-
Notifications
You must be signed in to change notification settings - Fork 101
enh: Add dataset_utils.py to control the "new" in "new data"
#1661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,315 @@ | ||
| """Utilities for dataset comparison and validation.""" | ||
|
|
||
| import pandas as pd | ||
| import numpy as np | ||
| from typing import Dict, List, Tuple, Union, Optional, Any | ||
| import warnings | ||
| from scipy import stats | ||
|
|
||
|
|
||
| def compare_datasets( | ||
| dataset1: Union[pd.DataFrame, np.ndarray], | ||
| dataset2: Union[pd.DataFrame, np.ndarray], | ||
| id_column: Optional[str] = None, | ||
| verbose: bool = True, | ||
| feature_distribution: bool = False, | ||
| ) -> Dict[str, Any]: | ||
| """ | ||
| Compare two datasets and return a summary of differences to detect potential data leakage. | ||
|
|
||
| This function helps identify overlapping data between training and evaluation datasets, | ||
| which could lead to overoptimistic model performance estimates. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| dataset1 : DataFrame or array | ||
| First dataset for comparison (e.g., training data) | ||
| dataset2 : DataFrame or array | ||
| Second dataset for comparison (e.g., test data) | ||
| id_column : str, optional | ||
| Column name to use for identifying unique rows. If provided and present in both | ||
| datasets, will be used to find overlapping records. | ||
| verbose : bool, default=True | ||
| Whether to print comparison summary | ||
| feature_distribution : bool, default=False | ||
| Whether to compare statistical distributions of shared numerical features | ||
|
|
||
| Returns | ||
| ------- | ||
| Dict | ||
| Dictionary containing comparison results with the following keys: | ||
| - shape_comparison: Information about dataset dimensions | ||
| - column_comparison: Analysis of columns in both datasets | ||
| - overlap_analysis: Information about overlapping data points | ||
| - feature_distribution: Statistical tests comparing distributions (if requested) | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> import pandas as pd | ||
| >>> from skore.dataset_utils import compare_datasets | ||
| >>> | ||
| >>> # Basic comparison with ID columns | ||
| >>> train_df = pd.DataFrame({"id": [1, 2, 3], "value": [10, 20, 30]}) | ||
| >>> test_df = pd.DataFrame({"id": [3, 4, 5], "value": [30, 40, 50]}) | ||
| >>> result = compare_datasets(train_df, test_df, id_column="id") | ||
|
|
||
| >>> # Compare data used with report_metrics to check for potential data leakage | ||
| >>> from skore import EstimatorReport | ||
| >>> from sklearn.ensemble import RandomForestClassifier | ||
| >>> | ||
| >>> # First create and train a model with training data | ||
| >>> X_train = pd.DataFrame({"feature1": [1, 2, 3], "feature2": [4, 5, 6]}) | ||
| >>> y_train = [0, 0, 1] | ||
| >>> X_new = pd.DataFrame({"feature1": [2, 3, 4], "feature2": [5, 6, 7]}) | ||
| >>> y_new = [0, 1, 1] | ||
| >>> | ||
| >>> # Check if "new" data overlaps with training data before using it | ||
| >>> leakage_check = compare_datasets(X_train, X_new) | ||
| >>> | ||
| >>> # Only use for evaluation if overlap is minimal | ||
| >>> model = RandomForestClassifier() | ||
| >>> report = EstimatorReport(model, X_train=X_train, y_train=y_train) | ||
| >>> if leakage_check["overlap_analysis"].get("overlap_percentage", 0) < 10: | ||
| >>> report.report_metrics(X_y=(X_new, y_new)) | ||
| >>> else: | ||
| >>> print("Warning: Significant overlap detected with training data") | ||
| """ | ||
| # Convert numpy arrays to pandas DataFrames if needed | ||
| if isinstance(dataset1, np.ndarray): | ||
| dataset1 = pd.DataFrame(dataset1) | ||
| if isinstance(dataset2, np.ndarray): | ||
| dataset2 = pd.DataFrame(dataset2) | ||
|
|
||
| # Initialize results dictionary | ||
| results = { | ||
| "shape_comparison": { | ||
| "dataset1_shape": dataset1.shape, | ||
| "dataset2_shape": dataset2.shape, | ||
| "same_shape": dataset1.shape == dataset2.shape | ||
| }, | ||
| "column_comparison": {}, | ||
| "overlap_analysis": {} | ||
| } | ||
|
|
||
| # Compare columns | ||
| dataset1_cols = set(dataset1.columns) | ||
| dataset2_cols = set(dataset2.columns) | ||
|
|
||
| results["column_comparison"] = { | ||
| "shared_columns": list(dataset1_cols.intersection(dataset2_cols)), | ||
| "only_in_dataset1": list(dataset1_cols - dataset2_cols), | ||
| "only_in_dataset2": list(dataset2_cols - dataset1_cols), | ||
| "same_columns": dataset1_cols == dataset2_cols | ||
| } | ||
|
|
||
| # Check for overlapping data | ||
| if id_column and id_column in dataset1.columns and id_column in dataset2.columns: | ||
| # Using specified ID column | ||
| dataset1_ids = set(dataset1[id_column]) | ||
| dataset2_ids = set(dataset2[id_column]) | ||
|
|
||
| overlap_ids = dataset1_ids.intersection(dataset2_ids) | ||
| results["overlap_analysis"] = { | ||
| "overlapping_ids": list(overlap_ids), | ||
| "overlap_count": len(overlap_ids), | ||
| "overlap_percentage": round(len(overlap_ids) / len(dataset1_ids) * 100, 2) | ||
| } | ||
| else: | ||
| # Try to check for duplicate rows | ||
| try: | ||
| shared_columns = results["column_comparison"]["shared_columns"] | ||
| if shared_columns: | ||
| # Check overlapping rows based on shared columns | ||
| dataset1_subset = dataset1[shared_columns].drop_duplicates() | ||
| dataset2_subset = dataset2[shared_columns].drop_duplicates() | ||
|
|
||
| merged = pd.merge( | ||
| dataset1_subset, dataset2_subset, | ||
| on=shared_columns, how='inner' | ||
| ) | ||
|
|
||
| overlap_count = len(merged) | ||
| results["overlap_analysis"] = { | ||
| "overlap_row_count": overlap_count, | ||
| "overlap_percentage": round(overlap_count / len(dataset1) * 100, 2) | ||
|
Comment on lines
+123
to
+134
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Whenever it's not super plain to do, can you split the big function in sub functions please? This way, it's much easier to test all the use-cases, and also to read. |
||
| } | ||
| else: | ||
| results["overlap_analysis"] = { | ||
| "message": "No shared columns to check for overlapping rows" | ||
| } | ||
| except Exception as e: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the difficulties that could trigger a failure? The user should be warned plainly if there is an interruption somewhere. As I understand it, here they would have to dig and read the whole output to be able to see it. |
||
| results["overlap_analysis"] = { | ||
| "message": f"Could not analyze overlap: {str(e)}" | ||
| } | ||
|
|
||
| # Compare feature distributions if requested | ||
| if feature_distribution: | ||
| results["feature_distribution"] = {} | ||
| shared_columns = results["column_comparison"]["shared_columns"] | ||
|
|
||
| for col in shared_columns: | ||
| # Check if column is numeric | ||
| if pd.api.types.is_numeric_dtype(dataset1[col]) and pd.api.types.is_numeric_dtype(dataset2[col]): | ||
| try: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the difficulties that could trigger a failure? The user should be warned plainly if there is an interruption somewhere. As I understand it, here they would have to dig and read the whole output to be able to see it. |
||
| # Perform Kolmogorov-Smirnov test for distribution comparison | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. really nice, we can enrich with other tests later, but it's a good start. |
||
| ks_stat, p_value = stats.ks_2samp( | ||
| dataset1[col].dropna().values, | ||
| dataset2[col].dropna().values | ||
| ) | ||
|
|
||
| results["feature_distribution"][col] = { | ||
| "ks_statistic": ks_stat, | ||
| "p_value": p_value, | ||
| "same_distribution": p_value > 0.05, # Common threshold | ||
| "dataset1_mean": dataset1[col].mean(), | ||
| "dataset2_mean": dataset2[col].mean(), | ||
| "dataset1_std": dataset1[col].std(), | ||
| "dataset2_std": dataset2[col].std(), | ||
| } | ||
|
Comment on lines
+155
to
+168
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can also be put in a sub function. |
||
| except Exception as e: | ||
| results["feature_distribution"][col] = { | ||
| "error": str(e) | ||
| } | ||
|
|
||
| # Print summary if verbose | ||
| if verbose: | ||
| print(f"Dataset Comparison Summary:") | ||
| print(f"- Dataset 1 shape: {results['shape_comparison']['dataset1_shape']}") | ||
| print(f"- Dataset 2 shape: {results['shape_comparison']['dataset2_shape']}") | ||
|
|
||
| print("\nColumn comparison:") | ||
| if results["column_comparison"]["same_columns"]: | ||
| print("- Both datasets have identical columns") | ||
| else: | ||
| if results["column_comparison"]["only_in_dataset1"]: | ||
| print(f"- Columns only in dataset 1: {results['column_comparison']['only_in_dataset1']}") | ||
| if results["column_comparison"]["only_in_dataset2"]: | ||
| print(f"- Columns only in dataset 2: {results['column_comparison']['only_in_dataset2']}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could be interesting here to precise which columns are common. |
||
|
|
||
| print("\nOverlap analysis:") | ||
| if "overlap_count" in results["overlap_analysis"]: | ||
| print(f"- {results['overlap_analysis']['overlap_count']} overlapping IDs found") | ||
| print(f"- {results['overlap_analysis']['overlap_percentage']}% of dataset 1 IDs are in dataset 2") | ||
| if results['overlap_analysis']['overlap_count'] > 0: | ||
| warnings.warn( | ||
| f"Datasets have {results['overlap_analysis']['overlap_count']} overlapping IDs. " | ||
| f"This may lead to data leakage if one dataset was used for training." | ||
| ) | ||
| elif "overlap_row_count" in results["overlap_analysis"]: | ||
| print(f"- {results['overlap_analysis']['overlap_row_count']} overlapping rows found") | ||
| print(f"- {results['overlap_analysis']['overlap_percentage']}% of dataset 1 rows are in dataset 2") | ||
| if results['overlap_analysis']['overlap_row_count'] > 0: | ||
| warnings.warn( | ||
| f"Datasets have {results['overlap_analysis']['overlap_row_count']} overlapping rows. " | ||
| f"This may lead to data leakage if one dataset was used for training." | ||
| ) | ||
| else: | ||
| print(f"- {results['overlap_analysis']['message']}") | ||
|
|
||
| # Print feature distribution results if available | ||
| if feature_distribution and "feature_distribution" in results: | ||
| print("\nFeature distribution comparison:") | ||
| for col, stats_data in results["feature_distribution"].items(): | ||
| if "error" in stats_data: | ||
| print(f"- {col}: Error in comparison - {stats_data['error']}") | ||
| continue | ||
|
|
||
| if stats_data["same_distribution"]: | ||
| print(f"- {col}: Similar distributions (p={stats_data['p_value']:.4f})") | ||
| else: | ||
| print(f"- {col}: Different distributions (p={stats_data['p_value']:.4f})") | ||
| print(f" Dataset 1: mean={stats_data['dataset1_mean']:.4f}, std={stats_data['dataset1_std']:.4f}") | ||
| print(f" Dataset 2: mean={stats_data['dataset2_mean']:.4f}, std={stats_data['dataset2_std']:.4f}") | ||
|
|
||
| return results | ||
|
|
||
| def check_data_leakage( | ||
| report, | ||
| new_data, | ||
| id_column=None, | ||
| threshold=0.05, | ||
| verbose=True | ||
| ): | ||
| """ | ||
| Check if new data overlaps with training data in a report object. | ||
|
|
||
| This utility helps ensure that when using report_metrics with new data (X_y syntax), | ||
| the data is actually "new" and doesn't overlap significantly with training data. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| report : EstimatorReport or ComparisonReport | ||
| The report object containing reference to training data | ||
| new_data : DataFrame or array | ||
| New data to check for overlap with training data | ||
| id_column : str, optional | ||
| Column name to use for identifying unique rows | ||
| threshold : float, default=0.05 | ||
| Maximum acceptable overlap percentage (0.05 = 5%) | ||
| verbose : bool, default=True | ||
| Whether to print comparison summary | ||
|
|
||
| Returns | ||
| ------- | ||
| bool | ||
| True if significant overlap detected (exceeding threshold) | ||
| dict | ||
| Detailed leakage report from compare_datasets | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> import pandas as pd | ||
| >>> from sklearn.ensemble import RandomForestClassifier | ||
| >>> from skore import EstimatorReport | ||
| >>> from skore.dataset_utils import check_data_leakage | ||
| >>> | ||
| >>> # Create and train model | ||
| >>> X_train = pd.DataFrame({"feature1": [1, 2, 3], "feature2": [4, 5, 6]}) | ||
| >>> y_train = [0, 0, 1] | ||
| >>> X_new = pd.DataFrame({"feature1": [2, 3, 4], "feature2": [5, 6, 7]}) | ||
| >>> y_new = [0, 1, 1] | ||
| >>> | ||
| >>> model = RandomForestClassifier() | ||
| >>> report = EstimatorReport(model, X_train=X_train, y_train=y_train) | ||
| >>> | ||
| >>> # Check for data leakage before evaluating on "new" data | ||
| >>> has_leakage, leakage_report = check_data_leakage(report, X_new) | ||
| >>> | ||
| >>> if not has_leakage: | ||
| >>> report.report_metrics(X_y=(X_new, y_new)) | ||
| >>> else: | ||
| >>> print(f"Warning: {leakage_report['overlap_analysis']['overlap_percentage']}% overlap detected") | ||
| """ | ||
| import warnings | ||
|
|
||
| # Extract training data from report object | ||
| if hasattr(report, "X_train") and report.X_train is not None: | ||
| # EstimatorReport case | ||
| train_data = report.X_train | ||
| elif hasattr(report, "reports_") and len(report.reports_) > 0 and hasattr(report.reports_[0], "X_train"): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took too long to give you a review; something changed in ComparisonReport: it's now possible to have EstimatorReports with different X_train, as long as the y is the same. Therefore there is no way to know which report the user want to check. |
||
| # ComparisonReport case | ||
| train_data = report.reports_[0].X_train # Use first report's training data | ||
| else: | ||
| raise ValueError("Could not find training data in the provided report object") | ||
|
|
||
| # Run comparison | ||
| leakage_report = compare_datasets(train_data, new_data, id_column=id_column, verbose=verbose) | ||
|
|
||
| # Determine if overlap exceeds threshold | ||
| overlap_pct = 0 | ||
| if "overlap_percentage" in leakage_report["overlap_analysis"]: | ||
| overlap_pct = leakage_report["overlap_analysis"]["overlap_percentage"] | ||
| elif "overlap_row_count" in leakage_report["overlap_analysis"]: | ||
| overlap_pct = leakage_report["overlap_analysis"]["overlap_percentage"] | ||
|
|
||
| significant_overlap = overlap_pct > (threshold * 100) | ||
|
|
||
| if significant_overlap and verbose: | ||
| warnings.warn( | ||
| f"Significant data overlap detected: {overlap_pct:.2f}% of training data " | ||
| f"appears in evaluation data. This exceeds the threshold of {threshold*100:.1f}%. " | ||
| f"This may lead to overly optimistic performance estimates.", | ||
| UserWarning | ||
| ) | ||
|
|
||
| return significant_overlap, leakage_report | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if the datasets 1 and 2 have very different length (as it can be if we are dealing with a train and test set), this percentage can be misleading. Can you either find a key more explicit (all the ideas I have in mind are too long, such as
overlap_percentage_to_dataset1), or remove it please?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other idea, explicit it completely in the key "message", and use it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see now that you are using it below - let's find a clear name to keep it :)!