|
| 1 | +# Advanced Examples |
| 2 | + |
| 3 | +## California Housing End-to-End |
| 4 | +```python |
| 5 | +import pandas as pd |
| 6 | +from sklearn.datasets import fetch_california_housing |
| 7 | +from statclean import StatClean |
| 8 | + |
| 9 | +housing = fetch_california_housing() |
| 10 | +df = pd.DataFrame(housing.data, columns=housing.feature_names) |
| 11 | +df['PRICE'] = housing.target |
| 12 | + |
| 13 | +cleaner = StatClean(df, preserve_index=True) |
| 14 | + |
| 15 | +# Analyze & clean selected features |
| 16 | +features = ['MedInc', 'AveRooms', 'PRICE'] |
| 17 | +cleaned_df, info = cleaner.clean_columns(features, method='auto', show_progress=True) |
| 18 | + |
| 19 | +# Multivariate check |
| 20 | +mv_outliers = cleaner.detect_outliers_mahalanobis(['MedInc', 'AveRooms', 'PRICE'], chi2_threshold=0.975) |
| 21 | +print('Multivariate outliers:', mv_outliers.sum()) |
| 22 | + |
| 23 | +# Visualization grid |
| 24 | +figs = cleaner.plot_outlier_analysis(features) |
| 25 | +``` |
| 26 | + |
| 27 | +## Financial Data Preprocessing |
| 28 | +```python |
| 29 | +import pandas as pd |
| 30 | +import numpy as np |
| 31 | +from statclean import StatClean |
| 32 | + |
| 33 | +# Simulate financial returns data |
| 34 | +np.random.seed(42) |
| 35 | +returns = np.random.normal(0.001, 0.02, 1000) # Daily returns |
| 36 | +prices = 100 * np.cumprod(1 + returns) |
| 37 | +volumes = np.random.lognormal(15, 1, 1000) |
| 38 | + |
| 39 | +# Add some outliers (market crashes/spikes) |
| 40 | +returns[250] = -0.15 # Market crash |
| 41 | +returns[500] = 0.08 # Large gain |
| 42 | +volumes[100] = volumes[100] * 50 # Volume spike |
| 43 | + |
| 44 | +df = pd.DataFrame({ |
| 45 | + 'returns': returns, |
| 46 | + 'prices': prices, |
| 47 | + 'volume': volumes, |
| 48 | + 'volatility': pd.Series(returns).rolling(20).std() |
| 49 | +}) |
| 50 | + |
| 51 | +cleaner = StatClean(df.dropna(), preserve_index=True) |
| 52 | + |
| 53 | +# Financial outlier detection with domain-specific thresholds |
| 54 | +financial_features = ['returns', 'volume', 'volatility'] |
| 55 | + |
| 56 | +# Statistical significance testing for returns |
| 57 | +grubbs_results = {} |
| 58 | +for feature in financial_features: |
| 59 | + result = cleaner.grubbs_test(feature, alpha=0.01) # Stricter alpha for finance |
| 60 | + grubbs_results[feature] = result |
| 61 | + print(f"{feature}: Outlier detected = {result['is_outlier']}, p-value = {result['p_value']:.6f}") |
| 62 | + |
| 63 | +# Conservative cleaning with winsorization (preserve extreme but valid movements) |
| 64 | +cleaner.winsorize_outliers_percentile('volume', lower_percentile=1, upper_percentile=99) |
| 65 | +cleaner.winsorize_outliers_percentile('volatility', lower_percentile=5, upper_percentile=95) |
| 66 | + |
| 67 | +# More aggressive cleaning for returns (likely data errors) |
| 68 | +cleaner.remove_outliers_modified_zscore('returns', threshold=4.0) # Conservative threshold |
| 69 | + |
| 70 | +cleaned_df = cleaner.clean_df |
| 71 | +print(f"Original shape: {df.shape}, Cleaned shape: {cleaned_df.shape}") |
| 72 | +``` |
| 73 | + |
| 74 | +## Time Series Sensor Data |
| 75 | +```python |
| 76 | +import pandas as pd |
| 77 | +import numpy as np |
| 78 | +from datetime import datetime, timedelta |
| 79 | +from statclean import StatClean |
| 80 | + |
| 81 | +# Simulate IoT sensor data |
| 82 | +np.random.seed(123) |
| 83 | +dates = pd.date_range(start='2024-01-01', periods=2000, freq='H') |
| 84 | +base_temp = 20 + 10 * np.sin(2 * np.pi * np.arange(2000) / 24) # Daily cycle |
| 85 | +noise = np.random.normal(0, 2, 2000) |
| 86 | +temperatures = base_temp + noise |
| 87 | + |
| 88 | +# Add sensor malfunctions and anomalies |
| 89 | +temperatures[500:510] = -999 # Sensor error (impossible temperature) |
| 90 | +temperatures[1000] = 150 # Sensor spike |
| 91 | +temperatures[1500:1505] = np.nan # Missing readings |
| 92 | + |
| 93 | +humidity = np.clip(50 + 30 * np.sin(2 * np.pi * np.arange(2000) / 24) + np.random.normal(0, 5, 2000), 0, 100) |
| 94 | +pressure = 1013 + np.random.normal(0, 15, 2000) |
| 95 | + |
| 96 | +df = pd.DataFrame({ |
| 97 | + 'timestamp': dates, |
| 98 | + 'temperature': temperatures, |
| 99 | + 'humidity': humidity, |
| 100 | + 'pressure': pressure |
| 101 | +}) |
| 102 | + |
| 103 | +# Handle time series specific preprocessing |
| 104 | +df = df[df['temperature'] > -50] # Remove impossible sensor readings first |
| 105 | +cleaner = StatClean(df, preserve_index=True) |
| 106 | + |
| 107 | +# Time series outlier detection with domain knowledge |
| 108 | +sensor_features = ['temperature', 'humidity', 'pressure'] |
| 109 | + |
| 110 | +# Distribution analysis for each sensor |
| 111 | +for feature in sensor_features: |
| 112 | + analysis = cleaner.analyze_distribution(feature) |
| 113 | + print(f"\n{feature} Analysis:") |
| 114 | + print(f" Skewness: {analysis['skewness']:.3f}") |
| 115 | + print(f" Recommended method: {analysis['recommended_method']}") |
| 116 | + |
| 117 | + # Apply recommended transformation if highly skewed |
| 118 | + if abs(analysis['skewness']) > 2: |
| 119 | + cleaner.transform_boxcox(feature) |
| 120 | + |
| 121 | +# Gentle cleaning for sensor data (preserve natural variation) |
| 122 | +cleaned_df, info = cleaner.clean_columns( |
| 123 | + sensor_features, |
| 124 | + method='modified_zscore', # Robust to occasional spikes |
| 125 | + show_progress=True |
| 126 | +) |
| 127 | + |
| 128 | +# Time series specific visualization |
| 129 | +for feature in sensor_features: |
| 130 | + print(f"\n{feature} Cleaning Results:") |
| 131 | + print(f" Method used: {info[feature]['method_used']}") |
| 132 | + print(f" Outliers removed: {info[feature]['outliers_removed']}") |
| 133 | + |
| 134 | +# Generate comprehensive plots for time series data |
| 135 | +figs = cleaner.plot_outlier_analysis(sensor_features) |
| 136 | +``` |
| 137 | + |
| 138 | +## Modified Z-score Visualization |
| 139 | +```python |
| 140 | +outliers = cleaner.detect_outliers_modified_zscore('PRICE') |
| 141 | +cleaner.remove_outliers_modified_zscore('PRICE') |
| 142 | +cleaner.visualize_outliers('PRICE') |
| 143 | +``` |
| 144 | + |
| 145 | +## Method Comparison for Research Data |
| 146 | +```python |
| 147 | +import pandas as pd |
| 148 | +from statclean import StatClean |
| 149 | + |
| 150 | +# Simulate experimental research data |
| 151 | +np.random.seed(456) |
| 152 | +df = pd.DataFrame({ |
| 153 | + 'reaction_time': np.random.gamma(2, 0.15, 500), # Skewed distribution |
| 154 | + 'accuracy': np.random.beta(8, 2, 500) * 100, # Bounded data |
| 155 | + 'confidence': np.random.normal(7, 1.5, 500) # Normal-ish data |
| 156 | +}) |
| 157 | + |
| 158 | +# Add some experimental outliers |
| 159 | +df.loc[50:52, 'reaction_time'] *= 5 # Participant distraction |
| 160 | +df.loc[100, 'accuracy'] = 30 # Data entry error |
| 161 | +df.loc[200:205, 'confidence'] = np.nan # Missing responses |
| 162 | + |
| 163 | +cleaner = StatClean(df.dropna(), preserve_index=True) |
| 164 | + |
| 165 | +# Compare detection methods for research validity |
| 166 | +research_features = ['reaction_time', 'accuracy', 'confidence'] |
| 167 | +comparison = cleaner.compare_methods( |
| 168 | + research_features, |
| 169 | + methods=['iqr', 'zscore', 'modified_zscore', 'grubbs'] |
| 170 | +) |
| 171 | + |
| 172 | +# Statistical reporting for publication |
| 173 | +print("Method Agreement Analysis for Research Data:") |
| 174 | +for feature in research_features: |
| 175 | + print(f"\n{feature}:") |
| 176 | + print(f" {comparison[feature]['summary']}") |
| 177 | + |
| 178 | + # Formal statistical tests |
| 179 | + grubbs_result = cleaner.grubbs_test(feature, alpha=0.05) |
| 180 | + dixon_result = cleaner.dixon_q_test(feature, alpha=0.05) |
| 181 | + |
| 182 | + print(f" Grubbs test: p = {grubbs_result['p_value']:.6f}") |
| 183 | + print(f" Dixon Q test: p = {dixon_result['p_value']:.6f}") |
| 184 | + |
| 185 | +# Generate publication-quality report |
| 186 | +summary_report = cleaner.get_summary_report() |
| 187 | +print("\nPublication Summary:") |
| 188 | +print(summary_report) |
| 189 | +``` |
| 190 | + |
| 191 | +[Back to top](#advanced-examples) |
0 commit comments