|
| 1 | +""" |
| 2 | +Script to detect flat sensors in the Shell dataset. |
| 3 | +
|
| 4 | +A "flat" sensor is one where the values show very little or no variation, |
| 5 | +which can negatively impact time series forecasting models. |
| 6 | +
|
| 7 | +This script analyzes sensors using multiple criteria: |
| 8 | +1. Standard deviation (low std = flat) |
| 9 | +2. Unique value count (few unique values = flat) |
| 10 | +3. Coefficient of variation (CV = std/mean) |
| 11 | +4. Range of values (max - min) |
| 12 | +5. Percentage of most common value |
| 13 | +""" |
| 14 | + |
| 15 | +import pandas as pd |
| 16 | +import numpy as np |
| 17 | +from pathlib import Path |
| 18 | +import json |
| 19 | + |
| 20 | + |
| 21 | +def load_data(data_path): |
| 22 | + """Load the preprocessed Shell data.""" |
| 23 | + print(f"Loading data from: {data_path}") |
| 24 | + |
| 25 | + if str(data_path).endswith('.parquet'): |
| 26 | + df = pd.read_parquet(data_path) |
| 27 | + else: |
| 28 | + df = pd.read_csv(data_path) |
| 29 | + |
| 30 | + print(f"Loaded {len(df):,} rows with {len(df.columns)} columns") |
| 31 | + print(f"Unique sensors: {df['TagName'].nunique()}") |
| 32 | + |
| 33 | + return df |
| 34 | + |
| 35 | + |
| 36 | +def analyze_sensor_variation(df, output_path='flat_sensor_analysis.json'): |
| 37 | + """ |
| 38 | + Analyze each sensor for flatness using multiple metrics. |
| 39 | +
|
| 40 | + Args: |
| 41 | + df: DataFrame with columns [TagName, EventTime, Value, ...] |
| 42 | + output_path: Path to save the analysis results |
| 43 | +
|
| 44 | + Returns: |
| 45 | + DataFrame with analysis results for each sensor |
| 46 | + """ |
| 47 | + print("\nAnalyzing sensor variation...") |
| 48 | + |
| 49 | + # Filter out error values (-1 markers) |
| 50 | + df_clean = df[df['Value'] != -1].copy() |
| 51 | + |
| 52 | + # Remove NaN values |
| 53 | + df_clean = df_clean.dropna(subset=['Value']) |
| 54 | + |
| 55 | + print(f"After filtering: {len(df_clean):,} rows") |
| 56 | + |
| 57 | + # Group by sensor |
| 58 | + results = [] |
| 59 | + |
| 60 | + sensors = df_clean['TagName'].unique() |
| 61 | + print(f"Analyzing {len(sensors)} sensors...") |
| 62 | + |
| 63 | + for i, sensor in enumerate(sensors, 1): |
| 64 | + if i % 100 == 0: |
| 65 | + print(f" Progress: {i}/{len(sensors)} sensors analyzed") |
| 66 | + |
| 67 | + sensor_data = df_clean[df_clean['TagName'] == sensor]['Value'] |
| 68 | + |
| 69 | + if len(sensor_data) < 2: |
| 70 | + continue |
| 71 | + |
| 72 | + # Calculate various metrics |
| 73 | + values = sensor_data.values |
| 74 | + n_points = len(values) |
| 75 | + |
| 76 | + # Basic statistics |
| 77 | + mean_val = np.mean(values) |
| 78 | + std_val = np.std(values) |
| 79 | + min_val = np.min(values) |
| 80 | + max_val = np.max(values) |
| 81 | + value_range = max_val - min_val |
| 82 | + |
| 83 | + # Unique values |
| 84 | + n_unique = len(np.unique(values)) |
| 85 | + unique_ratio = n_unique / n_points |
| 86 | + |
| 87 | + # Most common value percentage |
| 88 | + if n_unique > 0: |
| 89 | + unique_vals, counts = np.unique(values, return_counts=True) |
| 90 | + most_common_pct = (np.max(counts) / n_points) * 100 |
| 91 | + else: |
| 92 | + most_common_pct = 100.0 |
| 93 | + |
| 94 | + # Coefficient of variation (normalized standard deviation) |
| 95 | + if abs(mean_val) > 1e-10: |
| 96 | + cv = abs(std_val / mean_val) |
| 97 | + else: |
| 98 | + cv = 0.0 if std_val < 1e-10 else float('inf') |
| 99 | + |
| 100 | + # Calculate sequential differences (how much values change) |
| 101 | + if len(values) > 1: |
| 102 | + diffs = np.diff(values) |
| 103 | + mean_abs_diff = np.mean(np.abs(diffs)) |
| 104 | + pct_zero_diff = (np.sum(diffs == 0) / len(diffs)) * 100 |
| 105 | + else: |
| 106 | + mean_abs_diff = 0.0 |
| 107 | + pct_zero_diff = 100.0 |
| 108 | + |
| 109 | + results.append({ |
| 110 | + 'sensor': sensor, |
| 111 | + 'n_points': n_points, |
| 112 | + 'mean': mean_val, |
| 113 | + 'std': std_val, |
| 114 | + 'min': min_val, |
| 115 | + 'max': max_val, |
| 116 | + 'range': value_range, |
| 117 | + 'n_unique': n_unique, |
| 118 | + 'unique_ratio': unique_ratio, |
| 119 | + 'most_common_pct': most_common_pct, |
| 120 | + 'cv': cv if not np.isinf(cv) else 999.0, |
| 121 | + 'mean_abs_diff': mean_abs_diff, |
| 122 | + 'pct_zero_diff': pct_zero_diff |
| 123 | + }) |
| 124 | + |
| 125 | + results_df = pd.DataFrame(results) |
| 126 | + |
| 127 | + print(f"\nCompleted analysis of {len(results_df)} sensors") |
| 128 | + |
| 129 | + return results_df |
| 130 | + |
| 131 | + |
| 132 | +def identify_flat_sensors(results_df, |
| 133 | + std_threshold=0.01, |
| 134 | + unique_ratio_threshold=0.01, |
| 135 | + cv_threshold=0.01, |
| 136 | + range_threshold=0.01, |
| 137 | + most_common_threshold=95.0, |
| 138 | + zero_diff_threshold=95.0): |
| 139 | + """ |
| 140 | + Identify flat sensors based on multiple criteria. |
| 141 | +
|
| 142 | + Args: |
| 143 | + results_df: DataFrame with sensor analysis results |
| 144 | + std_threshold: Maximum standard deviation for flat sensor |
| 145 | + unique_ratio_threshold: Maximum ratio of unique values |
| 146 | + cv_threshold: Maximum coefficient of variation |
| 147 | + range_threshold: Maximum range (max - min) |
| 148 | + most_common_threshold: Minimum percentage of most common value |
| 149 | + zero_diff_threshold: Minimum percentage of zero differences |
| 150 | +
|
| 151 | + Returns: |
| 152 | + Dictionary with flat sensor classifications |
| 153 | + """ |
| 154 | + print("\nIdentifying flat sensors...") |
| 155 | + print(f"Criteria:") |
| 156 | + print(f" - Standard deviation <= {std_threshold}") |
| 157 | + print(f" - Unique value ratio <= {unique_ratio_threshold}") |
| 158 | + print(f" - Coefficient of variation <= {cv_threshold}") |
| 159 | + print(f" - Value range <= {range_threshold}") |
| 160 | + print(f" - Most common value >= {most_common_threshold}%") |
| 161 | + print(f" - Zero differences >= {zero_diff_threshold}%") |
| 162 | + |
| 163 | + # Create boolean masks for each criterion |
| 164 | + masks = { |
| 165 | + 'low_std': results_df['std'] <= std_threshold, |
| 166 | + 'low_unique_ratio': results_df['unique_ratio'] <= unique_ratio_threshold, |
| 167 | + 'low_cv': results_df['cv'] <= cv_threshold, |
| 168 | + 'low_range': results_df['range'] <= range_threshold, |
| 169 | + 'high_common_value': results_df['most_common_pct'] >= most_common_threshold, |
| 170 | + 'high_zero_diff': results_df['pct_zero_diff'] >= zero_diff_threshold |
| 171 | + } |
| 172 | + |
| 173 | + # Combine criteria (sensor is flat if it meets ANY of these criteria) |
| 174 | + flat_mask = ( |
| 175 | + masks['low_std'] | |
| 176 | + masks['low_unique_ratio'] | |
| 177 | + masks['low_cv'] | |
| 178 | + masks['low_range'] | |
| 179 | + masks['high_common_value'] | |
| 180 | + masks['high_zero_diff'] |
| 181 | + ) |
| 182 | + |
| 183 | + flat_sensors = results_df[flat_mask].copy() |
| 184 | + non_flat_sensors = results_df[~flat_mask].copy() |
| 185 | + |
| 186 | + # Determine which criteria each flat sensor meets |
| 187 | + flat_sensors['flatness_reasons'] = '' |
| 188 | + for criterion, mask in masks.items(): |
| 189 | + matched = flat_sensors[mask[flat_mask]] |
| 190 | + flat_sensors.loc[matched.index, 'flatness_reasons'] += criterion + ', ' |
| 191 | + |
| 192 | + flat_sensors['flatness_reasons'] = flat_sensors['flatness_reasons'].str.rstrip(', ') |
| 193 | + |
| 194 | + print(f"\nResults:") |
| 195 | + print(f" Total sensors: {len(results_df)}") |
| 196 | + print(f" Flat sensors: {len(flat_sensors)} ({len(flat_sensors)/len(results_df)*100:.1f}%)") |
| 197 | + print(f" Non-flat sensors: {len(non_flat_sensors)} ({len(non_flat_sensors)/len(results_df)*100:.1f}%)") |
| 198 | + |
| 199 | + # Count sensors by criterion |
| 200 | + print(f"\nBreakdown by criterion:") |
| 201 | + for criterion, mask in masks.items(): |
| 202 | + count = mask.sum() |
| 203 | + print(f" {criterion}: {count} sensors ({count/len(results_df)*100:.1f}%)") |
| 204 | + |
| 205 | + return { |
| 206 | + 'flat_sensors': flat_sensors, |
| 207 | + 'non_flat_sensors': non_flat_sensors, |
| 208 | + 'criteria': masks |
| 209 | + } |
| 210 | + |
| 211 | + |
| 212 | +def save_results(results_df, flat_info, output_dir='preprocessing'): |
| 213 | + """Save analysis results to files.""" |
| 214 | + output_dir = Path(output_dir) |
| 215 | + |
| 216 | + # Save full analysis |
| 217 | + full_results_path = output_dir / 'sensor_variation_analysis.csv' |
| 218 | + results_df.to_csv(full_results_path, index=False) |
| 219 | + print(f"\nFull analysis saved to: {full_results_path}") |
| 220 | + |
| 221 | + # Save flat sensors list |
| 222 | + flat_sensors_path = output_dir / 'flat_sensors.csv' |
| 223 | + flat_info['flat_sensors'].to_csv(flat_sensors_path, index=False) |
| 224 | + print(f"Flat sensors list saved to: {flat_sensors_path}") |
| 225 | + |
| 226 | + # Save non-flat sensors list |
| 227 | + non_flat_sensors_path = output_dir / 'non_flat_sensors.csv' |
| 228 | + flat_info['non_flat_sensors'].to_csv(non_flat_sensors_path, index=False) |
| 229 | + print(f"Non-flat sensors list saved to: {non_flat_sensors_path}") |
| 230 | + |
| 231 | + # Save summary JSON |
| 232 | + summary = { |
| 233 | + 'total_sensors': len(results_df), |
| 234 | + 'flat_sensors_count': len(flat_info['flat_sensors']), |
| 235 | + 'non_flat_sensors_count': len(flat_info['non_flat_sensors']), |
| 236 | + 'flat_percentage': len(flat_info['flat_sensors']) / len(results_df) * 100, |
| 237 | + 'flat_sensor_names': flat_info['flat_sensors']['sensor'].tolist(), |
| 238 | + 'criterion_counts': { |
| 239 | + criterion: int(mask.sum()) |
| 240 | + for criterion, mask in flat_info['criteria'].items() |
| 241 | + } |
| 242 | + } |
| 243 | + |
| 244 | + summary_path = output_dir / 'flat_sensor_summary.json' |
| 245 | + with open(summary_path, 'w') as f: |
| 246 | + json.dump(summary, f, indent=2) |
| 247 | + print(f"Summary saved to: {summary_path}") |
| 248 | + |
| 249 | + # Print top 10 flattest sensors |
| 250 | + print("\n" + "="*80) |
| 251 | + print("TOP 10 FLATTEST SENSORS (by standard deviation):") |
| 252 | + print("="*80) |
| 253 | + top_flat = flat_info['flat_sensors'].nsmallest(10, 'std') |
| 254 | + for i, row in top_flat.iterrows(): |
| 255 | + print(f"\n{row['sensor']}:") |
| 256 | + print(f" Data points: {row['n_points']:,}") |
| 257 | + print(f" Mean: {row['mean']:.4f}, Std: {row['std']:.6f}") |
| 258 | + print(f" Range: [{row['min']:.4f}, {row['max']:.4f}] (span: {row['range']:.6f})") |
| 259 | + print(f" Unique values: {row['n_unique']} ({row['unique_ratio']*100:.2f}%)") |
| 260 | + print(f" Most common value: {row['most_common_pct']:.1f}% of data") |
| 261 | + print(f" Zero differences: {row['pct_zero_diff']:.1f}%") |
| 262 | + print(f" Flatness reasons: {row['flatness_reasons']}") |
| 263 | + |
| 264 | + |
| 265 | +def main(): |
| 266 | + """Main execution function.""" |
| 267 | + print("="*80) |
| 268 | + print("FLAT SENSOR DETECTION FOR SHELL DATASET") |
| 269 | + print("="*80) |
| 270 | + |
| 271 | + # Configuration |
| 272 | + data_path = 'amos_team_resources/shell/preprocessing/ShellData_preprocessed.parquet' |
| 273 | + output_dir = 'amos_team_resources/shell/preprocessing' |
| 274 | + |
| 275 | + # Load data |
| 276 | + df = load_data(data_path) |
| 277 | + |
| 278 | + # Analyze variation |
| 279 | + results_df = analyze_sensor_variation(df) |
| 280 | + |
| 281 | + # Identify flat sensors |
| 282 | + flat_info = identify_flat_sensors(results_df) |
| 283 | + |
| 284 | + # Save results |
| 285 | + save_results(results_df, flat_info, output_dir) |
| 286 | + |
| 287 | + print("\n" + "="*80) |
| 288 | + print("ANALYSIS COMPLETE") |
| 289 | + print("="*80) |
| 290 | + |
| 291 | + |
| 292 | +if __name__ == '__main__': |
| 293 | + main() |
0 commit comments