Skip to content

Commit 2e2e960

Browse files
authored
Merge pull request #71 from amosproj/model-fixing
Model fixing
2 parents bb4122b + 41522c9 commit 2e2e960

File tree

9 files changed

+1565
-47
lines changed

9 files changed

+1565
-47
lines changed

amos_team_resources/shell/pipeline_shell_data.py

Lines changed: 351 additions & 20 deletions
Large diffs are not rendered by default.
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
"""
2+
Script to detect flat sensors in the Shell dataset.
3+
4+
A "flat" sensor is one where the values show very little or no variation,
5+
which can negatively impact time series forecasting models.
6+
7+
This script analyzes sensors using multiple criteria:
8+
1. Standard deviation (low std = flat)
9+
2. Unique value count (few unique values = flat)
10+
3. Coefficient of variation (CV = std/mean)
11+
4. Range of values (max - min)
12+
5. Percentage of most common value
13+
"""
14+
15+
import pandas as pd
16+
import numpy as np
17+
from pathlib import Path
18+
import json
19+
20+
21+
def load_data(data_path):
22+
"""Load the preprocessed Shell data."""
23+
print(f"Loading data from: {data_path}")
24+
25+
if str(data_path).endswith('.parquet'):
26+
df = pd.read_parquet(data_path)
27+
else:
28+
df = pd.read_csv(data_path)
29+
30+
print(f"Loaded {len(df):,} rows with {len(df.columns)} columns")
31+
print(f"Unique sensors: {df['TagName'].nunique()}")
32+
33+
return df
34+
35+
36+
def analyze_sensor_variation(df, output_path='flat_sensor_analysis.json'):
37+
"""
38+
Analyze each sensor for flatness using multiple metrics.
39+
40+
Args:
41+
df: DataFrame with columns [TagName, EventTime, Value, ...]
42+
output_path: Path to save the analysis results
43+
44+
Returns:
45+
DataFrame with analysis results for each sensor
46+
"""
47+
print("\nAnalyzing sensor variation...")
48+
49+
# Filter out error values (-1 markers)
50+
df_clean = df[df['Value'] != -1].copy()
51+
52+
# Remove NaN values
53+
df_clean = df_clean.dropna(subset=['Value'])
54+
55+
print(f"After filtering: {len(df_clean):,} rows")
56+
57+
# Group by sensor
58+
results = []
59+
60+
sensors = df_clean['TagName'].unique()
61+
print(f"Analyzing {len(sensors)} sensors...")
62+
63+
for i, sensor in enumerate(sensors, 1):
64+
if i % 100 == 0:
65+
print(f" Progress: {i}/{len(sensors)} sensors analyzed")
66+
67+
sensor_data = df_clean[df_clean['TagName'] == sensor]['Value']
68+
69+
if len(sensor_data) < 2:
70+
continue
71+
72+
# Calculate various metrics
73+
values = sensor_data.values
74+
n_points = len(values)
75+
76+
# Basic statistics
77+
mean_val = np.mean(values)
78+
std_val = np.std(values)
79+
min_val = np.min(values)
80+
max_val = np.max(values)
81+
value_range = max_val - min_val
82+
83+
# Unique values
84+
n_unique = len(np.unique(values))
85+
unique_ratio = n_unique / n_points
86+
87+
# Most common value percentage
88+
if n_unique > 0:
89+
unique_vals, counts = np.unique(values, return_counts=True)
90+
most_common_pct = (np.max(counts) / n_points) * 100
91+
else:
92+
most_common_pct = 100.0
93+
94+
# Coefficient of variation (normalized standard deviation)
95+
if abs(mean_val) > 1e-10:
96+
cv = abs(std_val / mean_val)
97+
else:
98+
cv = 0.0 if std_val < 1e-10 else float('inf')
99+
100+
# Calculate sequential differences (how much values change)
101+
if len(values) > 1:
102+
diffs = np.diff(values)
103+
mean_abs_diff = np.mean(np.abs(diffs))
104+
pct_zero_diff = (np.sum(diffs == 0) / len(diffs)) * 100
105+
else:
106+
mean_abs_diff = 0.0
107+
pct_zero_diff = 100.0
108+
109+
results.append({
110+
'sensor': sensor,
111+
'n_points': n_points,
112+
'mean': mean_val,
113+
'std': std_val,
114+
'min': min_val,
115+
'max': max_val,
116+
'range': value_range,
117+
'n_unique': n_unique,
118+
'unique_ratio': unique_ratio,
119+
'most_common_pct': most_common_pct,
120+
'cv': cv if not np.isinf(cv) else 999.0,
121+
'mean_abs_diff': mean_abs_diff,
122+
'pct_zero_diff': pct_zero_diff
123+
})
124+
125+
results_df = pd.DataFrame(results)
126+
127+
print(f"\nCompleted analysis of {len(results_df)} sensors")
128+
129+
return results_df
130+
131+
132+
def identify_flat_sensors(results_df,
133+
std_threshold=0.01,
134+
unique_ratio_threshold=0.01,
135+
cv_threshold=0.01,
136+
range_threshold=0.01,
137+
most_common_threshold=95.0,
138+
zero_diff_threshold=95.0):
139+
"""
140+
Identify flat sensors based on multiple criteria.
141+
142+
Args:
143+
results_df: DataFrame with sensor analysis results
144+
std_threshold: Maximum standard deviation for flat sensor
145+
unique_ratio_threshold: Maximum ratio of unique values
146+
cv_threshold: Maximum coefficient of variation
147+
range_threshold: Maximum range (max - min)
148+
most_common_threshold: Minimum percentage of most common value
149+
zero_diff_threshold: Minimum percentage of zero differences
150+
151+
Returns:
152+
Dictionary with flat sensor classifications
153+
"""
154+
print("\nIdentifying flat sensors...")
155+
print(f"Criteria:")
156+
print(f" - Standard deviation <= {std_threshold}")
157+
print(f" - Unique value ratio <= {unique_ratio_threshold}")
158+
print(f" - Coefficient of variation <= {cv_threshold}")
159+
print(f" - Value range <= {range_threshold}")
160+
print(f" - Most common value >= {most_common_threshold}%")
161+
print(f" - Zero differences >= {zero_diff_threshold}%")
162+
163+
# Create boolean masks for each criterion
164+
masks = {
165+
'low_std': results_df['std'] <= std_threshold,
166+
'low_unique_ratio': results_df['unique_ratio'] <= unique_ratio_threshold,
167+
'low_cv': results_df['cv'] <= cv_threshold,
168+
'low_range': results_df['range'] <= range_threshold,
169+
'high_common_value': results_df['most_common_pct'] >= most_common_threshold,
170+
'high_zero_diff': results_df['pct_zero_diff'] >= zero_diff_threshold
171+
}
172+
173+
# Combine criteria (sensor is flat if it meets ANY of these criteria)
174+
flat_mask = (
175+
masks['low_std'] |
176+
masks['low_unique_ratio'] |
177+
masks['low_cv'] |
178+
masks['low_range'] |
179+
masks['high_common_value'] |
180+
masks['high_zero_diff']
181+
)
182+
183+
flat_sensors = results_df[flat_mask].copy()
184+
non_flat_sensors = results_df[~flat_mask].copy()
185+
186+
# Determine which criteria each flat sensor meets
187+
flat_sensors['flatness_reasons'] = ''
188+
for criterion, mask in masks.items():
189+
matched = flat_sensors[mask[flat_mask]]
190+
flat_sensors.loc[matched.index, 'flatness_reasons'] += criterion + ', '
191+
192+
flat_sensors['flatness_reasons'] = flat_sensors['flatness_reasons'].str.rstrip(', ')
193+
194+
print(f"\nResults:")
195+
print(f" Total sensors: {len(results_df)}")
196+
print(f" Flat sensors: {len(flat_sensors)} ({len(flat_sensors)/len(results_df)*100:.1f}%)")
197+
print(f" Non-flat sensors: {len(non_flat_sensors)} ({len(non_flat_sensors)/len(results_df)*100:.1f}%)")
198+
199+
# Count sensors by criterion
200+
print(f"\nBreakdown by criterion:")
201+
for criterion, mask in masks.items():
202+
count = mask.sum()
203+
print(f" {criterion}: {count} sensors ({count/len(results_df)*100:.1f}%)")
204+
205+
return {
206+
'flat_sensors': flat_sensors,
207+
'non_flat_sensors': non_flat_sensors,
208+
'criteria': masks
209+
}
210+
211+
212+
def save_results(results_df, flat_info, output_dir='preprocessing'):
213+
"""Save analysis results to files."""
214+
output_dir = Path(output_dir)
215+
216+
# Save full analysis
217+
full_results_path = output_dir / 'sensor_variation_analysis.csv'
218+
results_df.to_csv(full_results_path, index=False)
219+
print(f"\nFull analysis saved to: {full_results_path}")
220+
221+
# Save flat sensors list
222+
flat_sensors_path = output_dir / 'flat_sensors.csv'
223+
flat_info['flat_sensors'].to_csv(flat_sensors_path, index=False)
224+
print(f"Flat sensors list saved to: {flat_sensors_path}")
225+
226+
# Save non-flat sensors list
227+
non_flat_sensors_path = output_dir / 'non_flat_sensors.csv'
228+
flat_info['non_flat_sensors'].to_csv(non_flat_sensors_path, index=False)
229+
print(f"Non-flat sensors list saved to: {non_flat_sensors_path}")
230+
231+
# Save summary JSON
232+
summary = {
233+
'total_sensors': len(results_df),
234+
'flat_sensors_count': len(flat_info['flat_sensors']),
235+
'non_flat_sensors_count': len(flat_info['non_flat_sensors']),
236+
'flat_percentage': len(flat_info['flat_sensors']) / len(results_df) * 100,
237+
'flat_sensor_names': flat_info['flat_sensors']['sensor'].tolist(),
238+
'criterion_counts': {
239+
criterion: int(mask.sum())
240+
for criterion, mask in flat_info['criteria'].items()
241+
}
242+
}
243+
244+
summary_path = output_dir / 'flat_sensor_summary.json'
245+
with open(summary_path, 'w') as f:
246+
json.dump(summary, f, indent=2)
247+
print(f"Summary saved to: {summary_path}")
248+
249+
# Print top 10 flattest sensors
250+
print("\n" + "="*80)
251+
print("TOP 10 FLATTEST SENSORS (by standard deviation):")
252+
print("="*80)
253+
top_flat = flat_info['flat_sensors'].nsmallest(10, 'std')
254+
for i, row in top_flat.iterrows():
255+
print(f"\n{row['sensor']}:")
256+
print(f" Data points: {row['n_points']:,}")
257+
print(f" Mean: {row['mean']:.4f}, Std: {row['std']:.6f}")
258+
print(f" Range: [{row['min']:.4f}, {row['max']:.4f}] (span: {row['range']:.6f})")
259+
print(f" Unique values: {row['n_unique']} ({row['unique_ratio']*100:.2f}%)")
260+
print(f" Most common value: {row['most_common_pct']:.1f}% of data")
261+
print(f" Zero differences: {row['pct_zero_diff']:.1f}%")
262+
print(f" Flatness reasons: {row['flatness_reasons']}")
263+
264+
265+
def main():
266+
"""Main execution function."""
267+
print("="*80)
268+
print("FLAT SENSOR DETECTION FOR SHELL DATASET")
269+
print("="*80)
270+
271+
# Configuration
272+
data_path = 'amos_team_resources/shell/preprocessing/ShellData_preprocessed.parquet'
273+
output_dir = 'amos_team_resources/shell/preprocessing'
274+
275+
# Load data
276+
df = load_data(data_path)
277+
278+
# Analyze variation
279+
results_df = analyze_sensor_variation(df)
280+
281+
# Identify flat sensors
282+
flat_info = identify_flat_sensors(results_df)
283+
284+
# Save results
285+
save_results(results_df, flat_info, output_dir)
286+
287+
print("\n" + "="*80)
288+
print("ANALYSIS COMPLETE")
289+
print("="*80)
290+
291+
292+
if __name__ == '__main__':
293+
main()

0 commit comments

Comments
 (0)