Skip to content

Commit 005f050

Browse files
cleaning up data
1 parent 8079896 commit 005f050

17 files changed

+7272
-4431
lines changed

data/data_normalized/bird_probabilities.json

Lines changed: 525 additions & 0 deletions
Large diffs are not rendered by default.

data/data_normalized/master.json

Lines changed: 4915 additions & 4319 deletions
Large diffs are not rendered by default.

data/field_types.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import json
2+
import pprint
3+
from collections import defaultdict
4+
5+
6+
def get_field_types(data: dict):
7+
fields = defaultdict(set)
8+
for bird in data:
9+
for field in bird:
10+
val = bird[field]
11+
if isinstance(val, dict):
12+
for key in val:
13+
fields[key].add(type(val[key]))
14+
15+
else:
16+
if field == "Wetland" and isinstance(val, int):
17+
print(bird)
18+
fields[field].add(type(val))
19+
pprint.pprint(fields)
20+
21+
22+
if __name__ == "__main__":
23+
with open("data/master.json", "r") as file:
24+
get_field_types(json.load(file))

data/format_json.py

Lines changed: 99 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,21 @@
1818

1919
def to_snake_case(s: str) -> str:
2020
"""Convert string to snake_case.
21-
21+
2222
Handles special cases:
2323
- "/ (food cost)" → "slash_food_cost"
2424
- "* (food cost)" → "star_food_cost"
2525
- Spaces and special chars → underscores
2626
"""
27-
27+
2828
# Replace special characters with underscores
29-
s = re.sub(r'[^\w\s-]', '', s) # Remove special chars except spaces and hyphens
29+
s = re.sub(r"[^\w\s-]", "", s) # Remove special chars except spaces and hyphens
3030
# Convert spaces and hyphens to underscores
31-
s = re.sub(r'[\s-]+', '_', s)
31+
s = re.sub(r"[\s-]+", "_", s)
3232
# Convert to lowercase
3333
s = s.lower()
3434
# Remove leading/trailing underscores
35-
s = s.strip('_')
35+
s = s.strip("_")
3636
return s
3737

3838

@@ -44,64 +44,114 @@ def normalize_boolean_value(value: Any) -> bool:
4444
return True
4545
if value == "X":
4646
return True
47+
# BLUE DUCK??
48+
if value == 1:
49+
return True
4750
if value is None:
4851
return False
4952
return bool(value)
5053

5154

55+
def normalize_float_value(value) -> float:
56+
if value is None:
57+
return 0.0
58+
return value
59+
60+
5261
def normalize_master_json(data: List[Dict]) -> List[Dict]:
5362
"""Normalize bird card data from master.json."""
5463
normalized = []
55-
64+
5665
# Define boolean fields (fields that can have "X" values)
5766
boolean_fields = {
58-
"Forest", "Grassland", "Wetland",
67+
"Forest",
68+
"Grassland",
69+
"Wetland",
5970
"Bonus card",
60-
"Predator", "Flocking",
61-
"North America", "Central America", "South America",
62-
"Europe", "Asia", "Africa", "Oceania",
71+
"Predator",
72+
"Flocking",
73+
"North America",
74+
"Central America",
75+
"South America",
76+
"Europe",
77+
"Asia",
78+
"Africa",
79+
"Oceania",
6380
"Fan Art Pack?",
64-
"Anatomist", "Cartographer", "Historian", "Photographer",
65-
"Backyard Birder", "Bird Bander", "Bird Counter", "Bird Feeder",
66-
"Diet Specialist", "Enclosure Builder", "Endangered Species Protector",
67-
"Falconer", "Fishery Manager", "Food Web Expert", "Forester",
68-
"Large Bird Specialist", "Nest Box Builder", "Omnivore Expert",
69-
"Passerine Specialist", "Platform Builder", "Prairie Manager",
70-
"Rodentologist", "Small Clutch Specialist", "Viticulturalist",
71-
"Wetland Scientist", "Wildlife Gardener","/ (food cost)", "* (food cost)"
81+
"Anatomist",
82+
"Cartographer",
83+
"Historian",
84+
"Photographer",
85+
"Backyard Birder",
86+
"Bird Bander",
87+
"Bird Counter",
88+
"Bird Feeder",
89+
"Diet Specialist",
90+
"Enclosure Builder",
91+
"Endangered Species Protector",
92+
"Falconer",
93+
"Fishery Manager",
94+
"Food Web Expert",
95+
"Forester",
96+
"Large Bird Specialist",
97+
"Nest Box Builder",
98+
"Omnivore Expert",
99+
"Passerine Specialist",
100+
"Platform Builder",
101+
"Prairie Manager",
102+
"Rodentologist",
103+
"Small Clutch Specialist",
104+
"Viticulturalist",
105+
"Wetland Scientist",
106+
"Wildlife Gardener",
107+
"/ (food cost)",
108+
"* (food cost)",
109+
"Swift Start",
110+
"Automa ban",
72111
}
73-
112+
float_fields = {
113+
"Invertebrate",
114+
"Seed",
115+
"Fish",
116+
"Fruit",
117+
"Rodent",
118+
"Nectar",
119+
"Wild (food)",
120+
"Total food cost",
121+
}
122+
74123
for item in data:
75124
normalized_item = {}
76-
125+
77126
for key, value in item.items():
78127
new_key = to_snake_case(key)
79-
80-
128+
81129
# Handle boolean fields
82130
if key in boolean_fields:
83131
normalized_item[new_key] = normalize_boolean_value(value)
84-
elif new_key =="wingspan":
85-
value = str(value)
132+
elif key in float_fields:
133+
normalized_item[new_key] = normalize_float_value(value)
134+
elif key == "Color" and value is None:
135+
normalized_item[new_key] = "white"
86136
# Handle nested structures
87137
else:
88138
normalized_item[new_key] = value
89-
139+
90140
normalized.append(normalized_item)
91-
141+
92142
return normalized
93143

94144

95145
def normalize_bonus_json(data: List[Dict]) -> List[Dict]:
96146
"""Normalize bonus card data from bonus.json."""
97147
normalized = []
98-
148+
99149
for item in data:
100150
normalized_item = {}
101-
151+
102152
for key, value in item.items():
103153
new_key = to_snake_case(key)
104-
154+
105155
# Handle special field name mappings
106156
if key == "Bonus card":
107157
new_key = "bonus_card"
@@ -110,32 +160,32 @@ def normalize_bonus_json(data: List[Dict]) -> List[Dict]:
110160
elif key == "%":
111161
new_key = "percentage"
112162
# Convert "-" to null
113-
if value == "-" or value== "variable":
163+
if value == "-" or value == "variable":
114164
value = None
115165

116166
elif key == "VP Average":
117167
new_key = "vp_average"
118168
elif key == "Explanatory text":
119169
new_key = "explanatory_text"
120-
170+
121171
# Handle boolean field
122172
if key == "Automa":
123173
normalized_item[new_key] = normalize_boolean_value(value)
124174
else:
125175
normalized_item[new_key] = value
126-
176+
127177
normalized.append(normalized_item)
128-
178+
129179
return normalized
130180

131181

132182
def normalize_goals_json(data: List[Dict]) -> List[Dict]:
133183
"""Normalize goals data from goals.json."""
134184
normalized = []
135-
185+
136186
for item in data:
137187
normalized_item = {}
138-
188+
139189
for key, value in item.items():
140190
# Handle numeric score keys
141191

@@ -151,15 +201,13 @@ def normalize_goals_json(data: List[Dict]) -> List[Dict]:
151201
else:
152202
new_key = key # Keep id as-is
153203
normalized_item[new_key] = value
154-
155-
204+
156205
normalized.append(normalized_item)
157-
206+
158207
return normalized
159208

160209

161210
def normalize_general_json(data: Dict) -> Dict:
162-
163211
return data
164212

165213

@@ -169,45 +217,44 @@ def main():
169217
script_dir = Path(__file__).parent
170218
data_dir = script_dir
171219
output_dir = script_dir / "data_normalized"
172-
220+
173221
# Create output directory if it doesn't exist
174222
output_dir.mkdir(exist_ok=True)
175-
223+
176224
# Define file processors
177225
processors = {
178226
"master.json": normalize_master_json,
179227
"bonus.json": normalize_bonus_json,
180228
"goals.json": normalize_goals_json,
181229
"general.json": normalize_general_json,
182230
}
183-
231+
184232
# Process each file
185233
for filename, processor in processors.items():
186234
input_path = data_dir / filename
187-
235+
188236
if not input_path.exists():
189237
print(f"Warning: {filename} not found, skipping...")
190238
continue
191-
239+
192240
print(f"Processing {filename}...")
193-
241+
194242
# Read input file
195-
with open(input_path, 'r', encoding='utf-8') as f:
243+
with open(input_path, "r", encoding="utf-8") as f:
196244
data = json.load(f)
197-
245+
198246
# Normalize data
199247
normalized_data = processor(data)
200-
248+
201249
# Write output file
202250
output_path = output_dir / filename
203-
with open(output_path, 'w', encoding='utf-8') as f:
251+
with open(output_path, "w", encoding="utf-8") as f:
204252
json.dump(normalized_data, f, indent=2, ensure_ascii=False)
205-
253+
206254
print(f" → Written to {output_path}")
207-
255+
208256
print(f"\nNormalization complete! Output files written to {output_dir}")
209257

210258

211259
if __name__ == "__main__":
212260
main()
213-

data/predator_probabilities.json

Whitespace-only changes.

0 commit comments

Comments
 (0)