-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_data.py
More file actions
105 lines (81 loc) · 3.37 KB
/
Copy pathextract_data.py
File metadata and controls
105 lines (81 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import io
import json
import re
import os
def extract_ial_data(file_path):
with open(file_path, 'r') as f:
content = f.read()
# Find the CSV_DATA string
match = re.search(r'CSV_DATA = """(.*?)"""', content, re.DOTALL)
if match:
csv_content = match.group(1).strip()
df = pd.read_csv(io.StringIO(csv_content))
# Filter out 2025 papers
# Extract year from Title
df['Year'] = df['Title'].str.extract(r'(\d{4})')
df = df[df['Year'] != '2025']
# Minimal fields
# We need Unit_Code, Title, URL.
# We can pre-parse Session and Type to save client work
df['Session'] = df['Title'].str.extract(r'(January|February|March|April|May|June|July|August|September|October|November|December)')
def get_type(title):
t = title.lower()
if 'question paper' in t: return 'qp'
if 'mark scheme' in t: return 'ms'
if 'examiner report' in t: return 'er'
return 'other'
df['Type'] = df['Title'].apply(get_type)
# Select minimal columns
df = df[['Unit_Code', 'Title', 'URL', 'Year', 'Session', 'Type']]
# Replace NaN with None for valid JSON
df = df.where(pd.notnull(df), None)
return df.to_dict(orient='records')
return []
def extract_cie_data(file_path):
if os.path.exists(file_path):
df = pd.read_csv(file_path)
# Filter Categories
df['Category'] = df['Category'].astype(str).str.replace(r'[\[\]]', '', regex=True)
df = df[df['Category'].isin(['AS and A Level', 'IGCSE', 'O Level'])]
# Filter Year
df = df[df['Year'] != 2025]
# Clean Subject
df['Subject'] = df['Subject'].astype(str).str.replace(r'[\[\]]', '', regex=True)
# Minimal fields
# Rename for consistency/minification
df = df.rename(columns={
'Extracted_Session': 'Session',
'Extracted_Type': 'Type',
'Extracted_Component': 'Component',
'Full_URL': 'URL',
'Extracted_UnitCode': 'Unit'
})
# Simplify Type
def simplify_type(t):
if pd.isna(t): return 'other'
t = str(t).lower()
if 'question paper' in t: return 'qp'
if 'mark scheme' in t: return 'ms'
if 'examiner report' in t: return 'er'
if 'grade thresholds' in t: return 'gt'
return 'other'
df['Type'] = df['Type'].apply(simplify_type)
# Select columns
df = df[['Category', 'Subject', 'Year', 'Session', 'Type', 'Component', 'URL', 'Unit']]
df = df.astype(object)
df = df.where(pd.notnull(df), None)
return df.to_dict(orient='records')
return []
# Extract IAL
ial_data = extract_ial_data('developmentfiles/main-ial.py')
if ial_data:
with open('public/ial_data.json', 'w') as f:
json.dump(ial_data, f, indent=2)
print(f"Extracted {len(ial_data)} IAL records to public/ial_data.json")
# Extract CIE
cie_data = extract_cie_data('developmentfiles/cie.csv')
if cie_data:
with open('public/cie_data.json', 'w') as f:
json.dump(cie_data, f, indent=2)
print(f"Extracted {len(cie_data)} CIE records to public/cie_data.json")