Skip to content

Commit 5bc4fc2

Browse files
committed
Add validation and defensive CSV reads to prevent pandas EmptyDataError in CI and replage direct Image.LANCZOS access with fallback
1 parent a481608 commit 5bc4fc2

1 file changed

Lines changed: 69 additions & 10 deletions

File tree

scripts/highlights-download-statistics.py

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,30 @@ def main():
2222
This is done in the github CI before the website is regenerated, after every modification on the main branch.
2323
"""
2424

25-
latest_file, previous_file = get_latest_two_csv_files(folder)
25+
try:
26+
latest_file, previous_file = get_latest_two_csv_files(folder)
27+
except FileNotFoundError as e:
28+
print(f"No valid CSV files found: {e}")
29+
return
30+
31+
# Load the CSV data with defensive handling
32+
try:
33+
current_week_data = pd.read_csv(os.path.join(folder, latest_file))
34+
except pd.errors.EmptyDataError:
35+
print(f"Latest CSV '{latest_file}' is empty or has no columns. Aborting.")
36+
return
37+
except Exception as e:
38+
print(f"Failed to read latest CSV '{latest_file}': {e}")
39+
return
2640

27-
# Load the CSV data
28-
current_week_data = pd.read_csv(os.path.join(folder, latest_file))
29-
previous_week_data = pd.read_csv(os.path.join(folder, previous_file))
41+
try:
42+
previous_week_data = pd.read_csv(os.path.join(folder, previous_file))
43+
except pd.errors.EmptyDataError:
44+
print(f"Previous CSV '{previous_file}' is empty or has no columns. Aborting.")
45+
return
46+
except Exception as e:
47+
print(f"Failed to read previous CSV '{previous_file}': {e}")
48+
return
3049

3150
# Merge data on the 'url' column to compare downloads
3251
merged_data = pd.merge(current_week_data, previous_week_data, on='url', suffixes=('_current', '_previous'))
@@ -58,13 +77,44 @@ def get_latest_two_csv_files(folder):
5877
Get the two most recent CSV files in the specified folder.
5978
"""
6079
# Get all CSV files in the folder
61-
csv_files = [f for f in os.listdir(folder) if f.endswith('.csv')]
80+
csv_candidates = [f for f in os.listdir(folder) if f.endswith('.csv')]
81+
82+
# Parse dates from filenames and ignore files that don't match the expected format
83+
dated_files = []
84+
for f in csv_candidates:
85+
try:
86+
dt = extract_date_from_filename(f)
87+
dated_files.append((f, dt))
88+
except Exception:
89+
continue
6290

6391
# Sort files by date (newest first)
64-
csv_files = sorted(csv_files, key=lambda f: extract_date_from_filename(f), reverse=True)
92+
dated_files.sort(key=lambda t: t[1], reverse=True)
93+
94+
# Collect two valid CSV files (non-empty and parseable)
95+
valid_files = []
96+
for fname, _ in dated_files:
97+
fullpath = os.path.join(folder, fname)
98+
try:
99+
if os.path.getsize(fullpath) == 0:
100+
continue
101+
except OSError:
102+
continue
103+
104+
# Quick parse test: try to read one row to ensure there are columns
105+
try:
106+
pd.read_csv(fullpath, nrows=1)
107+
except Exception:
108+
continue
109+
110+
valid_files.append(fname)
111+
if len(valid_files) >= 2:
112+
break
113+
114+
if len(valid_files) < 2:
115+
raise FileNotFoundError(f"Could not find two valid CSV files in '{folder}'. Found: {valid_files}")
65116

66-
# Return the two most recent files
67-
return csv_files[0], csv_files[1]
117+
return valid_files[0], valid_files[1]
68118

69119
def download_first_pdf_file_from_zenodo(folder, record_id):
70120
"""
@@ -100,7 +150,7 @@ def download_first_pdf_file_from_zenodo(folder, record_id):
100150
pdf = pypdfium2.PdfDocument(file_content)
101151
page = pdf[0]
102152
pil_image = page.render(
103-
scale=2.0,
153+
scale=2,
104154
rotation=0
105155
).to_pil()
106156

@@ -132,7 +182,16 @@ def resize_image(image, height):
132182
"""
133183
aspect_ratio = image.width / image.height
134184
new_width = int(aspect_ratio * height)
135-
return image.resize((new_width, height), Image.LANCZOS)
185+
# Determine resampling method safely to avoid static type-checker attribute errors
186+
resample_method = None
187+
# Prefer the Resampling enum when available (Pillow >= 9.1)
188+
resampling_enum = getattr(Image, "Resampling", None)
189+
if resampling_enum is not None:
190+
resample_method = getattr(resampling_enum, "LANCZOS", None)
191+
# Fallback to legacy attribute via getattr to avoid Pylance attribute warnings
192+
if resample_method is None:
193+
resample_method = getattr(Image, "LANCZOS", getattr(Image, "BICUBIC", None))
194+
return image.resize((new_width, height), resample_method)
136195

137196
# Define the format of your PNG file
138197
def get_latest_png_filename(id):

0 commit comments

Comments
 (0)