Skip to content

Commit 8dc6b28

Browse files
committed
update extracting script
1 parent 6093fbf commit 8dc6b28

File tree

1 file changed

+167
-15
lines changed

1 file changed

+167
-15
lines changed

_scripts/extract_solutions.py

Lines changed: 167 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import argparse
2+
import html
3+
import os
24
import re
35
import time
6+
import urllib.request
47

58
import yaml
69
from selenium import webdriver
@@ -15,6 +18,61 @@ def get_competition_slug(link):
1518
return None
1619

1720

21+
def format_yaml_value(value):
22+
"""Format a value with double quotes."""
23+
if value is None:
24+
return ""
25+
return f'"{value}"'
26+
27+
28+
def format_competition_yaml(comp, indent=" "):
29+
"""Format a competition entry with custom YAML style (unquoted keys, quoted values)."""
30+
lines = []
31+
32+
# Add "- number:" as first field with list syntax
33+
if "number" in comp:
34+
lines.append(f"{indent}- number: {format_yaml_value(comp['number'])}")
35+
field_indent = indent + " " # Extra indent for subsequent fields
36+
else:
37+
field_indent = indent
38+
39+
# Add all other simple fields in order
40+
for key in [
41+
"title",
42+
"desc",
43+
"kind",
44+
"prize",
45+
"team",
46+
"metric",
47+
"link",
48+
"image",
49+
"year",
50+
"isHot",
51+
"done",
52+
]:
53+
if key in comp:
54+
lines.append(f"{field_indent}{key}: {format_yaml_value(comp[key])}")
55+
56+
# Add solutions
57+
if "solutions" in comp:
58+
if comp["solutions"]:
59+
lines.append(f"{field_indent}solutions:")
60+
for sol in comp["solutions"]:
61+
lines.append(
62+
f"{field_indent} - rank: {format_yaml_value(sol.get('rank', ''))}"
63+
)
64+
lines.append(
65+
f"{field_indent} link: {format_yaml_value(sol.get('link', ''))}"
66+
)
67+
lines.append(
68+
f"{field_indent} kind: {format_yaml_value(sol.get('kind', ''))}"
69+
)
70+
else:
71+
lines.append(f"{field_indent}solutions:")
72+
73+
return "\n".join(lines)
74+
75+
1876
def create_driver():
1977
"""Create a headless Chrome driver."""
2078
options = Options()
@@ -27,6 +85,77 @@ def create_driver():
2785
return driver
2886

2987

88+
def build_competition_image_mapping(driver):
89+
"""
90+
Visit the competitions listing page and build a mapping of slug -> (image_url, comp_id).
91+
"""
92+
url = "https://www.kaggle.com/competitions?listOption=completed"
93+
print(f" Building image mapping from: {url}")
94+
driver.get(url)
95+
time.sleep(5)
96+
97+
# Scroll to load more competitions
98+
for _ in range(3):
99+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
100+
time.sleep(2)
101+
102+
page_source = driver.page_source
103+
104+
# Split by competition links and find nearby image URLs
105+
chunks = re.split(r'href="/competitions/', page_source)
106+
mapping = {}
107+
108+
for chunk in chunks[1:]: # Skip first chunk (before first competition)
109+
slug_match = re.match(r"([a-z0-9-]+)\"", chunk)
110+
if slug_match:
111+
slug = slug_match.group(1)
112+
# Look for image URL in nearby content (within same card/container)
113+
img_match = re.search(
114+
r'src="(https://storage\.googleapis\.com/kaggle-competitions/kaggle/(\d+)/logos/[^"]+)"',
115+
chunk[:3000],
116+
)
117+
if img_match:
118+
img_url = img_match.group(1)
119+
comp_id = img_match.group(2)
120+
if slug not in mapping: # Keep first match for each slug
121+
mapping[slug] = (html.unescape(img_url), comp_id)
122+
123+
print(f" Found {len(mapping)} competition image mappings")
124+
return mapping
125+
126+
127+
def download_competition_image(competition_slug, output_dir, image_mapping):
128+
"""
129+
Download the image for a single competition using the pre-built mapping.
130+
Returns the filename if successful, None otherwise.
131+
"""
132+
if competition_slug not in image_mapping:
133+
print(f" No image mapping found for: {competition_slug}")
134+
return None
135+
136+
img_url, comp_id = image_mapping[competition_slug]
137+
138+
try:
139+
# Get file extension
140+
ext_match = re.search(r"\.(png|jpg|jpeg|gif|svg|webp)", img_url.lower())
141+
ext = ext_match.group(1) if ext_match else "png"
142+
143+
filename = f"{comp_id}.{ext}"
144+
filepath = os.path.join(output_dir, filename)
145+
146+
if not os.path.exists(filepath):
147+
print(f" Downloading image: {filename}")
148+
urllib.request.urlretrieve(img_url, filepath)
149+
return filename
150+
else:
151+
print(f" Image exists: {filename}")
152+
return filename
153+
except Exception as e:
154+
print(f" Error downloading image: {e}")
155+
156+
return None
157+
158+
30159
def get_kaggle_solutions(driver, competition_slug):
31160
"""
32161
Fetch leaderboard page with Selenium and extract solution links.
@@ -101,9 +230,10 @@ def get_kaggle_solutions(driver, competition_slug):
101230
return solutions
102231

103232

104-
def process_yaml_file(input_path, output_path=None):
233+
def process_yaml_file(input_path, output_path=None, image_dir=None):
105234
"""
106235
Process a YAML file containing Kaggle competitions and fill in solutions.
236+
Optionally download competition images if image_dir is provided.
107237
"""
108238
with open(input_path, "r", encoding="utf-8") as f:
109239
competitions = yaml.safe_load(f)
@@ -114,10 +244,19 @@ def process_yaml_file(input_path, output_path=None):
114244

115245
print(f"Found {len(competitions)} competitions to process.\n")
116246

247+
if image_dir:
248+
os.makedirs(image_dir, exist_ok=True)
249+
print(f"Images will be saved to: {image_dir}\n")
250+
117251
# Create driver once for all competitions
118252
print("Starting browser...")
119253
driver = create_driver()
120254

255+
# Build image mapping if we need to download images
256+
image_mapping = {}
257+
if image_dir:
258+
image_mapping = build_competition_image_mapping(driver)
259+
121260
try:
122261
for i, comp in enumerate(competitions):
123262
title = comp.get("title", "Unknown")
@@ -144,26 +283,27 @@ def process_yaml_file(input_path, output_path=None):
144283
if solutions:
145284
comp["solutions"] = solutions
146285
comp["done"] = "true"
147-
print(f" Found {len(solutions)} solutions.\n")
286+
print(f" Found {len(solutions)} solutions.")
148287
else:
149288
comp["solutions"] = []
150-
print(" No solutions found.\n")
289+
print(" No solutions found.")
290+
291+
# Download image if image_dir is provided
292+
if image_dir:
293+
download_competition_image(competition_slug, image_dir, image_mapping)
294+
295+
print() # Empty line between competitions
151296

152297
finally:
153298
driver.quit()
154299
print("Browser closed.")
155300

156-
output_yaml = yaml.dump(
157-
competitions,
158-
sort_keys=False,
159-
allow_unicode=True,
160-
default_flow_style=False,
161-
default_style='"',
162-
width=1000,
163-
)
301+
# Format output using custom YAML formatter
302+
output_lines = []
303+
for comp in competitions:
304+
output_lines.append(format_competition_yaml(comp))
164305

165-
# Add 2-space indentation for easy copy-paste into parent YAML
166-
indented_yaml = "\n".join(" " + line for line in output_yaml.splitlines())
306+
indented_yaml = "\n".join(output_lines)
167307

168308
if output_path:
169309
with open(output_path, "w", encoding="utf-8") as f:
@@ -180,12 +320,24 @@ def process_yaml_file(input_path, output_path=None):
180320
)
181321
parser.add_argument(
182322
"input_file",
323+
nargs="?",
183324
help="Path to the input YAML file (e.g., kaggle-2026-01-01.txt)",
184325
)
185326
parser.add_argument(
186-
"-o",
187327
"--output",
328+
metavar="OUTPUT",
188329
help="Path to save the output file. If not specified, prints to stdout.",
189330
)
331+
parser.add_argument(
332+
"--images",
333+
metavar="DIR",
334+
help="Extract competition images to the specified directory (e.g., ~/Desktop/images)",
335+
)
190336
args = parser.parse_args()
191-
process_yaml_file(args.input_file, args.output)
337+
338+
if args.input_file:
339+
output_path = os.path.expanduser(args.output) if args.output else None
340+
image_dir = os.path.expanduser(args.images) if args.images else None
341+
process_yaml_file(args.input_file, output_path, image_dir)
342+
else:
343+
parser.print_help()

0 commit comments

Comments
 (0)