11import argparse
2+ import html
3+ import os
24import re
35import time
6+ import urllib .request
47
58import yaml
69from selenium import webdriver
@@ -15,6 +18,61 @@ def get_competition_slug(link):
1518 return None
1619
1720
21+ def format_yaml_value (value ):
22+ """Format a value with double quotes."""
23+ if value is None :
24+ return ""
25+ return f'"{ value } "'
26+
27+
28+ def format_competition_yaml (comp , indent = " " ):
29+ """Format a competition entry with custom YAML style (unquoted keys, quoted values)."""
30+ lines = []
31+
32+ # Add "- number:" as first field with list syntax
33+ if "number" in comp :
34+ lines .append (f"{ indent } - number: { format_yaml_value (comp ['number' ])} " )
35+ field_indent = indent + " " # Extra indent for subsequent fields
36+ else :
37+ field_indent = indent
38+
39+ # Add all other simple fields in order
40+ for key in [
41+ "title" ,
42+ "desc" ,
43+ "kind" ,
44+ "prize" ,
45+ "team" ,
46+ "metric" ,
47+ "link" ,
48+ "image" ,
49+ "year" ,
50+ "isHot" ,
51+ "done" ,
52+ ]:
53+ if key in comp :
54+ lines .append (f"{ field_indent } { key } : { format_yaml_value (comp [key ])} " )
55+
56+ # Add solutions
57+ if "solutions" in comp :
58+ if comp ["solutions" ]:
59+ lines .append (f"{ field_indent } solutions:" )
60+ for sol in comp ["solutions" ]:
61+ lines .append (
62+ f"{ field_indent } - rank: { format_yaml_value (sol .get ('rank' , '' ))} "
63+ )
64+ lines .append (
65+ f"{ field_indent } link: { format_yaml_value (sol .get ('link' , '' ))} "
66+ )
67+ lines .append (
68+ f"{ field_indent } kind: { format_yaml_value (sol .get ('kind' , '' ))} "
69+ )
70+ else :
71+ lines .append (f"{ field_indent } solutions:" )
72+
73+ return "\n " .join (lines )
74+
75+
1876def create_driver ():
1977 """Create a headless Chrome driver."""
2078 options = Options ()
@@ -27,6 +85,77 @@ def create_driver():
2785 return driver
2886
2987
88+ def build_competition_image_mapping (driver ):
89+ """
90+ Visit the competitions listing page and build a mapping of slug -> (image_url, comp_id).
91+ """
92+ url = "https://www.kaggle.com/competitions?listOption=completed"
93+ print (f" Building image mapping from: { url } " )
94+ driver .get (url )
95+ time .sleep (5 )
96+
97+ # Scroll to load more competitions
98+ for _ in range (3 ):
99+ driver .execute_script ("window.scrollTo(0, document.body.scrollHeight);" )
100+ time .sleep (2 )
101+
102+ page_source = driver .page_source
103+
104+ # Split by competition links and find nearby image URLs
105+ chunks = re .split (r'href="/competitions/' , page_source )
106+ mapping = {}
107+
108+ for chunk in chunks [1 :]: # Skip first chunk (before first competition)
109+ slug_match = re .match (r"([a-z0-9-]+)\"" , chunk )
110+ if slug_match :
111+ slug = slug_match .group (1 )
112+ # Look for image URL in nearby content (within same card/container)
113+ img_match = re .search (
114+ r'src="(https://storage\.googleapis\.com/kaggle-competitions/kaggle/(\d+)/logos/[^"]+)"' ,
115+ chunk [:3000 ],
116+ )
117+ if img_match :
118+ img_url = img_match .group (1 )
119+ comp_id = img_match .group (2 )
120+ if slug not in mapping : # Keep first match for each slug
121+ mapping [slug ] = (html .unescape (img_url ), comp_id )
122+
123+ print (f" Found { len (mapping )} competition image mappings" )
124+ return mapping
125+
126+
127+ def download_competition_image (competition_slug , output_dir , image_mapping ):
128+ """
129+ Download the image for a single competition using the pre-built mapping.
130+ Returns the filename if successful, None otherwise.
131+ """
132+ if competition_slug not in image_mapping :
133+ print (f" No image mapping found for: { competition_slug } " )
134+ return None
135+
136+ img_url , comp_id = image_mapping [competition_slug ]
137+
138+ try :
139+ # Get file extension
140+ ext_match = re .search (r"\.(png|jpg|jpeg|gif|svg|webp)" , img_url .lower ())
141+ ext = ext_match .group (1 ) if ext_match else "png"
142+
143+ filename = f"{ comp_id } .{ ext } "
144+ filepath = os .path .join (output_dir , filename )
145+
146+ if not os .path .exists (filepath ):
147+ print (f" Downloading image: { filename } " )
148+ urllib .request .urlretrieve (img_url , filepath )
149+ return filename
150+ else :
151+ print (f" Image exists: { filename } " )
152+ return filename
153+ except Exception as e :
154+ print (f" Error downloading image: { e } " )
155+
156+ return None
157+
158+
30159def get_kaggle_solutions (driver , competition_slug ):
31160 """
32161 Fetch leaderboard page with Selenium and extract solution links.
@@ -101,9 +230,10 @@ def get_kaggle_solutions(driver, competition_slug):
101230 return solutions
102231
103232
104- def process_yaml_file (input_path , output_path = None ):
233+ def process_yaml_file (input_path , output_path = None , image_dir = None ):
105234 """
106235 Process a YAML file containing Kaggle competitions and fill in solutions.
236+ Optionally download competition images if image_dir is provided.
107237 """
108238 with open (input_path , "r" , encoding = "utf-8" ) as f :
109239 competitions = yaml .safe_load (f )
@@ -114,10 +244,19 @@ def process_yaml_file(input_path, output_path=None):
114244
115245 print (f"Found { len (competitions )} competitions to process.\n " )
116246
247+ if image_dir :
248+ os .makedirs (image_dir , exist_ok = True )
249+ print (f"Images will be saved to: { image_dir } \n " )
250+
117251 # Create driver once for all competitions
118252 print ("Starting browser..." )
119253 driver = create_driver ()
120254
255+ # Build image mapping if we need to download images
256+ image_mapping = {}
257+ if image_dir :
258+ image_mapping = build_competition_image_mapping (driver )
259+
121260 try :
122261 for i , comp in enumerate (competitions ):
123262 title = comp .get ("title" , "Unknown" )
@@ -144,26 +283,27 @@ def process_yaml_file(input_path, output_path=None):
144283 if solutions :
145284 comp ["solutions" ] = solutions
146285 comp ["done" ] = "true"
147- print (f" Found { len (solutions )} solutions.\n " )
286+ print (f" Found { len (solutions )} solutions." )
148287 else :
149288 comp ["solutions" ] = []
150- print (" No solutions found.\n " )
289+ print (" No solutions found." )
290+
291+ # Download image if image_dir is provided
292+ if image_dir :
293+ download_competition_image (competition_slug , image_dir , image_mapping )
294+
295+ print () # Empty line between competitions
151296
152297 finally :
153298 driver .quit ()
154299 print ("Browser closed." )
155300
156- output_yaml = yaml .dump (
157- competitions ,
158- sort_keys = False ,
159- allow_unicode = True ,
160- default_flow_style = False ,
161- default_style = '"' ,
162- width = 1000 ,
163- )
301+ # Format output using custom YAML formatter
302+ output_lines = []
303+ for comp in competitions :
304+ output_lines .append (format_competition_yaml (comp ))
164305
165- # Add 2-space indentation for easy copy-paste into parent YAML
166- indented_yaml = "\n " .join (" " + line for line in output_yaml .splitlines ())
306+ indented_yaml = "\n " .join (output_lines )
167307
168308 if output_path :
169309 with open (output_path , "w" , encoding = "utf-8" ) as f :
@@ -180,12 +320,24 @@ def process_yaml_file(input_path, output_path=None):
180320 )
181321 parser .add_argument (
182322 "input_file" ,
323+ nargs = "?" ,
183324 help = "Path to the input YAML file (e.g., kaggle-2026-01-01.txt)" ,
184325 )
185326 parser .add_argument (
186- "-o" ,
187327 "--output" ,
328+ metavar = "OUTPUT" ,
188329 help = "Path to save the output file. If not specified, prints to stdout." ,
189330 )
331+ parser .add_argument (
332+ "--images" ,
333+ metavar = "DIR" ,
334+ help = "Extract competition images to the specified directory (e.g., ~/Desktop/images)" ,
335+ )
190336 args = parser .parse_args ()
191- process_yaml_file (args .input_file , args .output )
337+
338+ if args .input_file :
339+ output_path = os .path .expanduser (args .output ) if args .output else None
340+ image_dir = os .path .expanduser (args .images ) if args .images else None
341+ process_yaml_file (args .input_file , output_path , image_dir )
342+ else :
343+ parser .print_help ()
0 commit comments