1414
1515# Global lock for thread-safe printing and counter for matches
1616print_lock = Lock ()
17- matches_found = {'count' : 0 , 'files' : []}
17+ matches_found = {"count" : 0 , "files" : []}
18+
1819
1920def get_file_list (figshare_article_id ):
2021 """Fetch the list of ALL files from Figshare API (handles pagination)."""
2122 base_url = f"https://api.figshare.com/v2/articles/{ figshare_article_id } /files"
2223 all_files = []
2324 page = 1
2425 page_size = 100 # Max allowed by Figshare API
25-
26+
2627 try :
2728 while True :
28- params = {
29- 'page' : page ,
30- 'page_size' : page_size
31- }
29+ params = {"page" : page , "page_size" : page_size }
3230 response = requests .get (base_url , params = params )
3331 response .raise_for_status ()
3432 files = response .json ()
35-
33+
3634 if not files :
3735 break
38-
36+
3937 all_files .extend (files )
4038 print (f" Fetched page { page } : { len (files )} files (total so far: { len (all_files )} )" )
41-
39+
4240 if len (files ) < page_size :
4341 # Last page
4442 break
45-
43+
4644 page += 1
47-
45+
4846 return all_files
4947 except requests .exceptions .RequestException as e :
5048 print (f"Error fetching file list: { e } " )
5149 sys .exit (1 )
5250
51+
5352def download_and_check_file (file_info , search_string , download_dir , idx , total ):
5453 """
5554 Download a file, check for search string while streaming, and handle accordingly.
5655 Returns (found, file_name) tuple.
5756 """
58- file_name = file_info [' name' ]
59- file_url = file_info [' download_url' ]
57+ file_name = file_info [" name" ]
58+ file_url = file_info [" download_url" ]
6059 file_path = os .path .join (download_dir , file_name )
61-
60+
6261 with print_lock :
63- print (f"[{ idx } /{ total } ] Downloading: { file_name } ({ file_info ['size' ] / (1024 * 1024 ):.2f} MB)..." )
64-
62+ print (
63+ f"[{ idx } /{ total } ] Downloading: { file_name } ({ file_info ['size' ] / (1024 * 1024 ):.2f} MB)..."
64+ )
65+
6566 try :
6667 # Download file with streaming
6768 response = requests .get (file_url , stream = True , timeout = 60 )
6869 response .raise_for_status ()
69-
70+
7071 # Search while downloading (more efficient for large files)
7172 found = False
7273 chunk_size = 8192
73- buffer = b''
74- search_bytes = search_string .encode (' utf-8' )
75-
76- with open (file_path , 'wb' ) as f :
74+ buffer = b""
75+ search_bytes = search_string .encode (" utf-8" )
76+
77+ with open (file_path , "wb" ) as f :
7778 for chunk in response .iter_content (chunk_size = chunk_size ):
7879 f .write (chunk )
79-
80+
8081 # Search in overlapping buffer to catch strings across chunk boundaries
8182 buffer += chunk
8283 if search_bytes in buffer :
8384 found = True
8485 # Continue downloading but we know we found it
85-
86+
8687 # Keep last part of buffer for overlap check
8788 if len (buffer ) > len (search_bytes ) * 2 :
88- buffer = buffer [- (len (search_bytes ) * 2 ):]
89-
89+ buffer = buffer [- (len (search_bytes ) * 2 ) :]
90+
9091 if found :
9192 with print_lock :
9293 print (f" ✓ FOUND '{ search_string } ' in { file_name } !" )
9394 print (f" File saved at: { file_path } " )
94- matches_found [' count' ] += 1
95- matches_found [' files' ].append (file_name )
95+ matches_found [" count" ] += 1
96+ matches_found [" files" ].append (file_name )
9697 return (True , file_name )
9798 else :
9899 with print_lock :
99100 print (f" String not found in { file_name } . Deleting..." )
100101 os .remove (file_path )
101102 return (False , None )
102-
103+
103104 except requests .exceptions .RequestException as e :
104105 with print_lock :
105106 print (f" Error downloading { file_name } : { e } " )
@@ -113,43 +114,44 @@ def download_and_check_file(file_info, search_string, download_dir, idx, total):
113114 os .remove (file_path )
114115 return (False , None )
115116
117+
116118def main ():
117119 FIGSHARE_ARTICLE_ID = "7376003" # english dataset, change for other languages
118120 SEARCH_STRING = "2052702.7345.7345"
119121 DOWNLOAD_DIR = "./wikiconv_downloads"
120122 MAX_WORKERS = 10 # Adjust based on your server's bandwidth and CPU
121-
123+
122124 print ("=" * 60 )
123125 print ("WikiConv File Finder (Parallel - Keep All Matches)" )
124126 print ("=" * 60 )
125127 print (f"Search string: '{ SEARCH_STRING } '" )
126128 print (f"Download directory: { DOWNLOAD_DIR } " )
127129 print (f"Parallel workers: { MAX_WORKERS } " )
128130 print ()
129-
131+
130132 # Create download directory
131133 Path (DOWNLOAD_DIR ).mkdir (parents = True , exist_ok = True )
132-
134+
133135 # Get file list
134136 print ("Fetching file list from Figshare..." )
135137 files = get_file_list (FIGSHARE_ARTICLE_ID )
136-
138+
137139 if not files :
138140 print ("No files found!" )
139141 sys .exit (1 )
140-
142+
141143 print (f"Found { len (files )} files." )
142144 print ()
143-
145+
144146 start_time = time .time ()
145-
147+
146148 # Process files in parallel
147149 START_INDEX = 1 # 1-based index, meaning skip first 88
148150 if START_INDEX > len (files ):
149151 print (f"Start index ({ START_INDEX } ) is beyond available files ({ len (files )} ). Exiting." )
150152 sys .exit (1 )
151153
152- files_to_process = files [START_INDEX - 1 :] # slice from the 89th file onward
154+ files_to_process = files [START_INDEX - 1 :] # slice from the 89th file onward
153155 total_files = len (files_to_process )
154156 print (f"Processing files { START_INDEX } –{ len (files )} ({ total_files } total)...\n " )
155157
@@ -162,35 +164,38 @@ def main():
162164 file_info ,
163165 SEARCH_STRING ,
164166 DOWNLOAD_DIR ,
165- idx + START_INDEX - 1 ,
166- len (files )
167+ idx + START_INDEX - 1 ,
168+ len (files ),
167169 ): file_info
168170 for idx , file_info in enumerate (files_to_process , 1 )
169171 }
170-
172+
171173 # process completed tasks
172174 for future in as_completed (future_to_file ):
173175 found , file_name = future .result ()
174176 completed += 1
175-
177+
176178 if completed % 50 == 0 :
177179 with print_lock :
178- print (f"\n --- Progress: { completed } /{ len (files )} files processed, { matches_found ['count' ]} matches found ---\n " )
179-
180+ print (
181+ f"\n --- Progress: { completed } /{ len (files )} files processed, { matches_found ['count' ]} matches found ---\n "
182+ )
183+
180184 elapsed = time .time () - start_time
181185 print ()
182186 print ("=" * 60 )
183187 print (f"COMPLETED: Processed all { len (files )} files." )
184188 print (f"Matches found: { matches_found ['count' ]} " )
185- if matches_found [' files' ]:
189+ if matches_found [" files" ]:
186190 print (f"\n Files containing '{ SEARCH_STRING } ':" )
187- for match_file in matches_found [' files' ]:
191+ for match_file in matches_found [" files" ]:
188192 print (f" - { match_file } " )
189193 else :
190194 print (f"\n Search string '{ SEARCH_STRING } ' was NOT found in any file." )
191195 print (f"\n Time elapsed: { elapsed :.2f} seconds" )
192196 print (f"Average: { elapsed / len (files ):.2f} seconds per file" )
193197 print ("=" * 60 )
194198
199+
195200if __name__ == "__main__" :
196- main ()
201+ main ()
0 commit comments