@@ -184,6 +184,14 @@ def download(
184184 print ("Dataset already exists at {}" .format (dataset_path ))
185185 dataset_path = os .path .join (downloaded_paths [name ], name )
186186
187+ # Check if the corpus directory exists, otherwise use the parent directory
188+ if needs_download :
189+ potential_corpus_dir = os .path .join (os .path .dirname (dataset_path ), name )
190+ if os .path .exists (potential_corpus_dir ):
191+ dataset_path = potential_corpus_dir
192+ else :
193+ dataset_path = os .path .dirname (dataset_path )
194+
187195 return dataset_path
188196
189197
@@ -276,19 +284,37 @@ def _download_helper(
276284 os .mkdir (corpus_dir )
277285 zipf .extractall (corpus_dir )
278286
279- elif url .lower ().endswith (".corpus" ) or url .lower ().endswith (".zip" ):
280- # print(dataset_path)
287+ elif (
288+ url .lower ().endswith (".corpus" )
289+ or url .lower ().endswith (".corpus.zip" )
290+ or url .lower ().endswith (".zip" )
291+ ):
281292 with zipfile .ZipFile (dataset_path , "r" ) as zipf :
282- zipf .extractall (os .path .dirname (dataset_path ))
293+ # Check if the zip contains a directory with the corpus name
294+ zip_contents = zipf .namelist ()
295+ has_corpus_dir = any (item .startswith (f"{ name } /" ) for item in zip_contents )
296+
297+ if has_corpus_dir :
298+ # If zip contains a corpus directory, extract to parent directory
299+ zipf .extractall (os .path .dirname (dataset_path ))
300+ else :
301+ # If zip contains files at root level, create corpus directory and extract there
302+ corpus_dir = os .path .join (os .path .dirname (dataset_path ), name )
303+ if not os .path .exists (corpus_dir ):
304+ os .mkdir (corpus_dir )
305+ zipf .extractall (corpus_dir )
283306
284307 if verbose :
285308 print ("Done" )
286309 # for Corpus objects only: check the Corpus version
287310 if is_corpus :
288311 with open (downloadeds_path , "a" ) as f :
289- fn = os .path .join (
290- os .path .dirname (dataset_path ), name
291- ) # os.path.join(os.path.dirname(data), name)
312+ # Check if the corpus directory exists, otherwise use the parent directory
313+ potential_corpus_dir = os .path .join (os .path .dirname (dataset_path ), name )
314+ if os .path .exists (potential_corpus_dir ):
315+ fn = potential_corpus_dir
316+ else :
317+ fn = os .path .dirname (dataset_path )
292318 f .write (
293319 "{}$#${}$#${}\n " .format (
294320 name , os .path .realpath (os .path .dirname (dataset_path ) + "/" ), corpus_version (fn )
0 commit comments