Skip to content

Commit 84f03b1

Browse files
authored
Corpus download issues (#297)
1 parent f224f22 commit 84f03b1

File tree

1 file changed

+32
-6
lines changed

1 file changed

+32
-6
lines changed

convokit/util.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,14 @@ def download(
184184
print("Dataset already exists at {}".format(dataset_path))
185185
dataset_path = os.path.join(downloaded_paths[name], name)
186186

187+
# Check if the corpus directory exists, otherwise use the parent directory
188+
if needs_download:
189+
potential_corpus_dir = os.path.join(os.path.dirname(dataset_path), name)
190+
if os.path.exists(potential_corpus_dir):
191+
dataset_path = potential_corpus_dir
192+
else:
193+
dataset_path = os.path.dirname(dataset_path)
194+
187195
return dataset_path
188196

189197

@@ -276,19 +284,37 @@ def _download_helper(
276284
os.mkdir(corpus_dir)
277285
zipf.extractall(corpus_dir)
278286

279-
elif url.lower().endswith(".corpus") or url.lower().endswith(".zip"):
280-
# print(dataset_path)
287+
elif (
288+
url.lower().endswith(".corpus")
289+
or url.lower().endswith(".corpus.zip")
290+
or url.lower().endswith(".zip")
291+
):
281292
with zipfile.ZipFile(dataset_path, "r") as zipf:
282-
zipf.extractall(os.path.dirname(dataset_path))
293+
# Check if the zip contains a directory with the corpus name
294+
zip_contents = zipf.namelist()
295+
has_corpus_dir = any(item.startswith(f"{name}/") for item in zip_contents)
296+
297+
if has_corpus_dir:
298+
# If zip contains a corpus directory, extract to parent directory
299+
zipf.extractall(os.path.dirname(dataset_path))
300+
else:
301+
# If zip contains files at root level, create corpus directory and extract there
302+
corpus_dir = os.path.join(os.path.dirname(dataset_path), name)
303+
if not os.path.exists(corpus_dir):
304+
os.mkdir(corpus_dir)
305+
zipf.extractall(corpus_dir)
283306

284307
if verbose:
285308
print("Done")
286309
# for Corpus objects only: check the Corpus version
287310
if is_corpus:
288311
with open(downloadeds_path, "a") as f:
289-
fn = os.path.join(
290-
os.path.dirname(dataset_path), name
291-
) # os.path.join(os.path.dirname(data), name)
312+
# Check if the corpus directory exists, otherwise use the parent directory
313+
potential_corpus_dir = os.path.join(os.path.dirname(dataset_path), name)
314+
if os.path.exists(potential_corpus_dir):
315+
fn = potential_corpus_dir
316+
else:
317+
fn = os.path.dirname(dataset_path)
292318
f.write(
293319
"{}$#${}$#${}\n".format(
294320
name, os.path.realpath(os.path.dirname(dataset_path) + "/"), corpus_version(fn)

0 commit comments

Comments
 (0)