Skip to content

Commit 37c4027

Browse files
authored
Ingest NLPAICS (#5017)
1 parent 045dac1 commit 37c4027

File tree

3 files changed

+309
-2
lines changed

3 files changed

+309
-2
lines changed

bin/ingest.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,8 @@ def find_book():
325325
if os.path.basename(pdf_file).startswith("."):
326326
continue
327327

328-
# names are {abbrev}{number}.pdf
329-
match = re.match(r".*(\d+)\.pdf", pdf_file)
328+
# names are {abbrev}{number}.pdf, but may also have Anthology new-style IDs
329+
match = re.match(r".*?(\d+)\.pdf", pdf_file)
330330

331331
if match is not None:
332332
paper_num = int(match[1])
@@ -387,6 +387,13 @@ def find_book():
387387
log(f"Copying {attachment_file} -> {dest_path}", args.dry_run)
388388
shutil.copyfile(attachment_file_path, dest_path)
389389

390+
if paper_num not in volume:
391+
print(f"Fatal: no key {paper_num} in volume", file=sys.stderr)
392+
import json
393+
394+
print(json.dumps(volume, indent=2), file=sys.stderr)
395+
sys.exit(1)
396+
390397
volume[paper_num]["attachments"].append((dest_path, type_))
391398

392399
# create xml

0 commit comments

Comments
 (0)