Ingestion with the new library · Issue #4338 · acl-org/acl-anthology

This issue tracks what needs to be done to be able to ingest new volumes using the new Python library.

Provide convenience functions to create new collections, volumes, and papers. 🟢
- Basic functionality for this added; interaction with different indices is an open question.
Add function to generate new bibkeys. 🟢
Add function to add new files, with checksum calculation. 🟢
Move normalization logic into the library. 🟡
Move LaTeX conversion into the library. 🟡
- Do we need our custom latex_to_unicode? Can we use pylatexenc instead? Should this be added as MarkupText.from_latex?
Make XML serialization produce minimal diffs by respecting order of elements in existing XML file. 🟠

Sketch how an adapted ingestion script should function, roughly:

Lines 306 to 398 in 431c8e9

    
           collection_id = str(year) + "." + venue 
        
           if (collection := anthology.collections.get(collection_id)) is None: 
        
               collection = anthology.collections.create(collection_id) 
        
           papers = [] 
        
           for xml in sorted(args.root_dir.glob("*.xml")): 
        
               paper_dict, issue_info, issue, volume = process_xml(xml, is_tacl) 
        
               if paper_dict["title"].startswith("Erratum: “"): 
        
                   continue 
        
               pdf_path = xml.parent / xml.with_suffix(".pdf").name 
        
               if not pdf_path.is_file(): 
        
                   logging.error(f"Missing pdf for {pdf_path}") 
        
                   sys.exit(1) 
        
               papers.append((paper_dict, pdf_path, issue_info, issue)) 
        
               pdf_destination = Path(args.pdfs_dir) 
        
               pdf_destination = pdf_destination / "pdf" / venue 
        
               pdf_destination.mkdir(parents=True, exist_ok=True) 
        
           # MIT Press does assign its IDs in page order, so we have to sort by page 
        
           def sort_papers_by_page(paper_tuple): 
        
               startpage = paper_tuple[0]["pages"][0] 
        
               return startpage 
        
           for paper_dict, pdf_path, issue_info, issue in sorted( 
        
               papers, key=sort_papers_by_page 
        
           ): 
        
               issue = issue or "1" 
        
               if (volume := collection.get(issue)) is None: 
        
                   logging.info(f"New issue: {issue_info}") 
        
                   if venue == "cl": 
        
                       month = issue_info.split()[-2]  # blah blah blah month year 
        
                       if month not in MONTHS.values(): 
        
                           logging.error("Unknown month: " + month) 
        
                   else: 
        
                       month = None 
        
                   volume = collection.create_volume( 
        
                       issue, 
        
                       title=MarkupText.from_string(issue_info),  # TODO: from LaTeX? 
        
                       type=VolumeType.JOURNAL, 
        
                       year=str(year), 
        
                       month=month, 
        
                       publisher="MIT Press", 
        
                       address="Cambridge, MA", 
        
                       venue_ids=[venue], 
        
                       journal_volume=volume, 
        
                       journal_issue=issue, 
        
                   ) 
        
               # Check if the paper is already present in the volume 
        
               if any(paper.get("doi") == paper_dict["doi"] for paper in volume.papers()): 
        
                   logging.info(f"Skipping existing paper with DOI {paper_dict['doi']}") 
        
                   continue 
        
               # TODO: encapsulate 
        
               from acl_anthology.collections import Paper 
        
               from acl_anthology.people import NameSpecification, Name 
        
               paper = Paper( 
        
                   id=...,  # should be determined automatically 
        
                   parent=volume, 
        
                   bibkey="",  # should be determined automatically 
        
                   title=MarkupText.from_string(paper_dict["title"]),  # TODO: from LaTeX? 
        
                   abstract=MarkupText.from_string(paper_dict["abstract"]),  # TODO: from LaTeX? 
        
                   doi=paper_dict["doi"], 
        
                   pages="-".join(paper_dict["pages"]),  # TODO 
        
                   authors=[ 
        
                       NameSpecification(Name.from_dict(author)) 
        
                       for author in paper_dict["authors"] 
        
                   ], 
        
               ) 
        
               anth_id = paper.full_id 
        
               destination = pdf_destination / f"{anth_id}.pdf" 
        
               print(f"Copying {pdf_path} to {destination}") 
        
               shutil.copyfile(pdf_path, destination) 
        
               # TODO: 
        
               from acl_anthology.files import PDFReference 
        
               paper.pdf = PDFReference(name=f"{anth_id}.pdf", checksum=...) 
        
               # checksum = compute_hash_from_file(pdf_path) 
        
               # TODO: Normalization needs to happen within the library 
        
               # for oldnode in papernode: 
        
               #    normalize(oldnode, informat="latex") 
        
           # All serialization to XML happens here 
        
           collection.save()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Ingestion with the new library #4338

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	collection_id = str(year) + "." + venue
	if (collection := anthology.collections.get(collection_id)) is None:
	collection = anthology.collections.create(collection_id)

	papers = []
	for xml in sorted(args.root_dir.glob("*.xml")):
	paper_dict, issue_info, issue, volume = process_xml(xml, is_tacl)
	if paper_dict["title"].startswith("Erratum: “"):
	continue

	pdf_path = xml.parent / xml.with_suffix(".pdf").name
	if not pdf_path.is_file():
	logging.error(f"Missing pdf for {pdf_path}")
	sys.exit(1)

	papers.append((paper_dict, pdf_path, issue_info, issue))

	pdf_destination = Path(args.pdfs_dir)
	pdf_destination = pdf_destination / "pdf" / venue
	pdf_destination.mkdir(parents=True, exist_ok=True)

	# MIT Press does assign its IDs in page order, so we have to sort by page
	def sort_papers_by_page(paper_tuple):
	startpage = paper_tuple[0]["pages"][0]
	return startpage

	for paper_dict, pdf_path, issue_info, issue in sorted(
	papers, key=sort_papers_by_page
	):
	issue = issue or "1"
	if (volume := collection.get(issue)) is None:
	logging.info(f"New issue: {issue_info}")

	if venue == "cl":
	month = issue_info.split()[-2] # blah blah blah month year
	if month not in MONTHS.values():
	logging.error("Unknown month: " + month)
	else:
	month = None

	volume = collection.create_volume(
	issue,
	title=MarkupText.from_string(issue_info), # TODO: from LaTeX?
	type=VolumeType.JOURNAL,
	year=str(year),
	month=month,
	publisher="MIT Press",
	address="Cambridge, MA",
	venue_ids=[venue],
	journal_volume=volume,
	journal_issue=issue,
	)

	# Check if the paper is already present in the volume
	if any(paper.get("doi") == paper_dict["doi"] for paper in volume.papers()):
	logging.info(f"Skipping existing paper with DOI {paper_dict['doi']}")
	continue

	# TODO: encapsulate
	from acl_anthology.collections import Paper
	from acl_anthology.people import NameSpecification, Name

	paper = Paper(
	id=..., # should be determined automatically
	parent=volume,
	bibkey="", # should be determined automatically
	title=MarkupText.from_string(paper_dict["title"]), # TODO: from LaTeX?
	abstract=MarkupText.from_string(paper_dict["abstract"]), # TODO: from LaTeX?
	doi=paper_dict["doi"],
	pages="-".join(paper_dict["pages"]), # TODO
	authors=[
	NameSpecification(Name.from_dict(author))
	for author in paper_dict["authors"]
	],
	)

	anth_id = paper.full_id
	destination = pdf_destination / f"{anth_id}.pdf"
	print(f"Copying {pdf_path} to {destination}")
	shutil.copyfile(pdf_path, destination)

	# TODO:
	from acl_anthology.files import PDFReference

	paper.pdf = PDFReference(name=f"{anth_id}.pdf", checksum=...)
	# checksum = compute_hash_from_file(pdf_path)

	# TODO: Normalization needs to happen within the library
	# for oldnode in papernode:
	# normalize(oldnode, informat="latex")

	# All serialization to XML happens here
	collection.save()

Ingestion with the new library #4338

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions