Skip to content

Commit e86f5ff

Browse files
Ingest arxiv data
1 parent fd72614 commit e86f5ff

File tree

2 files changed

+19
-13
lines changed

2 files changed

+19
-13
lines changed

db/enrichments.sqlite3

568 KB
Binary file not shown.

lib/tasks/enrichment.rake

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,30 +26,36 @@ namespace :enrichment do
2626

2727
desc "Ingest ARXIV data"
2828
task ingest_arxiv: :environment do
29-
CSV.foreach("lib/data/20250426_arxiv_sample_3_matches.csv", headers: true) do |row|
29+
file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json")
30+
data = JSON.parse(file)
31+
count = 0
32+
33+
data.each do |item|
34+
count += 1
35+
36+
break if count == 2001
37+
3038
enrichment = Enrichment.new(
31-
doi: row["input_doi"],
39+
doi: item["input_doi"],
3240
source: "COMET",
33-
process: "10.000/FAKE.PROCESS",
34-
field: "types",
35-
action: "update",
41+
process: "10.0000/FAKE.PROCESS",
42+
field: "relatedIdentifiers",
43+
action: "insert",
44+
original_value: nil,
3645
enriched_value: {
37-
ris: "GEN",
38-
bibtex: "misc",
39-
citeproc: "article",
40-
schemaOrg: "CreativeWork",
41-
resourceType: "Article",
42-
resourceTypeGeneral: "Dataset",
46+
relationType: "Preprint",
47+
relatedIdentifier: item["matched_doi"],
48+
relatedIdentifierType: "DOI",
4349
},
4450
created: Time.current.utc,
4551
updated: Time.current.utc,
4652
produced: Time.current.utc - 5.days,
4753
)
4854

4955
if enrichment.save
50-
puts("Created enrichment for #{row["input_doi"]}")
56+
puts("Created enrichment for #{item["input_doi"]}")
5157
else
52-
puts("Failed to create enrichment for #{row["input_doi"]}")
58+
puts("Failed to create enrichment for #{item["input_doi"]}")
5359
puts(enrichment.errors.full_messages.join(","))
5460
end
5561
end

0 commit comments

Comments
 (0)