Skip to content

Commit 29df90c

Browse files
Fix up arxiv preprint data
1 parent e86f5ff commit 29df90c

File tree

2 files changed

+51
-17
lines changed

2 files changed

+51
-17
lines changed

db/enrichments.sqlite3

1.54 MB
Binary file not shown.

lib/tasks/enrichment.rake

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,41 +26,75 @@ namespace :enrichment do
2626

2727
desc "Ingest ARXIV data"
2828
task ingest_arxiv: :environment do
29-
file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json")
30-
data = JSON.parse(file)
29+
csv_path = Rails.root.join("lib/data/arxiv_preprint_matching.csv")
3130
count = 0
3231

33-
data.each do |item|
32+
CSV.foreach(csv_path, headers: true) do |row|
3433
count += 1
3534

3635
break if count == 2001
3736

37+
item = row.to_hash
38+
3839
enrichment = Enrichment.new(
39-
doi: item["input_doi"],
40-
source: "COMET",
41-
process: "10.0000/FAKE.PROCESS",
42-
field: "relatedIdentifiers",
43-
action: "insert",
44-
original_value: nil,
45-
enriched_value: {
46-
relationType: "Preprint",
47-
relatedIdentifier: item["matched_doi"],
48-
relatedIdentifierType: "DOI",
49-
},
40+
doi: item["doi"],
41+
source: item["source"],
42+
process: item["process"],
43+
field: item["field"],
44+
action: item["action"],
45+
original_value: item["originalValue"],
46+
enriched_value: JSON.parse(item["enrichedValue"]),
5047
created: Time.current.utc,
5148
updated: Time.current.utc,
52-
produced: Time.current.utc - 5.days,
49+
produced: item["produced"],
5350
)
5451

5552
if enrichment.save
56-
puts("Created enrichment for #{item["input_doi"]}")
53+
puts("Created enrichment for #{item["doi"]}")
5754
else
58-
puts("Failed to create enrichment for #{item["input_doi"]}")
55+
puts("Failed to create enrichment for #{item["doi"]}")
5956
puts(enrichment.errors.full_messages.join(","))
6057
end
6158
end
6259
end
6360

61+
# desc "Ingest ARXIV data"
62+
# task ingest_arxiv: :environment do
63+
# file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json")
64+
# data = JSON.parse(file)
65+
# count = 0
66+
67+
# data.each do |item|
68+
# count += 1
69+
70+
# break if count == 2001
71+
72+
# enrichment = Enrichment.new(
73+
# doi: item["input_doi"],
74+
# source: "COMET",
75+
# process: "10.0000/FAKE.PROCESS",
76+
# field: "relatedIdentifiers",
77+
# action: "insert",
78+
# original_value: nil,
79+
# enriched_value: {
80+
# relationType: "Preprint",
81+
# relatedIdentifier: item["matched_doi"],
82+
# relatedIdentifierType: "DOI",
83+
# },
84+
# created: Time.current.utc,
85+
# updated: Time.current.utc,
86+
# produced: Time.current.utc - 5.days,
87+
# )
88+
89+
# if enrichment.save
90+
# puts("Created enrichment for #{item["input_doi"]}")
91+
# else
92+
# puts("Failed to create enrichment for #{item["input_doi"]}")
93+
# puts(enrichment.errors.full_messages.join(","))
94+
# end
95+
# end
96+
# end
97+
6498
desc "Ingest procedural resource type"
6599
task ingest_procedural_resource_type: :environment do
66100
file = File.read("lib/data/datacite_procedural_resource_type_general_reclassifications_datacite_lookup_format.json")

0 commit comments

Comments
 (0)