-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathenrichment.rake
More file actions
130 lines (112 loc) · 3.53 KB
/
enrichment.rake
File metadata and controls
130 lines (112 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# frozen_string_literal: true
require "csv"
require "json"
namespace :enrichment do
desc "Create the enrichments sqlite database table"
task create_sqlite_table: :environment do
ActiveRecord::Base.establish_connection(:enrichments)
ActiveRecord::Base.connection.execute(<<-SQL)
CREATE TABLE IF NOT EXISTS enrichments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
doi TEXT,
source TEXT,
process TEXT,
field TEXT,
action TEXT,
original_value TEXT,
enriched_value TEXT,
created DATETIME,
updated DATETIME,
produced DATETIME
);
SQL
end
desc "Ingest ARXIV data"
task ingest_arxiv: :environment do
csv_path = Rails.root.join("lib/data/arxiv_preprint_matching.csv")
count = 0
CSV.foreach(csv_path, headers: true) do |row|
count += 1
break if count == 2001
item = row.to_hash
enrichment = Enrichment.new(
doi: item["doi"],
source: item["source"],
process: item["process"],
field: item["field"],
action: item["action"],
original_value: item["originalValue"],
enriched_value: JSON.parse(item["enrichedValue"]),
created: Time.current.utc,
updated: Time.current.utc,
produced: item["produced"],
)
if enrichment.save
puts("Created enrichment for #{item["doi"]}")
else
puts("Failed to create enrichment for #{item["doi"]}")
puts(enrichment.errors.full_messages.join(","))
end
end
end
# desc "Ingest ARXIV data"
# task ingest_arxiv: :environment do
# file = File.read("lib/data/20250615_arxiv_preprint_matching_results.json")
# data = JSON.parse(file)
# count = 0
# data.each do |item|
# count += 1
# break if count == 2001
# enrichment = Enrichment.new(
# doi: item["input_doi"],
# source: "COMET",
# process: "10.0000/FAKE.PROCESS",
# field: "relatedIdentifiers",
# action: "insert",
# original_value: nil,
# enriched_value: {
# relationType: "Preprint",
# relatedIdentifier: item["matched_doi"],
# relatedIdentifierType: "DOI",
# },
# created: Time.current.utc,
# updated: Time.current.utc,
# produced: Time.current.utc - 5.days,
# )
# if enrichment.save
# puts("Created enrichment for #{item["input_doi"]}")
# else
# puts("Failed to create enrichment for #{item["input_doi"]}")
# puts(enrichment.errors.full_messages.join(","))
# end
# end
# end
desc "Ingest procedural resource type"
task ingest_procedural_resource_type: :environment do
file = File.read("lib/data/datacite_procedural_resource_type_general_reclassifications_datacite_lookup_format.json")
data = JSON.parse(file)
count = 0
data.each do |item|
count += 1
break if count == 2001
enrichment = Enrichment.new(
doi: item["doi"],
source: "COMET",
process: "10.0000/FAKE.PROCESS",
field: "types",
action: "update",
original_value: item["currentTypes"],
enriched_value: item["reclassifiedTypes"],
created: Time.current.utc,
updated: Time.current.utc,
produced: Time.current.utc - 5.days,
)
if enrichment.save
puts("Created enrichment for #{item["doi"]}")
else
puts("Failed to create enrichment for #{item["doi"]}")
puts(enrichment.errors.full_messages.join(","))
end
end
end
end