Skip to content

Commit c26be66

Browse files
Merge pull request #19 from datacite/add-indexing-logic
Add indexing logic
2 parents 7b658e5 + 3dc5c5a commit c26be66

File tree

9 files changed

+362
-16
lines changed

9 files changed

+362
-16
lines changed

app/constants/relation_types.rb

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# frozen_string_literal: true
2+
3+
module RelationTypes
4+
REFERENCE_RELATION_TYPES = [
5+
"cites", "is-supplemented-by", "references",
6+
].freeze
7+
8+
CITATION_RELATION_TYPES = [
9+
"is-cited-by",
10+
"is-supplement-to",
11+
"is-referenced-by",
12+
].freeze
13+
14+
INCLUDED_RELATION_TYPES = REFERENCE_RELATION_TYPES | CITATION_RELATION_TYPES
15+
16+
PART_RELATION_TYPES = [
17+
"is-part-of",
18+
"has-part",
19+
].freeze
20+
21+
NEW_RELATION_TYPES = [
22+
"is-reply-to",
23+
"is-translation-of",
24+
"is-published-in",
25+
].freeze
26+
27+
RELATIONS_RELATION_TYPES = [
28+
"compiles",
29+
"is-compiled-by",
30+
"documents",
31+
"is-documented-by",
32+
"has-metadata",
33+
"is-metadata-for",
34+
"is-derived-from",
35+
"is-source-of",
36+
"reviews",
37+
"is-reviewed-by",
38+
"requires",
39+
"is-required-by",
40+
"continues",
41+
"is-coutinued-by",
42+
"has-version",
43+
"is-version-of",
44+
"has-part",
45+
"is-part-of",
46+
"is-variant-from-of",
47+
"is-original-form-of",
48+
"is-identical-to",
49+
"obsoletes",
50+
"is-obsolete-by",
51+
"is-new-version-of",
52+
"is-previous-version-of",
53+
"describes",
54+
"is-described-by",
55+
].freeze
56+
57+
ALL_RELATION_TYPES = (
58+
RELATIONS_RELATION_TYPES |
59+
NEW_RELATION_TYPES |
60+
CITATION_RELATION_TYPES |
61+
REFERENCE_RELATION_TYPES
62+
).uniq
63+
64+
OTHER_RELATION_TYPES =
65+
(RELATIONS_RELATION_TYPES | NEW_RELATION_TYPES) -
66+
INCLUDED_RELATION_TYPES - PART_RELATION_TYPES
67+
68+
RELATED_SOURCE_IDS = [
69+
"datacite-related",
70+
"datacite-crossref",
71+
"crossref",
72+
].freeze
73+
end

app/jobs/event_index_job.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# frozen_string_literal: true
2+
3+
class EventIndexJob < ApplicationJob
4+
queue_as :events_index
5+
6+
rescue_from ActiveJob::DeserializationError,
7+
SocketError,
8+
Elasticsearch::Transport::Transport::Errors::BadRequest,
9+
Elasticsearch::Transport::Transport::Error do |error|
10+
Rails.logger.error(error.message)
11+
end
12+
13+
def perform(obj)
14+
log_prefix = "[Events:EventIndexJob]"
15+
16+
response = obj.__elasticsearch__.index_document
17+
18+
if ["created", "updated"].exclude?(response["result"])
19+
Rails.logger.error("#{log_prefix} OpenSearch Error: #{response.inspect}")
20+
end
21+
end
22+
end
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# frozen_string_literal: true
2+
3+
module EventIndexHandler
4+
include RelationTypes
5+
extend ActiveSupport::Concern
6+
7+
# Used to prepare the event record for indexing
8+
def as_indexed_json(_options = {})
9+
{
10+
"uuid" => uuid,
11+
"subj_id" => subj_id,
12+
"obj_id" => obj_id,
13+
"subj" => subj_hash.merge(cache_key: subj_cache_key),
14+
"obj" => obj_hash.merge(cache_key: obj_cache_key),
15+
"source_doi" => source_doi,
16+
"target_doi" => target_doi,
17+
"source_relation_type_id" => source_relation_type_id,
18+
"target_relation_type_id" => target_relation_type_id,
19+
"doi" => doi,
20+
"orcid" => orcid,
21+
"issn" => issn,
22+
"prefix" => prefix,
23+
"subtype" => subtype,
24+
"citation_type" => citation_type,
25+
"source_id" => source_id,
26+
"source_token" => source_token,
27+
"message_action" => message_action,
28+
"relation_type_id" => relation_type_id,
29+
"registrant_id" => registrant_id,
30+
"access_method" => access_method,
31+
"metric_type" => metric_type,
32+
"total" => total,
33+
"license" => license,
34+
"error_messages" => error_messages,
35+
"aasm_state" => aasm_state,
36+
"state_event" => state_event,
37+
"year_month" => year_month,
38+
"created_at" => created_at,
39+
"updated_at" => updated_at,
40+
"indexed_at" => indexed_at,
41+
"occurred_at" => occurred_at,
42+
"citation_id" => citation_id,
43+
"citation_year" => citation_year,
44+
"cache_key" => cache_key,
45+
}
46+
end
47+
48+
def subj_cache_key
49+
timestamp = subj_hash["dateModified"] || Time.zone.now.iso8601
50+
"objects/#{subj_id}-#{timestamp}"
51+
end
52+
53+
def obj_cache_key
54+
timestamp = obj_hash["dateModified"] || Time.zone.now.iso8601
55+
"objects/#{obj_id}-#{timestamp}"
56+
end
57+
58+
def doi
59+
Array.wrap(subj_hash["proxyIdentifiers"]).grep(%r{\A10\.\d{4,5}/.+\z}) { ::Regexp.last_match(1) } +
60+
Array.wrap(obj_hash["proxyIdentifiers"]).grep(%r{\A10\.\d{4,5}/.+\z}) { ::Regexp.last_match(1) } +
61+
Array.wrap(subj_hash["funder"]).map { |f| DoiUtilities.doi_from_url(f["@id"]) }.compact +
62+
Array.wrap(obj_hash["funder"]).map { |f| DoiUtilities.doi_from_url(f["@id"]) }.compact +
63+
[DoiUtilities.doi_from_url(subj_id), DoiUtilities.doi_from_url(obj_id)].compact
64+
end
65+
66+
def orcid
67+
Array.wrap(subj_hash["author"]).map { |f| OrcidUtilities.orcid_from_url(f["@id"]) }.compact +
68+
Array.wrap(obj_hash["author"]).map { |f| OrcidUtilities.orcid_from_url(f["@id"]) }.compact +
69+
[OrcidUtilities.orcid_from_url(subj_id), OrcidUtilities.orcid_from_url(obj_id)].compact
70+
end
71+
72+
def issn
73+
Array.wrap(subj_hash.dig("periodical", "issn")).compact +
74+
Array.wrap(obj_hash.dig("periodical", "issn")).compact
75+
rescue TypeError
76+
nil
77+
end
78+
79+
def prefix
80+
[doi.map { |d| d.to_s.split("/", 2).first }].compact
81+
end
82+
83+
def subtype
84+
[subj_hash["@type"], obj["@type"]].compact
85+
end
86+
87+
def citation_type
88+
if subj_hash["@type"].blank? || subj_hash["@type"] == "CreativeWork" ||
89+
obj_hash["@type"].blank? ||
90+
obj_hash["@type"] == "CreativeWork"
91+
return
92+
end
93+
94+
[subj_hash["@type"], obj_hash["@type"]].compact.sort.join("-")
95+
end
96+
97+
def registrant_id
98+
[
99+
subj_hash["registrantId"],
100+
obj_hash["registrantId"],
101+
subj_hash["providerId"],
102+
obj_hash["providerId"],
103+
].compact
104+
end
105+
106+
def access_method
107+
if /(requests|investigations)/.match?(relation_type_id.to_s)
108+
relation_type_id.split("-").last if relation_type_id.present?
109+
end
110+
end
111+
112+
def metric_type
113+
if /(requests|investigations)/.match?(relation_type_id.to_s)
114+
arr = relation_type_id.split("-", 4)
115+
arr[0..2].join("-")
116+
end
117+
end
118+
119+
def year_month
120+
occurred_at.utc.iso8601[0..6] if occurred_at.present?
121+
end
122+
123+
def citation_id
124+
[subj_id, obj_id].sort.join("-")
125+
end
126+
127+
def citation_year
128+
if (INCLUDED_RELATION_TYPES + RELATIONS_RELATION_TYPES).exclude?(relation_type_id)
129+
return ""
130+
end
131+
132+
subj_publication = subj_hash["datePublished"] ||
133+
subj_hash["date_published"] ||
134+
(date_published(subj_id) || year_month)
135+
136+
obj_publication = obj_hash["datePublished"] ||
137+
obj_hash["date_published"] ||
138+
(date_published(obj_id) || year_month)
139+
140+
[subj_publication[0..3].to_i, obj_publication[0..3].to_i].max
141+
end
142+
143+
def cache_key
144+
timestamp = updated_at || Time.zone.now
145+
146+
"events/#{uuid}-#{timestamp.iso8601}"
147+
end
148+
149+
def date_published(doi)
150+
item = Doi.find_by(doi: DoiUtilities.uppercase_doi_from_url(doi))
151+
152+
item[:publication_date] if item.present?
153+
end
154+
end

app/models/concerns/relation_type_handler.rb

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# Method is only invoked via a before_validation callback in the events model.
55

66
module RelationTypeHandler
7+
include RelationTypes
78
extend ActiveSupport::Concern
89

910
def set_source_and_target_doi!
@@ -48,16 +49,4 @@ def set_source_and_target_doi!
4849
self.target_relation_type_id = "part_of"
4950
end
5051
end
51-
52-
REFERENCE_RELATION_TYPES = [
53-
"cites",
54-
"is-supplemented-by",
55-
"references",
56-
].freeze
57-
58-
CITATION_RELATION_TYPES = [
59-
"is-cited-by",
60-
"is-supplement-to",
61-
"is-referenced-by",
62-
].freeze
6352
end

app/models/doi.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# frozen_string_literal: true
2+
3+
class Doi < ApplicationRecord
4+
# Map the doi model to the dataset table in the datacite mysql database
5+
self.table_name = "dataset"
6+
7+
# Attributes
8+
attribute :doi, :string
9+
attribute :publication_year, :integer
10+
end

app/models/event.rb

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22

33
class Event < ApplicationRecord
44
include RelationTypeHandler
5-
# include Modelable
6-
# include Identifiable
7-
# include Elasticsearch::Model
5+
include EventIndexHandler
6+
include Elasticsearch::Model
87

98
# Attributes
109
attribute :uuid, :text
@@ -41,6 +40,84 @@ class Event < ApplicationRecord
4140
validates :updated_at, presence: true
4241
validates :indexed_at, presence: true
4342

44-
# Callbacks
43+
# Getters
44+
def subj_hash
45+
@subj_hash ||= JSON.parse(subj)
46+
end
47+
48+
def obj_hash
49+
@obj_hash ||= JSON.parse(obj)
50+
end
51+
52+
# Callback Hooks
4553
before_validation :set_source_and_target_doi!
54+
after_commit -> { EventIndexJob.perform_later(self) }
55+
56+
# OpenSearch Mappings
57+
mapping dynamic: "false" do
58+
indexes :uuid, type: :keyword
59+
indexes :subj_id, type: :keyword
60+
indexes :obj_id, type: :keyword
61+
indexes :doi, type: :keyword
62+
indexes :orcid, type: :keyword
63+
indexes :prefix, type: :keyword
64+
indexes :subtype, type: :keyword
65+
indexes :citation_type, type: :keyword
66+
indexes :issn, type: :keyword
67+
indexes :subj,
68+
type: :object,
69+
properties: {
70+
type: { type: :keyword },
71+
id: { type: :keyword },
72+
uid: { type: :keyword },
73+
proxyIdentifiers: { type: :keyword },
74+
datePublished: {
75+
type: :date,
76+
format: "date_optional_time||yyyy-MM-dd||yyyy-MM||yyyy",
77+
ignore_malformed: true,
78+
},
79+
registrantId: { type: :keyword },
80+
cache_key: { type: :keyword },
81+
}
82+
indexes :obj,
83+
type: :object,
84+
properties: {
85+
type: { type: :keyword },
86+
id: { type: :keyword },
87+
uid: { type: :keyword },
88+
proxyIdentifiers: { type: :keyword },
89+
datePublished: {
90+
type: :date,
91+
format: "date_optional_time||yyyy-MM-dd||yyyy-MM||yyyy",
92+
ignore_malformed: true,
93+
},
94+
registrantId: { type: :keyword },
95+
cache_key: { type: :keyword },
96+
}
97+
indexes :source_doi, type: :keyword
98+
indexes :target_doi, type: :keyword
99+
indexes :source_relation_type_id, type: :keyword
100+
indexes :target_relation_type_id, type: :keyword
101+
indexes :source_id, type: :keyword
102+
indexes :source_token, type: :keyword
103+
indexes :message_action, type: :keyword
104+
indexes :relation_type_id, type: :keyword
105+
indexes :registrant_id, type: :keyword
106+
indexes :access_method, type: :keyword
107+
indexes :metric_type, type: :keyword
108+
indexes :total, type: :integer
109+
indexes :license, type: :text, fields: { keyword: { type: "keyword" } }
110+
indexes :error_messages, type: :object
111+
indexes :callback, type: :text
112+
indexes :aasm_state, type: :keyword
113+
indexes :state_event, type: :keyword
114+
indexes :year_month, type: :keyword
115+
indexes :created_at, type: :date
116+
indexes :updated_at, type: :date
117+
indexes :indexed_at, type: :date
118+
indexes :occurred_at, type: :date
119+
indexes :citation_id, type: :keyword
120+
indexes :citation_year, type: :integer
121+
indexes :cache_key, type: :keyword
122+
end
46123
end

app/utilities/doi_utilities.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,13 @@ def uppercase_doi_from_url(url)
2121
uri.path.gsub(%r{^/}, "").upcase
2222
end
2323
end
24+
25+
def doi_from_url(url)
26+
if %r{\A(?:(http|https)://(dx\.)?(doi.org|handle.test.datacite.org)/)?(doi:)?(10\.\d{4,5}/.+)\z}
27+
.match?(url)
28+
uri = Addressable::URI.parse(url)
29+
uri.path.gsub(%r{^/}, "").downcase
30+
end
31+
end
2432
end
2533
end

0 commit comments

Comments
 (0)