Skip to content

Commit fd8ed52

Browse files
committed
Normalize licenses parsed from RO-Crate to SPDX ID
1 parent 1e18d40 commit fd8ed52

File tree

9 files changed

+58
-12
lines changed

9 files changed

+58
-12
lines changed

app/validators/license_validator.rb

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@ class LicenseValidator < ActiveModel::EachValidator
33
def validate_each(record, attribute, value)
44
return if Seek::License.find(value)
55
# Try looking up by URI
6-
if value.start_with? /https?:/
7-
id = Seek::License.uri_to_id(value)
8-
if id
9-
record.send("#{attribute}=", id)
10-
return
11-
end
6+
license = Seek::License.normalize(license)
7+
if license
8+
record.send("#{attribute}=", license)
9+
return
1210
end
1311
record.errors.add(attribute, options[:message] || "isn't a recognized license")
1412
end

lib/seek/license.rb

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,24 @@ def self.find(id, source = Seek::License.combined)
4141
end
4242

4343
def self.uri_to_id(uri)
44-
uri_map[uri]
44+
uri_map[uri.downcase]
4545
end
4646

4747
def self.find_as_hash(id, source = Seek::License.combined)
4848
source[id]
4949
end
5050

51+
def self.normalize(license)
52+
return nil if license.blank?
53+
license = license.strip
54+
if license.start_with?(/https?:/)
55+
license = uri_to_id(license)
56+
else
57+
license = normalized_id_map[license.downcase]
58+
end
59+
license
60+
end
61+
5162
def is_null_license?
5263
id == NULL_LICENSE
5364
end
@@ -84,10 +95,19 @@ def self.uri_map
8495
@uri_map = {}
8596
combined.each do |id, license|
8697
(license['urls'] || []).each do |url|
87-
@uri_map[url] ||= id
98+
@uri_map[url.downcase] ||= id
8899
end
89100
end
90101
@uri_map
91102
end
103+
104+
def self.normalized_id_map
105+
return @normalized_id_map if @normalized_id_map
106+
@normalized_id_map = {}
107+
combined.each_key do |id|
108+
@normalized_id_map[id.downcase] = id
109+
end
110+
@normalized_id_map
111+
end
92112
end
93113
end

lib/seek/workflow_extractors/cff.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def metadata
3939
metadata[:other_creators] = other_creators.join(', ')
4040

4141
metadata[:title] = cff.title if cff.title.present?
42-
metadata[:license] = cff.license if cff.license.present?
42+
metadata[:license] = Seek::License.normalize(cff.license) if cff.license.present?
4343
metadata[:tags] = cff.keywords.map(&:strip) if cff.keywords.present?
4444
metadata[:doi] = cff.doi if cff.doi.present?
4545
metadata[:source_link_url] = cff.url if cff.url.present?

lib/seek/workflow_extractors/cwl.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def parse_metadata(existing_metadata, yaml_or_json_string)
100100
existing_metadata[:description] = cwl['doc']
101101
end
102102
if cwl.key?('s:license')
103-
existing_metadata[:license] = cwl['s:license']
103+
existing_metadata[:license] = Seek::License.normalize(cwl['s:license'])
104104
end
105105

106106
existing_metadata[:internals] = {

lib/seek/workflow_extractors/galaxy.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def metadata
3939
end
4040

4141
metadata[:description] = galaxy['annotation'] if galaxy['annotation'].present?
42-
metadata[:license] = galaxy['license'] if galaxy['license'].present?
42+
metadata[:license] = Seek::License.normalize(galaxy['license']) if galaxy['license'].present?
4343

4444
if galaxy['creator']
4545
people, others = Array(galaxy['creator']).partition { |c| c['class'] == 'Person' }

lib/seek/workflow_extractors/rocrate.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def metadata_from_crate(crate, m)
7474

7575
m[:title] = crate['name'] if crate['name'].present?
7676
m[:description] = crate['description'] if crate['description'].present?
77-
m[:license] = crate['license'] if crate['license'].present?
77+
m[:license] = Seek::License.normalize(crate['license']) if crate['license'].present?
7878

7979
other_creators = []
8080
authors = []
Binary file not shown.

test/unit/license_test.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,4 +171,18 @@ class LicenseTest < ActiveSupport::TestCase
171171
assert_equal 'No license - no permission to use unless the owner grants a licence', license.full_display_title
172172
end
173173

174+
test 'normalize incoming license' do
175+
assert_equal 'CC-BY-4.0', Seek::License.normalize('https://spdx.org/licenses/CC-BY-4.0.html')
176+
assert_equal 'CC-BY-4.0', Seek::License.normalize('https://spdx.org/licenses/CC-BY-4.0')
177+
assert_equal 'CC-BY-4.0', Seek::License.normalize('https://creativecommons.org/licenses/by/4.0/')
178+
assert_equal 'CC-BY-4.0', Seek::License.normalize('https://creativecommons.org/licenses/by/4.0/legalcode')
179+
assert_equal 'CC-BY-4.0', Seek::License.normalize('CC-BY-4.0')
180+
assert_equal 'CC-BY-4.0', Seek::License.normalize('cc-by-4.0')
181+
assert_equal 'CC-BY-4.0', Seek::License.normalize('cc-By-4.0')
182+
assert_equal 'CC-BY-4.0', Seek::License.normalize(" cc-By-4.0\n ")
183+
assert_nil Seek::License.normalize('huh what')
184+
assert_nil Seek::License.normalize(nil)
185+
assert_nil Seek::License.normalize('')
186+
end
187+
174188
end

test/unit/workflow_extraction/ro_crate_extraction_test.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,20 @@ class RoCrateExtractionTest < ActiveSupport::TestCase
127127
assert_equal 'Apache-2.0', metadata[:license]
128128
end
129129

130+
test 'extracts and normalizes URI license from ro-crate-metadata, and ignores Licensee "other" license file' do
131+
# Checks 2 things:
132+
# 1. A URI license, e.g. https://opensource.org/licenses/MIT, is normalized to its SPDX ID
133+
# (which means the correct license will be selected in the dropdown).
134+
# 2. The SPDX ID `NOASSERTION`, returned by Licensee (which flags the file `not_a_license.py` as potentially containing a license),
135+
# is ignored in favour of the MIT license from the RO-Crate metadata.
136+
137+
wf = open_fixture_file('workflows/ro-crate-with-other-license-file.crate.zip')
138+
extractor = Seek::WorkflowExtractors::ROCrate.new(wf)
139+
metadata = extractor.metadata
140+
141+
assert_equal 'MIT', metadata[:license]
142+
end
143+
130144
test 'extracts author with affiliation as array' do
131145
# Also note that the following is just a regular RO-Crate, not a Workflow RO-Crate
132146
wf = open_fixture_file('ro_crates/affiliation_array.crate.zip')

0 commit comments

Comments
 (0)