Skip to content

Commit 1c8ce6d

Browse files
authored
Merge pull request #171 from MITLibraries/tco-114-validate-isbns
Validate ISBNs detected via regex
2 parents 7f8bd83 + 624e6fc commit 1c8ce6d

File tree

3 files changed

+101
-8
lines changed

3 files changed

+101
-8
lines changed

.yardopts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
--no-private --protected app/**/*.rb - docs/**/*.md
1+
--private --protected app/**/*.rb - docs/**/*.md

app/models/detector/standard_identifiers.rb

+69-3
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@ def self.table_name_prefix
1717
extend Detector::BulkChecker
1818

1919
# Initialization process will run pattern checkers and strip invalid ISSN detections.
20-
# @param phrase String. Often a `Term.phrase`.
21-
# @return Nothing intentional. Data is written to Hash `@detections` during processing.
20+
#
21+
# @param phrase String. Often a `Term.phrase`.
22+
# @return nil. Data is written to Hash `@detections` during processing. Things technically get
23+
# returned here but it is a side effect and should not be relied on.
2224
def initialize(phrase)
2325
@detections = {}
2426
pattern_checker(phrase)
2527
strip_invalid_issns
28+
strip_invalid_isbns
2629
end
2730

2831
# The record method will consult the set of regex-based detectors that are defined in
@@ -53,13 +56,76 @@ def self.record(term)
5356
def patterns
5457
{
5558
barcode: /^39080[0-9]{9}$/,
56-
isbn: /\b(ISBN-*(1[03])* *(: ){0,1})*(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
59+
isbn: /\b(([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b/,
5760
issn: /\b[0-9]{4}-[0-9]{3}[0-9xX]\b/,
5861
pmid: /\b((pmid|PMID):\s?(\d{7,8}))\b/,
5962
doi: %r{\b10\.(\d+\.*)+/(([^\s.])+\.*)+\b}
6063
}
6164
end
6265

66+
# strip_invalid_isbns coordinates the logic to remove ISBNs that are not valid from our list of detected ISBNs
67+
#
68+
# ISBNs cannot be validated via regex. Regex gives us a list of candidates that look like ISBNs. We remove invalid
69+
# ISBNs by following validation specifications defined in the standard.
70+
def strip_invalid_isbns
71+
return unless @detections[:isbn]
72+
73+
@detections.delete(:isbn) unless valid_isbn?(@detections[:isbn])
74+
end
75+
76+
# valid_isbn? checks for 10 or 13 digit ISBNs and defers to appropriate methods for each
77+
#
78+
# @param candidate String. A string representation of a regex detected ISBN.
79+
# @return boolean
80+
def valid_isbn?(candidate)
81+
digits = candidate.delete('-').chars
82+
83+
# check 10 digit
84+
if digits.length == 10
85+
valid_isbn_10?(digits)
86+
# check 13 digit
87+
elsif digits.length == 13
88+
valid_isbn_13?(digits)
89+
# This shouldn't happen, log an error.
90+
else
91+
Rails.logger.error("Non-10 or 13 digit sequence detected as ISBN: #{candidate}")
92+
Sentry.capture_message('Non-10 or 13 digit sequence detected as ISBN')
93+
false
94+
end
95+
end
96+
97+
# valid_isbn_10? follows the ISBN 10 specification for validation
98+
# https://en.wikipedia.org/wiki/ISBN#ISBN-10_check_digits
99+
#
100+
# @param digits Array. An array of strings representing each character from a detected ISBN candidate.
101+
# @return boolean
102+
def valid_isbn_10?(digits)
103+
sum = 0
104+
digits.each_with_index do |digit, index|
105+
digit = '10' if digit.casecmp('x').zero?
106+
sum += digit.to_i * (10 - index)
107+
end
108+
(sum % 11).zero?
109+
end
110+
111+
# valid_isbn_13? follows the ISBN 13 specification for validation
112+
# https://en.wikipedia.org/wiki/ISBN#ISBN-13_check_digit_calculation
113+
#
114+
# @param digits Array. An array of strings representing each character from a detected ISBN candidate.
115+
# @return boolean
116+
def valid_isbn_13?(digits)
117+
sum = 0
118+
digits.map(&:to_i).each_with_index do |digit, index|
119+
sum += digit * (index.even? ? 1 : 3)
120+
end
121+
122+
(sum % 10).zero?
123+
end
124+
125+
# strip_invalid_issns coordinates the logic to remove ISSNs that are not valid from our list of detected ISSNs
126+
#
127+
# ISSNs cannot be validated via regex. Regex gives us a list of candidates that look like ISSNs. We remove invalid
128+
# ISSNs by following validation specifications defined in the standard.
63129
def strip_invalid_issns
64130
return unless @detections[:issn]
65131

test/models/detector/standard_identifiers_test.rb

+31-4
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,24 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
2323
end
2424
end
2525

26+
test 'ISBN-10 examples with incorrect check digits are not detected' do
27+
# from wikipedia
28+
samples = ['99921-58-10-1', '9971-5-0210-1', '960-425-059-1', '80-902734-1-1', '85-359-0277-1',
29+
'1-84356-028-1', '0-684-84328-1', '0-8044-2957-1', '0-85131-041-1', '93-86954-21-1', '0-943396-04-1',
30+
'0-9752298-0-1']
31+
32+
samples.each do |isbn|
33+
actual = Detector::StandardIdentifiers.new(isbn).detections
34+
35+
assert_nil(actual[:isbn])
36+
end
37+
end
38+
2639
test 'ISBN-13 examples' do
27-
samples = ['978-99921-58-10-7', '979-9971-5-0210-0', '978-960-425-059-0', '979-80-902734-1-6', '978-85-359-0277-5',
28-
'979-1-84356-028-3', '978-0-684-84328-5', '979-0-8044-2957-X', '978-0-85131-041-9', '979-93-86954-21-4',
29-
'978-0-943396-04-2', '979-0-9752298-0-X']
40+
samples = ['978-99921-58-10-4', '978-9971-5-0210-2', '978-960-425-059-2', '978-80-902734-1-2',
41+
'978-85-359-0277-8', '978-1-84356-028-9', '978-0-684-84328-5', '978-0-8044-2957-3',
42+
'978-0-85131-041-1', '978-93-86954-21-3', '978-0-943396-04-0', '978-0-9752298-0-4', '9798531132178',
43+
'9798577456832', '979-8-886-45174-0', '9781319145446']
3044

3145
samples.each do |isbn|
3246
actual = Detector::StandardIdentifiers.new(isbn).detections
@@ -35,8 +49,21 @@ class StandardIdentifiersTest < ActiveSupport::TestCase
3549
end
3650
end
3751

52+
test 'ISBN-13 examples with incorrect check digits are not detected' do
53+
samples = ['978-99921-58-10-1', '978-9971-5-0210-1', '978-960-425-059-1', '978-80-902734-1-1',
54+
'978-85-359-0277-1', '978-1-84356-028-1', '978-0-684-84328-1', '978-0-8044-2957-1',
55+
'978-0-85131-041-2', '978-93-86954-21-1', '978-0-943396-04-1', '978-0-9752298-0-1', '9798531132171',
56+
'9798577456831', '979-8-886-45174-1', '9781319145441']
57+
58+
samples.each do |isbn|
59+
actual = Detector::StandardIdentifiers.new(isbn).detections
60+
61+
assert_nil(actual[:isbn])
62+
end
63+
end
64+
3865
test 'not ISBNs' do
39-
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here']
66+
samples = ['orange cats like popcorn', '1234-6798', 'another ISBN not found here', '99921-58-10-1', '979-8-886-45174-1']
4067

4168
samples.each do |notisbn|
4269
actual = Detector::StandardIdentifiers.new(notisbn).detections

0 commit comments

Comments
 (0)