@@ -17,12 +17,15 @@ def self.table_name_prefix
17
17
extend Detector ::BulkChecker
18
18
19
19
# Initialization process will run pattern checkers and strip invalid ISSN detections.
20
- # @param phrase String. Often a `Term.phrase`.
21
- # @return Nothing intentional. Data is written to Hash `@detections` during processing.
20
+ #
21
+ # @param phrase String. Often a `Term.phrase`.
22
+ # @return nil. Data is written to Hash `@detections` during processing. Things technically get
23
+ # returned here but it is a side effect and should not be relied on.
22
24
def initialize ( phrase )
23
25
@detections = { }
24
26
pattern_checker ( phrase )
25
27
strip_invalid_issns
28
+ strip_invalid_isbns
26
29
end
27
30
28
31
# The record method will consult the set of regex-based detectors that are defined in
@@ -53,13 +56,76 @@ def self.record(term)
53
56
def patterns
54
57
{
55
58
barcode : /^39080[0-9]{9}$/ ,
56
- isbn : /\b (ISBN-*(1[03])* *(: ){0,1})*( ([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b / ,
59
+ isbn : /\b (([0-9Xx][- ]*){13}|([0-9Xx][- ]*){10})\b / ,
57
60
issn : /\b [0-9]{4}-[0-9]{3}[0-9xX]\b / ,
58
61
pmid : /\b ((pmid|PMID):\s ?(\d {7,8}))\b / ,
59
62
doi : %r{\b 10\. (\d +\. *)+/(([^\s .])+\. *)+\b }
60
63
}
61
64
end
62
65
66
+ # strip_invalid_isbns coordinates the logic to remove ISBNs that are not valid from our list of detected ISBNs
67
+ #
68
+ # ISBNs cannot be validated via regex. Regex gives us a list of candidates that look like ISBNs. We remove invalid
69
+ # ISBNs by following validation specifications defined in the standard.
70
+ def strip_invalid_isbns
71
+ return unless @detections [ :isbn ]
72
+
73
+ @detections . delete ( :isbn ) unless valid_isbn? ( @detections [ :isbn ] )
74
+ end
75
+
76
+ # valid_isbn? checks for 10 or 13 digit ISBNs and defers to appropriate methods for each
77
+ #
78
+ # @param candidate String. A string representation of a regex detected ISBN.
79
+ # @return boolean
80
+ def valid_isbn? ( candidate )
81
+ digits = candidate . delete ( '-' ) . chars
82
+
83
+ # check 10 digit
84
+ if digits . length == 10
85
+ valid_isbn_10? ( digits )
86
+ # check 13 digit
87
+ elsif digits . length == 13
88
+ valid_isbn_13? ( digits )
89
+ # This shouldn't happen, log an error.
90
+ else
91
+ Rails . logger . error ( "Non-10 or 13 digit sequence detected as ISBN: #{ candidate } " )
92
+ Sentry . capture_message ( 'Non-10 or 13 digit sequence detected as ISBN' )
93
+ false
94
+ end
95
+ end
96
+
97
+ # valid_isbn_10? follows the ISBN 10 specification for validation
98
+ # https://en.wikipedia.org/wiki/ISBN#ISBN-10_check_digits
99
+ #
100
+ # @param digits Array. An array of strings representing each character from a detected ISBN candidate.
101
+ # @return boolean
102
+ def valid_isbn_10? ( digits )
103
+ sum = 0
104
+ digits . each_with_index do |digit , index |
105
+ digit = '10' if digit . casecmp ( 'x' ) . zero?
106
+ sum += digit . to_i * ( 10 - index )
107
+ end
108
+ ( sum % 11 ) . zero?
109
+ end
110
+
111
+ # valid_isbn_13? follows the ISBN 13 specification for validation
112
+ # https://en.wikipedia.org/wiki/ISBN#ISBN-13_check_digit_calculation
113
+ #
114
+ # @param digits Array. An array of strings representing each character from a detected ISBN candidate.
115
+ # @return boolean
116
+ def valid_isbn_13? ( digits )
117
+ sum = 0
118
+ digits . map ( &:to_i ) . each_with_index do |digit , index |
119
+ sum += digit * ( index . even? ? 1 : 3 )
120
+ end
121
+
122
+ ( sum % 10 ) . zero?
123
+ end
124
+
125
+ # strip_invalid_issns coordinates the logic to remove ISSNs that are not valid from our list of detected ISSNs
126
+ #
127
+ # ISSNs cannot be validated via regex. Regex gives us a list of candidates that look like ISSNs. We remove invalid
128
+ # ISSNs by following validation specifications defined in the standard.
63
129
def strip_invalid_issns
64
130
return unless @detections [ :issn ]
65
131
0 commit comments