Skip to content

Commit fedf726

Browse files
Merge pull request #95 from NatLibFi/EKIR-232-Add-demarque-classification
Ekir 232 add demarque classification
2 parents 8fd6f5d + 3085801 commit fedf726

4 files changed

Lines changed: 151 additions & 4 deletions

File tree

core/classifier/__init__.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
"""
2+
A classifier module that classifies books and subjects into various categories. This module is called when importing
3+
collections to a library or updating classifications. It's called by the core/model/classification.py.
4+
"""
5+
16
# If the genre classification does not match the fiction classification, throw
27
# away the genre classifications.
38
#
@@ -37,13 +42,15 @@ class ClassifierConstants:
3742
BISAC = "BISAC"
3843
BIC = "BIC"
3944
TAG = "tag" # Folksonomic tags.
45+
DEMARQUE = "De Marque"
4046

4147
# Appeal controlled vocabulary developed by NYPL
4248
NYPL_APPEAL = "NYPL Appeal"
4349

4450
GRADE_LEVEL = "Grade level" # "1-2", "Grade 4", "Kindergarten", etc.
4551
AGE_RANGE = "schema:typicalAgeRange" # "0-2", etc.
4652
AXIS_360_AUDIENCE = "Axis 360 Audience"
53+
DEMARQUE_AUDIENCE = "schema:Audience"
4754

4855
# We know this says something about the audience but we're not sure what.
4956
# Could be any of the values from GRADE_LEVEL or AGE_RANGE, plus
@@ -1223,6 +1230,32 @@ def add(self, classification):
12231230
# "Juvenile Fiction".
12241231
self.overdrive_juvenile_generic = classification
12251232

1233+
# E-kirjasto: Since De Marque classifications have target ages for children's and YA books, we want to weigh
1234+
# them more heavily by setting their weights to 1.0. This ensures that those books are classified accordingly.
1235+
if subject.type == "De Marque" and (
1236+
subject.audience == Classifier.AUDIENCE_CHILDREN
1237+
or subject.audience == Classifier.AUDIENCE_YOUNG_ADULT
1238+
):
1239+
if subject.target_age:
1240+
# Set the weight to 1.0 for any target age.
1241+
self.audience_weights = Counter()
1242+
self.audience_weights[subject.audience] += weight * 1.0
1243+
scaled_weight = classification.weight_as_indicator_of_target_age
1244+
target_min = subject.target_age.lower
1245+
target_max = subject.target_age.upper
1246+
if target_min is not None:
1247+
self.target_age_lower_weights[target_min] = 1.0
1248+
if target_max is not None:
1249+
self.target_age_upper_weights[target_max] = 1.0
1250+
# E-kirjasto: Some De Marque adult books were incorrectly classified as children's books. Let's set the
1251+
# weight to 1.0 for any adult audience books.
1252+
if (
1253+
subject.type == "De Marque"
1254+
and subject.audience == Classifier.AUDIENCE_ADULT
1255+
):
1256+
self.audience_weights = Counter()
1257+
self.audience_weights[subject.audience] += weight * 1.0
1258+
12261259
def weigh_metadata(self):
12271260
"""Modify the weights according to the given Work's metadata.
12281261
@@ -1497,12 +1530,10 @@ def target_age(self, audience):
14971530
if target_age_min is None:
14981531
target_age_min = target_age_max
14991532

1500-
if target_age_max is None:
1533+
# Err on the side of setting the minimum age too high but first ensure we have values to compare.
1534+
if target_age_min and target_age_max and target_age_min > target_age_max:
15011535
target_age_max = target_age_min
15021536

1503-
# Err on the side of setting the minimum age too high.
1504-
if target_age_min > target_age_max:
1505-
target_age_max = target_age_min
15061537
return Classifier.range_tuple(target_age_min, target_age_max)
15071538

15081539
def genres(self, fiction, cutoff=0.15):
@@ -1624,6 +1655,7 @@ def consolidate_genre_weights(cls, weights, subgenre_swallows_parent_at=0.03):
16241655
from core.classifier.bic import BICClassifier
16251656
from core.classifier.bisac import BISACClassifier
16261657
from core.classifier.ddc import DeweyDecimalClassifier
1658+
from core.classifier.demarque import DeMarqueClassifier
16271659
from core.classifier.gutenberg import GutenbergBookshelfClassifier
16281660
from core.classifier.keyword import (
16291661
Eg,

core/classifier/demarque.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Classifier to extract classifications from De Marque data.
2+
"""
3+
from core.classifier import *
4+
5+
6+
class DeMarqueClassifier(Classifier):
7+
@classmethod
8+
def scrub_identifier(cls, identifier):
9+
"""
10+
Make sure that the identifier matches with De Marque codes.
11+
12+
:param identifier: The identifier to be scrubbed.
13+
:return: The scrubbed identifier.
14+
"""
15+
if identifier.startswith("READ"):
16+
return identifier
17+
18+
@classmethod
19+
def scrub_name(cls, name):
20+
"""
21+
Read in the De Marque name of the subject code.
22+
:param name: The name of the subject.
23+
"""
24+
if name:
25+
return name
26+
27+
@classmethod
28+
def audience(cls, identifier, name):
29+
"""
30+
Function to determine the audience based on the given identifier.
31+
32+
:param identifier: The identifier to check for audience classification.
33+
:param name: The name associated with the identifier.
34+
:return: The audience classification based on the identifier.
35+
"""
36+
if identifier in ["READ0001", "READ0002", "READ0003"]:
37+
return cls.AUDIENCE_CHILDREN
38+
elif identifier in ["READ0004", "READ0005"]:
39+
return cls.AUDIENCE_YOUNG_ADULT
40+
return cls.AUDIENCE_ADULT
41+
42+
@classmethod
43+
def target_age(cls, identifier, name):
44+
"""
45+
Function that determines the target age range based on the given identifier.
46+
47+
:param identifier: The identifier to check for target age classification.
48+
:return: A tuple representing the target age range.
49+
"""
50+
if identifier == "READ0001":
51+
return (0, 3)
52+
if identifier == "READ0002":
53+
return (4, 7)
54+
if identifier == "READ0003":
55+
return (8, 12)
56+
if identifier == "READ0004":
57+
return (13, 18)
58+
if identifier == "READ0005":
59+
return (17, None)
60+
61+
62+
Classifier.classifiers[Classifier.DEMARQUE] = DeMarqueClassifier

core/model/classification.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class Subject(Base):
5959
TAG: str = Classifier.TAG # Folksonomic tags.
6060
FREEFORM_AUDIENCE: str = Classifier.FREEFORM_AUDIENCE
6161
NYPL_APPEAL = Classifier.NYPL_APPEAL
62+
DEMARQUE = Classifier.DEMARQUE
6263

6364
# Types with terms that are suitable for search.
6465
TYPES_FOR_SEARCH = [FAST, OVERDRIVE, BISAC, TAG]
@@ -92,6 +93,7 @@ class Subject(Base):
9293
"http://www.bisg.org/standards/bisac_subject/": BISAC,
9394
# Feedbooks uses a modified BISAC which we know how to handle.
9495
"http://www.feedbooks.com/categories": BISAC,
96+
"http://schema.org/Audience": DEMARQUE,
9597
}
9698

9799
uri_lookup = dict()
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from core.classifier import DeMarqueClassifier
2+
3+
4+
class MockSubject:
5+
def __init__(self, identifier, name):
6+
self.identifier = identifier
7+
self.name = name
8+
9+
10+
class TestDeMarqueClassifier:
11+
def _subject(self, identifier, name):
12+
subject = MockSubject(identifier, name)
13+
(
14+
subject.genre,
15+
subject.audience,
16+
subject.target_age,
17+
subject.fiction,
18+
) = DeMarqueClassifier.classify(subject)
19+
return subject
20+
21+
def test_scrub_identifier(self):
22+
"""Make sure that the identifier matches with De Marque codes."""
23+
assert "READ0000" == DeMarqueClassifier.scrub_identifier("READ0000")
24+
25+
# Otherwise, the identifier is left alone.
26+
assert "RRRR0000" != DeMarqueClassifier.scrub_identifier("RRRR0000")
27+
28+
def test_scrub_name(self):
29+
"""Sometimes a data provider sends BISAC names that contain extra or
30+
nonstandard characters. We store the data as it was provided to us,
31+
but when it's time to classify things, we normalize it.
32+
"""
33+
assert "Early childhood" == DeMarqueClassifier.scrub_name("Early childhood")
34+
35+
def test_audience(self):
36+
"""Test that the correct audience is returned for each identifier."""
37+
assert "Children" == DeMarqueClassifier.audience("READ0001", "Early childhood")
38+
assert "Children" == DeMarqueClassifier.audience("READ0002", "Beginner reader")
39+
assert "Children" == DeMarqueClassifier.audience("READ0003", "Advanced reader")
40+
assert "Young Adult" == DeMarqueClassifier.audience("READ0004", "Teen")
41+
assert "Young Adult" == DeMarqueClassifier.audience("READ0005", "Young adult")
42+
assert "Adult" == DeMarqueClassifier.audience("READ0000", "Adult")
43+
44+
def test_target_age(self):
45+
"""Test that the correct target age range is returned for each identifier. Adult books do not have a target age
46+
range."""
47+
assert (0, 3) == DeMarqueClassifier.target_age("READ0001", "Early childhood")
48+
assert (4, 7) == DeMarqueClassifier.target_age("READ0002", "Beginner reader")
49+
assert (8, 12) == DeMarqueClassifier.target_age("READ0003", "Advanced reader")
50+
assert (13, 18) == DeMarqueClassifier.target_age("READ0004", "Teen")
51+
assert (17, None) == DeMarqueClassifier.target_age("READ0005", "Young Adult")

0 commit comments

Comments
 (0)