Skip to content

Commit c8b69bc

Browse files
Ekir 569 stop using weights in classification (#221)
* EKIR-569 Remake WorkClassifier constructor * EKIR-569 Remake the add() -> prepare_classification() Rebuilt the whole function due to ditching weights. WorkClassifier attributes are modified based on the type of subject. Added some smaller functions to do the actual modifications. * EKIR-569 Update classify_work() * EKIR-569 Remove weights from _fiction() * EKIR-569 Remake the logic for _audience() * EKIR-569 Remove weights from _genres() * EKIR-569 Update _target_age() * EKIR-569 Update docs * EKIR-569 Remove weights * EKIR-569 Rename function * EKIR-569 Rename function * EKIR-569 prepare not add * EKIR-569 Remove constants * EKIR-569 Remove irrelevant classifiers * EKIR-569 Remove weight * EKIR-569 Random changes * EKIR-569 Remove weights * EKIR-569 Remove weight from SubjectData * EKIR-569 Remove weights * EKIR-569 Update test * EKIR-569 Remove more weight references * EKIR-569 Remove InterestLevelClassifier * EKIR-569 Lint * EKIR-569 Remove InterestLevelClassifier * EKIR-569 Document classification * EKIR-569 Adjust target age * EKIR-569 Set fiction=True as default * EKIR-569 Adjust audience
1 parent e93621c commit c8b69bc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1329
-5051
lines changed

api/admin/controller/work_editor.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,6 @@ def classifications(self, identifier_type, identifier):
433433
.join(Subject)
434434
.join(DataSource)
435435
.filter(Classification.identifier_id == identifier_id)
436-
.order_by(Classification.weight.desc())
437436
.all()
438437
)
439438

@@ -445,7 +444,6 @@ def classifications(self, identifier_type, identifier):
445444
"type": result.subject.type,
446445
"name": result.subject.identifier,
447446
"source": result.data_source.name,
448-
"weight": result.weight,
449447
}
450448
)
451449
)
@@ -500,7 +498,6 @@ def edit_classifications(self, identifier_type, identifier):
500498
data_source=staff_data_source,
501499
subject_type=Subject.SCHEMA_AUDIENCE,
502500
subject_identifier=new_audience,
503-
weight=WorkController.STAFF_WEIGHT,
504501
)
505502

506503
# Update target age if present
@@ -542,7 +539,6 @@ def edit_classifications(self, identifier_type, identifier):
542539
data_source=staff_data_source,
543540
subject_type=Subject.SCHEMA_AGE_RANGE,
544541
subject_identifier=age_range_identifier,
545-
weight=WorkController.STAFF_WEIGHT * 100,
546542
)
547543

548544
# Update fiction status
@@ -561,7 +557,6 @@ def edit_classifications(self, identifier_type, identifier):
561557
data_source=staff_data_source,
562558
subject_type=Subject.SIMPLIFIED_FICTION_STATUS,
563559
subject_identifier=fiction_term,
564-
weight=WorkController.STAFF_WEIGHT,
565560
)
566561
classification.subject.fiction = new_fiction
567562

@@ -592,7 +587,6 @@ def edit_classifications(self, identifier_type, identifier):
592587
data_source=staff_data_source,
593588
subject_type=Subject.SIMPLIFIED_GENRE,
594589
subject_identifier=genre,
595-
weight=WorkController.STAFF_WEIGHT,
596590
)
597591

598592
# add NONE genre classification if we aren't keeping any genres
@@ -601,7 +595,6 @@ def edit_classifications(self, identifier_type, identifier):
601595
data_source=staff_data_source,
602596
subject_type=Subject.SIMPLIFIED_GENRE,
603597
subject_identifier=SimplifiedGenreClassifier.NONE,
604-
weight=WorkController.STAFF_WEIGHT,
605598
)
606599
else:
607600
# otherwise delete existing NONE genre classification

api/axis.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@
5454
TimestampData,
5555
)
5656
from core.model import (
57-
Classification,
5857
Collection,
5958
Contributor,
6059
DataSource,
@@ -1091,7 +1090,6 @@ def extract_bibliographic(
10911090
type=Subject.BISAC,
10921091
identifier=None,
10931092
name=subject_identifier,
1094-
weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
10951093
)
10961094
)
10971095

@@ -1108,14 +1106,6 @@ def extract_bibliographic(
11081106
imprint = self.text_of_optional_subtag(element, "axis:imprint", ns)
11091107

11101108
audience = self.text_of_optional_subtag(element, "axis:audience", ns)
1111-
if audience:
1112-
subjects.append(
1113-
SubjectData(
1114-
type=Subject.AXIS_360_AUDIENCE,
1115-
identifier=audience,
1116-
weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
1117-
)
1118-
)
11191109

11201110
language = self.text_of_subtag(element, "axis:language", ns)
11211111

api/bibliotheca.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
)
5454
from core.model import (
5555
CirculationEvent,
56-
Classification,
5756
Collection,
5857
Contributor,
5958
DataSource,
@@ -703,7 +702,6 @@ def parse_genre_string(self, s: str | None) -> list[SubjectData]:
703702
Subject.BISAC,
704703
None,
705704
i,
706-
weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
707705
)
708706
)
709707
return genres

api/enki.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
TimestampData,
4141
)
4242
from core.model import (
43-
Classification,
4443
Collection,
4544
DataSource,
4645
DeliveryMechanism,
@@ -699,7 +698,6 @@ def extract_bibliographic(self, element: Mapping[str, str]) -> Metadata:
699698
SubjectData(
700699
Subject.TAG,
701700
topic,
702-
weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
703701
)
704702
)
705703
seen_topics.add(topic)

api/metadata/novelist.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,6 @@ def lookup_info_to_metadata(
418418
if feature_content:
419419
series_info = feature_content.get("SeriesInfo")
420420
appeals_info = feature_content.get("Appeals")
421-
lexile_info = feature_content.get("LexileInfo")
422421
goodreads_info = feature_content.get("GoodReads")
423422
recommendations_info = feature_content.get("SimilarTitles")
424423

@@ -446,11 +445,6 @@ def lookup_info_to_metadata(
446445
if extracted_genres:
447446
break
448447

449-
if lexile_info:
450-
metadata.subjects.append(
451-
SubjectData(Subject.LEXILE_SCORE, lexile_info["Lexile"])
452-
)
453-
454448
if goodreads_info:
455449
metadata.measurements.append(
456450
MeasurementData(Measurement.RATING, goodreads_info["average_rating"])

api/overdrive.py

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@
6464
TimestampData,
6565
)
6666
from core.model import (
67-
Classification,
6867
Collection,
6968
Contributor,
7069
Credential,
@@ -2446,10 +2445,6 @@ def book_info_to_metadata(
24462445
overdrive_id = book["id"]
24472446
primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id)
24482447

2449-
# If we trust classification data, we'll give it this weight.
2450-
# Otherwise we'll probably give it a fraction of this weight.
2451-
trusted_weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
2452-
24532448
duration: int | None = None
24542449

24552450
if include_bibliographic:
@@ -2488,23 +2483,11 @@ def book_info_to_metadata(
24882483
contributors.append(contributor)
24892484

24902485
subjects = []
2491-
for sub in book.get("subjects", []):
2492-
subject = SubjectData(
2493-
type=Subject.OVERDRIVE,
2494-
identifier=sub["value"],
2495-
weight=trusted_weight,
2496-
)
2497-
subjects.append(subject)
24982486

24992487
for sub in book.get("keywords", []):
25002488
subject = SubjectData(
25012489
type=Subject.TAG,
25022490
identifier=sub["value"],
2503-
# We don't use TRUSTED_DISTRIBUTOR_WEIGHT because
2504-
# we don't know where the tags come from --
2505-
# probably Overdrive users -- and they're
2506-
# frequently wrong.
2507-
weight=1,
25082491
)
25092492
subjects.append(subject)
25102493

@@ -2518,7 +2501,6 @@ def book_info_to_metadata(
25182501
subject = SubjectData(
25192502
type=Subject.GRADE_LEVEL,
25202503
identifier=i["value"],
2521-
weight=trusted_weight / 10,
25222504
)
25232505
subjects.append(subject)
25242506

@@ -2543,30 +2525,6 @@ def book_info_to_metadata(
25432525
MeasurementData(Measurement.AWARDS, str(num_awards))
25442526
)
25452527

2546-
for name, subject_type in (
2547-
("ATOS", Subject.ATOS_SCORE),
2548-
("lexileScore", Subject.LEXILE_SCORE),
2549-
("interestLevel", Subject.INTEREST_LEVEL),
2550-
):
2551-
if not name in book:
2552-
continue
2553-
identifier = str(book[name])
2554-
subjects.append(
2555-
SubjectData(
2556-
type=subject_type, identifier=identifier, weight=trusted_weight
2557-
)
2558-
)
2559-
2560-
for grade_level_info in book.get("gradeLevels", []):
2561-
grade_level = grade_level_info.get("value")
2562-
subjects.append(
2563-
SubjectData(
2564-
type=Subject.GRADE_LEVEL,
2565-
identifier=grade_level,
2566-
weight=trusted_weight,
2567-
)
2568-
)
2569-
25702528
identifiers = []
25712529
links = []
25722530
sample_hrefs = set()

0 commit comments

Comments
 (0)