Skip to content

Commit 56738d8

Browse files
committed
Fix concept uniqueness
Concepts were designed to be unique with respect to a lower-cased name, but that was not consistently enforced in the code, and it prevented looking up the "equivalent" concept by name. The "cached" normalized name enables that and the uniqueness constraint works with this new field.
1 parent a6a65aa commit 56738d8

3 files changed

Lines changed: 16 additions & 6 deletions

File tree

web/concepts/models.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
import logging
22

3-
from concepts.utils import UnionFind
3+
from concepts.utils import UnionFind, normalize_concept_name
44
from django.db import models
55
from django.db.models.functions import Lower
66
from django.db.utils import IntegrityError
77

88

99
class Concept(models.Model):
1010
name = models.CharField(max_length=200, null=True)
11+
normal_name = models.CharField(max_length=200, null=True)
1112
description = models.TextField(null=True)
1213

1314
class Meta:
1415
ordering = ["name", "description"]
1516
constraints = [
16-
models.UniqueConstraint(Lower("name").desc(), name="unique_lower_name")
17+
models.UniqueConstraint(fields=["normal_name"], name="unique_normal_name")
1718
]
1819

1920

@@ -34,7 +35,7 @@ def create_singleton_concepts(self):
3435
logging.WARNING,
3536
f" A concept named '{new_concept.name}' already exists.",
3637
)
37-
new_concept = Concept.objects.get(name__iexact=new_concept.name)
38+
new_concept = Concept.objects.get(normal_name=new_concept.normal_name)
3839
item.concept = new_concept
3940
item.save()
4041

@@ -45,16 +46,17 @@ def take_first(lst):
4546
components = UnionFind(self.all(), Link.objects.all().to_tuples())
4647
for concept_items in components.get_item_components(sort_key=Item.Source.key()):
4748
name = take_first([item.name for item in concept_items])
49+
normal_name = normalize_concept_name(name)
4850
description = take_first([item.description for item in concept_items])
49-
new_concept = Concept(name=name, description=description)
51+
new_concept = Concept(name=name, normal_name=normal_name, description=description)
5052
try:
5153
new_concept.save()
5254
except IntegrityError:
5355
logging.log(
5456
logging.WARNING,
5557
f" A concept named '{new_concept.name}' already exists.",
5658
)
57-
new_concept = Concept.objects.get(name=name)
59+
new_concept = Concept.objects.get(normal_name=normal_name)
5860
for item in concept_items:
5961
item.concept = new_concept
6062
item.save()
@@ -127,7 +129,7 @@ def get_linked_item_urls(self):
127129
return [i.get_url() for i in self.get_linked_items()]
128130

129131
def to_concept(self):
130-
return Concept(name=self.name, description=self.description)
132+
return Concept(name=self.name, normal_name=normalize_concept_name(self.name), description=self.description)
131133

132134
def __str__(self):
133135
if self.name:

web/concepts/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Any, Dict, List, Tuple
22

3+
from unidecode import unidecode
4+
35

46
class UnionFind:
57
item_to_element: Dict[Any, int] = {}
@@ -57,3 +59,8 @@ def elements_to_sorted_items(elements: List[int]):
5759
return items
5860

5961
return list(map(elements_to_sorted_items, self.components.values()))
62+
63+
64+
def normalize_concept_name(name: str) -> str:
65+
nyoo = unidecode(name).replace('_', '-').replace(' ', '-').lower()
66+
return ''.join(filter(lambda x: x.isalnum() or x == '-', nyoo))

web/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ requests~=2.32.5
44
spacy~=3.7.0 --prefer-binary
55
scispacy~=0.6.2
66
python-decouple~=3.8
7+
unidecode~=1.4.0
78

89
# LLM dependencies (optional, install based on which LLM you want to use)
910
# For paid APIs:

0 commit comments

Comments
 (0)