Skip to content

Commit 230bdbf

Browse files
periercCharles Perieralexgarel
authored
feat(parser,backend,frontend): enable extended taxonomies (#429)
* enable extension of taxonomies * fix edge case where node merged with external node * handle external nodes on frontend * chore: resync-pr * resolve comments * reintroduce deleted test * move the external taxonomies parsing logic to taxonomy_parser.py --------- Co-authored-by: Charles Perier <[email protected]> Co-authored-by: Alex Garel <[email protected]>
1 parent ca0512c commit 230bdbf

23 files changed

+551
-162
lines changed

backend/editor/entries.py

+49-22
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Database helper functions for API
33
"""
44

5+
import asyncio
56
import logging
67
import re
78
import shutil
@@ -34,6 +35,7 @@
3435
)
3536
from .models.node_models import EntryNodeCreate
3637
from .models.project_models import ProjectCreate, ProjectEdit, ProjectStatus
38+
from .settings import EXTERNAL_TAXONOMIES
3739

3840
log = logging.getLogger(__name__)
3941

@@ -50,9 +52,8 @@ def __init__(self, branch_name, taxonomy_name):
5052
self.branch_name = branch_name
5153
self.project_name = "p_" + taxonomy_name + "_" + branch_name
5254

53-
@property
54-
def taxonomy_path_in_repository(self):
55-
return utils.taxonomy_path_in_repository(self.taxonomy_name)
55+
def taxonomy_path_in_repository(self, taxonomy_name):
56+
return utils.taxonomy_path_in_repository(taxonomy_name)
5657

5758
def get_label(self, id):
5859
"""
@@ -86,29 +87,34 @@ async def get_local_taxonomy_file(self, tmpdir: str, uploadfile: UploadFile):
8687
await run_in_threadpool(shutil.copyfileobj, uploadfile.file, f)
8788
return filepath
8889

89-
async def get_github_taxonomy_file(self, tmpdir: str):
90+
async def get_github_taxonomy_file(self, tmpdir: str, taxonomy_name: str):
9091
async with TransactionCtx():
91-
filepath = f"{tmpdir}/{self.taxonomy_name}.txt"
92+
filepath = f"{tmpdir}/{taxonomy_name}.txt"
93+
path_in_repository = self.taxonomy_path_in_repository(taxonomy_name)
9294
target_url = (
9395
f"https://raw.githubusercontent.com/{settings.repo_uri}"
94-
f"/main/{self.taxonomy_path_in_repository}"
96+
f"/main/{path_in_repository}"
9597
)
9698
try:
99+
# get taxonomy file
97100
await run_in_threadpool(urllib.request.urlretrieve, target_url, filepath)
98-
github_object = GithubOperations(self.taxonomy_name, self.branch_name)
99-
commit_sha = (await github_object.get_branch("main")).commit.sha
100-
file_sha = await github_object.get_file_sha()
101-
await edit_project(
102-
self.project_name,
103-
ProjectEdit(
104-
github_checkout_commit_sha=commit_sha, github_file_latest_sha=file_sha
105-
),
106-
)
101+
if taxonomy_name == self.taxonomy_name:
102+
# this is the taxonomy we want to edit
103+
# track the current commit to know where to start the PR from
104+
github_object = GithubOperations(self.taxonomy_name, self.branch_name)
105+
commit_sha = (await github_object.get_branch("main")).commit.sha
106+
file_sha = await github_object.get_file_sha()
107+
await edit_project(
108+
self.project_name,
109+
ProjectEdit(
110+
github_checkout_commit_sha=commit_sha, github_file_latest_sha=file_sha
111+
),
112+
)
107113
return filepath
108114
except Exception as e:
109115
raise TaxonomyImportError() from e
110116

111-
def parse_taxonomy(self, filepath: str):
117+
def parse_taxonomy(self, main_filepath: str, other_filepaths: list[str] | None = None):
112118
"""
113119
Helper function to call the Open Food Facts Python Taxonomy Parser
114120
"""
@@ -117,7 +123,7 @@ def parse_taxonomy(self, filepath: str):
117123
parser_object = parser.Parser(session)
118124
try:
119125
# Parse taxonomy with given file name and branch name
120-
parser_object(filepath, self.branch_name, self.taxonomy_name)
126+
parser_object(main_filepath, other_filepaths, self.branch_name, self.taxonomy_name)
121127
except Exception as e:
122128
# outer exception handler will put project status to FAILED
123129
raise TaxonomyParsingError() from e
@@ -126,11 +132,14 @@ async def get_and_parse_taxonomy(self, uploadfile: UploadFile | None = None):
126132
try:
127133
with tempfile.TemporaryDirectory(prefix="taxonomy-") as tmpdir:
128134
filepath = await (
129-
self.get_github_taxonomy_file(tmpdir)
135+
self.get_github_taxonomy_file(tmpdir, self.taxonomy_name)
130136
if uploadfile is None
131137
else self.get_local_taxonomy_file(tmpdir, uploadfile)
132138
)
133-
await run_in_threadpool(self.parse_taxonomy, filepath)
139+
other_filepaths = None
140+
if self.taxonomy_name in EXTERNAL_TAXONOMIES:
141+
other_filepaths = await self.fetch_external_taxonomy_files(tmpdir)
142+
await run_in_threadpool(self.parse_taxonomy, filepath, other_filepaths)
134143
async with TransactionCtx():
135144
error_node = await get_error_node(self.project_name)
136145
errors_count = len(error_node.errors) if error_node else 0
@@ -149,6 +158,25 @@ async def get_and_parse_taxonomy(self, uploadfile: UploadFile | None = None):
149158
log.exception(e)
150159
raise e
151160

161+
async def fetch_external_taxonomy_files(self, tmpdir: str) -> list[str]:
162+
"""
163+
Helper function to fetch external taxonomies concurrently from Github
164+
"""
165+
external_taxonomy_filepaths = []
166+
tasks = []
167+
168+
# Create tasks for each external taxonomy and store them in a list
169+
for external_taxonomy in EXTERNAL_TAXONOMIES[self.taxonomy_name]:
170+
task = asyncio.create_task(self.get_github_taxonomy_file(tmpdir, external_taxonomy))
171+
tasks.append(task)
172+
173+
# Wait for all tasks to complete concurrently
174+
for task in tasks:
175+
external_filepath = await task
176+
external_taxonomy_filepaths.append(external_filepath)
177+
178+
return external_taxonomy_filepaths
179+
152180
async def import_taxonomy(
153181
self,
154182
description: str,
@@ -689,9 +717,8 @@ async def full_text_search(self, text):
689717
where score_ > 0
690718
return node, score_ as score
691719
}
692-
with node.id as node, score
693-
RETURN node, sum(score) as score
694-
720+
WITH node.id AS node_id, node.is_external AS is_external, score
721+
RETURN {id: node_id, is_external: is_external} AS node, sum(score) AS score
695722
ORDER BY score DESC
696723
"""
697724
_result = await get_current_transaction().run(query, params)

backend/editor/settings.py

+11
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,14 @@
66
uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
77
access_token = os.environ.get("GITHUB_PAT")
88
repo_uri = os.environ.get("REPO_URI", "openfoodfacts/openfoodfacts-server")
9+
10+
EXTERNAL_TAXONOMIES = {
11+
"food_ingredients": [
12+
"additives_classes",
13+
"additives",
14+
"minerals",
15+
"vitamins",
16+
"nucleotides",
17+
"other_nutritional_substances",
18+
],
19+
}

parser/openfoodfacts_taxonomy_parser/parser/parser.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label
2828
elif node_data.get_node_type() == NodeType.STOPWORDS:
2929
type_label = "STOPWORDS"
3030
else:
31-
raise ValueError(f"ENTRY nodes should not be passed to this function")
31+
raise ValueError("ENTRY nodes should not be passed to this function")
3232

3333
node_tags_queries = [f"{key} : ${key}" for key in node_data.tags]
3434

@@ -70,7 +70,7 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
7070

7171
for entry_node in entry_nodes:
7272
if entry_node.get_node_type() != NodeType.ENTRY:
73-
raise ValueError(f"Only ENTRY nodes should be passed to this function")
73+
raise ValueError("Only ENTRY nodes should be passed to this function")
7474
seen_properties_and_tags_and_comments.update(entry_node.tags)
7575
seen_properties_and_tags_and_comments.update(entry_node.properties)
7676
seen_properties_and_tags_and_comments.update(entry_node.comments)
@@ -79,11 +79,13 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
7979
f"{key} : entry_node.{key}" for key in seen_properties_and_tags_and_comments
8080
]
8181

82-
base_properties_query = f"""
82+
base_properties_query = """
8383
id: entry_node.id,
8484
preceding_lines: entry_node.preceding_lines,
8585
src_position: entry_node.src_position,
86-
main_language: entry_node.main_language
86+
main_language: entry_node.main_language,
87+
is_external: entry_node.is_external,
88+
original_taxonomy: entry_node.original_taxonomy
8789
"""
8890

8991
properties_query = ",\n".join([base_properties_query, *additional_properties_queries])
@@ -234,14 +236,23 @@ def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name
234236
self._create_child_links(taxonomy.child_links, project_label)
235237
self._create_previous_links(taxonomy.previous_links, project_label)
236238

237-
def __call__(self, filename: str, branch_name: str, taxonomy_name: str):
239+
def __call__(
240+
self,
241+
main_filename: str,
242+
external_filenames: list[str] | None,
243+
branch_name: str,
244+
taxonomy_name: str,
245+
):
238246
"""Process the file"""
239247
start_time = timeit.default_timer()
240248

241249
branch_name = normalize_text(branch_name, char="_")
242250
taxonomy_parser = TaxonomyParser()
243251
try:
244-
taxonomy = taxonomy_parser.parse_file(filename, self.parser_logger)
252+
taxonomy = taxonomy_parser.parse_file(
253+
main_filename, external_filenames, self.parser_logger
254+
)
255+
245256
self._write_to_database(taxonomy, taxonomy_name, branch_name)
246257

247258
self.parser_logger.info(

parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py

+36-4
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,18 @@ class NodeData:
3535
# to keep track of comments just above the current line
3636
# during parsing of an entry, to be able to add them
3737
# to the right property or tag when possible
38-
comments_stack: list[(int, str)] = field(default_factory=list)
38+
comments_stack: list[tuple[int, str]] = field(default_factory=list)
39+
is_external: bool = False # True if the node comes from another taxonomy
40+
original_taxonomy: str | None = None # the name of the taxonomy the node comes from
3941

4042
def to_dict(self):
4143
return {
4244
"id": self.id,
4345
"main_language": self.main_language,
4446
"preceding_lines": self.preceding_lines,
4547
"src_position": self.src_position,
48+
"is_external": self.is_external,
49+
"original_taxonomy": self.original_taxonomy,
4650
**self.properties,
4751
**self.tags,
4852
**self.comments,
@@ -464,6 +468,14 @@ def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[Node
464468
for node in entry_nodes:
465469
if node.id in ids_to_nodes:
466470
first_node = ids_to_nodes[node.id]
471+
if first_node.is_external:
472+
# we don't want to merge a node with an external node;
473+
# the external node gets a new id with its original taxonomy name
474+
# and the new one becomes the new "first node"
475+
first_node.id += f"@{first_node.original_taxonomy}"
476+
unique_entry_nodes.append(node)
477+
ids_to_nodes[node.id] = node
478+
continue
467479
for key, value in node.tags.items():
468480
if not key.startswith("tags_ids_"):
469481
# union of the tags
@@ -496,11 +508,25 @@ def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[Node
496508
ids_to_nodes[node.id] = node
497509
return unique_entry_nodes
498510

499-
def _create_taxonomy(self, filename: str) -> Taxonomy:
511+
def _create_taxonomy(
512+
self, filename: str, external_filenames: list[str] | None = None
513+
) -> Taxonomy:
500514
"""Create the taxonomy from the file"""
515+
# parse external taxonomies if any, and add their entry nodes to the main taxonomy
516+
external_entry_nodes = []
517+
for external_filename in external_filenames or []:
518+
external_taxonomy_parser = TaxonomyParser()
519+
external_taxonomy = external_taxonomy_parser.parse_file(
520+
external_filename, None, self.parser_logger
521+
)
522+
external_entry_nodes.extend(external_taxonomy.entry_nodes)
523+
for node in external_entry_nodes:
524+
node.is_external = True
525+
501526
self.parser_logger.info(f"Parsing {filename}")
502527
harvested_header_data, entries_start_line = self._header_harvest(filename)
503528
entry_nodes: list[NodeData] = []
529+
entry_nodes.extend(external_entry_nodes)
504530
other_nodes = [
505531
NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1)
506532
]
@@ -509,6 +535,7 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
509535
harvested_data = self._harvest_entries(filename, entries_start_line)
510536
for entry in harvested_data:
511537
if entry.get_node_type() == NodeType.ENTRY:
538+
entry.original_taxonomy = filename.split("/")[-1]
512539
entry_nodes.append(entry)
513540
else:
514541
other_nodes.append(entry)
@@ -534,13 +561,18 @@ def _create_taxonomy(self, filename: str) -> Taxonomy:
534561
child_links=child_links,
535562
)
536563

537-
def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -> Taxonomy:
564+
def parse_file(
565+
self,
566+
filename: str,
567+
external_filenames: list[str] | None = None,
568+
logger: ParserConsoleLogger | None = None,
569+
) -> Taxonomy:
538570
if logger:
539571
self.parser_logger = logger
540572
"""Process the file into a Taxonomy object"""
541573
start_time = timeit.default_timer()
542574
filename = normalize_filename(filename)
543-
taxonomy = self._create_taxonomy(filename)
575+
taxonomy = self._create_taxonomy(filename, external_filenames)
544576
self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
545577
self.parser_logger.info(
546578
f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes"

parser/tests/data/test.txt

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ synonyms:en:passion fruit, passionfruit
66

77
synonyms:fr:fruit de la passion, maracuja, passion
88

9+
<en:milk
910
en:yogurts, yoghurts
1011
fr:yaourts, yoghourts, yogourts
1112

parser/tests/data/test_external1.txt

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# External taxonomy 1
2+
3+
synonyms:en:fiber, fibre
4+
5+
stopwords:fr:aux, au, de, le, du, la, a, et
6+
7+
# comment
8+
en:milk
9+
fr:lait
10+
11+
en:honey
12+
fr:miel
13+
14+
<en:milk
15+
en:oat milk
16+
17+
<en:milk
18+
en:almond milk

parser/tests/data/test_external2.txt

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# External taxonomy 2
2+
3+
# comment
4+
en:coffee
5+
fr:café
6+
7+
en:tomato
8+
fr:tomate
9+
10+
<en:coffee
11+
en:cappuccino
12+
13+
<en:coffee
14+
en:latte

0 commit comments

Comments
 (0)