Skip to content

Commit 9c87d1b

Browse files
feat: Add advanced search API (#438)
* feat: add advanced search on entry nodes API * feat: add pagination on search API * fix: add text search order in search API * fix: fix multi-word search * refactor: enable multi-word whitespaced search * chore: update Client SDK * chore: update frontend to support new API * style: lint backend * feat: return filters AST in search API call * refactor: simplify filter search term parsing * refactor: respond to comments * chore: regenerate SDK * style: lint * docs: add function doc * fix: reorder search response * refactor: add parsed query string to search response * docs: add code docs * docs: add filter docs
1 parent e60b525 commit 9c87d1b

23 files changed

+909
-241
lines changed

backend/editor/api.py

+18-15
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@
88
# Required imports
99
# ------------------------------------------------------------------------------------#
1010
from datetime import datetime
11-
from typing import Optional
11+
from typing import Annotated, Optional
1212

1313
# FastAPI
1414
from fastapi import (
1515
BackgroundTasks,
1616
FastAPI,
1717
Form,
1818
HTTPException,
19+
Query,
1920
Request,
2021
Response,
2122
UploadFile,
@@ -31,7 +32,7 @@
3132
from . import graph_db
3233

3334
# Controller imports
34-
from .controllers import project_controller
35+
from .controllers import project_controller, search_controller
3536
from .entries import TaxonomyGraph
3637

3738
# Custom exceptions
@@ -40,6 +41,7 @@
4041
# Data model imports
4142
from .models.node_models import EntryNodeCreate, ErrorNode, Footer, Header, NodeType
4243
from .models.project_models import Project, ProjectEdit, ProjectStatus
44+
from .models.search_models import EntryNodeSearchResult
4345
from .scheduler import scheduler_lifespan
4446

4547
# -----------------------------------------------------------------------------------#
@@ -231,16 +233,6 @@ async def find_one_entry_children(response: Response, branch: str, taxonomy_name
231233
return one_entry_children
232234

233235

234-
@app.get("/{taxonomy_name}/{branch}/entry")
235-
async def find_all_entries(response: Response, branch: str, taxonomy_name: str):
236-
"""
237-
Get all entries within taxonomy
238-
"""
239-
taxonomy = TaxonomyGraph(branch, taxonomy_name)
240-
all_entries = await taxonomy.get_all_nodes("ENTRY")
241-
return all_entries
242-
243-
244236
@app.get("/{taxonomy_name}/{branch}/synonym/{synonym}")
245237
async def find_one_synonym(response: Response, branch: str, taxonomy_name: str, synonym: str):
246238
"""
@@ -317,10 +309,21 @@ async def find_all_errors(branch: str, taxonomy_name: str) -> ErrorNode:
317309
return result
318310

319311

320-
@app.get("/{taxonomy_name}/{branch}/search")
321-
async def search_node(response: Response, branch: str, taxonomy_name: str, query: str):
312+
@app.get("/{taxonomy_name}/{branch}/nodes/entry")
313+
async def search_entry_nodes(
314+
branch: str,
315+
taxonomy_name: str,
316+
q: Annotated[
317+
str,
318+
Query(
319+
description="The search query string to filter down the returned entry nodes.\
320+
Example: is:root language:en not(language):fr"
321+
),
322+
] = "",
323+
page: int = 1,
324+
) -> EntryNodeSearchResult:
322325
taxonomy = TaxonomyGraph(branch, taxonomy_name)
323-
result = await taxonomy.full_text_search(query)
326+
result = await search_controller.search_entry_nodes(taxonomy.project_name, q, page)
324327
return result
325328

326329

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
import math
2+
from dataclasses import dataclass
3+
4+
from openfoodfacts_taxonomy_parser import utils as parser_utils
5+
from pydantic import ValidationError
6+
7+
from ..graph_db import get_current_transaction
8+
from ..models.node_models import EntryNode
9+
from ..models.search_models import (
10+
CypherQuery,
11+
EntryNodeSearchResult,
12+
FilterSearchTerm,
13+
FilterSearchTermValidator,
14+
)
15+
16+
17+
def get_query_param_name_prefix(index: int) -> str:
18+
return f"value_{index}"
19+
20+
21+
@dataclass(frozen=True)
22+
class Query:
23+
project_id: str
24+
search_terms: list[str]
25+
name_search_terms: list[str]
26+
filter_search_terms: list[FilterSearchTerm]
27+
28+
29+
def split_query_into_search_terms(query: str) -> list[str]:
30+
"""
31+
Queries should be split by whitespaces that are not inside quotes
32+
"""
33+
query = query.strip()
34+
search_terms = []
35+
36+
inside_quotes = False
37+
term_start = 0
38+
39+
for term_end in range(len(query)):
40+
if query[term_end] == '"':
41+
inside_quotes = not inside_quotes
42+
# If we are not inside quotes and we encounter a whitespace
43+
# we are at the end of the current search term
44+
elif query[term_end] == " " and not inside_quotes:
45+
# If the term is not empty, we add it to the list of search terms
46+
if term_start != term_end:
47+
search_term = query[term_start:term_end]
48+
search_terms.append(search_term)
49+
term_start = term_end + 1
50+
51+
search_terms.append(query[term_start:])
52+
53+
return search_terms
54+
55+
56+
def parse_filter_search_term(search_term: str) -> FilterSearchTerm | None:
57+
"""
58+
Parses a filter search term of the format `filter:value` if possible
59+
OR
60+
Returns None
61+
"""
62+
63+
if ":" not in search_term:
64+
return None
65+
66+
filter_name, filter_value = search_term.split(":", maxsplit=1)
67+
68+
if filter_value.startswith('"') and filter_value.endswith('"'):
69+
filter_value = filter_value[1:-1]
70+
71+
# If the filter value contains quotes, it is invalid
72+
if '"' in filter_value:
73+
return None
74+
75+
try:
76+
# dispatch according to filter_name
77+
return FilterSearchTermValidator.validate_python(
78+
{"filter_type": filter_name, "filter_value": filter_value}
79+
)
80+
except ValidationError:
81+
return None
82+
83+
84+
def validate_query(project_id: str, query: str) -> Query:
85+
"""
86+
A query is composed of search terms separated by whitespaces.
87+
A search term is either a name search term or a filter search term.
88+
89+
A filter search term is of the format `filter:value` where `filter` is a valid filter value
90+
and `value` is a valid search value for the particular filter.
91+
The `value` is surrounded by quotes if it contains whitespaces.
92+
The value cannot contain quotes.
93+
94+
All other terms are considered name search terms.
95+
The name search term allows for a text search on a node's tags.
96+
97+
The possible filters are:
98+
- `is`: `root`, `external` and `not:external` are the only possible values.
99+
It allows to filter on the root and external nodes.
100+
- `language`: the value is a language code. It allows to filter on
101+
if the language exists or not on the node.
102+
You can negate the filter with the not:lc syntax.
103+
- `parent`: the value is a node's id. It allows to filter on if the node is a
104+
parent of the node with the given id.
105+
- `child`: the value is a node's id. It allows to filter on if the node is a child of
106+
the node with the given id.
107+
- `ancestor`: the value is a node's id. It allows to filter on if the node is an ancestor
108+
of the node with the given id.
109+
- `descendant`: the value is a node's id. It allows to filter on if the node is a descendant
110+
of the node with the given id.
111+
- `property`: the value is a property name and an optional value (property_name:value).
112+
It allows to filter on if the node has the given property and if the property has the
113+
given value if it is provided. You can add the `not:inherited:` prefix to the filter to
114+
negate it or to also search on parent nodes for inherited properties.
115+
116+
Examples:
117+
- "is:root language:en not(language):fr property:inherited:vegan:en:yes"
118+
- "is:not:external parent:"en:apple juice" descendant:en:juices "fruit concentrate""
119+
"""
120+
121+
search_terms = split_query_into_search_terms(query)
122+
123+
parsed_search_terms = []
124+
name_search_terms = []
125+
filter_search_terms = []
126+
127+
for search_term in search_terms:
128+
if (filter_search_term := parse_filter_search_term(search_term)) is not None:
129+
filter_search_terms.append(filter_search_term)
130+
parsed_search_terms.append(filter_search_term.to_query_string())
131+
else:
132+
name_search_terms.append(search_term)
133+
parsed_search_terms.append(search_term)
134+
135+
return Query(project_id, parsed_search_terms, name_search_terms, filter_search_terms)
136+
137+
138+
def _get_token_query(token: str) -> str:
139+
"""
140+
Returns the lucene query for a token.
141+
The tokens are additive and the fuzziness of the search depends on the length of the token.
142+
"""
143+
144+
token = "+" + token
145+
if len(token) > 10:
146+
return token + "~2"
147+
elif len(token) > 4:
148+
return token + "~1"
149+
else:
150+
return token
151+
152+
153+
def build_lucene_name_search_query(search_value: str) -> str | None:
154+
"""
155+
The name search term can trigger two types of searches:
156+
- if the search value is in the format `language_code:raw_search_value`,
157+
it triggers a search on the tags_ids_{language_code} index
158+
- else it triggers a search on the tags_ids index
159+
160+
If the `raw_search_value` is surrounded by quotes, the search will be exact.
161+
Otherwise, the search is fuzzy when the search value is longer than 4 characters
162+
(the edit distance depends of the length of the search value)
163+
"""
164+
language_code = None
165+
166+
# get an eventual language prefix
167+
if len(search_value) > 2 and search_value[2] == ":" and search_value[0:2].isalpha():
168+
language_code, search_value = search_value.split(":", maxsplit=1)
169+
language_code = language_code.lower()
170+
171+
def get_search_query() -> str | None:
172+
if search_value.startswith('"') and search_value.endswith('"'):
173+
return search_value if len(search_value) > 2 else None
174+
175+
if language_code is not None:
176+
normalized_text = parser_utils.normalize_text(search_value, language_code)
177+
else:
178+
normalized_text = parser_utils.normalize_text(search_value)
179+
180+
# If normalized text is empty, no searches are found
181+
if normalized_text.strip() == "":
182+
return None
183+
184+
tokens = normalized_text.split("-")
185+
186+
return "(" + " ".join(map(_get_token_query, tokens)) + ")"
187+
188+
search_query = get_search_query()
189+
190+
if search_query is None:
191+
return None
192+
193+
if language_code is not None:
194+
search_query = f"tags_ids_{language_code}:{search_query}"
195+
196+
return search_query
197+
198+
199+
def build_cypher_query(query: Query, skip: int, limit: int) -> tuple[str, str, dict[str, str]]:
200+
# build part of the query doing full text search
201+
lucene_name_search_queries = list(
202+
filter(
203+
lambda q: q is not None, map(build_lucene_name_search_query, query.name_search_terms)
204+
)
205+
)
206+
207+
# build part of the query for filter:value members
208+
cypher_filter_search_terms = [
209+
term.build_cypher_query(get_query_param_name_prefix(index))
210+
for index, term in enumerate(query.filter_search_terms)
211+
]
212+
213+
full_text_search_query, order_clause = "", "WITH n ORDER BY n.is_external, n.id"
214+
query_params = {}
215+
216+
if lucene_name_search_queries:
217+
SEARCH_QUERY_PARAM_NAME = "search_query"
218+
MIN_SEARCH_SCORE = 0.1
219+
220+
full_text_search_query = f"""
221+
CALL db.index.fulltext.queryNodes("{query.project_id}_SearchTagsIds",
222+
${SEARCH_QUERY_PARAM_NAME})
223+
YIELD node, score
224+
WHERE score > {MIN_SEARCH_SCORE}
225+
WITH node.id AS nodeId
226+
WITH COLLECT(nodeId) AS nodeIds
227+
"""
228+
query_params[SEARCH_QUERY_PARAM_NAME] = " AND ".join(lucene_name_search_queries)
229+
230+
order_clause = (
231+
"WITH n, apoc.coll.indexOf(nodeIds, n.id) AS index ORDER BY index, n.is_external"
232+
)
233+
234+
name_filter_search_term = "n.id IN nodeIds"
235+
cypher_filter_search_terms.append(CypherQuery(name_filter_search_term))
236+
237+
for cypher_filter_search_term in cypher_filter_search_terms:
238+
query_params |= cypher_filter_search_term.params
239+
240+
combined_filter_query = (
241+
f"WHERE {' AND '.join([cypher_query.query for cypher_query in cypher_filter_search_terms])}"
242+
if cypher_filter_search_terms
243+
else ""
244+
)
245+
246+
base_query = f"""
247+
{full_text_search_query}
248+
MATCH (n:{query.project_id}:ENTRY)
249+
{combined_filter_query}
250+
"""
251+
252+
page_subquery = f"""
253+
{order_clause}
254+
WITH collect(n) AS nodeList, count(n) AS nodeCount
255+
UNWIND nodeList AS node
256+
WITH node, nodeCount
257+
SKIP {skip} LIMIT {limit}
258+
WITH collect(node) AS nodeList, nodeCount
259+
RETURN nodeList, nodeCount;
260+
"""
261+
262+
count_subquery = """
263+
RETURN count(n) AS nodeCount;
264+
"""
265+
266+
page_query = base_query + page_subquery
267+
count_query = base_query + count_subquery
268+
269+
return page_query, count_query, query_params
270+
271+
272+
async def search_entry_nodes(project_id: str, raw_query: str, page: int) -> EntryNodeSearchResult:
273+
"""
274+
Search for entry nodes in the database
275+
"""
276+
query = validate_query(project_id, raw_query)
277+
278+
parsed_query_string = " ".join(query.search_terms)
279+
# For better UX on the search bar
280+
if parsed_query_string != "":
281+
parsed_query_string += " "
282+
283+
PAGE_LENGTH = 50
284+
skip = max(0, (page - 1) * PAGE_LENGTH)
285+
286+
cypher_query = build_cypher_query(query, skip, PAGE_LENGTH)
287+
288+
page_query, count_query, query_params = cypher_query
289+
290+
result = await get_current_transaction().run(page_query, query_params)
291+
search_result = await result.single()
292+
293+
if search_result is None:
294+
count_result = await get_current_transaction().run(count_query, query_params)
295+
node_count = (await count_result.single())["nodeCount"]
296+
return EntryNodeSearchResult(
297+
node_count=node_count,
298+
page_count=math.ceil(node_count / PAGE_LENGTH),
299+
q=parsed_query_string,
300+
filters=query.filter_search_terms,
301+
)
302+
303+
node_count, nodes = search_result["nodeCount"], search_result["nodeList"]
304+
return EntryNodeSearchResult(
305+
node_count=node_count,
306+
page_count=math.ceil(node_count / PAGE_LENGTH),
307+
q=parsed_query_string,
308+
filters=query.filter_search_terms,
309+
nodes=[EntryNode(**node) for node in nodes],
310+
)

0 commit comments

Comments
 (0)