Skip to content

Commit b062f67

Browse files
RayBBmystic-06
authored andcommitted
fastapi search improve model validation and add api contract tests (internetarchive#11549)
* move processing of 'fields' queryparam to pydantic * remove mode, it's already default * refactor: Move `q` parameter validation into `PublicQueryOptions` field validator and use `HTTPException` for errors. * feat: Add JSON parsing error handling for the `query` parameter in the search endpoint and include new tests for valid and invalid JSON inputs. * feat: Add JSON query parameter with validation to `WorkSearchScheme` and remove manual parsing from `search_json` endpoint. * refactor: encapsulate sort and spellcheck_count query parameters within AdditionalEndpointQueryOptions. * refactor: Unify search endpoint parameters into a single `SearchRequestParams` model using `Query()` and add OpenAPI documentation tests. * refactor: Exclude offset from the search query dictionary and add a test for pagination parameter handling. * feat: add description and examples to the search language parameter * feat: Add descriptions and examples to search query facet fields. * move q back up * refactor: simplify `Field` default assignments and ensure sorted default `fields` * refactor: replace HTTPException with ValueError in Pydantic field validators * docs: add "search" tag to `/search.json` endpoint * feat: Use BeforeValidator to parse comma-separated fields string into a list in search request parameters. * feat: Parse search query parameter as a JSON dictionary and refactor its validation. * test: replace query alias test with precedence test for search parameters * feat: introduce Pydantic SearchResponse model for `/search.json` endpoint and update tests accordingly * feat: Add API contract tests to compare FastAPI and webpy search endpoints, introducing shared test fixtures and updating existing FastAPI tests. * feat: Remove explicit `page` and `limit` addition to search query dictionary and corresponding test. * feat: Standardize boolean query parameters to string literals, add `author_facet` as an alias for `author_key`, and enhance API contract testing with dynamic parameter generation. * add all search fields programatically * test: rename arbitrary query parameter `osp_count` to `osp_count_fake` in search test. * refactor: Remove dynamic field addition by inlining WorkSearchScheme fields into PublicQueryOptions and add a synchronization test. * feat: refactor work search query parameter handling.
1 parent bc8e1eb commit b062f67

File tree

7 files changed

+628
-174
lines changed

7 files changed

+628
-174
lines changed

openlibrary/fastapi/search.py

Lines changed: 162 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,18 @@
11
from __future__ import annotations
22

33
import json
4-
from typing import Annotated, Any
5-
6-
from fastapi import APIRouter, Depends, Query, Request
7-
from fastapi.responses import JSONResponse
8-
from pydantic import BaseModel, Field, model_validator
4+
from typing import Annotated, Any, Literal, Self
5+
6+
from fastapi import APIRouter, Query, Request
7+
from pydantic import (
8+
BaseModel,
9+
BeforeValidator,
10+
ConfigDict,
11+
Field,
12+
computed_field,
13+
field_validator,
14+
model_validator,
15+
)
916

1017
from openlibrary.core.fulltext import fulltext_search_async
1118
from openlibrary.plugins.inside.code import RESULTS_PER_PAGE
@@ -21,12 +28,16 @@
2128

2229
# Ideally this will go in a models files, we'll move it for the 2nd endpoint
2330
class Pagination(BaseModel):
24-
limit: Annotated[int, Query(ge=0)] = 100
25-
offset: Annotated[int | None, Query(ge=0)] = None
26-
page: Annotated[int | None, Query(ge=1)] = None
31+
"""Reusable pagination parameters for API endpoints."""
32+
33+
limit: int = Field(100, ge=0, description="Maximum number of results to return.")
34+
offset: int | None = Field(
35+
None, ge=0, description="Number of results to skip.", exclude=True
36+
)
37+
page: int | None = Field(None, ge=1, description="Page number (1-indexed).")
2738

2839
@model_validator(mode='after')
29-
def normalize_pagination(self) -> Pagination:
40+
def normalize_pagination(self) -> Self:
3041
if self.offset is not None:
3142
self.page = None
3243
elif self.page is None:
@@ -36,12 +47,12 @@ def normalize_pagination(self) -> Pagination:
3647

3748
class PublicQueryOptions(BaseModel):
3849
"""
39-
This class has all the parameters that are passed as "query"
50+
All parameters (and Pagination) that will be passed to the query.
4051
"""
4152

42-
q: str = Query("", description="The search query, like keyword.")
53+
q: str = Field("", description="The search query string.")
4354

44-
# from check_params in works.py
55+
# from public_api_params in works.py
4556
title: str | None = None
4657
publisher: str | None = None
4758
oclc: str | None = None
@@ -52,86 +63,159 @@ class PublicQueryOptions(BaseModel):
5263
person: str | None = None
5364
time: str | None = None
5465
# from workscheme facet_fields
55-
has_fulltext: bool | None = None
56-
public_scan_b: bool | None = None
66+
has_fulltext: Literal["true", "false"] | None = None
67+
public_scan_b: list[Literal["true", "false"]] = []
5768

58-
"""
59-
The day will come when someone asks, why do we have Field wrapping Query
60-
The answer seems to be:
61-
1. Depends(): Tells FastAPI to explode the Pydantic model into individual arguments (dependency injection).
62-
2. Field(Query([])): Overrides the default behavior for lists. It forces FastAPI to look for ?author_key=...
63-
in the URL query string instead of expecting a JSON array in the request body.
64-
The Field part is needed because FastAPI's default guess for lists inside Pydantic models is wrong for this use case.
65-
It guesses "JSON Body," and you have to manually correct it to "Query String."
66-
See: https://github.com/internetarchive/openlibrary/pull/11517#issuecomment-3584196385
67-
"""
68-
author_key: list[str] = Field(Query([]))
69-
subject_facet: list[str] = Field(Query([]))
70-
person_facet: list[str] = Field(Query([]))
71-
place_facet: list[str] = Field(Query([]))
72-
time_facet: list[str] = Field(Query([]))
73-
first_publish_year: list[str] = Field(Query([]))
74-
publisher_facet: list[str] = Field(Query([]))
75-
language: list[str] = Field(Query([]))
76-
author_facet: list[str] = Field(Query([]))
77-
78-
79-
@router.get("/search.json")
69+
# List fields (facets)
70+
author_key: list[str] = Field(
71+
[], description="Filter by author key.", examples=["OL1394244A"]
72+
)
73+
subject_facet: list[str] = Field(
74+
[], description="Filter by subject.", examples=["Fiction", "City planning"]
75+
)
76+
person_facet: list[str] = Field(
77+
[],
78+
description="Filter by person. Not the author but the person who is the subject of the work.",
79+
examples=["Jane Jacobs (1916-2006)", "Cory Doctorow"],
80+
)
81+
place_facet: list[str] = Field(
82+
[], description="Filter by place.", examples=["New York", "Xiamen Shi"]
83+
)
84+
time_facet: list[str] = Field(
85+
[],
86+
description="Filter by time. It can be formatted many ways.",
87+
examples=["20th century", "To 70 A.D."],
88+
)
89+
first_publish_year: list[str] = Field(
90+
[], description="Filter by first publish year.", examples=["2020"]
91+
)
92+
publisher_facet: list[str] = Field(
93+
[], description="Filter by publisher.", examples=["Urban Land Institute"]
94+
)
95+
language: list[str] = Field(
96+
[],
97+
description="Filter by language using three-letter language codes.",
98+
examples={
99+
"english": {
100+
"summary": "English",
101+
"description": "Returns results in English",
102+
"value": ["eng"],
103+
},
104+
"spanish": {
105+
"summary": "Spanish",
106+
"description": "Returns results in Spanish",
107+
"value": ["spa"],
108+
},
109+
"english_and_spanish": {
110+
"summary": "English + Spanish",
111+
"description": "Bilingual results",
112+
"value": ["eng", "spa"],
113+
},
114+
},
115+
)
116+
author_facet: list[str] = Field(
117+
[],
118+
description="(alias for author_key) Filter by author key.",
119+
examples=["OL1394244A"],
120+
)
121+
122+
isbn: str | None = None
123+
author: str | None = None
124+
125+
@field_validator('q')
126+
@classmethod
127+
def parse_q_string(cls, v: str) -> str:
128+
if q_error := validate_search_json_query(v):
129+
raise ValueError(q_error)
130+
return v
131+
132+
133+
class SearchRequestParams(PublicQueryOptions, Pagination):
134+
fields: Annotated[list[str], BeforeValidator(parse_fields_string)] = Field(
135+
",".join(sorted(WorkSearchScheme.default_fetched_fields)),
136+
description="The fields to return.",
137+
)
138+
query: Annotated[dict[str, Any], BeforeValidator(parse_query_json)] = Field(
139+
None, description="A full JSON encoded solr query.", examples=['{"q": "mark"}']
140+
)
141+
sort: str | None = Field(None, description="The sort order of results.")
142+
spellcheck_count: int | None = Field(
143+
default_spellcheck_count,
144+
description="The number of spellcheck suggestions.",
145+
)
146+
147+
def parse_fields_string(v: str | list[str]) -> list[str]:
148+
if isinstance(v, str):
149+
v = [v]
150+
return [f.strip() for item in v for f in str(item).split(",") if f.strip()]
151+
152+
def parse_query_json(v: str) -> dict[str, Any]:
153+
try:
154+
return json.loads(v)
155+
except json.JSONDecodeError as e:
156+
raise ValueError(f"Invalid JSON in 'query' parameter: {e}")
157+
158+
@computed_field
159+
def selected_query(self) -> dict[str, Any]:
160+
if isinstance(self.query, dict):
161+
return self.query
162+
else:
163+
# Include all fields that belong to PublicQueryOptions (the search query part)
164+
# This automatically excludes SearchRequestParams-specific fields like fields, sort, etc.
165+
query_fields = set(PublicQueryOptions.model_fields.keys())
166+
# Add dynamically handled fields from WorkSearchScheme
167+
query_fields |= WorkSearchScheme.all_fields
168+
query_fields |= set(WorkSearchScheme.field_name_map.keys())
169+
q = self.model_dump(include=query_fields, exclude_none=True)
170+
return q
171+
172+
173+
class SearchResponse(BaseModel):
174+
"""The response from a (books) search query."""
175+
176+
model_config = ConfigDict(extra='allow')
177+
178+
numFound: int
179+
start: int
180+
numFoundExact: bool
181+
num_found: int
182+
183+
documentation_url: str = "https://openlibrary.org/dev/docs/api/search"
184+
q: str
185+
offset: int | None
186+
187+
# Defined last so it renders at the bottom.
188+
# We use dict[str, Any] to avoid documenting the internal book fields.
189+
docs: list[dict[str, Any]] = []
190+
191+
192+
@router.get("/search.json", tags=["search"], response_model=SearchResponse)
80193
async def search_json(
81194
request: Request,
82-
pagination: Annotated[Pagination, Depends()],
83-
public_query_options: Annotated[PublicQueryOptions, Depends()],
84-
sort: str | None = Query(None, description="The sort order of results."),
85-
fields: str | None = Query(None, description="The fields to return."),
86-
spellcheck_count: int | None = Query(
87-
default_spellcheck_count, description="The number of spellcheck suggestions."
88-
),
89-
query_str: str | None = Query(
90-
None, alias="query", description="A full JSON encoded solr query."
91-
),
92-
):
195+
params: Annotated[SearchRequestParams, Query()],
196+
) -> Any:
93197
"""
94198
Performs a search for documents based on the provided query.
95199
"""
96-
query: dict[str, Any] = {}
97-
if query_str:
98-
query = json.loads(query_str)
99-
else:
100-
# In an ideal world, we would pass the model unstead of the dict but that's a big refactoring down the line
101-
query = public_query_options.model_dump(exclude_none=True)
102-
query.update({"page": pagination.page, "limit": pagination.limit})
103-
104-
_fields: list[str] = list(WorkSearchScheme.default_fetched_fields)
105-
if fields:
106-
_fields = fields.split(',')
107-
108-
if q_error := validate_search_json_query(public_query_options.q):
109-
return JSONResponse(status_code=422, content={"error": q_error})
110-
111-
response = await work_search_async(
112-
query,
113-
sort=sort,
114-
page=pagination.page,
115-
offset=pagination.offset,
116-
limit=pagination.limit,
117-
fields=_fields,
200+
raw_response = await work_search_async(
201+
params.selected_query,
202+
sort=params.sort,
203+
page=params.page,
204+
offset=params.offset,
205+
limit=params.limit,
206+
fields=params.fields,
118207
# We do not support returning facets from /search.json,
119208
# so disable it. This makes it much faster.
120209
facet=False,
121-
spellcheck_count=spellcheck_count,
210+
spellcheck_count=params.spellcheck_count,
122211
request_label='BOOK_SEARCH_API',
123212
lang=request.state.lang,
124213
)
125214

126-
response['documentation_url'] = "https://openlibrary.org/dev/docs/api/search"
127-
response['q'] = public_query_options.q
128-
response['offset'] = pagination.offset
129-
130-
# Put docs at the end of the response
131-
docs = response.pop('docs', [])
132-
response['docs'] = docs
215+
raw_response['q'] = params.q
216+
raw_response['offset'] = params.offset
133217

134-
return response
218+
return raw_response
135219

136220

137221
@router.get("/search/inside.json")

openlibrary/plugins/worksearch/code.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,7 @@ class search_json(delegate.page):
12641264
def GET(self):
12651265
i = web.input(
12661266
author_key=[],
1267+
author_facet=[],
12671268
subject_facet=[],
12681269
person_facet=[],
12691270
place_facet=[],

openlibrary/plugins/worksearch/schemes/works.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,8 @@ class WorkSearchScheme(SearchScheme):
238238
): lambda: f'ebook_access:[* TO {get_fulltext_min()}]',
239239
}
240240
)
241-
check_params = frozenset(
241+
# These are extra public api params on top of facets, which are also public
242+
public_api_params = frozenset(
242243
{
243244
'title',
244245
'publisher',
@@ -250,6 +251,8 @@ class WorkSearchScheme(SearchScheme):
250251
'person',
251252
'time',
252253
'author_key',
254+
'author',
255+
'isbn',
253256
}
254257
)
255258

@@ -287,27 +290,24 @@ def transform_user_query(
287290

288291
def build_q_from_params(self, params: dict[str, Any]) -> str:
289292
q_list = []
290-
if 'author' in params:
291-
v = params['author'].strip()
292-
m = re_author_key.search(v)
293-
if m:
294-
q_list.append(f"author_key:({m.group(1)})")
295-
else:
296-
v = fully_escape_query(v)
297-
q_list.append(f"(author_name:({v}) OR author_alternative_name:({v}))")
298-
299-
# support web.input fields being either a list or string
300-
# when default values used
301-
q_list += [
302-
f'{k}:({fully_escape_query(val)})'
303-
for k in (self.check_params & set(params))
304-
for val in (params[k] if isinstance(params[k], list) else [params[k]])
305-
]
306-
307-
if params.get('isbn'):
308-
q_list.append(
309-
'isbn:(%s)' % (normalize_isbn(params['isbn']) or params['isbn'])
310-
)
293+
for k in self.public_api_params & set(params):
294+
values = params[k] if isinstance(params[k], list) else [params[k]]
295+
for val in values:
296+
if k == 'author':
297+
v = val.strip()
298+
m = re_author_key.search(v)
299+
if m:
300+
q_list.append(f"author_key:({m.group(1)})")
301+
else:
302+
v = fully_escape_query(v)
303+
q_list.append(
304+
f"(author_name:({v}) OR author_alternative_name:({v}))"
305+
)
306+
elif k == 'isbn':
307+
normalized = normalize_isbn(val)
308+
q_list.append(f'isbn:({normalized or val})')
309+
else:
310+
q_list.append(f'{k}:({fully_escape_query(val)})')
311311

312312
return ' AND '.join(q_list)
313313

0 commit comments

Comments
 (0)