Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8a68790
move processing of 'fields' queryparam to pydantic
RayBB Dec 5, 2025
c5a7b22
remove mode, it's already default
RayBB Dec 5, 2025
9ae433a
refactor: Move `q` parameter validation into `PublicQueryOptions` fie…
RayBB Dec 5, 2025
d1cb2da
feat: Add JSON parsing error handling for the `query` parameter in th…
RayBB Dec 5, 2025
df5c438
feat: Add JSON query parameter with validation to `WorkSearchScheme` …
RayBB Dec 5, 2025
138405d
refactor: encapsulate sort and spellcheck_count query parameters with…
RayBB Dec 5, 2025
a4a7c17
refactor: Unify search endpoint parameters into a single `SearchReque…
RayBB Dec 5, 2025
463211f
refactor: Exclude offset from the search query dictionary and add a t…
RayBB Dec 5, 2025
6e973ea
feat: add description and examples to the search language parameter
RayBB Dec 5, 2025
6616f18
feat: Add descriptions and examples to search query facet fields.
RayBB Dec 5, 2025
1b2a6e1
move q back up
RayBB Dec 5, 2025
5c5a18e
refactor: simplify `Field` default assignments and ensure sorted defa…
RayBB Dec 5, 2025
2eb7533
refactor: replace HTTPException with ValueError in Pydantic field val…
RayBB Dec 5, 2025
1b32833
docs: add "search" tag to `/search.json` endpoint
RayBB Dec 5, 2025
a8c54cb
feat: Use BeforeValidator to parse comma-separated fields string into…
RayBB Dec 5, 2025
dce8305
feat: Parse search query parameter as a JSON dictionary and refactor …
RayBB Dec 5, 2025
745c782
test: replace query alias test with precedence test for search parame…
RayBB Dec 5, 2025
df7f55a
feat: introduce Pydantic SearchResponse model for `/search.json` endp…
RayBB Dec 5, 2025
3181dbf
feat: Add API contract tests to compare FastAPI and webpy search endp…
RayBB Dec 6, 2025
4ecb9de
feat: Remove explicit `page` and `limit` addition to search query dic…
RayBB Dec 6, 2025
7f9d60e
feat: Standardize boolean query parameters to string literals, add `a…
RayBB Dec 7, 2025
05ed95a
add all search fields programatically
RayBB Dec 12, 2025
d5e26a6
test: rename arbitrary query parameter `osp_count` to `osp_count_fake…
RayBB Dec 12, 2025
affc787
refactor: Remove dynamic field addition by inlining WorkSearchScheme …
RayBB Dec 12, 2025
b548926
feat: refactor work search query parameter handling.
RayBB Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 162 additions & 78 deletions openlibrary/fastapi/search.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from __future__ import annotations

import json
from typing import Annotated, Any

from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field, model_validator
from typing import Annotated, Any, Literal, Self

from fastapi import APIRouter, Query, Request
from pydantic import (
BaseModel,
BeforeValidator,
ConfigDict,
Field,
computed_field,
field_validator,
model_validator,
)

from openlibrary.core.fulltext import fulltext_search_async
from openlibrary.plugins.inside.code import RESULTS_PER_PAGE
Expand All @@ -21,12 +28,16 @@

# Ideally this will go in a models files, we'll move it for the 2nd endpoint
class Pagination(BaseModel):
limit: Annotated[int, Query(ge=0)] = 100
offset: Annotated[int | None, Query(ge=0)] = None
page: Annotated[int | None, Query(ge=1)] = None
"""Reusable pagination parameters for API endpoints."""

limit: int = Field(100, ge=0, description="Maximum number of results to return.")
offset: int | None = Field(
None, ge=0, description="Number of results to skip.", exclude=True
)
page: int | None = Field(None, ge=1, description="Page number (1-indexed).")

@model_validator(mode='after')
def normalize_pagination(self) -> Pagination:
def normalize_pagination(self) -> Self:
if self.offset is not None:
self.page = None
elif self.page is None:
Expand All @@ -36,12 +47,12 @@ def normalize_pagination(self) -> Pagination:

class PublicQueryOptions(BaseModel):
"""
This class has all the parameters that are passed as "query"
All parameters (and Pagination) that will be passed to the query.
"""

q: str = Query("", description="The search query, like keyword.")
q: str = Field("", description="The search query string.")

# from check_params in works.py
# from public_api_params in works.py
title: str | None = None
publisher: str | None = None
oclc: str | None = None
Expand All @@ -52,86 +63,159 @@ class PublicQueryOptions(BaseModel):
person: str | None = None
time: str | None = None
# from workscheme facet_fields
has_fulltext: bool | None = None
public_scan_b: bool | None = None
has_fulltext: Literal["true", "false"] | None = None
public_scan_b: list[Literal["true", "false"]] = []

"""
The day will come when someone asks, why do we have Field wrapping Query
The answer seems to be:
1. Depends(): Tells FastAPI to explode the Pydantic model into individual arguments (dependency injection).
2. Field(Query([])): Overrides the default behavior for lists. It forces FastAPI to look for ?author_key=...
in the URL query string instead of expecting a JSON array in the request body.
The Field part is needed because FastAPI's default guess for lists inside Pydantic models is wrong for this use case.
It guesses "JSON Body," and you have to manually correct it to "Query String."
See: https://github.com/internetarchive/openlibrary/pull/11517#issuecomment-3584196385
"""
author_key: list[str] = Field(Query([]))
subject_facet: list[str] = Field(Query([]))
person_facet: list[str] = Field(Query([]))
place_facet: list[str] = Field(Query([]))
time_facet: list[str] = Field(Query([]))
first_publish_year: list[str] = Field(Query([]))
publisher_facet: list[str] = Field(Query([]))
language: list[str] = Field(Query([]))
author_facet: list[str] = Field(Query([]))


@router.get("/search.json")
# List fields (facets)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# List fields (facets)
# List fields (facets). Note the examples power some of the unit tests.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't bother since the unit tests powered by the examples will be deleted once the webpy endpoint is deleted, and the examples don't really need justification.

author_key: list[str] = Field(
[], description="Filter by author key.", examples=["OL1394244A"]
)
subject_facet: list[str] = Field(
[], description="Filter by subject.", examples=["Fiction", "City planning"]
)
person_facet: list[str] = Field(
[],
description="Filter by person. Not the author but the person who is the subject of the work.",
examples=["Jane Jacobs (1916-2006)", "Cory Doctorow"],
)
place_facet: list[str] = Field(
[], description="Filter by place.", examples=["New York", "Xiamen Shi"]
)
time_facet: list[str] = Field(
[],
description="Filter by time. It can be formatted many ways.",
examples=["20th century", "To 70 A.D."],
)
first_publish_year: list[str] = Field(
[], description="Filter by first publish year.", examples=["2020"]
)
publisher_facet: list[str] = Field(
[], description="Filter by publisher.", examples=["Urban Land Institute"]
)
language: list[str] = Field(
[],
description="Filter by language using three-letter language codes.",
examples={
"english": {
"summary": "English",
"description": "Returns results in English",
"value": ["eng"],
},
"spanish": {
"summary": "Spanish",
"description": "Returns results in Spanish",
"value": ["spa"],
},
"english_and_spanish": {
"summary": "English + Spanish",
"description": "Bilingual results",
"value": ["eng", "spa"],
},
},
)
author_facet: list[str] = Field(
[],
description="(alias for author_key) Filter by author key.",
examples=["OL1394244A"],
)

isbn: str | None = None
author: str | None = None

@field_validator('q')
@classmethod
def parse_q_string(cls, v: str) -> str:
if q_error := validate_search_json_query(v):
raise ValueError(q_error)
return v


class SearchRequestParams(PublicQueryOptions, Pagination):
fields: Annotated[list[str], BeforeValidator(parse_fields_string)] = Field(
",".join(sorted(WorkSearchScheme.default_fetched_fields)),
description="The fields to return.",
)
query: Annotated[dict[str, Any], BeforeValidator(parse_query_json)] = Field(
None, description="A full JSON encoded solr query.", examples=['{"q": "mark"}']
)
sort: str | None = Field(None, description="The sort order of results.")
spellcheck_count: int | None = Field(
default_spellcheck_count,
description="The number of spellcheck suggestions.",
)

def parse_fields_string(v: str | list[str]) -> list[str]:
if isinstance(v, str):
v = [v]
return [f.strip() for item in v for f in str(item).split(",") if f.strip()]

def parse_query_json(v: str) -> dict[str, Any]:
try:
return json.loads(v)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in 'query' parameter: {e}")

@computed_field
def selected_query(self) -> dict[str, Any]:
if isinstance(self.query, dict):
return self.query
else:
# Include all fields that belong to PublicQueryOptions (the search query part)
# This automatically excludes SearchRequestParams-specific fields like fields, sort, etc.
query_fields = set(PublicQueryOptions.model_fields.keys())
# Add dynamically handled fields from WorkSearchScheme
query_fields |= WorkSearchScheme.all_fields
query_fields |= set(WorkSearchScheme.field_name_map.keys())
q = self.model_dump(include=query_fields, exclude_none=True)
return q


class SearchResponse(BaseModel):
"""The response from a (books) search query."""

model_config = ConfigDict(extra='allow')

numFound: int
start: int
numFoundExact: bool
num_found: int

documentation_url: str = "https://openlibrary.org/dev/docs/api/search"
q: str
offset: int | None

# Defined last so it renders at the bottom.
# We use dict[str, Any] to avoid documenting the internal book fields.
docs: list[dict[str, Any]] = []


@router.get("/search.json", tags=["search"], response_model=SearchResponse)
async def search_json(
request: Request,
pagination: Annotated[Pagination, Depends()],
public_query_options: Annotated[PublicQueryOptions, Depends()],
sort: str | None = Query(None, description="The sort order of results."),
fields: str | None = Query(None, description="The fields to return."),
spellcheck_count: int | None = Query(
default_spellcheck_count, description="The number of spellcheck suggestions."
),
query_str: str | None = Query(
None, alias="query", description="A full JSON encoded solr query."
),
):
params: Annotated[SearchRequestParams, Query()],
) -> Any:
"""
Performs a search for documents based on the provided query.
"""
query: dict[str, Any] = {}
if query_str:
query = json.loads(query_str)
else:
# In an ideal world, we would pass the model unstead of the dict but that's a big refactoring down the line
query = public_query_options.model_dump(exclude_none=True)
query.update({"page": pagination.page, "limit": pagination.limit})

_fields: list[str] = list(WorkSearchScheme.default_fetched_fields)
if fields:
_fields = fields.split(',')

if q_error := validate_search_json_query(public_query_options.q):
return JSONResponse(status_code=422, content={"error": q_error})

response = await work_search_async(
query,
sort=sort,
page=pagination.page,
offset=pagination.offset,
limit=pagination.limit,
fields=_fields,
raw_response = await work_search_async(
params.selected_query,
sort=params.sort,
page=params.page,
offset=params.offset,
limit=params.limit,
fields=params.fields,
# We do not support returning facets from /search.json,
# so disable it. This makes it much faster.
facet=False,
spellcheck_count=spellcheck_count,
spellcheck_count=params.spellcheck_count,
request_label='BOOK_SEARCH_API',
lang=request.state.lang,
)

response['documentation_url'] = "https://openlibrary.org/dev/docs/api/search"
response['q'] = public_query_options.q
response['offset'] = pagination.offset

# Put docs at the end of the response
docs = response.pop('docs', [])
response['docs'] = docs
raw_response['q'] = params.q
raw_response['offset'] = params.offset

return response
return raw_response


@router.get("/search/inside.json")
Expand Down
1 change: 1 addition & 0 deletions openlibrary/plugins/worksearch/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,7 @@ class search_json(delegate.page):
def GET(self):
i = web.input(
author_key=[],
author_facet=[],
subject_facet=[],
person_facet=[],
place_facet=[],
Expand Down
44 changes: 22 additions & 22 deletions openlibrary/plugins/worksearch/schemes/works.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,8 @@ class WorkSearchScheme(SearchScheme):
): lambda: f'ebook_access:[* TO {get_fulltext_min()}]',
}
)
check_params = frozenset(
# These are extra public api params on top of facets, which are also public
public_api_params = frozenset(
{
'title',
'publisher',
Expand All @@ -250,6 +251,8 @@ class WorkSearchScheme(SearchScheme):
'person',
'time',
'author_key',
'author',
'isbn',
}
)

Expand Down Expand Up @@ -287,27 +290,24 @@ def transform_user_query(

def build_q_from_params(self, params: dict[str, Any]) -> str:
q_list = []
if 'author' in params:
v = params['author'].strip()
m = re_author_key.search(v)
if m:
q_list.append(f"author_key:({m.group(1)})")
else:
v = fully_escape_query(v)
q_list.append(f"(author_name:({v}) OR author_alternative_name:({v}))")

# support web.input fields being either a list or string
# when default values used
q_list += [
f'{k}:({fully_escape_query(val)})'
for k in (self.check_params & set(params))
for val in (params[k] if isinstance(params[k], list) else [params[k]])
]

if params.get('isbn'):
q_list.append(
'isbn:(%s)' % (normalize_isbn(params['isbn']) or params['isbn'])
)
for k in self.public_api_params & set(params):
values = params[k] if isinstance(params[k], list) else [params[k]]
for val in values:
if k == 'author':
v = val.strip()
m = re_author_key.search(v)
if m:
q_list.append(f"author_key:({m.group(1)})")
else:
v = fully_escape_query(v)
q_list.append(
f"(author_name:({v}) OR author_alternative_name:({v}))"
)
elif k == 'isbn':
normalized = normalize_isbn(val)
q_list.append(f'isbn:({normalized or val})')
else:
q_list.append(f'{k}:({fully_escape_query(val)})')

return ' AND '.join(q_list)

Expand Down
Loading
Loading