Skip to content

Commit f75aab1

Browse files
Merge pull request open5e#750 from open5e/moody/fuzzy-search
Add fuzzy and vector search to v2/search endpoint
2 parents 1b900da + 6725642 commit f75aab1

File tree

13 files changed

+1442
-178
lines changed

13 files changed

+1442
-178
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.11-alpine
1+
FROM python:3.11-slim
22

33
RUN mkdir -p /opt/services/open5e-api
44
WORKDIR /opt/services/open5e-api

Pipfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ requests = "*"
1313
whitenoise = "*"
1414
gunicorn = "*"
1515
drf-spectacular = {extras = ["sidecar"], version = "*"}
16+
numpy = "*"
17+
rapidfuzz = "*"
18+
scikit-learn = "*"
1619

1720
[dev-packages]
1821
pytest = "*"

Pipfile.lock

Lines changed: 394 additions & 122 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ To use the search function, you must build the search index by running the above
9292
pipenv run python manage.py quicksetup
9393
```
9494

95-
9695
## Run
9796

9897
Run the server locally. This server is only for development and shall __not__ be used in production. The server will be available at `http://localhost:8000`.

api/management/commands/quicksetup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ def clean_dir() ->None:
100100
shutil.rmtree(Path(settings.STATIC_ROOT))
101101
if Path(settings.DATABASES['default']['NAME']).exists():
102102
Path(settings.DATABASES['default']['NAME']).unlink()
103+
vector_index = Path('server/vector_index.pkl')
104+
if vector_index.exists():
105+
vector_index.unlink()
103106

104107
def import_v1() -> None:
105108
"""Import the v1 apps' database models."""

scripts/search_benchmark.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
import sys
5+
import django
6+
import time
7+
import requests
8+
from statistics import mean
9+
10+
# Django setup
11+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'server.settings')
12+
sys.path.append('/Users/moody/open5e-api')
13+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Add parent directory
14+
django.setup()
15+
16+
def time_search(query, params=None):
17+
"""Time a search request and return timing + result info."""
18+
url = "http://127.0.0.1:8000/v2/search/"
19+
search_params = {"query": query}
20+
if params:
21+
search_params.update(params)
22+
23+
start_time = time.time()
24+
response = requests.get(url, params=search_params)
25+
end_time = time.time()
26+
27+
if response.status_code != 200:
28+
return None, f"Error: {response.status_code}"
29+
30+
data = response.json()
31+
timing = (end_time - start_time) * 1000 # Convert to milliseconds
32+
33+
# Just get the total count
34+
total_results = data.get('count', 0)
35+
metadata = data.get('search_metadata', {})
36+
exact_matches = metadata.get('exact_matches', False)
37+
38+
return timing, {
39+
'total_results': total_results,
40+
'exact_matches': exact_matches
41+
}
42+
43+
def run_performance_test():
44+
"""Run simplified performance comparison of search modes."""
45+
46+
# Test queries covering different scenarios
47+
test_queries = [
48+
"fire", # Common term, many exact matches
49+
"sword", # Common term, many exact matches
50+
"heal", # Common term, some matches
51+
"dragon", # Common term, many matches
52+
"fireball", # Specific spell, exact matches
53+
"teleport", # Specific spell, exact matches
54+
"magic weapon", # Multi-word, exact matches
55+
"chromatic orb", # Specific spell, few matches
56+
"firbal", # Typo, fuzzy fallback
57+
"dragn", # Typo, fuzzy fallback
58+
]
59+
60+
results_table = []
61+
62+
print("Search Mode Performance Comparison")
63+
print("=" * 80)
64+
print("Testing: Default (exact + fuzzy fallback) vs Fuzzy-strict vs Vector-strict")
65+
print()
66+
67+
for query in test_queries:
68+
print(f"Testing '{query}'...")
69+
70+
# Test Default mode (exact + fuzzy fallback if needed)
71+
default_times = []
72+
default_info = None
73+
for _ in range(3):
74+
timing, info = time_search(query)
75+
if timing is not None:
76+
default_times.append(timing)
77+
if default_info is None:
78+
default_info = info
79+
80+
# Test Fuzzy-strict mode
81+
fuzzy_times = []
82+
fuzzy_info = None
83+
for _ in range(3):
84+
timing, info = time_search(query, {"strict": "true", "fuzzy": "true"})
85+
if timing is not None:
86+
fuzzy_times.append(timing)
87+
if fuzzy_info is None:
88+
fuzzy_info = info
89+
90+
# Test Vector-strict mode
91+
vector_times = []
92+
vector_info = None
93+
for _ in range(3):
94+
timing, info = time_search(query, {"strict": "true", "vector": "true"})
95+
if timing is not None:
96+
vector_times.append(timing)
97+
if vector_info is None:
98+
vector_info = info
99+
100+
# Calculate averages and format results
101+
default_avg = mean(default_times) if default_times else 0
102+
fuzzy_avg = mean(fuzzy_times) if fuzzy_times else 0
103+
vector_avg = mean(vector_times) if vector_times else 0
104+
105+
# Format result information
106+
def format_results(info):
107+
if not info:
108+
return "Error"
109+
total = info['total_results']
110+
return f"{total}"
111+
112+
results_table.append({
113+
'query': query,
114+
'default_time': default_avg,
115+
'default_results': format_results(default_info),
116+
'fuzzy_time': fuzzy_avg,
117+
'fuzzy_results': format_results(fuzzy_info),
118+
'vector_time': vector_avg,
119+
'vector_results': format_results(vector_info)
120+
})
121+
122+
# Output markdown table
123+
print("\nResults (Markdown Table):")
124+
print("=" * 80)
125+
print("| Query | Default Mode | | Fuzzy-Strict | | Vector-Strict | |")
126+
print("|-------|-------------|---|-------------|---|-------------|---|")
127+
print("| | Time (ms) | Results | Time (ms) | Results | Time (ms) | Results |")
128+
129+
for row in results_table:
130+
print(f"| {row['query']} | {row['default_time']:.1f} | {row['default_results']} | "
131+
f"{row['fuzzy_time']:.1f} | {row['fuzzy_results']} | "
132+
f"{row['vector_time']:.1f} | {row['vector_results']} |")
133+
134+
print()
135+
print("Legend:")
136+
print("- Default Mode: Runs exact search, falls back to fuzzy if exact finds 0 results")
137+
print("- Fuzzy-Strict: Only fuzzy search")
138+
print("- Vector-Strict: Only vector search")
139+
print("- Results: Total matches found across all pages (API returns 50 per page)")
140+
141+
if __name__ == "__main__":
142+
print("Starting simplified search performance comparison...")
143+
print("Make sure the Django dev server is running on http://127.0.0.1:8000")
144+
print()
145+
146+
try:
147+
# Quick connectivity test
148+
response = requests.get("http://127.0.0.1:8000/v2/search/", params={"query": "test"})
149+
if response.status_code != 200:
150+
print(f"Server connectivity issue: {response.status_code}")
151+
sys.exit(1)
152+
except requests.exceptions.ConnectionError:
153+
print("Cannot connect to server. Please start the Django dev server first:")
154+
print("python manage.py runserver")
155+
sys.exit(1)
156+
157+
run_performance_test()

search/apps.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,33 @@
1+
import pickle
2+
from pathlib import Path
13
from django.apps import AppConfig
24

35

46
class SearchConfig(AppConfig):
57
default_auto_field = 'django.db.models.BigAutoField'
68
name = 'search'
9+
vector_index = None # Class attribute to store the loaded index
10+
11+
def ready(self):
12+
"""Load vector index once when Django starts."""
13+
vector_index_path = Path('server/vector_index.pkl')
14+
15+
if vector_index_path.exists():
16+
try:
17+
print("Loading vector search index...")
18+
with vector_index_path.open('rb') as f:
19+
SearchConfig.vector_index = pickle.load(f)
20+
21+
# Print basic info about the loaded index
22+
if SearchConfig.vector_index and 'matrix' in SearchConfig.vector_index:
23+
matrix_shape = SearchConfig.vector_index['matrix'].shape
24+
print(f"Vector index loaded successfully: {matrix_shape}")
25+
else:
26+
print("Vector index loaded but missing expected components")
27+
28+
except Exception as e:
29+
print(f"Failed to load vector index: {e}")
30+
SearchConfig.vector_index = None
31+
else:
32+
print(f"Vector index file not found: {vector_index_path}")
33+
SearchConfig.vector_index = None

search/management/commands/buildindex.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11

22
import argparse
3+
import pickle
4+
5+
from pathlib import Path
36

47
from django.core.management import call_command
58
from django.core.management.base import BaseCommand
@@ -9,6 +12,7 @@
912
from api import models as v1
1013
from api_v2 import models as v2
1114
from search import models as search
15+
from sklearn.feature_extraction.text import TfidfVectorizer
1216

1317
class Command(BaseCommand):
1418
"""Implementation for the `manage.py `index_v1` subcommand."""
@@ -102,6 +106,20 @@ def load_index(self):
102106
"SELECT document_pk,object_pk,object_name,object_model,text,schema_version " +
103107
"FROM search_searchresult")
104108

109+
def build_vector_index(self):
110+
"""Create a TF-IDF matrix for vector search and store it to disk."""
111+
qs = search.SearchResult.objects.all().order_by("id")
112+
if not qs:
113+
return
114+
docs = [f"{o.object_name} {o.text}" for o in qs]
115+
ids = [o.id for o in qs]
116+
names = [o.object_name for o in qs]
117+
vectorizer = TfidfVectorizer()
118+
matrix = vectorizer.fit_transform(docs)
119+
index_data = {"ids": ids, "names": names, "matrix": matrix, "vectorizer": vectorizer}
120+
with Path("server/vector_index.pkl").open("wb") as fh:
121+
pickle.dump(index_data, fh)
122+
105123
def check_fts_enabled(self):
106124
#import sqlite3
107125
with connection.cursor() as cursor:
@@ -153,5 +171,8 @@ def handle(self, *args, **options):
153171
# Take the content table's current data and load it into the index.
154172
self.load_index()
155173

174+
# Also build the vector search index from the loaded content.
175+
self.build_vector_index()
176+
156177
# Unload content table (saves storage space.)
157178
self.unload_all_content()

search/serializers.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ class SearchResultSerializer(serializers.ModelSerializer):
1515
object = serializers.SerializerMethodField(method_name='get_object')
1616
document = serializers.SerializerMethodField(method_name='get_document')
1717
route = serializers.SerializerMethodField(method_name='get_route')
18+
match_type = serializers.CharField(read_only=True)
19+
matched_term = serializers.CharField(read_only=True, allow_null=True)
20+
match_score = serializers.FloatField(read_only=True, allow_null=True)
1821

1922
class Meta:
2023
model = models.SearchResult
@@ -27,7 +30,10 @@ class Meta:
2730
'schema_version',
2831
'route',
2932
'text',
30-
'highlighted']
33+
'highlighted',
34+
'match_type',
35+
'matched_term',
36+
'match_score']
3137

3238
# The following override is replaced in a postproccessing hook defined in oas.py. I couldn't figure out how to get a oneof in here
3339
@extend_schema_field(OpenApiTypes.STR)

0 commit comments

Comments
 (0)