Skip to content

Commit f82c7f8

Browse files
committed
-moved fetch to be per item, not fetch all then
1 parent 7fd7197 commit f82c7f8

File tree

8 files changed

+93
-50
lines changed

8 files changed

+93
-50
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,8 @@ start:
99

1010
compute-concepts:
1111
python ./web/manage.py compute_concepts
12+
13+
fix-files:
14+
python3 -m black .
15+
python3 -m isort .
16+
python3 -m flake8 .

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
black~=25.9.0
2-
isort~=4.2.5
2+
isort~=5.12.0
33
flake8~=7.3.0
44

55
-r ./web/requirements.txt

web/categorizer/categorizer_service.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import json
22
import logging
33
import re
4-
from concepts.models import Item, CategorizerResult
4+
55
from categorizer.llm_service import LLMService, LLMType
6+
from concepts.models import CategorizerResult, Item
67

78
# Free LLM types to use for categorization
89
LLM_JUDGE_POOL = [
@@ -33,27 +34,30 @@ def categorize_items(self, limit=None):
3334
queryset = queryset[:limit]
3435

3536
total = queryset.count()
36-
self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs")
37+
self.logger.info(
38+
f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs"
39+
)
3740

3841
for i, item in enumerate(queryset):
39-
self.logger.info(
40-
f"Processing item {i + 1}/{total}: {item.identifier}"
41-
)
42+
self.logger.info(f"Processing item {i + 1}/{total}: {item.identifier}")
4243
self.categorize_item(item)
4344

4445
self.logger.info("Categorization complete")
4546

4647
def categorize_item(
4748
self,
4849
item,
49-
predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?"
50+
predicate: str = "Is the given concept a mathematical concept,"
51+
" given the name, description, "
52+
"keywords, and article text?",
5053
):
5154
"""
5255
Categorize a single item using all free LLM types.
5356
5457
Args:
5558
item: Item instance to categorize
56-
predicate: The question to evaluate (default: checks if it's a mathematical concept)
59+
predicate: The question to evaluate (default: checks if it's
60+
a mathematical concept)
5761
5862
Returns:
5963
List of categorization results from all LLMs
@@ -68,7 +72,10 @@ def categorize_item(
6872
try:
6973
self.logger.info(f"Calling {llm_type.value} for {item.name}")
7074
raw_result = self.llm_service.call_llm(llm_type, prompt)
71-
self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...")
75+
self.logger.info(
76+
f"Categorized {item.name} with {llm_type.value}: "
77+
f"{raw_result[:100]}..."
78+
)
7279

7380
parsed_result = self._parse_categorization_result(raw_result)
7481

@@ -89,7 +96,9 @@ def categorize_item(
8996

9097
results.append(parsed_result)
9198
except Exception as e:
92-
self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}")
99+
self.logger.error(
100+
f"Failed to categorize {item.name} with {llm_type.value}: {e}"
101+
)
93102
# Continue with other LLMs even if one fails?
94103
continue
95104

@@ -106,7 +115,8 @@ def _build_categorization_prompt(self, item, predicate: str):
106115
Returns:
107116
Formatted prompt string
108117
"""
109-
system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate.
118+
system_prompt = """You are a categorization judge. Your task is to
119+
evaluate whether a given concept satisfies a specific predicate.
110120
111121
You must respond with a structured answer containing:
112122
1. answer: true or false (boolean)
@@ -173,7 +183,9 @@ def _parse_categorization_result(self, result: str) -> dict:
173183
parsed = json.loads(result)
174184

175185
if "answer" not in parsed or "confidence" not in parsed:
176-
raise ValueError("Response missing required fields 'answer' or 'confidence'")
186+
raise ValueError(
187+
"Response missing required fields 'answer' or 'confidence'"
188+
)
177189

178190
answer = parsed["answer"]
179191
if isinstance(answer, str):
@@ -183,10 +195,7 @@ def _parse_categorization_result(self, result: str) -> dict:
183195
if not 0 <= confidence <= 100:
184196
raise ValueError(f"Confidence must be between 0-100, got {confidence}")
185197

186-
return {
187-
"answer": bool(answer),
188-
"confidence": confidence
189-
}
198+
return {"answer": bool(answer), "confidence": confidence}
190199

191200
except json.JSONDecodeError as e:
192201
self.logger.error(f"Failed to parse JSON response: {result}")

web/categorizer/llm_service.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,24 @@ class LLMService:
2828
def __init__(self):
2929
self.logger = logging.getLogger(__name__)
3030
self.llm_handlers = {
31-
LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
32-
LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
33-
LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt),
34-
LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt),
35-
LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt),
36-
LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt),
31+
LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(
32+
llm_type, prompt
33+
),
34+
LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(
35+
llm_type, prompt
36+
),
37+
LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthrpc(
38+
prompt
39+
),
40+
LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_hgf(
41+
"google/flan-t5-base", prompt
42+
),
43+
LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_hgf(
44+
"gpt2", prompt
45+
),
46+
LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_hgf(
47+
"microsoft/DialoGPT-medium", prompt
48+
),
3749
LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt),
3850
}
3951

@@ -92,7 +104,7 @@ def _call_openai(self, llm_type: LLMType, prompt: str) -> str:
92104
self.logger.error(f"OpenAI API call failed: {e}")
93105
raise
94106

95-
def _call_anthropic(self, prompt: str) -> str:
107+
def _call_anthrpc(self, prompt: str) -> str:
96108
"""Call Anthropic Claude API"""
97109
try:
98110
import anthropic
@@ -121,7 +133,7 @@ def _call_anthropic(self, prompt: str) -> str:
121133
self.logger.error(f"Anthropic API call failed: {e}")
122134
raise
123135

124-
def _call_huggingface(self, model_id: str, prompt: str) -> str:
136+
def _call_hgf(self, model_id: str, prompt: str) -> str:
125137
"""
126138
Call HuggingFace models using langchain.
127139
@@ -156,7 +168,11 @@ def _call_huggingface(self, model_id: str, prompt: str) -> str:
156168
# Create the HuggingFace pipeline
157169
hf = HuggingFacePipeline.from_model_id(
158170
model_id=model_id,
159-
task="text-generation" if "gpt" in model_id.lower() else "text2text-generation",
171+
task=(
172+
"text-generation"
173+
if "gpt" in model_id.lower()
174+
else "text2text-generation"
175+
),
160176
pipeline_kwargs=pipeline_kwargs,
161177
)
162178

web/categorizer/management/commands/categorize.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from django.core.management.base import BaseCommand
21
from categorizer.categorizer_service import CategorizerService
2+
from django.core.management.base import BaseCommand
33

44

55
class Command(BaseCommand):
@@ -18,7 +18,10 @@ def handle(self, *args, **options):
1818

1919
service = CategorizerService()
2020

21-
self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt")
21+
self.stdout.write(
22+
"Using all free LLMs: huggingface_flan_t5, "
23+
"huggingface_gpt2, huggingface_dialogpt"
24+
)
2225
if limit:
2326
self.stdout.write(f"Categorizing up to {limit} items...")
2427
else:
@@ -28,6 +31,4 @@ def handle(self, *args, **options):
2831
service.categorize_items(limit=limit)
2932
self.stdout.write(self.style.SUCCESS("Categorization complete!"))
3033
except Exception as e:
31-
self.stdout.write(
32-
self.style.ERROR(f"Categorization failed: {e}")
33-
)
34+
self.stdout.write(self.style.ERROR(f"Categorization failed: {e}"))

web/concepts/models.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,7 @@ class Meta:
188188
]
189189

190190
def __str__(self):
191-
return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)"
191+
return (
192+
f"{self.item} - {self.llm_type}: "
193+
f"{self.result_answer} ({self.result_confidence}%)"
194+
)

web/slurper/keyword_util.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
# TODO SST: Move to readme.md
44
# Load the scientific English model from scispacy
55
# Note: You need to download this model first with:
6-
# make install-scispacy
7-
# Or directly:
8-
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
6+
# make install-scispacy
97

108
# Lazy-loaded spaCy model
119
_nlp = None

web/slurper/source_wikidata.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from django.db.utils import IntegrityError
88
from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
99

10-
1110
# Wikipedia API contact email (required by Wikipedia API guidelines)
1211
# Set to None to disable Wikipedia article fetching
1312
WIKIPEDIA_CONTACT_EMAIL = None
@@ -26,12 +25,15 @@
2625
# - Excludes humans (FILTER NOT EXISTS)
2726
# - Label service: Automatically fetches English labels and descriptions
2827
#
29-
# The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers.
28+
# The class fetches mathematical concepts from Wikidata while
29+
# filtering out unwanted items like people and natural numbers.
30+
3031

3132
class WikidataSlurper:
3233
SPARQL_URL = "https://query.wikidata.org/sparql"
3334

34-
SPARQL_QUERY_OPTIONS = """
35+
SPARQL_QUERY_OPTIONS = (
36+
"""
3537
OPTIONAL
3638
{ ?item wdt:P18 ?image . }
3739
OPTIONAL
@@ -44,7 +46,9 @@ class WikidataSlurper:
4446
{ ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") }
4547
# except for natural numbers and positive integers
4648
FILTER NOT EXISTS {
47-
VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ }
49+
VALUES ?excludedType { """
50+
+ " ".join(KNOWN_EXCLUDED_CATEGORIES)
51+
+ """ }
4852
?item wdt:P31 ?excludedType .
4953
}
5054
# except for humans
@@ -53,6 +57,7 @@ class WikidataSlurper:
5357
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
5458
}
5559
"""
60+
)
5661

5762
def __init__(self, source, query, limit=None):
5863
self.source = source
@@ -71,16 +76,13 @@ def __init__(self, source, query, limit=None):
7176
+ self.SPARQL_QUERY_OPTIONS
7277
+ """
7378
GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """
74-
+ " ".join(
75-
[f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]
76-
)
79+
+ " ".join([f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()])
7780
+ """
7881
"""
7982
+ (f"LIMIT {limit}" if limit is not None else "")
8083
)
8184
self.raw_data = self.fetch_json()
8285

83-
8486
def _sparql_source_vars_select(self):
8587
def to_var(source_dict):
8688
return " ?" + source_dict["json_key"]
@@ -112,8 +114,10 @@ def fetch_article(self, json_item, index=None, total=None):
112114
if not _missing_email_logged:
113115
logging.log(
114116
logging.WARNING,
115-
"WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
116-
"Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.",
117+
"WIKIPEDIA_CONTACT_EMAIL is not set. "
118+
"Wikipedia article fetching is disabled. "
119+
"Please set WIKIPEDIA_CONTACT_EMAIL at the top of "
120+
"source_wikidata.py to enable article fetching.",
117121
)
118122
_missing_email_logged = True
119123
return None
@@ -156,22 +160,26 @@ def fetch_article(self, json_item, index=None, total=None):
156160
time.sleep(0.01)
157161

158162
# Timeout: (connect_timeout, read_timeout) in seconds
159-
response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30))
163+
response = requests.get(
164+
api_url, params=params, headers=headers, timeout=(5, 30)
165+
)
160166

161167
# Handle rate limiting
162168
if response.status_code in (429, 403):
163169
if attempt < max_retries - 1:
164170
logging.log(
165171
logging.WARNING,
166-
f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})",
172+
f"Rate limited for {article_title}, retrying in "
173+
f"{retry_delay}s (attempt {attempt + 1}/{max_retries})",
167174
)
168175
time.sleep(retry_delay)
169176
retry_delay *= 2 # Exponential backoff
170177
continue
171178
else:
172179
logging.log(
173180
logging.ERROR,
174-
f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.",
181+
f"Failed to fetch {article_title} after "
182+
f"{max_retries} attempts (rate limited). Skipping article.",
175183
)
176184
break
177185

@@ -180,7 +188,8 @@ def fetch_article(self, json_item, index=None, total=None):
180188
if not response.text:
181189
logging.log(
182190
logging.WARNING,
183-
f"Empty response for Wikipedia article: {article_title}. Skipping article.",
191+
f"Empty response for Wikipedia article: "
192+
f"{article_title}. Skipping article.",
184193
)
185194
break
186195

@@ -200,14 +209,16 @@ def fetch_article(self, json_item, index=None, total=None):
200209
if attempt < max_retries - 1:
201210
logging.log(
202211
logging.WARNING,
203-
f"Request failed for {article_title}: {e}, retrying in {retry_delay}s",
212+
f"Request failed for {article_title}: "
213+
f"{e}, retrying in {retry_delay}s",
204214
)
205215
time.sleep(retry_delay)
206216
retry_delay *= 2
207217
else:
208218
logging.log(
209219
logging.ERROR,
210-
f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.",
220+
f"Failed to fetch {article_title}"
221+
f" after {max_retries} attempts: {e}. Skipping article.",
211222
)
212223
if not success and "wp_en" in json_item:
213224
logging.log(

0 commit comments

Comments
 (0)