-moved fetch to be per item, not fetch all then

Stanoja · Stanoja · commit f82c7f80d34c · 2025-12-13T00:13:00.000+01:00
diff --git a/Makefile b/Makefile
@@ -9,3 +9,8 @@ start:
 
 compute-concepts:
 	python ./web/manage.py compute_concepts
+
+fix-files:
+	python3 -m black .
+	python3 -m isort .
+	python3 -m flake8 .
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 black~=25.9.0
-isort~=4.2.5
+isort~=5.12.0
 flake8~=7.3.0
 
 -r ./web/requirements.txt
diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
@@ -1,8 +1,9 @@
 import json
 import logging
 import re
-from concepts.models import Item, CategorizerResult
+
 from categorizer.llm_service import LLMService, LLMType
+from concepts.models import CategorizerResult, Item
 
 # Free LLM types to use for categorization
 LLM_JUDGE_POOL = [
@@ -33,27 +34,30 @@ def categorize_items(self, limit=None):
             queryset = queryset[:limit]
 
         total = queryset.count()
-        self.logger.info(f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs")
+        self.logger.info(
+            f"Categorizing {total} items using {len(LLM_JUDGE_POOL)} free LLMs"
+        )
 
         for i, item in enumerate(queryset):
-            self.logger.info(
-                f"Processing item {i + 1}/{total}: {item.identifier}"
-            )
+            self.logger.info(f"Processing item {i + 1}/{total}: {item.identifier}")
             self.categorize_item(item)
 
         self.logger.info("Categorization complete")
 
     def categorize_item(
         self,
         item,
-        predicate: str = "Is the given concept a mathematical concept, given the name, description, keywords, and article text?"
+        predicate: str = "Is the given concept a mathematical concept,"
+        " given the name, description, "
+        "keywords, and article text?",
     ):
         """
         Categorize a single item using all free LLM types.
 
         Args:
             item: Item instance to categorize
-            predicate: The question to evaluate (default: checks if it's a mathematical concept)
+            predicate: The question to evaluate (default: checks if it's
+            a mathematical concept)
 
         Returns:
             List of categorization results from all LLMs
@@ -68,7 +72,10 @@ def categorize_item(
             try:
                 self.logger.info(f"Calling {llm_type.value} for {item.name}")
                 raw_result = self.llm_service.call_llm(llm_type, prompt)
-                self.logger.info(f"Categorized {item.name} with {llm_type.value}: {raw_result[:100]}...")
+                self.logger.info(
+                    f"Categorized {item.name} with {llm_type.value}: "
+                    f"{raw_result[:100]}..."
+                )
 
                 parsed_result = self._parse_categorization_result(raw_result)
 
@@ -89,7 +96,9 @@ def categorize_item(
 
                 results.append(parsed_result)
             except Exception as e:
-                self.logger.error(f"Failed to categorize {item.name} with {llm_type.value}: {e}")
+                self.logger.error(
+                    f"Failed to categorize {item.name} with {llm_type.value}: {e}"
+                )
                 # Continue with other LLMs even if one fails?
                 continue
 
@@ -106,7 +115,8 @@ def _build_categorization_prompt(self, item, predicate: str):
         Returns:
             Formatted prompt string
         """
-        system_prompt = """You are a categorization judge. Your task is to evaluate whether a given concept satisfies a specific predicate.
+        system_prompt = """You are a categorization judge. Your task is to
+         evaluate whether a given concept satisfies a specific predicate.
 
 You must respond with a structured answer containing:
 1. answer: true or false (boolean)
@@ -173,7 +183,9 @@ def _parse_categorization_result(self, result: str) -> dict:
                 parsed = json.loads(result)
 
             if "answer" not in parsed or "confidence" not in parsed:
-                raise ValueError("Response missing required fields 'answer' or 'confidence'")
+                raise ValueError(
+                    "Response missing required fields 'answer' or 'confidence'"
+                )
 
             answer = parsed["answer"]
             if isinstance(answer, str):
@@ -183,10 +195,7 @@ def _parse_categorization_result(self, result: str) -> dict:
             if not 0 <= confidence <= 100:
                 raise ValueError(f"Confidence must be between 0-100, got {confidence}")
 
-            return {
-                "answer": bool(answer),
-                "confidence": confidence
-            }
+            return {"answer": bool(answer), "confidence": confidence}
 
         except json.JSONDecodeError as e:
             self.logger.error(f"Failed to parse JSON response: {result}")
diff --git a/web/categorizer/llm_service.py b/web/categorizer/llm_service.py
@@ -28,12 +28,24 @@ class LLMService:
     def __init__(self):
         self.logger = logging.getLogger(__name__)
         self.llm_handlers = {
-            LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
-            LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(llm_type, prompt),
-            LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthropic(prompt),
-            LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_huggingface("google/flan-t5-base", prompt),
-            LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_huggingface("gpt2", prompt),
-            LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_huggingface("microsoft/DialoGPT-medium", prompt),
+            LLMType.OPENAI_GPT4: lambda llm_type, prompt: self._call_openai(
+                llm_type, prompt
+            ),
+            LLMType.OPENAI_GPT35: lambda llm_type, prompt: self._call_openai(
+                llm_type, prompt
+            ),
+            LLMType.ANTHROPIC_CLAUDE: lambda llm_type, prompt: self._call_anthrpc(
+                prompt
+            ),
+            LLMType.HUGGINGFACE_FLAN_T5: lambda llm_type, prompt: self._call_hgf(
+                "google/flan-t5-base", prompt
+            ),
+            LLMType.HUGGINGFACE_GPT2: lambda llm_type, prompt: self._call_hgf(
+                "gpt2", prompt
+            ),
+            LLMType.HUGGINGFACE_DIALOGPT: lambda llm_type, prompt: self._call_hgf(
+                "microsoft/DialoGPT-medium", prompt
+            ),
             LLMType.OLLAMA: lambda llm_type, prompt: self._call_ollama(prompt),
         }
 
@@ -92,7 +104,7 @@ def _call_openai(self, llm_type: LLMType, prompt: str) -> str:
             self.logger.error(f"OpenAI API call failed: {e}")
             raise
 
-    def _call_anthropic(self, prompt: str) -> str:
+    def _call_anthrpc(self, prompt: str) -> str:
         """Call Anthropic Claude API"""
         try:
             import anthropic
@@ -121,7 +133,7 @@ def _call_anthropic(self, prompt: str) -> str:
             self.logger.error(f"Anthropic API call failed: {e}")
             raise
 
-    def _call_huggingface(self, model_id: str, prompt: str) -> str:
+    def _call_hgf(self, model_id: str, prompt: str) -> str:
         """
         Call HuggingFace models using langchain.
 
@@ -156,7 +168,11 @@ def _call_huggingface(self, model_id: str, prompt: str) -> str:
             # Create the HuggingFace pipeline
             hf = HuggingFacePipeline.from_model_id(
                 model_id=model_id,
-                task="text-generation" if "gpt" in model_id.lower() else "text2text-generation",
+                task=(
+                    "text-generation"
+                    if "gpt" in model_id.lower()
+                    else "text2text-generation"
+                ),
                 pipeline_kwargs=pipeline_kwargs,
             )
 
diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py
@@ -1,5 +1,5 @@
-from django.core.management.base import BaseCommand
 from categorizer.categorizer_service import CategorizerService
+from django.core.management.base import BaseCommand
 
 
 class Command(BaseCommand):
@@ -18,7 +18,10 @@ def handle(self, *args, **options):
 
         service = CategorizerService()
 
-        self.stdout.write("Using all free LLMs: huggingface_flan_t5, huggingface_gpt2, huggingface_dialogpt")
+        self.stdout.write(
+            "Using all free LLMs: huggingface_flan_t5, "
+            "huggingface_gpt2, huggingface_dialogpt"
+        )
         if limit:
             self.stdout.write(f"Categorizing up to {limit} items...")
         else:
@@ -28,6 +31,4 @@ def handle(self, *args, **options):
             service.categorize_items(limit=limit)
             self.stdout.write(self.style.SUCCESS("Categorization complete!"))
         except Exception as e:
-            self.stdout.write(
-                self.style.ERROR(f"Categorization failed: {e}")
-            )
+            self.stdout.write(self.style.ERROR(f"Categorization failed: {e}"))
diff --git a/web/concepts/models.py b/web/concepts/models.py
@@ -188,4 +188,7 @@ class Meta:
         ]
 
     def __str__(self):
-        return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)"
+        return (
+            f"{self.item} - {self.llm_type}: "
+            f"{self.result_answer} ({self.result_confidence}%)"
+        )
diff --git a/web/slurper/keyword_util.py b/web/slurper/keyword_util.py
@@ -3,9 +3,7 @@
 # TODO SST: Move to readme.md
 # Load the scientific English model from scispacy
 # Note: You need to download this model first with:
-#   make install-scispacy
-# Or directly:
-#   pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_lg-0.5.4.tar.gz
+#  make install-scispacy
 
 # Lazy-loaded spaCy model
 _nlp = None
diff --git a/web/slurper/source_wikidata.py b/web/slurper/source_wikidata.py
@@ -7,7 +7,6 @@
 from django.db.utils import IntegrityError
 from slurper.wd_raw_item import WD_OTHER_SOURCES, BaseWdRawItem
 
-
 # Wikipedia API contact email (required by Wikipedia API guidelines)
 # Set to None to disable Wikipedia article fetching
 WIKIPEDIA_CONTACT_EMAIL = None
@@ -26,12 +25,15 @@
 #   - Excludes humans (FILTER NOT EXISTS)
 #   - Label service: Automatically fetches English labels and descriptions
 #
-#   The class fetches mathematical concepts from Wikidata while filtering out unwanted items like people and natural numbers.
+#   The class fetches mathematical concepts from Wikidata while
+#   filtering out unwanted items like people and natural numbers.
+
 
 class WikidataSlurper:
     SPARQL_URL = "https://query.wikidata.org/sparql"
 
-    SPARQL_QUERY_OPTIONS = """
+    SPARQL_QUERY_OPTIONS = (
+        """
   OPTIONAL
   { ?item wdt:P18 ?image . }
   OPTIONAL
@@ -44,7 +46,9 @@ class WikidataSlurper:
   { ?item skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en") }
   # except for natural numbers and positive integers
   FILTER NOT EXISTS {
-    VALUES ?excludedType { """ + " ".join(KNOWN_EXCLUDED_CATEGORIES) + """ }
+    VALUES ?excludedType { """
+        + " ".join(KNOWN_EXCLUDED_CATEGORIES)
+        + """ }
     ?item wdt:P31 ?excludedType .
   }
   # except for humans
@@ -53,6 +57,7 @@ class WikidataSlurper:
   SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
 }
 """
+    )
 
     def __init__(self, source, query, limit=None):
         self.source = source
@@ -71,16 +76,13 @@ def __init__(self, source, query, limit=None):
             + self.SPARQL_QUERY_OPTIONS
             + """
 GROUP BY ?item ?itemLabel ?itemDescription ?image ?wp_en """
-            + " ".join(
-                [f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()]
-            )
+            + " ".join([f"?{src['json_key']}" for src in WD_OTHER_SOURCES.values()])
             + """
 """
             + (f"LIMIT {limit}" if limit is not None else "")
         )
         self.raw_data = self.fetch_json()
 
-
     def _sparql_source_vars_select(self):
         def to_var(source_dict):
             return " ?" + source_dict["json_key"]
@@ -112,8 +114,10 @@ def fetch_article(self, json_item, index=None, total=None):
             if not _missing_email_logged:
                 logging.log(
                     logging.WARNING,
-                    "WIKIPEDIA_CONTACT_EMAIL is not set. Wikipedia article fetching is disabled. "
-                    "Please set WIKIPEDIA_CONTACT_EMAIL at the top of source_wikidata.py to enable article fetching.",
+                    "WIKIPEDIA_CONTACT_EMAIL is not set. "
+                    "Wikipedia article fetching is disabled. "
+                    "Please set WIKIPEDIA_CONTACT_EMAIL at the top of "
+                    "source_wikidata.py to enable article fetching.",
                 )
                 _missing_email_logged = True
             return None
@@ -156,22 +160,26 @@ def fetch_article(self, json_item, index=None, total=None):
                 time.sleep(0.01)
 
                 # Timeout: (connect_timeout, read_timeout) in seconds
-                response = requests.get(api_url, params=params, headers=headers, timeout=(5, 30))
+                response = requests.get(
+                    api_url, params=params, headers=headers, timeout=(5, 30)
+                )
 
                 # Handle rate limiting
                 if response.status_code in (429, 403):
                     if attempt < max_retries - 1:
                         logging.log(
                             logging.WARNING,
-                            f"Rate limited for {article_title}, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})",
+                            f"Rate limited for {article_title}, retrying in "
+                            f"{retry_delay}s (attempt {attempt + 1}/{max_retries})",
                         )
                         time.sleep(retry_delay)
                         retry_delay *= 2  # Exponential backoff
                         continue
                     else:
                         logging.log(
                             logging.ERROR,
-                            f"Failed to fetch {article_title} after {max_retries} attempts (rate limited). Skipping article.",
+                            f"Failed to fetch {article_title} after "
+                            f"{max_retries} attempts (rate limited). Skipping article.",
                         )
                         break
 
@@ -180,7 +188,8 @@ def fetch_article(self, json_item, index=None, total=None):
                 if not response.text:
                     logging.log(
                         logging.WARNING,
-                        f"Empty response for Wikipedia article: {article_title}. Skipping article.",
+                        f"Empty response for Wikipedia article: "
+                        f"{article_title}. Skipping article.",
                     )
                     break
 
@@ -200,14 +209,16 @@ def fetch_article(self, json_item, index=None, total=None):
                 if attempt < max_retries - 1:
                     logging.log(
                         logging.WARNING,
-                        f"Request failed for {article_title}: {e}, retrying in {retry_delay}s",
+                        f"Request failed for {article_title}: "
+                        f"{e}, retrying in {retry_delay}s",
                     )
                     time.sleep(retry_delay)
                     retry_delay *= 2
                 else:
                     logging.log(
                         logging.ERROR,
-                        f"Failed to fetch {article_title} after {max_retries} attempts: {e}. Skipping article.",
+                        f"Failed to fetch {article_title}"
+                        f" after {max_retries} attempts: {e}. Skipping article.",
                     )
         if not success and "wp_en" in json_item:
             logging.log(

Original file line number	Diff line number	Diff line change
`@@ -188,4 +188,7 @@ class Meta:`
`188`	`188`	`]`
`189`	`189`
`190`	`190`	`def __str__(self):`
`191`		`- return f"{self.item} - {self.llm_type}: {self.result_answer} ({self.result_confidence}%)"`
	`191`	`+ return (`
	`192`	`+ f"{self.item} - {self.llm_type}: "`
	`193`	`+ f"{self.result_answer} ({self.result_confidence}%)"`
	`194`	`+ )`