-updated other needed flags

Stanoja · Stanoja · commit 455b4ccd899a · 2026-04-06T20:54:59.000+02:00
-added data fetch for needed ids and meta from the sources
-added test scripts
diff --git a/test-001.sh b/test-001.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Categorize 500 Wikidata math items, fetch entity data, include external IDs
+cd "$(dirname "$0")/web" || exit 1
+
+SESSION="test-001-$(date +%Y%m%d)"
+
+python manage.py categorize \
+  --limit 500 \
+  --source Wd \
+  --domain math \
+  --fetch \
+  --session-name "$SESSION"
diff --git a/test-002.sh b/test-002.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Categorize 500 Wikidata math items, no fetch, no external IDs
+cd "$(dirname "$0")/web" || exit 1
+
+SESSION="test-002-$(date +%Y%m%d)"
+
+python manage.py categorize \
+  --limit 500 \
+  --source Wd \
+  --domain math \
+  --session-name "$SESSION"
diff --git a/test-003.sh b/test-003.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Categorize 500 Wikidata physics items
+cd "$(dirname "$0")/web" || exit 1
+
+SESSION="test-003-$(date +%Y%m%d)"
+
+python manage.py categorize \
+  --limit 500 \
+  --source Wd \
+  --domain phys \
+  --session-name "$SESSION"
diff --git a/web/categorizer/categorizer_service.py b/web/categorizer/categorizer_service.py
@@ -8,6 +8,7 @@
     parse_categorization_result,
     parse_categorization_result_with_reasoning,
 )
+from categorizer.wikidata_fetch_service import WikidataFetchService
 from concepts.models import CategorizerResult, Item
 
 # Free LLM types to use for categorization
@@ -39,9 +40,16 @@ class CategorizerService:
     def __init__(self):
         self.logger = logging.getLogger(__name__)
         self.llm_service = LLMService()
+        self.wikidata_fetch_service = WikidataFetchService()
 
     def categorize_items(
-        self, limit=None, judge_pool="low", domain=None, source=None, session_name=None
+        self,
+        limit=None,
+        judge_pool="low",
+        domain=None,
+        source=None,
+        fetch=False,
+        session_name=None,
     ):
         """
         Categorize items from the database using all free LLM types.
@@ -90,14 +98,19 @@ def categorize_items(
             f"{len(pool)} LLMs)"
         )
 
+        use_other_ids = fetch
+
         total_start = time.perf_counter()
         for i, item in enumerate(items_to_process):
             self.logger.info(f"Processing item {i + 1}/{to_process}: {item.identifier}")
+            if fetch:
+                self.wikidata_fetch_service.fetch_and_store_meta(item)
             self.categorize_item(
                 item,
                 pool=pool,
                 session_name=session_name,
                 judge_pool=judge_pool,
+                use_other_ids=use_other_ids,
             )
 
         total_elapsed = time.perf_counter() - total_start
@@ -112,6 +125,7 @@ def categorize_item(
         pool=None,
         session_name=None,
         judge_pool="low",
+        use_other_ids=True,
     ):
         """
         Categorize a single item using all free LLM types.
@@ -123,6 +137,7 @@ def categorize_item(
             pool: List of LLMType to use (defaults to LLM_JUDGE_POOL)
             session_name: Optional session name to tag results
             judge_pool: Which pool tier ("low", "local", or "high")
+            use_other_ids: Include external IDs from meta in prompt
 
         Returns:
             List of categorization results from all LLMs
@@ -135,7 +150,7 @@ def categorize_item(
         self.logger.debug(f"Categorizing: {item.name}")
 
         prompt = build_categorization_prompt(
-            item, predicate, with_reasoning=use_reasoning
+            item, predicate, with_reasoning=use_reasoning, use_other_ids=use_other_ids
         )
 
         results = []
diff --git a/web/categorizer/management/commands/categorize.py b/web/categorizer/management/commands/categorize.py
@@ -35,6 +35,12 @@ def add_arguments(self, parser):
             choices=Item.Source.values,
             help="Filter by source (e.g. Wd, nL, MW, PW, EoM, WpEN, AUm)",
         )
+        parser.add_argument(
+            "--fetch",
+            action="store_true",
+            default=False,
+            help="Fetch entity data from the source API if missing in item.meta",
+        )
         parser.add_argument(
             "--session-name",
             type=str,
@@ -47,6 +53,7 @@ def handle(self, *args, **options):
         judge_pool = options.get("judge_pool")
         domain = options.get("domain")
         source = options.get("source")
+        fetch = options.get("fetch")
         session_name = options.get("session_name")
 
         service = CategorizerService()
@@ -69,6 +76,7 @@ def handle(self, *args, **options):
                 judge_pool=judge_pool,
                 domain=domain,
                 source=source,
+                fetch=fetch,
                 session_name=session_name,
             )
             self.stdout.write(self.style.SUCCESS("Categorization complete!"))
diff --git a/web/categorizer/prompts.py b/web/categorizer/prompts.py
@@ -12,6 +12,13 @@
 SYSTEM_PROMPT_WITH_REASONING = """You are a categorization judge. Your task is to
          evaluate whether a given concept satisfies a specific predicate.
 
+Be careful with concepts from adjacent domains such as physics, computer science,
+or engineering. A concept should only be classified as mathematical if it is
+primarily mathematical in nature. Concepts that merely use mathematics as a tool
+(e.g. quantum mechanics, signal processing) should not be considered mathematical
+concepts. When in doubt, consider whether the concept originates from or is
+primarily studied within mathematics.
+
 You must respond with a structured answer containing:
 1. answer: yes or no
 2. confidence: a number from 0 to 100 (representing your confidence percentage)
@@ -24,14 +31,18 @@
 """
 
 
-def build_categorization_prompt(item, predicate, with_reasoning=False):
+def build_categorization_prompt(
+    item, predicate, with_reasoning=False, use_other_ids=True
+):
     """
     Build a prompt for evaluating a concept against a predicate.
 
     Args:
         item: Item instance to categorize
         predicate: The question/predicate to evaluate
         with_reasoning: If True, ask for reasoning in the response
+        use_other_ids: If True, include external IDs from item.meta
+            (only applies to Wikidata items)
 
     Returns:
         Formatted prompt string
@@ -54,6 +65,11 @@ def build_categorization_prompt(item, predicate, with_reasoning=False):
         article_text = item.article_text[:1000]
         item_info_parts.append(f"Article text: {article_text}")
 
+    if use_other_ids:
+        other_ids = _get_other_ids(item)
+        if other_ids:
+            item_info_parts.append(f"External IDs: {other_ids}")
+
     item_info = "\n".join(item_info_parts)
 
     prompt = f"""{system_prompt}
@@ -73,3 +89,33 @@ def build_categorization_prompt(item, predicate, with_reasoning=False):
 Please provide your evaluation in the format specified above."""
 
     return prompt
+
+
+_OTHER_ID_KEYS = {
+    "mathworld_id": "MathWorld ID",
+    "nlab_id": "nLab ID",
+    "proofwiki_id": "ProofWiki ID",
+    "eom_id": "Encyclopedia of Mathematics ID",
+}
+
+
+def _get_other_ids(item):
+    from concepts.models import Item
+
+    if item.source != Item.Source.WIKIDATA:
+        return None
+    if not item.meta:
+        return None
+    try:
+        import json
+
+        meta = json.loads(item.meta)
+    except (json.JSONDecodeError, TypeError):
+        return None
+
+    parts = []
+    for meta_key, label in _OTHER_ID_KEYS.items():
+        value = meta.get(meta_key)
+        if value:
+            parts.append(f"{label}: {value}")
+    return ", ".join(parts) if parts else None
diff --git a/web/categorizer/tests.py b/web/categorizer/tests.py
@@ -0,0 +1,54 @@
+import json
+
+from categorizer.wikidata_fetch_service import WikidataFetchService
+from django.test import TestCase
+
+
+class WikidataFetchServiceTest(TestCase):
+    """
+    Live test against the Wikidata API.
+    Run with: python manage.py test categorizer.tests.WikidataFetchServiceTest
+    """
+
+    def setUp(self):
+        self.service = WikidataFetchService()
+
+    def test_fetch_entity_q2261345(self):
+        """Fetch Q2261345 and print the response for inspection."""
+        entity_id = "Q2261345"
+        entity = self.service.fetch_entity(entity_id)
+
+        self.assertIsNotNone(entity, "Expected entity data, got None")
+
+        # Basic structure
+        self.assertIn("labels", entity)
+        self.assertIn("descriptions", entity)
+        self.assertIn("claims", entity)
+        self.assertIn("sitelinks", entity)
+
+        # Print for inspection
+        print(f"\n{'=' * 60}")
+        print(f"Entity: {entity_id}")
+        print(f"{'=' * 60}")
+
+        label = entity["labels"].get("en", {}).get("value")
+        print(f"Label: {label}")
+
+        description = entity["descriptions"].get("en", {}).get("value")
+        print(f"Description: {description}")
+
+        print(f"\nClaims ({len(entity['claims'])} properties):")
+        for prop_id, claims in entity["claims"].items():
+            values = []
+            for claim in claims:
+                mainsnak = claim.get("mainsnak", {})
+                datavalue = mainsnak.get("datavalue", {})
+                values.append(datavalue.get("value", "N/A"))
+            print(f"  {prop_id}: {values}")
+
+        print(f"\nSitelinks ({len(entity['sitelinks'])} wikis):")
+        for site, link in entity["sitelinks"].items():
+            print(f"  {site}: {link['title']}")
+
+        print(f"\n{'=' * 60}")
+        print(f"Full JSON:\n{json.dumps(entity, indent=2)[:3000]}")
diff --git a/web/categorizer/wikidata_fetch_service.py b/web/categorizer/wikidata_fetch_service.py