fix classification route

Diya Kadakia · Diya Kadakia · commit c6baddba6edc · 2025-11-10T21:05:14.000-05:00
diff --git a/backend/app/routes/classification_routes.py b/backend/app/routes/classification_routes.py
@@ -91,7 +91,7 @@ async def create_classifications(
             )
 
         classification_names: list[str] = await create_classifications_helper(
-            tenant_id,
+            extracted_files,
             [classification.name for classification in initial_classifications],
         )
 
diff --git a/backend/app/utils/classification/create_classifications.py b/backend/app/utils/classification/create_classifications.py
@@ -1,11 +1,10 @@
-from app.schemas.classification_schemas import ExtractedFile
-from uuid import UUID
 import hdbscan
 import numpy as np
-from app.core.litellm import EmbeddingModelType, LLMClient 
-from app.services.classification_service import ClassificationService
 from sklearn.preprocessing import normalize
 
+from app.core.litellm import LLMClient
+from app.schemas.classification_schemas import ExtractedFile
+
 
 async def create_classifications(
     extracted_files: list[ExtractedFile],
@@ -26,19 +25,21 @@ async def create_classifications(
             valid_files.append(file)
 
     if len(embeddings) < 3:
-        print(f"Not enough files for clustering ({len(embeddings)}), returning initial classifications")
+        print(
+            f"Not enough files for clustering ({len(embeddings)}), returning initial classifications"
+        )
         return initialClassifications
-    
+
     embeddings_array = np.array(embeddings)
 
     # Normalize embeddings so that cosine similarity ≈ euclidean distance
-    normalized_embeddings = normalize(embeddings_array)  # L2 normalization  
+    normalized_embeddings = normalize(embeddings_array)  # L2 normalization
 
     clusterer = hdbscan.HDBSCAN(
-        min_cluster_size = 2,      
-        min_samples = 1,          
-        metric = 'euclidean',    
-        cluster_selection_method = 'eom' 
+        min_cluster_size=2,
+        min_samples=1,
+        metric="euclidean",
+        cluster_selection_method="eom",
     )
 
     cluster_labels = clusterer.fit_predict(normalized_embeddings)
@@ -55,29 +56,28 @@ async def create_classifications(
     outliers = clusters.pop(-1, [])  # Remove -1 cluster if it exists
     print(f"Found {len(clusters)} clusters, {len(outliers)} outliers")
 
-
     client = LLMClient()
     classification_names = []
 
     for cluster_id, files_in_cluster in clusters.items():
         print(f"Analyzing cluster {cluster_id} with {len(files_in_cluster)} files...")
-    
+
         # Get sample documents from cluster (up to 5 for context)
         sample_texts = []
         for file in files_in_cluster[:5]:
             text = _extract_text_from_file(file)
             sample_texts.append(text[:500])  # Limit text length
-    
+
         # Use LLM to name the cluster
         prompt = f"""Analyze these similar documents and provide a single, concise classification name.
 
     Sample documents from this cluster:
 
-    {chr(10).join(f"Document {i+1}: {text}" for i, text in enumerate(sample_texts))}
+    {chr(10).join(f"Document {i + 1}: {text}" for i, text in enumerate(sample_texts))}
 
-    What type of documents are these? Respond with ONLY the category name (e.g., "Invoice", "Purchase Order", "Quote").
+    What type of documents are these? Respond with ONLY the category name. 
     Do not include any explanation or punctuation."""
-    
+
         try:
             response = await client.chat(prompt, temperature=0.3, max_tokens=50)
             category_name = response.choices[0].message.content.strip()
@@ -101,7 +101,7 @@ async def create_classifications(
     {text}
 
     Respond with ONLY the category name."""
-    
+
         try:
             response = await client.chat(prompt, temperature=0.3, max_tokens=50)
             category_name = response.choices[0].message.content.strip()
@@ -117,34 +117,32 @@ async def create_classifications(
             fallback_name = f"Document Type Outlier {i}"
             print(f"  → Outlier named: {fallback_name}")
 
-    
     all_classifications = classification_names + initialClassifications
-    final_classifications = list(set(all_classifications))  
-    
+    final_classifications = list(set(all_classifications))
+
     print(f"Final classifications: {final_classifications}")
     return final_classifications
 
 
 def _extract_text_from_file(file: ExtractedFile) -> str:
     """Convert extracted file to text representation for analysis."""
     parts = []
-    
+
     # Add filename
     if file.name:
         parts.append(f"Filename: {file.name}")
 
-    
     # Add extracted content
     if isinstance(file.extracted_data, dict):
         for key, value in file.extracted_data.items():
             if isinstance(value, (dict, list)):
-                continue  
+                continue
             parts.append(f"{key}: {value}")
     elif isinstance(file.extracted_data, list):
-        parts.append(f"Items: {', '.join(str(item) for item in file.extracted_data[:5])}")
+        parts.append(
+            f"Items: {', '.join(str(item) for item in file.extracted_data[:5])}"
+        )
     else:
         parts.append(str(file.extracted_data))
 
-        
-    
-    return " ".join(parts)
+    return " ".join(parts)
diff --git a/package-lock.json b/package-lock.json

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ async def create_classifications(`
`91`	`91`	`)`
`92`	`92`
`93`	`93`	`classification_names: list[str] = await create_classifications_helper(`
`94`		`- tenant_id,`
	`94`	`+ extracted_files,`
`95`	`95`	`[classification.name for classification in initial_classifications],`
`96`	`96`	`)`
`97`	`97`