Update dependencies. Add optional re-ranker cutoff. Improve error message when doc path doesn't exist

. · . · commit 19f8e443f9f8 · 2025-05-31T21:12:51.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+.github/
diff --git a/docs/configure_model.rst b/docs/configure_model.rst
@@ -20,12 +20,6 @@ An example of OpenAI gpt4o-mini is shown below:
    :language: yaml
 
 
-llamacpp
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. literalinclude:: ../sample_templates/llm/llamacpp.yaml
-   :language: yaml
-
 Ollama + Litellm
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -46,4 +40,4 @@ Reference
 
 .. automodule:: llmsearch.models.config
    :members:
-   
+   
diff --git a/sample_templates/generic/config_template.yaml b/sample_templates/generic/config_template.yaml
@@ -75,6 +75,8 @@ semantic_search:
   
   query_prefix: "query: " # Often queries have to be prefixed for embedding models, such as e5
 
+  score_cutoff: -3.0 # Optional reranker score cutoff. Documents below this score will be excluded from the returned document list
+
   hyde:
     enabled: False
   
diff --git a/sample_templates/llm/litellm.yaml b/sample_templates/llm/litellm.yaml
@@ -16,4 +16,4 @@ llm:
        temperature: 0.2
        model: "any"
        api_key: "any"
-       base_url: "http://0.0.0.0:8000"
+       base_url: "http://0.0.0.0:8000" 
diff --git a/src/llmsearch/config.py b/src/llmsearch/config.py
@@ -128,6 +128,14 @@ def validate_extension(cls, value):
                 )
         return value
 
+    @field_validator("doc_path")
+    def validate_path(cls, value):
+        path = Path(value)
+        if not path.exists():
+            raise TypeError("Provided doc_path doesn't exist.")
+        if not path.is_dir():
+            raise TypeError("Provided doc_path is not a directory.")
+        return value
 
 class EmbedddingsSpladeConfig(BaseModel):
     n_batch: int = 3
@@ -274,7 +282,10 @@ class SemanticSearchConfig(BaseModel):
     max_k: int = 15
     """Maximum number of documents to retrieve for dense OR sparse embedding (if using both, number of documents will be k*2)"""
 
-    max_char_size: int = 2048
+    score_cutoff: Optional[float] = None
+    """Documents with score less than specified will be excluded from relevant documents"""
+
+    max_char_size: int = 16384
     """Maximum character size for query + documents to fit into context window of LLM."""
 
     query_prefix: str = ""
diff --git a/src/llmsearch/ranking.py b/src/llmsearch/ranking.py
@@ -2,6 +2,7 @@
 # from llmsearch.utils import LLMBundle
 from typing import List, Tuple
 
+from sentence_transformers.util import semantic_search
 import torch
 from loguru import logger
 from sentence_transformers.cross_encoder import CrossEncoder
@@ -183,6 +184,11 @@ def get_relevant_documents(
     len_ = 0
 
     for doc in docs:
+        # Skip document with lower than cutoff score, if specified
+        if config.score_cutoff is not None and doc.metadata['score'] < config.score_cutoff:
+            logger.info(f"Skipping document {doc.metadata['document_id']} with score: {doc.metadata['score']}")
+            continue
+        # if doc.metadata['score'] 
         doc_length = len(doc.page_content)
         if len_ + doc_length < config.max_char_size - offset_max_chars:
             most_relevant_docs.append(doc)
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -147,3 +147,4 @@ dmypy.json`
`147`	`147`
`148`	`148`	`# Pyre type checker`
`149`	`149`	`.pyre/`
	`150`	`+.github/`