Merge pull request #32 from OrianeLanfranchi/tag-documents

wherka-ama · web-flow · commit d7c9f22398d2 · 2025-04-16T11:18:05.000+02:00
Tag documents
diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -18,6 +18,9 @@ definitions:
             type: dict
             required: False
             schema:
+              tag:
+                type: string
+                required: False
               api_url:
                 type: string
                 regex: '^http.*'
@@ -119,6 +122,9 @@ definitions:
               field_mapping:
                 type: dict
                 schema:
+                  tag:
+                    type: string
+                    required: False
                   document_id:
                     type: string
                     required: False
diff --git a/src/docs2vecs/subcommands/indexer/document/chunk.py b/src/docs2vecs/subcommands/indexer/document/chunk.py
@@ -4,6 +4,7 @@ def FromDict(dict):
         chunk = Chunk()
         chunk.document_id = dict["document_id"]
         chunk.document_name = dict["document_name"]
+        chunk.tag = dict["tag"]
         chunk.content = dict["content"]
         chunk.chunk_id = dict["chunk_id"]
         chunk.source_link = dict["source_link"]
@@ -13,6 +14,7 @@ def FromDict(dict):
     def __init__(self):
         self.document_id = None
         self.document_name = None
+        self.tag = None
         self.content = None
         self.chunk_id = None
         self.source_link = None
@@ -33,6 +35,7 @@ def to_dict(self):
         return {
             "document_id": self.document_id,
             "document_name": self.document_name,
+            "tag": self.tag,
             "content": self.content,
             "chunk_id": self.chunk_id,
             "source_link": self.source_link,
diff --git a/src/docs2vecs/subcommands/indexer/document/document.py b/src/docs2vecs/subcommands/indexer/document/document.py
@@ -2,9 +2,10 @@
 
 
 class Document:
-    def __init__(self, filename: str, source_url: str = "", text: str = ""):
+    def __init__(self, filename: str, source_url: str = "", tag: str = "", text: str = ""):
         self.filename: str = filename
         self.source_url: str = source_url
+        self.tag = tag
         self.text: str = text
         self.chunks: set[Chunk] = set()
 
diff --git a/src/docs2vecs/subcommands/indexer/skills/chromadb_vector_store_skill.py b/src/docs2vecs/subcommands/indexer/skills/chromadb_vector_store_skill.py
@@ -29,7 +29,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
                 ids=[chunk.chunk_id for chunk in doc.chunks],
                 embeddings=[chunk.embedding for chunk in doc.chunks],
                 documents=[chunk.content for chunk in doc.chunks],
-                metadatas=[{"source": chunk.source_link} for chunk in doc.chunks],
+                metadatas=[{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks],
             )
 
         return input
diff --git a/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py b/src/docs2vecs/subcommands/indexer/skills/default_file_reader.py
@@ -58,6 +58,7 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
         result = []
         for doc in documents:
             file_path = Path(doc.filename)
+            file_tag = doc.tag
             if not file_path.exists():
                 self.logger.info(f"File not found: {file_path}")
                 continue
@@ -71,6 +72,8 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
 
             try:
                 loaded_docs = handler(file_path)
+                for loaded_doc in loaded_docs:
+                    loaded_doc.tag = file_tag
                 result.extend(loaded_docs)
                 self.logger.info(f"Successfully read file: {file_path}")
             except Exception as e:
diff --git a/src/docs2vecs/subcommands/indexer/skills/file_scanner_skill.py b/src/docs2vecs/subcommands/indexer/skills/file_scanner_skill.py
@@ -23,6 +23,7 @@ def __init__(self, skill_config: dict, global_config: Config) -> None:
         self._path = Path(self._config["path"]).expanduser().resolve()
         self._recursive = self._config.get("recursive", False)
         self._filter = self._config.get("filter", [])
+        self.tag = self._config.get("tag", "default")
 
     def run(self, documents: Optional[List[Document]]) -> List[Document]:
         """Scan directory and return list of Documents with file paths.
@@ -51,7 +52,7 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
             # Keep if matches any include pattern
             if not self._filter or any(fnmatch.fnmatch(file_path.name, pattern) for pattern in self._filter):
                 # Add file as document
-                doc = Document(filename=file_path)
+                doc = Document(filename=file_path, tag=self.tag)
                 result.append(doc)
 
         for doc in result:
diff --git a/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py
@@ -42,6 +42,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
                 chunk = Chunk()
                 chunk.document_id = hashlib.sha256(text.encode()).hexdigest()
                 chunk.document_name = Path(doc.filename).name
+                chunk.tag = doc.tag
                 chunk.content = text
                 chunk.chunk_id = chunk.document_id
                 chunk.source_link = doc.source_url
diff --git a/src/docs2vecs/subcommands/indexer/skills/semantic_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/semantic_splitter_skill.py
@@ -53,6 +53,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
                 chunk = Chunk()
                 chunk.document_id = hashlib.sha256(text.encode()).hexdigest()
                 chunk.document_name = Path(doc.filename).name
+                chunk.tag = doc.tag
                 chunk.content = text
                 chunk.chunk_id = node.id_
                 chunk.source_link = doc.source_url
diff --git a/src/logs/indexer_skills.log b/src/logs/indexer_skills.log

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:`
`29`	`29`	`ids=[chunk.chunk_id for chunk in doc.chunks],`
`30`	`30`	`embeddings=[chunk.embedding for chunk in doc.chunks],`
`31`	`31`	`documents=[chunk.content for chunk in doc.chunks],`
`32`		`- metadatas=[{"source": chunk.source_link} for chunk in doc.chunks],`
	`32`	`+ metadatas=[{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks],`
`33`	`33`	`)`
`34`	`34`
`35`	`35`	`return input`