Skip to content

Commit d7c9f22

Browse files
authored
Merge pull request #32 from OrianeLanfranchi/tag-documents
Tag documents
2 parents 0ae5e1b + 9d7d40a commit d7c9f22

9 files changed

Lines changed: 19 additions & 106 deletions

File tree

src/docs2vecs/subcommands/indexer/config/config_schema.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ definitions:
1818
type: dict
1919
required: False
2020
schema:
21+
tag:
22+
type: string
23+
required: False
2124
api_url:
2225
type: string
2326
regex: '^http.*'
@@ -119,6 +122,9 @@ definitions:
119122
field_mapping:
120123
type: dict
121124
schema:
125+
tag:
126+
type: string
127+
required: False
122128
document_id:
123129
type: string
124130
required: False

src/docs2vecs/subcommands/indexer/document/chunk.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ def FromDict(dict):
44
chunk = Chunk()
55
chunk.document_id = dict["document_id"]
66
chunk.document_name = dict["document_name"]
7+
chunk.tag = dict["tag"]
78
chunk.content = dict["content"]
89
chunk.chunk_id = dict["chunk_id"]
910
chunk.source_link = dict["source_link"]
@@ -13,6 +14,7 @@ def FromDict(dict):
1314
def __init__(self):
1415
self.document_id = None
1516
self.document_name = None
17+
self.tag = None
1618
self.content = None
1719
self.chunk_id = None
1820
self.source_link = None
@@ -33,6 +35,7 @@ def to_dict(self):
3335
return {
3436
"document_id": self.document_id,
3537
"document_name": self.document_name,
38+
"tag": self.tag,
3639
"content": self.content,
3740
"chunk_id": self.chunk_id,
3841
"source_link": self.source_link,

src/docs2vecs/subcommands/indexer/document/document.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33

44
class Document:
5-
def __init__(self, filename: str, source_url: str = "", text: str = ""):
5+
def __init__(self, filename: str, source_url: str = "", tag: str = "", text: str = ""):
66
self.filename: str = filename
77
self.source_url: str = source_url
8+
self.tag = tag
89
self.text: str = text
910
self.chunks: set[Chunk] = set()
1011

src/docs2vecs/subcommands/indexer/skills/chromadb_vector_store_skill.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
2929
ids=[chunk.chunk_id for chunk in doc.chunks],
3030
embeddings=[chunk.embedding for chunk in doc.chunks],
3131
documents=[chunk.content for chunk in doc.chunks],
32-
metadatas=[{"source": chunk.source_link} for chunk in doc.chunks],
32+
metadatas=[{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks],
3333
)
3434

3535
return input

src/docs2vecs/subcommands/indexer/skills/default_file_reader.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
5858
result = []
5959
for doc in documents:
6060
file_path = Path(doc.filename)
61+
file_tag = doc.tag
6162
if not file_path.exists():
6263
self.logger.info(f"File not found: {file_path}")
6364
continue
@@ -71,6 +72,8 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
7172

7273
try:
7374
loaded_docs = handler(file_path)
75+
for loaded_doc in loaded_docs:
76+
loaded_doc.tag = file_tag
7477
result.extend(loaded_docs)
7578
self.logger.info(f"Successfully read file: {file_path}")
7679
except Exception as e:

src/docs2vecs/subcommands/indexer/skills/file_scanner_skill.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def __init__(self, skill_config: dict, global_config: Config) -> None:
2323
self._path = Path(self._config["path"]).expanduser().resolve()
2424
self._recursive = self._config.get("recursive", False)
2525
self._filter = self._config.get("filter", [])
26+
self.tag = self._config.get("tag", "default")
2627

2728
def run(self, documents: Optional[List[Document]]) -> List[Document]:
2829
"""Scan directory and return list of Documents with file paths.
@@ -51,7 +52,7 @@ def run(self, documents: Optional[List[Document]]) -> List[Document]:
5152
# Keep if matches any include pattern
5253
if not self._filter or any(fnmatch.fnmatch(file_path.name, pattern) for pattern in self._filter):
5354
# Add file as document
54-
doc = Document(filename=file_path)
55+
doc = Document(filename=file_path, tag=self.tag)
5556
result.append(doc)
5657

5758
for doc in result:

src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
4242
chunk = Chunk()
4343
chunk.document_id = hashlib.sha256(text.encode()).hexdigest()
4444
chunk.document_name = Path(doc.filename).name
45+
chunk.tag = doc.tag
4546
chunk.content = text
4647
chunk.chunk_id = chunk.document_id
4748
chunk.source_link = doc.source_url

src/docs2vecs/subcommands/indexer/skills/semantic_splitter_skill.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
5353
chunk = Chunk()
5454
chunk.document_id = hashlib.sha256(text.encode()).hexdigest()
5555
chunk.document_name = Path(doc.filename).name
56+
chunk.tag = doc.tag
5657
chunk.content = text
5758
chunk.chunk_id = node.id_
5859
chunk.source_link = doc.source_url

src/logs/indexer_skills.log

Lines changed: 0 additions & 103 deletions
This file was deleted.

0 commit comments

Comments
 (0)