Skip to content

Commit bbb2c6c

Browse files
authored
Merge pull request AmadeusITGroup#63 from martipath/new_skills
Added json_writer_skill
2 parents 47d5c26 + acd931c commit bbb2c6c

4 files changed

Lines changed: 80 additions & 1 deletion

File tree

src/docs2vecs/subcommands/indexer/config/config_schema.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ definitions:
1010
schema:
1111
type:
1212
type: string
13-
allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader']
13+
allowed: ['exporter', 'embedding', 'vector-store', 'uploader', 'splitter', 'integrated_vec', 'file-scanner', 'file-reader', 'loader', 'writer']
1414
required: True
1515
name:
1616
type: string
@@ -105,6 +105,10 @@ definitions:
105105
type: integer
106106
required: False
107107
min: 0
108+
# JSONWriterSkill params
109+
output_path:
110+
type: string
111+
required: False
108112
# ConfluenceFAQSplitter params
109113
min_heading_level:
110114
type: integer

src/docs2vecs/subcommands/indexer/skills/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .faiss_vector_store_skill import FaissVectorStoreSkill
1616
from .teams_qna_loader_skill import TeamsQnALoaderSkill
1717
from .confluence_faq_splitter_skill import ConfluenceFAQSplitter
18+
from .json_writer_skill import JSONWriterSkill
1819

1920

2021
__all__ = [
@@ -35,4 +36,5 @@
3536
"FaissVectorStoreSkill",
3637
"TeamsQnALoaderSkill",
3738
"ConfluenceFAQSplitter",
39+
"JSONWriterSkill",
3840
]

src/docs2vecs/subcommands/indexer/skills/factory.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill
1919
from docs2vecs.subcommands.indexer.skills.confluence_faq_splitter_skill import ConfluenceFAQSplitter
2020
from docs2vecs.subcommands.indexer.skills.teams_qna_loader_skill import TeamsQnALoaderSkill
21+
from docs2vecs.subcommands.indexer.skills.json_writer_skill import JSONWriterSkill
2122

2223

2324
class SkillType(StrEnum):
@@ -29,6 +30,7 @@ class SkillType(StrEnum):
2930
UPLOADER = "uploader"
3031
SPLITTER = "splitter"
3132
LOADER = "loader"
33+
WRITER = "writer"
3234

3335

3436
class AvailableSkillName(StrEnum):
@@ -63,6 +65,9 @@ class AvailableSkillName(StrEnum):
6365
JIRA_LOADER = "jira-loader"
6466
TEAMS_QNA_LOADER = "teams-qna-loader"
6567

68+
# writers
69+
JSON_WRITER = "json-writer"
70+
6671

6772
AVAILABLE_SKILLS = {
6873
SkillType.EXPORTER: {
@@ -92,6 +97,9 @@ class AvailableSkillName(StrEnum):
9297
AvailableSkillName.JIRA_LOADER: JiraLoaderSkill,
9398
AvailableSkillName.TEAMS_QNA_LOADER: TeamsQnALoaderSkill,
9499
},
100+
SkillType.WRITER: {
101+
AvailableSkillName.JSON_WRITER: JSONWriterSkill,
102+
},
95103
}
96104

97105

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""Skill that extracts chunk content from Documents and writes it to a JSON file.
2+
3+
Use this skill at any point in a pipeline to capture intermediate state,
4+
e.g. after a splitter, so the output can be checksummed for change detection
5+
without running expensive downstream skills like embedding and indexing.
6+
7+
Only the chunk text content is written as a sorted JSON array of strings —
8+
volatile metadata like filenames, document IDs, and timestamps are excluded
9+
so the checksum remains stable when the underlying text hasn't changed.
10+
"""
11+
12+
import json
13+
import os
14+
from typing import List, Optional
15+
16+
from docs2vecs.subcommands.indexer.config.config import Config
17+
from docs2vecs.subcommands.indexer.document import Document
18+
from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
19+
20+
21+
class JSONWriterSkill(IndexerSkill):
22+
"""Extract text content from all chunks and write it as a sorted JSON array.
23+
24+
The output is a flat list of strings (one per non-empty chunk), sorted
25+
alphabetically for deterministic checksumming. Documents are passed
26+
through unchanged for downstream skills.
27+
28+
Config params:
29+
output_path (str): Path to the output JSON file (default:
30+
``data/pipeline_output.json``). Parent
31+
directories are created automatically.
32+
"""
33+
34+
def __init__(self, skill_config: dict, global_config: Config) -> None:
35+
super().__init__(skill_config, global_config)
36+
self._output_path = self._config.get("output_path", "data/pipeline_output.json")
37+
38+
def run(self, input: Optional[List[Document]] = None) -> List[Document]:
39+
if not input:
40+
self.logger.warning("JSONWriterSkill received no input — nothing to write.")
41+
return input or []
42+
43+
# Collect only the content from every chunk across all documents
44+
contents = []
45+
for doc in input:
46+
for chunk in doc.chunks:
47+
if chunk.content:
48+
contents.append(chunk.content)
49+
50+
# Sort for deterministic output (stable checksums)
51+
contents.sort()
52+
53+
os.makedirs(os.path.dirname(self._output_path) or ".", exist_ok=True)
54+
55+
with open(self._output_path, "w", encoding="utf-8") as f:
56+
json.dump(contents, f, indent=2, ensure_ascii=False)
57+
58+
self.logger.info(
59+
"Wrote %d chunk content entries to %s",
60+
len(contents),
61+
self._output_path,
62+
)
63+
64+
# Pass-through: downstream skills can still consume the documents
65+
return input

0 commit comments

Comments
 (0)