Skip to content
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
56cbc91
Refactor for multi index in redis
mdciri Feb 5, 2026
a0fe349
Copilot terraform skills
uolter Feb 6, 2026
24ad7a4
[CAI-747] add parser app (#1996)
anemone008 Feb 5, 2026
542cd2c
[CAI-748] Add Puppeteer (#1999)
anemone008 Feb 6, 2026
e779b68
Update scripts to accept structured documents to create a dedicated i…
mdciri Feb 9, 2026
15b8b95
Update github action accordingly to python script
mdciri Feb 9, 2026
f096b08
Update chatbot create index workflow to accept input types
mdciri Feb 9, 2026
062d3b7
Update documents to read structured data into llamaindex documents
mdciri Feb 6, 2026
af6dc9c
Update apps/chatbot-index/src/modules/settings.py
mdciri Feb 10, 2026
7128817
Update apps/chatbot-index/src/modules/documents.py
mdciri Feb 10, 2026
17badb0
Update .github/actions/chatbot/action.yaml
mdciri Feb 10, 2026
f50a388
Fix the input and output types
mdciri Feb 10, 2026
0df9e63
Update structured llamaindex documents id
mdciri Feb 10, 2026
338efd6
Add assertion for the inputs
mdciri Feb 10, 2026
d1a8d4d
Update .changeset/eager-colts-smile.md
mdciri Feb 10, 2026
57d348f
Update apps/chatbot-index/src/modules/create_vector_index.py
mdciri Feb 10, 2026
4b315f0
Update document to safe load jsons
mdciri Feb 10, 2026
01ed416
Merge branch 'CAI-741-refactor-for-new-index' of github.com:pagopa/de…
mdciri Feb 10, 2026
dc9663a
Merge branch 'main' into CAI-741-refactor-for-new-index
mdciri Feb 10, 2026
74d9b35
Update cleaning redis when creating index given an index id
mdciri Feb 10, 2026
8966f99
Update .github/workflows/chatbot_create_index.yaml
mdciri Feb 10, 2026
6025cc3
Sobstitute s3 client with s3 resource
mdciri Feb 10, 2026
7a2232a
Update apps/chatbot-index/src/modules/vector_index.py
mdciri Feb 10, 2026
e2547c5
Update apps/chatbot-index/src/modules/documents.py
mdciri Feb 10, 2026
15187d3
Update apps/chatbot-index/src/modules/codec.py
mdciri Feb 10, 2026
bc02597
Update documents json loads
mdciri Feb 10, 2026
149725b
fix docker compose
batdevis Feb 11, 2026
ac5bd1f
Merge branch 'CAI-741-refactor-for-new-index' of github.com:pagopa/de…
batdevis Feb 11, 2026
4f70244
Add to the github workflow and action the index-id to create as env var
mdciri Feb 11, 2026
c3f2e8b
Merge branch 'CAI-741-refactor-for-new-index' of github.com:pagopa/de…
mdciri Feb 11, 2026
cda13ba
Update class name to be more generic
mdciri Feb 11, 2026
779967b
Update class name to be more generic
mdciri Feb 11, 2026
14e0a75
Merge branch 'main' into CAI-741-refactor-for-new-index
mdciri Feb 11, 2026
873f81e
Update workflow inputs
mdciri Feb 11, 2026
34dbc95
Update create-index service in compose.yaml
mdciri Feb 11, 2026
f3271e0
Update dockerfile and scripts
mdciri Feb 11, 2026
5b91eff
Update codec return
mdciri Feb 11, 2026
6c65f57
fix s3 init on create index
batdevis Feb 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/eager-colts-smile.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"chatbot-index": minor
---

Create index now also considers structured data
18 changes: 17 additions & 1 deletion .github/actions/chatbot/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,22 @@ inputs:
description: the name of the chatbot lambda function
required: true
default: ''
static:
description: 'Whether to create vector index using static data'
required: true
default: 'true'
dynamic:
description: 'Whether to create vector index using dynamic data'
required: true
default: 'true'
api:
description: 'Whether to create vector index using api data'
required: true
default: 'true'
structured:
description: 'Whether to create vector index using structured data'
required: true
default: 'false'

runs:
using: "composite"
Expand Down Expand Up @@ -42,4 +58,4 @@ runs:
shell: bash
run: |
cd apps/chatbot-index
PYTHONPATH=. poetry run python src/modules/create_vector_index.py
PYTHONPATH=. poetry run python src/modules/create_vector_index.py ${{ inputs.static == 'true' && ' --static' || '' }} ${{ inputs.dynamic == 'true' && ' --dynamic' || '' }} ${{ inputs.api == 'true' && ' --api' || '' }} ${{ inputs.structured == 'true' && ' --structured' || '' }} --clean-redis
14 changes: 14 additions & 0 deletions .github/workflows/chatbot_create_index.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ on:
- dev
- uat
- prod
action_type:
description: 'The type of action to perform'
type: choice
required: true
default: structured
options:
- api
- dynamic
- static
- structured

# Allows external webhook trigger
repository_dispatch:
Expand Down Expand Up @@ -81,4 +91,8 @@ jobs:
- name: Chatbot Vector Index
uses: ./.github/actions/chatbot
with:
api: ${{ github.event.inputs.action_type == 'api' && 'true' || 'false' }}
chatbot_lambda_name: ${{ vars.CHATBOT_LAMBDA_NAME }}
dynamic: ${{ github.event.inputs.action_type == 'dynamic' && 'true' || 'false' }}
static: ${{ github.event.inputs.action_type == 'static' && 'true' || 'false' }}
structured: ${{ github.event.inputs.action_type == 'structured' && 'true' || 'false' }}
1 change: 0 additions & 1 deletion apps/chatbot-index/config/params.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
vector_index:
index_id: discovery-index
chunk_size: 1024
chunk_overlap: 20
12 changes: 12 additions & 0 deletions apps/chatbot-index/src/modules/codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import json
from typing import Any


def safe_json_load(value: Any) -> Any:
"""Parse JSON string attributes into Python objects."""
if isinstance(value, str):
try:
return json.loads(value)
except json.JSONDecodeError:
return value
return value
53 changes: 52 additions & 1 deletion apps/chatbot-index/src/modules/create_vector_index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import argparse

from src.modules.logger import get_logger
from src.modules.vector_index import DiscoveryVectorIndex
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DiscoveryVectorIndex is not present in src.modules.vector_index

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an idea, you could rename the class : )


Expand All @@ -8,4 +10,53 @@

if __name__ == "__main__":

VECTOR_INDEX.create_index()
parser = argparse.ArgumentParser(
description="Create a vector index for the chatbot."
)
parser.add_argument(
"--static",
action="store_true",
help="Include static documents in the index",
)
parser.add_argument(
"--dynamic",
action="store_true",
help="Include dynamic documents in the index",
)
parser.add_argument(
"--api",
action="store_true",
help="Include API documents in the index",
)
parser.add_argument(
"--structured",
action="store_true",
help="Include structured documents in the index",
)
parser.add_argument(
"--clean-redis",
action="store_true",
help="Clean the Redis database before building the index",
)
args = parser.parse_args()

# Validate source selection:
# - Structured documents cannot be combined with static, dynamic, or API documents.
# - At least one document source must be selected.
has_unstructured_source = args.static or args.dynamic or args.api
if args.structured and has_unstructured_source:
parser.error(
"Structured documents cannot be combined with static, dynamic, or API documents."
)
if not args.structured and not has_unstructured_source:
parser.error(
"No document sources selected. Use one or more of --static, --dynamic, --api, or --structured."
)

VECTOR_INDEX.create_index(
static=args.static,
dynamic=args.dynamic,
api=args.api,
structured=args.structured,
clean_redis=args.clean_redis,
)
99 changes: 86 additions & 13 deletions apps/chatbot-index/src/modules/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from src.modules.logger import get_logger
from src.modules.settings import SETTINGS, AWS_SESSION
from src.modules.codec import safe_json_load


logging.getLogger("botocore").setLevel(logging.ERROR)
Expand All @@ -33,6 +34,7 @@
RELEASE_NOTES_FOLDER_FILEPATH = "release-notes-dirNames.json"
PRODUCTS_S3_FILEPATH = "synced-products-response.json"
APIS_DATA_S3_FILEPATH = "synced-apis-data-response.json"
EXTRACTOR_FOLDER = "extractor"


class StaticMetadata(BaseModel):
Expand Down Expand Up @@ -109,7 +111,7 @@ def get_folders_list(
s3_content = read_file_from_s3(filepath)
if s3_content:
try:
folders_content = json.loads(s3_content)
folders_content = safe_json_load(s3_content)
except Exception as e:
LOGGER.warning(f"Failed to decode {filepath}: {e}")
folders_content = {"dirNames": []}
Expand Down Expand Up @@ -145,7 +147,7 @@ def get_one_metadata_from_s3(
os.path.join(docs_parent_folder, folder_name, "metadata.json")
)
try:
folder_metadata = json.loads(s3_content) if s3_content else {}
folder_metadata = safe_json_load(s3_content) if s3_content else {}
except Exception as e:
LOGGER.warning(
f"Failed to decode metadata.json in folder {docs_parent_folder}/{folder_name}: {e}"
Expand Down Expand Up @@ -185,7 +187,7 @@ def get_metadata_from_s3(
s3_content = read_file_from_s3(
os.path.join(docs_parent_folder, folder_name, "metadata.json")
)
folder_metadata = json.loads(s3_content) if s3_content else []
folder_metadata = safe_json_load(s3_content) if s3_content else []
except Exception as e:
LOGGER.warning(
f"Failed to decode metadata.json in folder {docs_parent_folder}/{folder_name}: {e}"
Expand Down Expand Up @@ -221,7 +223,7 @@ def get_product_list(file_path: str | None = None) -> List[str]:
s3_content = read_file_from_s3(file_path)
product_list = []
if s3_content:
products = json.loads(s3_content)
products = safe_json_load(s3_content)
for product in products:
try:
if product["attributes"]["isVisible"]:
Expand Down Expand Up @@ -343,7 +345,7 @@ def get_apidata(file_path: str | None = None) -> dict:
s3_data = read_file_from_s3(file_path)
if not s3_data:
raise ValueError("API data content is empty.")
return json.loads(s3_data)
return safe_json_load(s3_data)


def read_api_url(url: str) -> str:
Expand All @@ -362,7 +364,7 @@ def read_api_url(url: str) -> str:
if url.endswith(".yaml") or url.endswith(".yml"):
data = yaml.safe_load(response.text)
elif url.endswith(".json"):
data = json.loads(response.text)
data = safe_json_load(response.text)
else:
raise ValueError("Unsupported file format. Use .yaml, .yml, or .json.")

Expand Down Expand Up @@ -606,20 +608,91 @@ def get_dynamic_docs(dynamic_metadata: List[DynamicMetadata]) -> List[Document]:
return dynamic_docs


def get_documents() -> List[Document]:
def get_structured_docs(parent_folder: str, bucket_name: str) -> List[Document]:
"""
Fetches structured documents from a specified S3 bucket and parent folder.
Args:
parent_folder (str): The parent folder in the S3 bucket where the structured documents are located.
bucket_name (str): The name of the S3 bucket to fetch the structured documents from.
Returns:
List[Document]: A list of Document objects containing the content and metadata of the structured documents
"""

s3_resource = AWS_SESSION.resource("s3")
bucket = s3_resource.Bucket(bucket_name)

structured_docs = []
for obj in bucket.objects.filter(
Prefix=os.path.join(parent_folder, EXTRACTOR_FOLDER)
):
if obj.key.lower().endswith(".json"):

json_file_path = obj.key
filename_split = json_file_path.split("/")
filename = (
os.path.join(filename_split[-2], filename_split[-1])
if len(filename_split) >= 2
else filename_split[-1]
)
s3_content = safe_json_load(read_file_from_s3(json_file_path, bucket_name))
structured_docs.append(
Document(
id_=filename,
text=s3_content.get("text", ""),
metadata={
"filepath": json_file_path,
"language": s3_content.get("language", ""),
"lastmod": s3_content.get("lastmod", ""),
"title": s3_content.get("title", ""),
},
)
)

return structured_docs


def get_documents(
index_id: str,
static: bool,
dynamic: bool,
api: bool,
structured: bool,
) -> List[Document]:
"""
Fetches documents from static and dynamic sources.
Args:
index_id (str): The identifier for the index to which the documents will be added.
static (bool): Flag indicating whether to include static documents in the index
dynamic (bool): Flag indicating whether to include dynamic documents in the index
api (bool): Flag indicating whether to include API documentation in the index
structured (bool): Flag indicating whether to include structured documents in the index
Returns:
List[Document]: A list of Document objects containing the content and metadata.
"""

static_metadata = get_static_metadata()
dynamic_metadata = get_dynamic_metadata(static_metadata)
docs = []

static_metadata = None

if static:
static_metadata = get_static_metadata()
static_docs = get_static_docs(static_metadata)
docs.extend(static_docs)

if dynamic:
if static_metadata is None:
static_metadata = get_static_metadata()
dynamic_metadata = get_dynamic_metadata(static_metadata)
dynamic_docs = get_dynamic_docs(dynamic_metadata)
docs.extend(dynamic_docs)

if api:
api_docs = get_api_docs()
docs.extend(api_docs)

api_docs = get_api_docs()
static_docs = get_static_docs(static_metadata)
dynamic_docs = get_dynamic_docs(dynamic_metadata)
docs = api_docs + static_docs + dynamic_docs
if structured:
structured_docs = get_structured_docs(index_id, SETTINGS.bucket_static_content)
docs.extend(structured_docs)

LOGGER.info(f"Total number of fetched documents: {len(docs)}")
return docs
2 changes: 1 addition & 1 deletion apps/chatbot-index/src/modules/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class ChatbotSettings(BaseSettings):
# vector index and docs params
chunk_overlap: int = PARAMS["vector_index"]["chunk_overlap"]
chunk_size: int = PARAMS["vector_index"]["chunk_size"]
index_id: str = PARAMS["vector_index"]["index_id"]
index_id: str = os.getenv("CHB_INDEX_ID", "discovery-index")
bucket_static_content: str = os.getenv(
"CHB_AWS_S3_BUCKET_NAME_STATIC_CONTENT", "devportal-d-website-static-content"
)
Expand Down
Loading