Skip to content

Commit 1dbf9e3

Browse files
feat(caas): port production Context-as-a-Service features from internal repo
New source files: - enrichment.py (124L): MetadataEnricher solving the Flat Chunk Fallacy - pragmatic_truth.py (434L): SourceDetector, ConflictDetector, CitationFormatter - gateway/trust_gateway.py (455L): Enterprise-grade private cloud router with audit - ingestion/structure_parser.py (183L): Content tier assignment (High/Medium/Low) Updated init files: - gateway/__init__.py: Re-export TrustGateway and security models - ingestion/__init__.py: Re-export StructureParser New test files: - test_pragmatic_truth.py (305L): Source detection and conflict resolution tests - test_context_triad.py (364L): Hot/warm/cold context management tests 106 tests pass (+14 new), 0 regressions from pre-existing baseline. Closes #58 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 97a5cb4 commit 1dbf9e3

File tree

8 files changed

+1972
-3
lines changed

8 files changed

+1972
-3
lines changed
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
4+
"""
5+
Metadata enrichment module for contextual injection.
6+
7+
Solves the "Flat Chunk Fallacy" by enriching chunks with parent metadata.
8+
Instead of storing isolated chunks like "It increased by 5%", we store:
9+
"[Document: Q3 Earnings] [Chapter: Revenue] [Section: North America] It increased by 5%."
10+
11+
This ensures the vector carries the weight of its context.
12+
"""
13+
14+
from typing import List, Optional
15+
from caas.models import Section, Document, DocumentType
16+
17+
18+
class MetadataEnricher:
19+
"""
20+
Enriches sections with parent metadata for contextual awareness.
21+
22+
Transforms isolated chunks into context-aware chunks by injecting
23+
hierarchical metadata (document, chapter, section).
24+
"""
25+
26+
def enrich_sections(self, document: Document) -> List[Section]:
27+
"""
28+
Enrich all sections in a document with metadata prefixes.
29+
30+
Args:
31+
document: Document with sections to enrich
32+
33+
Returns:
34+
List of enriched sections
35+
"""
36+
enriched_sections = []
37+
38+
for section in document.sections:
39+
enriched_section = self._enrich_section(section, document)
40+
enriched_sections.append(enriched_section)
41+
42+
return enriched_sections
43+
44+
def _enrich_section(self, section: Section, document: Document) -> Section:
45+
"""
46+
Enrich a single section with metadata prefix.
47+
48+
Args:
49+
section: Section to enrich
50+
document: Parent document
51+
52+
Returns:
53+
Section with enriched content
54+
"""
55+
# Build metadata prefix
56+
metadata_parts = []
57+
58+
# Add document title
59+
metadata_parts.append(f"[Document: {document.title}]")
60+
61+
# Add document type if meaningful
62+
if document.detected_type and document.detected_type.value != "unknown":
63+
doc_type_display = document.detected_type.value.replace("_", " ").title()
64+
metadata_parts.append(f"[Type: {doc_type_display}]")
65+
66+
# Add chapter/parent section if available
67+
if section.chapter:
68+
metadata_parts.append(f"[Chapter: {section.chapter}]")
69+
elif section.parent_section:
70+
metadata_parts.append(f"[Parent: {section.parent_section}]")
71+
72+
# Add current section
73+
metadata_parts.append(f"[Section: {section.title}]")
74+
75+
# Build enriched content
76+
metadata_prefix = " ".join(metadata_parts)
77+
enriched_content = f"{metadata_prefix} {section.content}"
78+
79+
# Create a new section with enriched content
80+
# We preserve the original section but update the content
81+
# Note: Using model_copy() from Pydantic v2 (we're on v2.5.0)
82+
enriched_section = section.model_copy()
83+
enriched_section.content = enriched_content
84+
85+
return enriched_section
86+
87+
def get_enriched_chunk(
88+
self,
89+
section: Section,
90+
document_title: str,
91+
document_type: Optional[DocumentType] = None,
92+
include_type: bool = True
93+
) -> str:
94+
"""
95+
Get an enriched chunk string for a section.
96+
97+
Useful for building enriched context on-the-fly without modifying
98+
the stored section.
99+
100+
Args:
101+
section: Section to enrich
102+
document_title: Title of parent document
103+
document_type: Type of document (optional)
104+
include_type: Whether to include document type in prefix
105+
106+
Returns:
107+
Enriched chunk string
108+
"""
109+
metadata_parts = []
110+
111+
# Add document title
112+
metadata_parts.append(f"[Document: {document_title}]")
113+
114+
# Add document type if requested and available
115+
if include_type and document_type and document_type.value != "unknown":
116+
doc_type_display = document_type.value.replace("_", " ").title()
117+
metadata_parts.append(f"[Type: {doc_type_display}]")
118+
119+
# Add hierarchical context
120+
if section.chapter:
121+
metadata_parts.append(f"[Chapter: {section.chapter}]")
122+
elif section.parent_section:
123+
metadata_parts.append(f"[Parent: {section.parent_section}]")
124+
125+
# Add current section
126+
metadata_parts.append(f"[Section: {section.title}]")
127+
128+
# Build and return enriched content
129+
metadata_prefix = " ".join(metadata_parts)
130+
return f"{metadata_prefix} {section.content}"
Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,27 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
3-
# Community Edition — basic context/memory management
43
"""
5-
Gateway sub-package — placeholder after Trust Gateway removal.
4+
Trust Gateway: Enterprise-Grade Private Cloud Router
5+
6+
The enterprise-ready gateway that can be deployed on-premises or in private cloud
7+
to address CISO concerns about data security and privacy.
68
"""
79

8-
__all__: list = []
10+
from caas.gateway.trust_gateway import (
11+
TrustGateway,
12+
DeploymentMode,
13+
SecurityPolicy,
14+
SecurityLevel,
15+
AuditLog,
16+
DataRetentionPolicy
17+
)
18+
19+
__all__ = [
20+
"TrustGateway",
21+
"DeploymentMode",
22+
"SecurityPolicy",
23+
"SecurityLevel",
24+
"AuditLog",
25+
"DataRetentionPolicy"
26+
]
927

0 commit comments

Comments
 (0)