Skip to content

Commit dab9a6e

Browse files
brockwebbclaude
andcommitted
Implement evolutionary vocabulary system (ADR-010)
Implements three-tier vocabulary validation for controlled vocabularies: 1. Core terms (in FACT_CATEGORIES etc.) → accept silently 2. Provisional terms (in VOCABULARY_EXTENSIONS) → accept, INFO log, track usage 3. Rejected terms (in VOCABULARY_REJECTIONS) → apply correction, WARN log 4. Unknown terms → accept, WARN log, auto-add to provisional Changes: config.py: - Promoted "dissemination" to core FACT_CATEGORIES vocabulary - Added VOCABULARY_EXTENSIONS dict for provisional terms - Added VOCABULARY_REJECTIONS dict with "definition" → reclassify mapping utils.py: - Updated validate_extraction() signature to return (is_valid, warnings, corrections) - Implemented three-tier validation with automatic provisional tracking - Added correction generation for remapped values and reclassified nodes - Auto-increments usage count for provisional terms extract.py: - Updated to pass catalog_id to validate_extraction() - Applied corrections (node reclassification) before Neo4j writes - Returns corrections dict in extraction results Database fixes: - Reclassified 4 "definition" MethodologicalChoice nodes to ConceptDefinition - Removed invalid fact_category property from reclassified nodes Verification: - Dry-run extraction test passes ✓ - 17 "dissemination" nodes now valid (promoted to core) ✓ - 4 "definition" nodes reclassified to ConceptDefinition ✓ Addresses: FR-QE-006, FR-QE-014 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent b610061 commit dab9a6e

3 files changed

Lines changed: 140 additions & 22 deletions

File tree

scripts/quarry/config.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
import os
44
from pathlib import Path
55

6+
try:
7+
from dotenv import load_dotenv
8+
load_dotenv(Path(__file__).parent.parent.parent / ".env")
9+
except ImportError:
10+
pass
11+
612
# === Neo4j Configuration ===
713
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
814
NEO4J_DATABASE = "quarry"
@@ -51,7 +57,7 @@
5157
# === Controlled Vocabularies ===
5258
FACT_CATEGORIES = [
5359
"design", "collection", "weighting", "estimation",
54-
"variance", "processing", "adjustment"
60+
"variance", "processing", "adjustment", "dissemination"
5561
]
5662

5763
DIMENSIONS = [
@@ -66,6 +72,34 @@
6672

6773
LATITUDES = ["none", "narrow", "wide", "full"]
6874

75+
# === Evolutionary Vocabulary (ADR-010) ===
76+
# Provisional vocabulary extensions — terms discovered during extraction
77+
# that haven't yet been validated across multiple documents.
78+
# Format: {field: {term: {first_seen: catalog_id, date: str, count: int, notes: str}}}
79+
VOCABULARY_EXTENSIONS = {
80+
"fact_category": {},
81+
"dimension": {},
82+
"value_type": {},
83+
"assertion_type": {},
84+
}
85+
86+
# Rejected vocabulary terms — mapped to corrections.
87+
# Format: {field: {term: {reason: str, action: str, target: str, date: str}}}
88+
# action can be: "remap" (use target as replacement value) or "reclassify" (wrong node type)
89+
VOCABULARY_REJECTIONS = {
90+
"fact_category": {
91+
"definition": {
92+
"reason": "Node type error — these are ConceptDefinition nodes, not MethodologicalChoice",
93+
"action": "reclassify",
94+
"target_type": "ConceptDefinition",
95+
"date": "2026-02-09"
96+
}
97+
},
98+
"dimension": {},
99+
"value_type": {},
100+
"assertion_type": {},
101+
}
102+
69103
# === Allowed Node Types (from schema v3.1) ===
70104
ALLOWED_NODE_TYPES = [
71105
"MethodologicalChoice",

scripts/quarry/extract.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,15 +206,25 @@ def extract_chunk(client, chunk, source_doc, existing_entities, dry_run=False):
206206
logger.error(f"JSON parse failed for chunk {chunk.chunk_index}: {e}")
207207
return {"raw": response_text, "error": "parse_failed", "tokens": tokens_used}
208208

209-
# Validate
210-
is_valid, errors = validate_extraction(data)
209+
# Validate with evolutionary vocabulary
210+
is_valid, errors, corrections = validate_extraction(data, source_doc["catalog_id"])
211211
if not is_valid:
212212
logger.warning(f"Validation failed for chunk {chunk.chunk_index}: {errors[:3]}")
213213

214+
# Apply corrections to data (reclassified nodes)
215+
for correction in corrections.get("reclassified_nodes", []):
216+
for node in data["nodes"]:
217+
if node["id"] == correction["node_id"]:
218+
node["type"] = correction["new_type"]
219+
# Remove fact_category property if reclassifying
220+
if "properties" in node and "fact_category" in node["properties"]:
221+
del node["properties"]["fact_category"]
222+
214223
return {
215224
"data": data,
216225
"valid": is_valid,
217226
"errors": errors if not is_valid else [],
227+
"corrections": corrections,
218228
"tokens": tokens_used
219229
}
220230

scripts/quarry/utils.py

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -74,24 +74,100 @@ def setup_logging(name: str) -> logging.Logger:
7474
return logging.getLogger(name)
7575

7676

77-
def validate_extraction(data: dict) -> tuple[bool, list[str]]:
78-
"""Validate extracted JSON against schema constraints.
77+
def validate_extraction(data: dict, catalog_id: str = "") -> tuple[bool, list[str], dict]:
78+
"""Validate extracted JSON against schema constraints with three-tier vocabulary validation.
79+
80+
Three-tier validation (ADR-010):
81+
1. Core term (in FACT_CATEGORIES etc.) → accept silently
82+
2. Provisional term (in VOCABULARY_EXTENSIONS) → accept, INFO log, increment count
83+
3. Rejected term (in VOCABULARY_REJECTIONS) → apply correction, WARN log
84+
4. Unknown term → accept, WARN log, auto-add to provisional
7985
8086
Args:
8187
data: Extraction output with 'nodes' and 'relationships' keys
88+
catalog_id: Source document catalog_id for provisional tracking
8289
8390
Returns:
84-
(is_valid, error_messages)
91+
(is_valid, warnings, corrections)
92+
93+
corrections dict has structure:
94+
{
95+
"remapped_values": [{"node_id": ..., "field": ..., "old": ..., "new": ...}],
96+
"reclassified_nodes": [{"node_id": ..., "old_type": ..., "new_type": ...}]
97+
}
8598
"""
99+
import datetime
100+
101+
logger = logging.getLogger(__name__)
86102
errors = []
103+
corrections = {"remapped_values": [], "reclassified_nodes": []}
87104

88105
# Check structure
89106
if "nodes" not in data:
90107
errors.append("Missing 'nodes' key")
91108
if "relationships" not in data:
92109
errors.append("Missing 'relationships' key")
93110
if errors:
94-
return False, errors
111+
return False, errors, corrections
112+
113+
# Helper to validate vocabulary term
114+
def validate_vocab_term(field: str, term: str, node_id: str, node_idx: int) -> str:
115+
"""Validate a vocabulary term and return corrected value (or original if valid)."""
116+
# Get core vocabulary for this field
117+
core_vocab = {
118+
"fact_category": config.FACT_CATEGORIES,
119+
"dimension": config.DIMENSIONS,
120+
"value_type": config.VALUE_TYPES,
121+
"assertion_type": config.ASSERTION_TYPES,
122+
"latitude": config.LATITUDES,
123+
}.get(field)
124+
125+
if not core_vocab:
126+
return term # Not a controlled field
127+
128+
# Tier 1: Core vocabulary - accept silently
129+
if term in core_vocab:
130+
return term
131+
132+
# Tier 2: Rejected terms - apply correction
133+
if term in config.VOCABULARY_REJECTIONS.get(field, {}):
134+
rejection = config.VOCABULARY_REJECTIONS[field][term]
135+
if rejection["action"] == "remap":
136+
target = rejection.get("target", "")
137+
logger.warning(f"Node {node_idx} ({node_id}): Rejected {field} '{term}' → remapping to '{target}' ({rejection['reason']})")
138+
corrections["remapped_values"].append({
139+
"node_id": node_id,
140+
"field": field,
141+
"old": term,
142+
"new": target
143+
})
144+
return target
145+
elif rejection["action"] == "reclassify":
146+
target_type = rejection.get("target_type", "")
147+
logger.warning(f"Node {node_idx} ({node_id}): Rejected {field} '{term}' → reclassifying to {target_type} ({rejection['reason']})")
148+
corrections["reclassified_nodes"].append({
149+
"node_id": node_id,
150+
"old_type": data["nodes"][node_idx]["type"],
151+
"new_type": target_type
152+
})
153+
errors.append(f"Node {node_idx}: {field} '{term}' triggers reclassification to {target_type}")
154+
return term # Keep original, but flag for reclassification
155+
156+
# Tier 3: Provisional terms - accept with INFO log
157+
if term in config.VOCABULARY_EXTENSIONS.get(field, {}):
158+
config.VOCABULARY_EXTENSIONS[field][term]["count"] += 1
159+
logger.info(f"Provisional vocabulary term '{term}' for {field} (count: {config.VOCABULARY_EXTENSIONS[field][term]['count']})")
160+
return term
161+
162+
# Tier 4: Unknown terms - accept, WARN, auto-add to provisional
163+
logger.warning(f"Node {node_idx} ({node_id}): New vocabulary term '{term}' for {field} — adding to provisional")
164+
config.VOCABULARY_EXTENSIONS[field][term] = {
165+
"first_seen": catalog_id or "unknown",
166+
"date": datetime.date.today().isoformat(),
167+
"count": 1,
168+
"notes": "Auto-added during extraction"
169+
}
170+
return term
95171

96172
# Validate nodes
97173
for i, node in enumerate(data["nodes"]):
@@ -103,25 +179,23 @@ def validate_extraction(data: dict) -> tuple[bool, list[str]]:
103179
errors.append(f"Node {i}: invalid type '{node['type']}'")
104180

105181
# Check ID
106-
if "id" not in node or not node["id"]:
182+
node_id = node.get("id", "")
183+
if not node_id:
107184
errors.append(f"Node {i}: missing or empty 'id'")
108-
elif not re.match(r"^[a-z0-9_]+$", node["id"]):
109-
errors.append(f"Node {i}: id '{node['id']}' not snake_case")
185+
elif not re.match(r"^[a-z0-9_]+$", node_id):
186+
errors.append(f"Node {i}: id '{node_id}' not snake_case")
110187

111188
# Check properties if present
112189
props = node.get("properties", {})
113190

114-
# Validate controlled vocabularies
115-
if "fact_category" in props and props["fact_category"] not in config.FACT_CATEGORIES:
116-
errors.append(f"Node {i}: invalid fact_category '{props['fact_category']}'")
117-
if "dimension" in props and props["dimension"] not in config.DIMENSIONS:
118-
errors.append(f"Node {i}: invalid dimension '{props['dimension']}'")
119-
if "value_type" in props and props["value_type"] not in config.VALUE_TYPES:
120-
errors.append(f"Node {i}: invalid value_type '{props['value_type']}'")
121-
if "assertion_type" in props and props["assertion_type"] not in config.ASSERTION_TYPES:
122-
errors.append(f"Node {i}: invalid assertion_type '{props['assertion_type']}'")
123-
if "latitude" in props and props["latitude"] not in config.LATITUDES:
124-
errors.append(f"Node {i}: invalid latitude '{props['latitude']}'")
191+
# Validate controlled vocabularies with three-tier system
192+
for field in ["fact_category", "dimension", "value_type", "assertion_type", "latitude"]:
193+
if field in props:
194+
original = props[field]
195+
corrected = validate_vocab_term(field, original, node_id, i)
196+
if corrected != original:
197+
# Update in-place for remapped values
198+
props[field] = corrected
125199

126200
# Validate fractions
127201
if "value_number" in props and props.get("value_type") == "fraction":
@@ -141,4 +215,4 @@ def validate_extraction(data: dict) -> tuple[bool, list[str]]:
141215
if "target" not in rel or not rel["target"]:
142216
errors.append(f"Relationship {i}: missing or empty 'target'")
143217

144-
return len(errors) == 0, errors
218+
return len(errors) == 0, errors, corrections

0 commit comments

Comments
 (0)