@@ -74,24 +74,100 @@ def setup_logging(name: str) -> logging.Logger:
7474 return logging .getLogger (name )
7575
7676
77- def validate_extraction (data : dict ) -> tuple [bool , list [str ]]:
78- """Validate extracted JSON against schema constraints.
77+ def validate_extraction (data : dict , catalog_id : str = "" ) -> tuple [bool , list [str ], dict ]:
78+ """Validate extracted JSON against schema constraints with three-tier vocabulary validation.
79+
80+ Three-tier validation (ADR-010):
81+ 1. Core term (in FACT_CATEGORIES etc.) → accept silently
82+ 2. Provisional term (in VOCABULARY_EXTENSIONS) → accept, INFO log, increment count
83+ 3. Rejected term (in VOCABULARY_REJECTIONS) → apply correction, WARN log
84+ 4. Unknown term → accept, WARN log, auto-add to provisional
7985
8086 Args:
8187 data: Extraction output with 'nodes' and 'relationships' keys
88+ catalog_id: Source document catalog_id for provisional tracking
8289
8390 Returns:
84- (is_valid, error_messages)
91+ (is_valid, warnings, corrections)
92+
93+ corrections dict has structure:
94+ {
95+ "remapped_values": [{"node_id": ..., "field": ..., "old": ..., "new": ...}],
96+ "reclassified_nodes": [{"node_id": ..., "old_type": ..., "new_type": ...}]
97+ }
8598 """
99+ import datetime
100+
101+ logger = logging .getLogger (__name__ )
86102 errors = []
103+ corrections = {"remapped_values" : [], "reclassified_nodes" : []}
87104
88105 # Check structure
89106 if "nodes" not in data :
90107 errors .append ("Missing 'nodes' key" )
91108 if "relationships" not in data :
92109 errors .append ("Missing 'relationships' key" )
93110 if errors :
94- return False , errors
111+ return False , errors , corrections
112+
113+ # Helper to validate vocabulary term
114+ def validate_vocab_term (field : str , term : str , node_id : str , node_idx : int ) -> str :
115+ """Validate a vocabulary term and return corrected value (or original if valid)."""
116+ # Get core vocabulary for this field
117+ core_vocab = {
118+ "fact_category" : config .FACT_CATEGORIES ,
119+ "dimension" : config .DIMENSIONS ,
120+ "value_type" : config .VALUE_TYPES ,
121+ "assertion_type" : config .ASSERTION_TYPES ,
122+ "latitude" : config .LATITUDES ,
123+ }.get (field )
124+
125+ if not core_vocab :
126+ return term # Not a controlled field
127+
128+ # Tier 1: Core vocabulary - accept silently
129+ if term in core_vocab :
130+ return term
131+
132+ # Tier 2: Rejected terms - apply correction
133+ if term in config .VOCABULARY_REJECTIONS .get (field , {}):
134+ rejection = config .VOCABULARY_REJECTIONS [field ][term ]
135+ if rejection ["action" ] == "remap" :
136+ target = rejection .get ("target" , "" )
137+ logger .warning (f"Node { node_idx } ({ node_id } ): Rejected { field } '{ term } ' → remapping to '{ target } ' ({ rejection ['reason' ]} )" )
138+ corrections ["remapped_values" ].append ({
139+ "node_id" : node_id ,
140+ "field" : field ,
141+ "old" : term ,
142+ "new" : target
143+ })
144+ return target
145+ elif rejection ["action" ] == "reclassify" :
146+ target_type = rejection .get ("target_type" , "" )
147+ logger .warning (f"Node { node_idx } ({ node_id } ): Rejected { field } '{ term } ' → reclassifying to { target_type } ({ rejection ['reason' ]} )" )
148+ corrections ["reclassified_nodes" ].append ({
149+ "node_id" : node_id ,
150+ "old_type" : data ["nodes" ][node_idx ]["type" ],
151+ "new_type" : target_type
152+ })
153+ errors .append (f"Node { node_idx } : { field } '{ term } ' triggers reclassification to { target_type } " )
154+ return term # Keep original, but flag for reclassification
155+
156+ # Tier 3: Provisional terms - accept with INFO log
157+ if term in config .VOCABULARY_EXTENSIONS .get (field , {}):
158+ config .VOCABULARY_EXTENSIONS [field ][term ]["count" ] += 1
159+ logger .info (f"Provisional vocabulary term '{ term } ' for { field } (count: { config .VOCABULARY_EXTENSIONS [field ][term ]['count' ]} )" )
160+ return term
161+
162+ # Tier 4: Unknown terms - accept, WARN, auto-add to provisional
163+ logger .warning (f"Node { node_idx } ({ node_id } ): New vocabulary term '{ term } ' for { field } — adding to provisional" )
164+ config .VOCABULARY_EXTENSIONS [field ][term ] = {
165+ "first_seen" : catalog_id or "unknown" ,
166+ "date" : datetime .date .today ().isoformat (),
167+ "count" : 1 ,
168+ "notes" : "Auto-added during extraction"
169+ }
170+ return term
95171
96172 # Validate nodes
97173 for i , node in enumerate (data ["nodes" ]):
@@ -103,25 +179,23 @@ def validate_extraction(data: dict) -> tuple[bool, list[str]]:
103179 errors .append (f"Node { i } : invalid type '{ node ['type' ]} '" )
104180
105181 # Check ID
106- if "id" not in node or not node ["id" ]:
182+ node_id = node .get ("id" , "" )
183+ if not node_id :
107184 errors .append (f"Node { i } : missing or empty 'id'" )
108- elif not re .match (r"^[a-z0-9_]+$" , node [ "id" ] ):
109- errors .append (f"Node { i } : id '{ node [ 'id' ] } ' not snake_case" )
185+ elif not re .match (r"^[a-z0-9_]+$" , node_id ):
186+ errors .append (f"Node { i } : id '{ node_id } ' not snake_case" )
110187
111188 # Check properties if present
112189 props = node .get ("properties" , {})
113190
114- # Validate controlled vocabularies
115- if "fact_category" in props and props ["fact_category" ] not in config .FACT_CATEGORIES :
116- errors .append (f"Node { i } : invalid fact_category '{ props ['fact_category' ]} '" )
117- if "dimension" in props and props ["dimension" ] not in config .DIMENSIONS :
118- errors .append (f"Node { i } : invalid dimension '{ props ['dimension' ]} '" )
119- if "value_type" in props and props ["value_type" ] not in config .VALUE_TYPES :
120- errors .append (f"Node { i } : invalid value_type '{ props ['value_type' ]} '" )
121- if "assertion_type" in props and props ["assertion_type" ] not in config .ASSERTION_TYPES :
122- errors .append (f"Node { i } : invalid assertion_type '{ props ['assertion_type' ]} '" )
123- if "latitude" in props and props ["latitude" ] not in config .LATITUDES :
124- errors .append (f"Node { i } : invalid latitude '{ props ['latitude' ]} '" )
191+ # Validate controlled vocabularies with three-tier system
192+ for field in ["fact_category" , "dimension" , "value_type" , "assertion_type" , "latitude" ]:
193+ if field in props :
194+ original = props [field ]
195+ corrected = validate_vocab_term (field , original , node_id , i )
196+ if corrected != original :
197+ # Update in-place for remapped values
198+ props [field ] = corrected
125199
126200 # Validate fractions
127201 if "value_number" in props and props .get ("value_type" ) == "fraction" :
@@ -141,4 +215,4 @@ def validate_extraction(data: dict) -> tuple[bool, list[str]]:
141215 if "target" not in rel or not rel ["target" ]:
142216 errors .append (f"Relationship { i } : missing or empty 'target'" )
143217
144- return len (errors ) == 0 , errors
218+ return len (errors ) == 0 , errors , corrections
0 commit comments