GeneralizedNotationNotation/src/gnn/validation.py at main · ActiveInferenceInstitute/GeneralizedNotationNotation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/env python3
"""
GNN Validation Strategy Module

This module provides comprehensive validation strategies for GNN models
with multiple validation levels and extensible validation rules.
"""

import logging
from pathlib import Path
from typing import Any, Dict, List

from .schema_validator import GNNValidator
from .types import ValidationLevel, ValidationResult

logger = logging.getLogger(__name__)


class ValidationStrategy:
    """
    Comprehensive validation with multiple levels and strategies.

    Supports validation levels from basic syntax checking to
    research-grade semantic validation with round-trip testing.
    """

    def __init__(self):
        self.validation_level = "standard"
        self.enable_strict_checking = False
        self.validators = {}
        self._initialize_validators()

    def configure(self, validation_level: str = "standard",
                 enable_strict_checking: bool = False):
        """Configure validation parameters."""
        self.validation_level = validation_level
        self.enable_strict_checking = enable_strict_checking

        # Update validator configurations
        for validator in self.validators.values():
            if hasattr(validator, 'validation_level'):
                validator.validation_level = ValidationLevel(validation_level.upper())

    def _initialize_validators(self):
        """Initialize validation components."""
        try:
            # Primary GNN validator
            self.validators['gnn'] = GNNValidator(
                validation_level=ValidationLevel.STANDARD,
                enable_round_trip_testing=False
            )
            logger.debug("GNN validator initialized")

        except Exception as e:
            logger.error(f"Could not initialize GNN validator: {e}; falling back to basic validation")
            self.validators['gnn'] = None

    def validate_files(self, files: List[Path]) -> Dict[Path, ValidationResult]:
        """
        Validate multiple files based on configured level.

        Args:
            files: List of file paths to validate

        Returns:
            Dictionary mapping file paths to validation results
        """
        results = {}

        logger.info(f"Validating {len(files)} files at level: {self.validation_level}")

        for file_path in files:
            try:
                result = self.validate_file(file_path)
                results[file_path] = result

                # Log validation outcome
                if result.is_valid:
                    logger.debug(f"✓ {file_path.name} - Valid")
                else:
                    logger.warning(f"✗ {file_path.name} - Invalid ({len(result.errors)} errors)")

            except Exception as e:
                # Create error result for files that couldn't be validated
                error_result = ValidationResult(
                    is_valid=False,
                    validation_level=ValidationLevel(self.validation_level.upper()),
                    format_tested="unknown"
                )
                error_result.errors.append(f"Validation failed: {e}")
                results[file_path] = error_result
                logger.error(f"Failed to validate {file_path}: {e}")

        # Log summary
        valid_count = sum(1 for r in results.values() if r.is_valid)
        logger.info(f"Validation complete: {valid_count}/{len(files)} files valid")

        return results

    def validate_file(self, file_path: Path) -> ValidationResult:
        """
        Validate a single file based on configured level.

        Args:
            file_path: Path to file to validate

        Returns:
            ValidationResult with comprehensive validation data
        """
        if not file_path.exists():
            result = ValidationResult(
                is_valid=False,
                validation_level=ValidationLevel(self.validation_level.upper())
            )
            result.errors.append(f"File not found: {file_path}")
            return result

        # Determine validation approach based on level
        validation_level_enum = ValidationLevel(self.validation_level.upper())

        if validation_level_enum == ValidationLevel.BASIC:
            return self._validate_basic(file_path)
        elif validation_level_enum == ValidationLevel.STANDARD:
            return self._validate_standard(file_path)
        elif validation_level_enum == ValidationLevel.STRICT:
            return self._validate_strict(file_path)
        elif validation_level_enum == ValidationLevel.RESEARCH:
            return self._validate_research(file_path)
        elif validation_level_enum == ValidationLevel.ROUND_TRIP:
            return self._validate_round_trip(file_path)
        else:
            return self._validate_standard(file_path)  # Default recovery

    def _validate_basic(self, file_path: Path) -> ValidationResult:
        """Basic validation - file accessibility and format detection."""
        result = ValidationResult(
            is_valid=True,
            validation_level=ValidationLevel.BASIC
        )

        try:
            # Check file accessibility
            file_size = file_path.stat().st_size
            if file_size == 0:
                result.warnings.append("File is empty")

            # Detect format
            format_detected = self._detect_file_format(file_path)
            result.format_tested = format_detected

            # Basic content check
            if format_detected in ['json', 'xml', 'yaml']:
                self._validate_structured_format_basic(file_path, result)
            else:
                self._validate_text_format_basic(file_path, result)

            result.metadata['file_size'] = file_size
            result.metadata['format_detected'] = format_detected

        except Exception as e:
            result.errors.append(f"Basic validation failed: {e}")
            result.is_valid = False

        return result

    def _validate_with_fallback(self, file_path: Path, level: ValidationLevel) -> ValidationResult:
        """Delegate to GNN validator or fall back to basic validation."""
        if self.validators['gnn']:
            return self.validators['gnn'].validate_file(file_path, level)
        result = self._validate_basic(file_path)
        result.validation_level = level
        result.warnings.append("Using basic validation - GNN validator unavailable")
        return result

    def _validate_standard(self, file_path: Path) -> ValidationResult:
        """Standard validation - structure and basic semantics."""
        return self._validate_with_fallback(file_path, ValidationLevel.STANDARD)

    def _validate_strict(self, file_path: Path) -> ValidationResult:
        """Strict validation - enhanced semantics and consistency."""
        return self._validate_with_fallback(file_path, ValidationLevel.STRICT)

    def _validate_research(self, file_path: Path) -> ValidationResult:
        """Research-grade validation - comprehensive analysis."""
        return self._validate_with_fallback(file_path, ValidationLevel.RESEARCH)

    def _validate_round_trip(self, file_path: Path) -> ValidationResult:
        """Round-trip validation - semantic preservation testing."""
        if self.validators['gnn']:
            # Enable round-trip testing for this validation
            original_setting = self.validators['gnn'].enable_round_trip_testing
            self.validators['gnn'].enable_round_trip_testing = True

            try:
                result = self.validators['gnn'].validate_file(file_path, ValidationLevel.ROUND_TRIP)
            finally:
                # Restore original setting
                self.validators['gnn'].enable_round_trip_testing = original_setting

            return result
        else:
            result = self._validate_basic(file_path)
            result.validation_level = ValidationLevel.ROUND_TRIP
            result.warnings.append("Round-trip testing unavailable - GNN validator missing")
            return result

    def _detect_file_format(self, file_path: Path) -> str:
        """Detect file format from extension and content."""
        ext = file_path.suffix.lower()

        format_map = {
            '.md': 'markdown',
            '.json': 'json',
            '.xml': 'xml',
            '.yaml': 'yaml',
            '.yml': 'yaml',
            '.pkl': 'pickle',
            '.pickle': 'pickle',
            '.gnn': 'gnn'
        }

        return format_map.get(ext, 'unknown')

    def _validate_structured_format_basic(self, file_path: Path, result: ValidationResult):
        """Basic validation for structured formats (JSON, XML, YAML)."""
        try:
            content = file_path.read_text(encoding='utf-8')
            format_type = result.format_tested

            if format_type == 'json':
                import json
                json.loads(content)
                result.suggestions.append("JSON format is valid")

            elif format_type == 'xml':
                import xml.etree.ElementTree as ET  # nosec B405 - GNN files are researcher-generated, not untrusted input
                ET.fromstring(content)  # nosec B314 - GNN files are researcher-generated, not untrusted input
                result.suggestions.append("XML format is valid")

            elif format_type == 'yaml':
                try:
                    import yaml
                    yaml.safe_load(content)
                    result.suggestions.append("YAML format is valid")
                except ImportError:
                    result.warnings.append("Cannot validate YAML - PyYAML not available")

        except Exception as e:
            result.errors.append(f"Format validation failed: {e}")
            result.is_valid = False

    def _validate_text_format_basic(self, file_path: Path, result: ValidationResult):
        """Basic validation for text formats (Markdown, GNN)."""
        try:
            content = file_path.read_text(encoding='utf-8')

            # Check for basic GNN markers
            gnn_markers = ['ModelName', 'StateSpaceBlock', 'Connections', 'Parameters']
            found_markers = [marker for marker in gnn_markers if marker in content]

            if found_markers:
                result.suggestions.append(f"GNN markers found: {found_markers}")
            else:
                result.warnings.append("No standard GNN markers found")

            # Check for section structure
            if '##' in content:
                section_count = content.count('##')
                result.metadata['section_count'] = section_count
                if section_count >= 3:
                    result.suggestions.append(f"Well-structured document ({section_count} sections)")
                else:
                    result.warnings.append("Document has few sections")

        except UnicodeDecodeError:
            result.warnings.append("File contains non-UTF-8 content")
        except Exception as e:
            result.errors.append(f"Text validation failed: {e}")
            result.is_valid = False

    def get_validation_summary(self, results: Dict[Path, ValidationResult]) -> Dict[str, Any]:
        """Get comprehensive validation summary."""
        summary = {
            'total_files': len(results),
            'valid_files': 0,
            'invalid_files': 0,
            'validation_level': self.validation_level,
            'format_distribution': {},
            'error_summary': {},
            'warning_summary': {},
        }

        for _, result in results.items():
            if result.is_valid:
                summary['valid_files'] += 1
            else:
                summary['invalid_files'] += 1

            # Format distribution
            fmt = result.format_tested or 'unknown'
            summary['format_distribution'][fmt] = summary['format_distribution'].get(fmt, 0) + 1

            # Error patterns
            for error in result.errors:
                error_type = error.split(':')[0] if ':' in error else 'General'
                summary['error_summary'][error_type] = summary['error_summary'].get(error_type, 0) + 1

            # Warning patterns
            for warning in result.warnings:
                warning_type = warning.split(':')[0] if ':' in warning else 'General'
                summary['warning_summary'][warning_type] = summary['warning_summary'].get(warning_type, 0) + 1

        summary['success_rate'] = (summary['valid_files'] / summary['total_files']) * 100 if summary['total_files'] > 0 else 0

        return summary