Simplify FileAnalyzer

DrGeoff · DrGeoff · commit bf43078b84e3 · 2025-08-15T22:50:54.000-05:00
diff --git a/src/compiletools/cake.py b/src/compiletools/cake.py
@@ -159,7 +159,7 @@ def _copyexes(self):
     def _callmakefile(self):
         makefile_creator = compiletools.makefile.MakefileCreator(self.args, self.hunter)
         makefilename = makefile_creator.create()
-        compiletools.wrappedos.makedirs(self.namer.executable_dir())
+        os.makedirs(self.namer.executable_dir(), exist_ok=True)
         cmd = ["make"]
         if self.args.verbose <= 1:
             cmd.append("-s")
diff --git a/src/compiletools/diskcache.py b/src/compiletools/diskcache.py
@@ -46,7 +46,7 @@ def __init__(self, cache_identifier, deps_mode=False, magic_mode=False):
         self.magic_mode = magic_mode
         self.cachedir = compiletools.dirnamer.user_cache_dir(appname="ct")
         if self.cachedir != "None":
-            compiletools.wrappedos.makedirs(self.cachedir)
+            os.makedirs(self.cachedir, exist_ok=True)
 
         # Keep a copy of the cachefile in memory to reduce disk IO
         # Call it "cache" to match the memoizer "cache" (for ease of clearing)
@@ -187,7 +187,7 @@ def _refresh_cache(self, filename, cachefile, func, *args):
             # that we are currently interested in.
             newargs = args[:-1] + (filename,)
             result = func(*newargs)
-            compiletools.wrappedos.makedirs(compiletools.wrappedos.dirname(cachefile))
+            os.makedirs(compiletools.wrappedos.dirname(cachefile), exist_ok=True)
             with open(cachefile, mode="wb") as cf:
                 pickle.dump(result, cf)
             self.cache[cachefile] = result
diff --git a/src/compiletools/file_analyzer.py b/src/compiletools/file_analyzer.py
@@ -8,6 +8,7 @@
 import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from functools import lru_cache
 from typing import Dict, List, Optional, Union
 from io import open
 
@@ -66,6 +67,19 @@ class LegacyFileAnalyzer(FileAnalyzer):
     
     def analyze(self) -> FileAnalysisResult:
         """Analyze file using regex patterns for compatibility."""
+        try:
+            mtime = compiletools.wrappedos.getmtime(self.filepath)
+        except OSError:
+            # File doesn't exist, return empty result directly
+            return FileAnalysisResult(
+                text="", include_positions=[], magic_positions=[],
+                directive_positions={}, bytes_analyzed=0, was_truncated=False
+            )
+        return self._cached_analyze(mtime)
+    
+    @lru_cache(maxsize=None)
+    def _cached_analyze(self, mtime: float) -> FileAnalysisResult:
+        """Cached analysis implementation."""
         if not os.path.exists(self.filepath):
             return FileAnalysisResult(
                 text="", include_positions=[], magic_positions=[],
@@ -84,7 +98,7 @@ def analyze(self) -> FileAnalysisResult:
                 else:
                     text = f.read(self.max_read_size)
                     bytes_analyzed = len(text.encode('utf-8'))
-                    was_truncated = bytes_analyzed >= self.max_read_size
+                    was_truncated = not read_entire_file and file_size > bytes_analyzed
                     
         except (IOError, OSError):
             return FileAnalysisResult(
@@ -131,10 +145,27 @@ def _find_magic_positions(self, text: str) -> List[int]:
         pattern = re.compile(r'^[\s]*//#([A-Za-z_][A-Za-z0-9_-]*)\s*=', re.MULTILINE)
         
         for match in pattern.finditer(text):
-            positions.append(match.start())
+            pos = match.start()
+            # Check if this position is inside a multi-line block comment
+            if not self._is_inside_block_comment_legacy(text, pos):
+                positions.append(pos)
             
         return positions
         
+    def _is_inside_block_comment_legacy(self, text: str, pos: int) -> bool:
+        """Check if position is inside a multi-line block comment (Legacy version)."""
+        # Find the most recent /* and */ before this position
+        last_block_start = text.rfind('/*', 0, pos)
+        if last_block_start != -1:
+            # Found a /* before this position
+            # Check if there's a closing */ between the /* and our position
+            last_block_end = text.rfind('*/', last_block_start, pos)
+            if last_block_end == -1:
+                # No closing */ found, so we're inside the block comment
+                return True
+                
+        return False
+        
     def _find_directive_positions(self, text: str) -> Dict[str, List[int]]:
         """Find positions of all preprocessor directives by type."""
         directive_positions = {}
@@ -167,6 +198,19 @@ def __init__(self, filepath: str, max_read_size: int = 0, verbose: int = 0):
     
     def analyze(self) -> FileAnalysisResult:
         """Analyze file using StringZilla SIMD optimization."""
+        try:
+            mtime = compiletools.wrappedos.getmtime(self.filepath)
+        except OSError:
+            # File doesn't exist, return empty result directly
+            return FileAnalysisResult(
+                text="", include_positions=[], magic_positions=[],
+                directive_positions={}, bytes_analyzed=0, was_truncated=False
+            )
+        return self._cached_analyze(mtime)
+    
+    @lru_cache(maxsize=None)
+    def _cached_analyze(self, mtime: float) -> FileAnalysisResult:
+        """Cached analysis implementation."""
         if not self._stringzilla_available:
             raise RuntimeError("StringZilla not available")
             
@@ -183,27 +227,28 @@ def analyze(self) -> FileAnalysisResult:
             read_entire_file = self._should_read_entire_file(file_size)
             
             if read_entire_file:
-                # Memory-map entire file
-                str_file = Str(File(self.filepath))
-                text = str(str_file)
+                # Memory-map entire file and keep as Str for SIMD operations
+                str_text = Str(File(self.filepath))
+                text = str(str_text)  # Convert to string only for return value
                 bytes_analyzed = len(text.encode('utf-8'))
                 was_truncated = False
             else:
                 # Read limited amount
                 with open(self.filepath, encoding="utf-8", errors="ignore") as f:
                     text = f.read(self.max_read_size)
                     bytes_analyzed = len(text.encode('utf-8'))
-                    was_truncated = bytes_analyzed >= self.max_read_size
+                    was_truncated = not read_entire_file and file_size > bytes_analyzed
+                # Create Str for limited read case
+                str_text = Str(text)
                     
         except (IOError, OSError):
             return FileAnalysisResult(
                 text="", include_positions=[], magic_positions=[],
                 directive_positions={}, bytes_analyzed=0, was_truncated=False
             )
             
-        # Use StringZilla for fast pattern detection on raw text
+        # Use StringZilla SIMD operations directly on str_text
         # Note: Conditional compilation should be handled by the caller
-        str_text = Str(text)
         include_positions = self._find_include_positions_simd(str_text)
         magic_positions = self._find_magic_positions_simd(str_text)
         directive_positions = self._find_directive_positions_simd(str_text)
@@ -228,18 +273,42 @@ def _find_include_positions_simd(self, str_text) -> List[int]:
             if pos == -1:
                 break
                 
-            # Verify this is a real include (not in comment)
-            line_start = str_text.rfind('\n', 0, pos) + 1
-            line_prefix = str(str_text[line_start:pos]).strip()
-            
-            # Check if line starts with // or is inside /* */
-            if not line_prefix.startswith('//') and '/*' not in line_prefix:
+            # Check if this #include is inside a comment
+            if not self._is_position_commented(str_text, pos):
                 positions.append(pos)
                 
             start = pos + 8  # len('#include')
             
         return positions
         
+    def _is_position_commented(self, str_text, pos: int) -> bool:
+        """Check if position is inside a comment (single-line or multi-line block)."""
+        # Check for single-line comment on current line
+        line_start = str_text.rfind('\n', 0, pos) + 1
+        # Use StringZilla slice directly for efficiency
+        line_prefix_slice = str_text[line_start:pos]
+        
+        # Look for // in the line prefix using SIMD
+        comment_pos = line_prefix_slice.find('//')
+        if comment_pos != -1:
+            # Check if there's only whitespace before //
+            before_comment = str(line_prefix_slice[:comment_pos]).strip()
+            if before_comment == '':
+                return True
+            
+        # Check for multi-line block comment
+        # Find the most recent /* and */ before this position
+        last_block_start = str_text.rfind('/*', 0, pos)
+        if last_block_start != -1:
+            # Found a /* before this position
+            # Check if there's a closing */ between the /* and our position
+            last_block_end = str_text.rfind('*/', last_block_start, pos)
+            if last_block_end == -1:
+                # No closing */ found, so we're inside the block comment
+                return True
+                
+        return False
+        
     def _find_magic_positions_simd(self, str_text) -> List[int]:
         """Find positions of all //#KEY=value patterns using StringZilla."""
         positions = []
@@ -253,29 +322,49 @@ def _find_magic_positions_simd(self, str_text) -> List[int]:
                 
             # Check if this //# is at start of line (after optional whitespace)
             line_start = str_text.rfind('\n', 0, pos) + 1
-            line_prefix = str(str_text[line_start:pos]).strip()
+            line_prefix_slice = str_text[line_start:pos]
             
-            if line_prefix == '':  # Only whitespace before //#
-                # Look for KEY=value pattern after //#
-                after_hash = pos + 3
-                line_end = str_text.find('\n', after_hash)
-                if line_end == -1:
-                    line_end = len(str_text)
-                    
-                line_content = str(str_text[after_hash:line_end])
-                if '=' in line_content:
-                    # Verify it matches KEY=value pattern with valid identifier
-                    key_part = line_content.split('=')[0].strip()
-                    # Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
-                    if (key_part and 
-                        (key_part[0].isalpha() or key_part[0] == '_') and 
-                        all(c.isalnum() or c in '_-' for c in key_part)):
-                        positions.append(pos)
+            # Use StringZilla to check if only whitespace (convert only when necessary)
+            if str(line_prefix_slice).strip() == '':  # Only whitespace before //#
+                # Check if we're inside a block comment (though //# starting a line is usually not)
+                if not self._is_inside_block_comment(str_text, pos):
+                    # Look for KEY=value pattern after //#
+                    after_hash = pos + 3
+                    line_end = str_text.find('\n', after_hash)
+                    if line_end == -1:
+                        line_end = len(str_text)
+                        
+                    # Use StringZilla slice and find = using SIMD
+                    line_content_slice = str_text[after_hash:line_end]
+                    equals_pos = line_content_slice.find('=')
+                    if equals_pos != -1:
+                        # Extract key part using StringZilla slice
+                        key_slice = line_content_slice[:equals_pos]
+                        key_part = str(key_slice).strip()
+                        # Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
+                        if (key_part and 
+                            (key_part[0].isalpha() or key_part[0] == '_') and 
+                            all(c.isalnum() or c in '_-' for c in key_part)):
+                            positions.append(pos)
                     
             start = pos + 3  # len('//#')
             
         return positions
         
+    def _is_inside_block_comment(self, str_text, pos: int) -> bool:
+        """Check if position is inside a multi-line block comment."""
+        # Find the most recent /* and */ before this position
+        last_block_start = str_text.rfind('/*', 0, pos)
+        if last_block_start != -1:
+            # Found a /* before this position
+            # Check if there's a closing */ between the /* and our position
+            last_block_end = str_text.rfind('*/', last_block_start, pos)
+            if last_block_end == -1:
+                # No closing */ found, so we're inside the block comment
+                return True
+                
+        return False
+        
     def _find_directive_positions_simd(self, str_text) -> Dict[str, List[int]]:
         """Find positions of all preprocessor directives using StringZilla."""
         directive_positions = {}
@@ -329,4 +418,6 @@ def create_file_analyzer(filepath: str, max_read_size: int = 0, verbose: int = 0
     except ImportError:
         if verbose >= 3:
             print("StringZilla not available, using legacy file analyzer")
-        return LegacyFileAnalyzer(filepath, max_read_size, verbose)
+        return LegacyFileAnalyzer(filepath, max_read_size, verbose)
+
+
diff --git a/src/compiletools/makefile.py b/src/compiletools/makefile.py
@@ -426,7 +426,7 @@ def create(self):
 
         # Find the realpaths of the given filenames (to avoid this being
         # duplicated many times)
-        compiletools.wrappedos.makedirs(self.namer.executable_dir())
+        os.makedirs(self.namer.executable_dir(), exist_ok=True)
         rule = self._create_all_rule()
         self.rules[rule.target] = rule
         buildoutputs = self._gather_build_outputs()
diff --git a/src/compiletools/samples/lotsofmagic/lotsofmagic.cpp b/src/compiletools/samples/lotsofmagic/lotsofmagic.cpp
@@ -15,6 +15,16 @@
 //#INVALID PATTERN (no =)
 // //#123=invalid_start_with_number (starts with number - invalid identifier)
 
+/*
+   Multi-line comment test cases:
+   #include <stdlib.h>
+   The above include should NOT be detected as it's inside a block comment.
+   Also test magic flags in comments:
+   //#COMMENTED_FLAG=should_not_be_detected
+*/
+
+/* Single line block comment with #include <math.h> should also be ignored */
+
 int main(int argc, char* argv[])
 {
     std::cout << std::abs(-2) << "\n";
diff --git a/src/compiletools/test_file_analyzer.py b/src/compiletools/test_file_analyzer.py
diff --git a/src/compiletools/wrappedos.py b/src/compiletools/wrappedos.py