88import re
99from abc import ABC , abstractmethod
1010from dataclasses import dataclass
11+ from functools import lru_cache
1112from typing import Dict , List , Optional , Union
1213from io import open
1314
@@ -66,6 +67,19 @@ class LegacyFileAnalyzer(FileAnalyzer):
6667
6768 def analyze (self ) -> FileAnalysisResult :
6869 """Analyze file using regex patterns for compatibility."""
70+ try :
71+ mtime = compiletools .wrappedos .getmtime (self .filepath )
72+ except OSError :
73+ # File doesn't exist, return empty result directly
74+ return FileAnalysisResult (
75+ text = "" , include_positions = [], magic_positions = [],
76+ directive_positions = {}, bytes_analyzed = 0 , was_truncated = False
77+ )
78+ return self ._cached_analyze (mtime )
79+
80+ @lru_cache (maxsize = None )
81+ def _cached_analyze (self , mtime : float ) -> FileAnalysisResult :
82+ """Cached analysis implementation."""
6983 if not os .path .exists (self .filepath ):
7084 return FileAnalysisResult (
7185 text = "" , include_positions = [], magic_positions = [],
@@ -84,7 +98,7 @@ def analyze(self) -> FileAnalysisResult:
8498 else :
8599 text = f .read (self .max_read_size )
86100 bytes_analyzed = len (text .encode ('utf-8' ))
87- was_truncated = bytes_analyzed >= self . max_read_size
101+ was_truncated = not read_entire_file and file_size > bytes_analyzed
88102
89103 except (IOError , OSError ):
90104 return FileAnalysisResult (
@@ -131,10 +145,27 @@ def _find_magic_positions(self, text: str) -> List[int]:
131145 pattern = re .compile (r'^[\s]*//#([A-Za-z_][A-Za-z0-9_-]*)\s*=' , re .MULTILINE )
132146
133147 for match in pattern .finditer (text ):
134- positions .append (match .start ())
148+ pos = match .start ()
149+ # Check if this position is inside a multi-line block comment
150+ if not self ._is_inside_block_comment_legacy (text , pos ):
151+ positions .append (pos )
135152
136153 return positions
137154
155+ def _is_inside_block_comment_legacy (self , text : str , pos : int ) -> bool :
156+ """Check if position is inside a multi-line block comment (Legacy version)."""
157+ # Find the most recent /* and */ before this position
158+ last_block_start = text .rfind ('/*' , 0 , pos )
159+ if last_block_start != - 1 :
160+ # Found a /* before this position
161+ # Check if there's a closing */ between the /* and our position
162+ last_block_end = text .rfind ('*/' , last_block_start , pos )
163+ if last_block_end == - 1 :
164+ # No closing */ found, so we're inside the block comment
165+ return True
166+
167+ return False
168+
138169 def _find_directive_positions (self , text : str ) -> Dict [str , List [int ]]:
139170 """Find positions of all preprocessor directives by type."""
140171 directive_positions = {}
@@ -167,6 +198,19 @@ def __init__(self, filepath: str, max_read_size: int = 0, verbose: int = 0):
167198
168199 def analyze (self ) -> FileAnalysisResult :
169200 """Analyze file using StringZilla SIMD optimization."""
201+ try :
202+ mtime = compiletools .wrappedos .getmtime (self .filepath )
203+ except OSError :
204+ # File doesn't exist, return empty result directly
205+ return FileAnalysisResult (
206+ text = "" , include_positions = [], magic_positions = [],
207+ directive_positions = {}, bytes_analyzed = 0 , was_truncated = False
208+ )
209+ return self ._cached_analyze (mtime )
210+
211+ @lru_cache (maxsize = None )
212+ def _cached_analyze (self , mtime : float ) -> FileAnalysisResult :
213+ """Cached analysis implementation."""
170214 if not self ._stringzilla_available :
171215 raise RuntimeError ("StringZilla not available" )
172216
@@ -183,27 +227,28 @@ def analyze(self) -> FileAnalysisResult:
183227 read_entire_file = self ._should_read_entire_file (file_size )
184228
185229 if read_entire_file :
186- # Memory-map entire file
187- str_file = Str (File (self .filepath ))
188- text = str (str_file )
230+ # Memory-map entire file and keep as Str for SIMD operations
231+ str_text = Str (File (self .filepath ))
232+ text = str (str_text ) # Convert to string only for return value
189233 bytes_analyzed = len (text .encode ('utf-8' ))
190234 was_truncated = False
191235 else :
192236 # Read limited amount
193237 with open (self .filepath , encoding = "utf-8" , errors = "ignore" ) as f :
194238 text = f .read (self .max_read_size )
195239 bytes_analyzed = len (text .encode ('utf-8' ))
196- was_truncated = bytes_analyzed >= self .max_read_size
240+ was_truncated = not read_entire_file and file_size > bytes_analyzed
241+ # Create Str for limited read case
242+ str_text = Str (text )
197243
198244 except (IOError , OSError ):
199245 return FileAnalysisResult (
200246 text = "" , include_positions = [], magic_positions = [],
201247 directive_positions = {}, bytes_analyzed = 0 , was_truncated = False
202248 )
203249
204- # Use StringZilla for fast pattern detection on raw text
250+ # Use StringZilla SIMD operations directly on str_text
205251 # Note: Conditional compilation should be handled by the caller
206- str_text = Str (text )
207252 include_positions = self ._find_include_positions_simd (str_text )
208253 magic_positions = self ._find_magic_positions_simd (str_text )
209254 directive_positions = self ._find_directive_positions_simd (str_text )
@@ -228,18 +273,42 @@ def _find_include_positions_simd(self, str_text) -> List[int]:
228273 if pos == - 1 :
229274 break
230275
231- # Verify this is a real include (not in comment)
232- line_start = str_text .rfind ('\n ' , 0 , pos ) + 1
233- line_prefix = str (str_text [line_start :pos ]).strip ()
234-
235- # Check if line starts with // or is inside /* */
236- if not line_prefix .startswith ('//' ) and '/*' not in line_prefix :
276+ # Check if this #include is inside a comment
277+ if not self ._is_position_commented (str_text , pos ):
237278 positions .append (pos )
238279
239280 start = pos + 8 # len('#include')
240281
241282 return positions
242283
284+ def _is_position_commented (self , str_text , pos : int ) -> bool :
285+ """Check if position is inside a comment (single-line or multi-line block)."""
286+ # Check for single-line comment on current line
287+ line_start = str_text .rfind ('\n ' , 0 , pos ) + 1
288+ # Use StringZilla slice directly for efficiency
289+ line_prefix_slice = str_text [line_start :pos ]
290+
291+ # Look for // in the line prefix using SIMD
292+ comment_pos = line_prefix_slice .find ('//' )
293+ if comment_pos != - 1 :
294+ # Check if there's only whitespace before //
295+ before_comment = str (line_prefix_slice [:comment_pos ]).strip ()
296+ if before_comment == '' :
297+ return True
298+
299+ # Check for multi-line block comment
300+ # Find the most recent /* and */ before this position
301+ last_block_start = str_text .rfind ('/*' , 0 , pos )
302+ if last_block_start != - 1 :
303+ # Found a /* before this position
304+ # Check if there's a closing */ between the /* and our position
305+ last_block_end = str_text .rfind ('*/' , last_block_start , pos )
306+ if last_block_end == - 1 :
307+ # No closing */ found, so we're inside the block comment
308+ return True
309+
310+ return False
311+
243312 def _find_magic_positions_simd (self , str_text ) -> List [int ]:
244313 """Find positions of all //#KEY=value patterns using StringZilla."""
245314 positions = []
@@ -253,29 +322,49 @@ def _find_magic_positions_simd(self, str_text) -> List[int]:
253322
254323 # Check if this //# is at start of line (after optional whitespace)
255324 line_start = str_text .rfind ('\n ' , 0 , pos ) + 1
256- line_prefix = str ( str_text [line_start :pos ]). strip ()
325+ line_prefix_slice = str_text [line_start :pos ]
257326
258- if line_prefix == '' : # Only whitespace before //#
259- # Look for KEY=value pattern after //#
260- after_hash = pos + 3
261- line_end = str_text .find ('\n ' , after_hash )
262- if line_end == - 1 :
263- line_end = len (str_text )
264-
265- line_content = str (str_text [after_hash :line_end ])
266- if '=' in line_content :
267- # Verify it matches KEY=value pattern with valid identifier
268- key_part = line_content .split ('=' )[0 ].strip ()
269- # Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
270- if (key_part and
271- (key_part [0 ].isalpha () or key_part [0 ] == '_' ) and
272- all (c .isalnum () or c in '_-' for c in key_part )):
273- positions .append (pos )
327+ # Use StringZilla to check if only whitespace (convert only when necessary)
328+ if str (line_prefix_slice ).strip () == '' : # Only whitespace before //#
329+ # Check if we're inside a block comment (though //# starting a line is usually not)
330+ if not self ._is_inside_block_comment (str_text , pos ):
331+ # Look for KEY=value pattern after //#
332+ after_hash = pos + 3
333+ line_end = str_text .find ('\n ' , after_hash )
334+ if line_end == - 1 :
335+ line_end = len (str_text )
336+
337+ # Use StringZilla slice and find = using SIMD
338+ line_content_slice = str_text [after_hash :line_end ]
339+ equals_pos = line_content_slice .find ('=' )
340+ if equals_pos != - 1 :
341+ # Extract key part using StringZilla slice
342+ key_slice = line_content_slice [:equals_pos ]
343+ key_part = str (key_slice ).strip ()
344+ # Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
345+ if (key_part and
346+ (key_part [0 ].isalpha () or key_part [0 ] == '_' ) and
347+ all (c .isalnum () or c in '_-' for c in key_part )):
348+ positions .append (pos )
274349
275350 start = pos + 3 # len('//#')
276351
277352 return positions
278353
354+ def _is_inside_block_comment (self , str_text , pos : int ) -> bool :
355+ """Check if position is inside a multi-line block comment."""
356+ # Find the most recent /* and */ before this position
357+ last_block_start = str_text .rfind ('/*' , 0 , pos )
358+ if last_block_start != - 1 :
359+ # Found a /* before this position
360+ # Check if there's a closing */ between the /* and our position
361+ last_block_end = str_text .rfind ('*/' , last_block_start , pos )
362+ if last_block_end == - 1 :
363+ # No closing */ found, so we're inside the block comment
364+ return True
365+
366+ return False
367+
279368 def _find_directive_positions_simd (self , str_text ) -> Dict [str , List [int ]]:
280369 """Find positions of all preprocessor directives using StringZilla."""
281370 directive_positions = {}
@@ -329,4 +418,6 @@ def create_file_analyzer(filepath: str, max_read_size: int = 0, verbose: int = 0
329418 except ImportError :
330419 if verbose >= 3 :
331420 print ("StringZilla not available, using legacy file analyzer" )
332- return LegacyFileAnalyzer (filepath , max_read_size , verbose )
421+ return LegacyFileAnalyzer (filepath , max_read_size , verbose )
422+
423+
0 commit comments