Skip to content

Commit bf43078

Browse files
author
DrGeoff
committed
Simplify FileAnalyzer
1 parent 4990eb1 commit bf43078

File tree

7 files changed

+227
-49
lines changed

7 files changed

+227
-49
lines changed

src/compiletools/cake.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def _copyexes(self):
159159
def _callmakefile(self):
160160
makefile_creator = compiletools.makefile.MakefileCreator(self.args, self.hunter)
161161
makefilename = makefile_creator.create()
162-
compiletools.wrappedos.makedirs(self.namer.executable_dir())
162+
os.makedirs(self.namer.executable_dir(), exist_ok=True)
163163
cmd = ["make"]
164164
if self.args.verbose <= 1:
165165
cmd.append("-s")

src/compiletools/diskcache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self, cache_identifier, deps_mode=False, magic_mode=False):
4646
self.magic_mode = magic_mode
4747
self.cachedir = compiletools.dirnamer.user_cache_dir(appname="ct")
4848
if self.cachedir != "None":
49-
compiletools.wrappedos.makedirs(self.cachedir)
49+
os.makedirs(self.cachedir, exist_ok=True)
5050

5151
# Keep a copy of the cachefile in memory to reduce disk IO
5252
# Call it "cache" to match the memoizer "cache" (for ease of clearing)
@@ -187,7 +187,7 @@ def _refresh_cache(self, filename, cachefile, func, *args):
187187
# that we are currently interested in.
188188
newargs = args[:-1] + (filename,)
189189
result = func(*newargs)
190-
compiletools.wrappedos.makedirs(compiletools.wrappedos.dirname(cachefile))
190+
os.makedirs(compiletools.wrappedos.dirname(cachefile), exist_ok=True)
191191
with open(cachefile, mode="wb") as cf:
192192
pickle.dump(result, cf)
193193
self.cache[cachefile] = result

src/compiletools/file_analyzer.py

Lines changed: 123 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
from abc import ABC, abstractmethod
1010
from dataclasses import dataclass
11+
from functools import lru_cache
1112
from typing import Dict, List, Optional, Union
1213
from io import open
1314

@@ -66,6 +67,19 @@ class LegacyFileAnalyzer(FileAnalyzer):
6667

6768
def analyze(self) -> FileAnalysisResult:
6869
"""Analyze file using regex patterns for compatibility."""
70+
try:
71+
mtime = compiletools.wrappedos.getmtime(self.filepath)
72+
except OSError:
73+
# File doesn't exist, return empty result directly
74+
return FileAnalysisResult(
75+
text="", include_positions=[], magic_positions=[],
76+
directive_positions={}, bytes_analyzed=0, was_truncated=False
77+
)
78+
return self._cached_analyze(mtime)
79+
80+
@lru_cache(maxsize=None)
81+
def _cached_analyze(self, mtime: float) -> FileAnalysisResult:
82+
"""Cached analysis implementation."""
6983
if not os.path.exists(self.filepath):
7084
return FileAnalysisResult(
7185
text="", include_positions=[], magic_positions=[],
@@ -84,7 +98,7 @@ def analyze(self) -> FileAnalysisResult:
8498
else:
8599
text = f.read(self.max_read_size)
86100
bytes_analyzed = len(text.encode('utf-8'))
87-
was_truncated = bytes_analyzed >= self.max_read_size
101+
was_truncated = not read_entire_file and file_size > bytes_analyzed
88102

89103
except (IOError, OSError):
90104
return FileAnalysisResult(
@@ -131,10 +145,27 @@ def _find_magic_positions(self, text: str) -> List[int]:
131145
pattern = re.compile(r'^[\s]*//#([A-Za-z_][A-Za-z0-9_-]*)\s*=', re.MULTILINE)
132146

133147
for match in pattern.finditer(text):
134-
positions.append(match.start())
148+
pos = match.start()
149+
# Check if this position is inside a multi-line block comment
150+
if not self._is_inside_block_comment_legacy(text, pos):
151+
positions.append(pos)
135152

136153
return positions
137154

155+
def _is_inside_block_comment_legacy(self, text: str, pos: int) -> bool:
156+
"""Check if position is inside a multi-line block comment (Legacy version)."""
157+
# Find the most recent /* and */ before this position
158+
last_block_start = text.rfind('/*', 0, pos)
159+
if last_block_start != -1:
160+
# Found a /* before this position
161+
# Check if there's a closing */ between the /* and our position
162+
last_block_end = text.rfind('*/', last_block_start, pos)
163+
if last_block_end == -1:
164+
# No closing */ found, so we're inside the block comment
165+
return True
166+
167+
return False
168+
138169
def _find_directive_positions(self, text: str) -> Dict[str, List[int]]:
139170
"""Find positions of all preprocessor directives by type."""
140171
directive_positions = {}
@@ -167,6 +198,19 @@ def __init__(self, filepath: str, max_read_size: int = 0, verbose: int = 0):
167198

168199
def analyze(self) -> FileAnalysisResult:
169200
"""Analyze file using StringZilla SIMD optimization."""
201+
try:
202+
mtime = compiletools.wrappedos.getmtime(self.filepath)
203+
except OSError:
204+
# File doesn't exist, return empty result directly
205+
return FileAnalysisResult(
206+
text="", include_positions=[], magic_positions=[],
207+
directive_positions={}, bytes_analyzed=0, was_truncated=False
208+
)
209+
return self._cached_analyze(mtime)
210+
211+
@lru_cache(maxsize=None)
212+
def _cached_analyze(self, mtime: float) -> FileAnalysisResult:
213+
"""Cached analysis implementation."""
170214
if not self._stringzilla_available:
171215
raise RuntimeError("StringZilla not available")
172216

@@ -183,27 +227,28 @@ def analyze(self) -> FileAnalysisResult:
183227
read_entire_file = self._should_read_entire_file(file_size)
184228

185229
if read_entire_file:
186-
# Memory-map entire file
187-
str_file = Str(File(self.filepath))
188-
text = str(str_file)
230+
# Memory-map entire file and keep as Str for SIMD operations
231+
str_text = Str(File(self.filepath))
232+
text = str(str_text) # Convert to string only for return value
189233
bytes_analyzed = len(text.encode('utf-8'))
190234
was_truncated = False
191235
else:
192236
# Read limited amount
193237
with open(self.filepath, encoding="utf-8", errors="ignore") as f:
194238
text = f.read(self.max_read_size)
195239
bytes_analyzed = len(text.encode('utf-8'))
196-
was_truncated = bytes_analyzed >= self.max_read_size
240+
was_truncated = not read_entire_file and file_size > bytes_analyzed
241+
# Create Str for limited read case
242+
str_text = Str(text)
197243

198244
except (IOError, OSError):
199245
return FileAnalysisResult(
200246
text="", include_positions=[], magic_positions=[],
201247
directive_positions={}, bytes_analyzed=0, was_truncated=False
202248
)
203249

204-
# Use StringZilla for fast pattern detection on raw text
250+
# Use StringZilla SIMD operations directly on str_text
205251
# Note: Conditional compilation should be handled by the caller
206-
str_text = Str(text)
207252
include_positions = self._find_include_positions_simd(str_text)
208253
magic_positions = self._find_magic_positions_simd(str_text)
209254
directive_positions = self._find_directive_positions_simd(str_text)
@@ -228,18 +273,42 @@ def _find_include_positions_simd(self, str_text) -> List[int]:
228273
if pos == -1:
229274
break
230275

231-
# Verify this is a real include (not in comment)
232-
line_start = str_text.rfind('\n', 0, pos) + 1
233-
line_prefix = str(str_text[line_start:pos]).strip()
234-
235-
# Check if line starts with // or is inside /* */
236-
if not line_prefix.startswith('//') and '/*' not in line_prefix:
276+
# Check if this #include is inside a comment
277+
if not self._is_position_commented(str_text, pos):
237278
positions.append(pos)
238279

239280
start = pos + 8 # len('#include')
240281

241282
return positions
242283

284+
def _is_position_commented(self, str_text, pos: int) -> bool:
285+
"""Check if position is inside a comment (single-line or multi-line block)."""
286+
# Check for single-line comment on current line
287+
line_start = str_text.rfind('\n', 0, pos) + 1
288+
# Use StringZilla slice directly for efficiency
289+
line_prefix_slice = str_text[line_start:pos]
290+
291+
# Look for // in the line prefix using SIMD
292+
comment_pos = line_prefix_slice.find('//')
293+
if comment_pos != -1:
294+
# Check if there's only whitespace before //
295+
before_comment = str(line_prefix_slice[:comment_pos]).strip()
296+
if before_comment == '':
297+
return True
298+
299+
# Check for multi-line block comment
300+
# Find the most recent /* and */ before this position
301+
last_block_start = str_text.rfind('/*', 0, pos)
302+
if last_block_start != -1:
303+
# Found a /* before this position
304+
# Check if there's a closing */ between the /* and our position
305+
last_block_end = str_text.rfind('*/', last_block_start, pos)
306+
if last_block_end == -1:
307+
# No closing */ found, so we're inside the block comment
308+
return True
309+
310+
return False
311+
243312
def _find_magic_positions_simd(self, str_text) -> List[int]:
244313
"""Find positions of all //#KEY=value patterns using StringZilla."""
245314
positions = []
@@ -253,29 +322,49 @@ def _find_magic_positions_simd(self, str_text) -> List[int]:
253322

254323
# Check if this //# is at start of line (after optional whitespace)
255324
line_start = str_text.rfind('\n', 0, pos) + 1
256-
line_prefix = str(str_text[line_start:pos]).strip()
325+
line_prefix_slice = str_text[line_start:pos]
257326

258-
if line_prefix == '': # Only whitespace before //#
259-
# Look for KEY=value pattern after //#
260-
after_hash = pos + 3
261-
line_end = str_text.find('\n', after_hash)
262-
if line_end == -1:
263-
line_end = len(str_text)
264-
265-
line_content = str(str_text[after_hash:line_end])
266-
if '=' in line_content:
267-
# Verify it matches KEY=value pattern with valid identifier
268-
key_part = line_content.split('=')[0].strip()
269-
# Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
270-
if (key_part and
271-
(key_part[0].isalpha() or key_part[0] == '_') and
272-
all(c.isalnum() or c in '_-' for c in key_part)):
273-
positions.append(pos)
327+
# Use StringZilla to check if only whitespace (convert only when necessary)
328+
if str(line_prefix_slice).strip() == '': # Only whitespace before //#
329+
# Check if we're inside a block comment (though //# starting a line is usually not)
330+
if not self._is_inside_block_comment(str_text, pos):
331+
# Look for KEY=value pattern after //#
332+
after_hash = pos + 3
333+
line_end = str_text.find('\n', after_hash)
334+
if line_end == -1:
335+
line_end = len(str_text)
336+
337+
# Use StringZilla slice and find = using SIMD
338+
line_content_slice = str_text[after_hash:line_end]
339+
equals_pos = line_content_slice.find('=')
340+
if equals_pos != -1:
341+
# Extract key part using StringZilla slice
342+
key_slice = line_content_slice[:equals_pos]
343+
key_part = str(key_slice).strip()
344+
# Key must start with letter or underscore, contain only alphanumeric and underscores/dashes
345+
if (key_part and
346+
(key_part[0].isalpha() or key_part[0] == '_') and
347+
all(c.isalnum() or c in '_-' for c in key_part)):
348+
positions.append(pos)
274349

275350
start = pos + 3 # len('//#')
276351

277352
return positions
278353

354+
def _is_inside_block_comment(self, str_text, pos: int) -> bool:
355+
"""Check if position is inside a multi-line block comment."""
356+
# Find the most recent /* and */ before this position
357+
last_block_start = str_text.rfind('/*', 0, pos)
358+
if last_block_start != -1:
359+
# Found a /* before this position
360+
# Check if there's a closing */ between the /* and our position
361+
last_block_end = str_text.rfind('*/', last_block_start, pos)
362+
if last_block_end == -1:
363+
# No closing */ found, so we're inside the block comment
364+
return True
365+
366+
return False
367+
279368
def _find_directive_positions_simd(self, str_text) -> Dict[str, List[int]]:
280369
"""Find positions of all preprocessor directives using StringZilla."""
281370
directive_positions = {}
@@ -329,4 +418,6 @@ def create_file_analyzer(filepath: str, max_read_size: int = 0, verbose: int = 0
329418
except ImportError:
330419
if verbose >= 3:
331420
print("StringZilla not available, using legacy file analyzer")
332-
return LegacyFileAnalyzer(filepath, max_read_size, verbose)
421+
return LegacyFileAnalyzer(filepath, max_read_size, verbose)
422+
423+

src/compiletools/makefile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ def create(self):
426426

427427
# Find the realpaths of the given filenames (to avoid this being
428428
# duplicated many times)
429-
compiletools.wrappedos.makedirs(self.namer.executable_dir())
429+
os.makedirs(self.namer.executable_dir(), exist_ok=True)
430430
rule = self._create_all_rule()
431431
self.rules[rule.target] = rule
432432
buildoutputs = self._gather_build_outputs()

src/compiletools/samples/lotsofmagic/lotsofmagic.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,16 @@
1515
//#INVALID PATTERN (no =)
1616
// //#123=invalid_start_with_number (starts with number - invalid identifier)
1717

18+
/*
19+
Multi-line comment test cases:
20+
#include <stdlib.h>
21+
The above include should NOT be detected as it's inside a block comment.
22+
Also test magic flags in comments:
23+
//#COMMENTED_FLAG=should_not_be_detected
24+
*/
25+
26+
/* Single line block comment with #include <math.h> should also be ignored */
27+
1828
int main(int argc, char* argv[])
1929
{
2030
std::cout << std::abs(-2) << "\n";

0 commit comments

Comments
 (0)