|
49 | 49 | if (defined $query->{detectors}) {
|
50 | 50 | @DETECTORS = grep { &listContains($query->{detectors}, $_->{name}) } @DETECTORS;
|
51 | 51 | if (not @DETECTORS) {
|
52 |
| - die "Error, no detectors matched names <@{$query->{detectors}}>\n"; |
| 52 | + die "Error, no available detectors matched names <@{$query->{detectors}}>\n"; |
53 | 53 | }
|
54 | 54 | }
|
55 | 55 | my @detectorNames = map { $_->{name} } @DETECTORS;
|
56 | 56 | &log("Using detectors <@detectorNames>");
|
57 | 57 |
|
| 58 | +my @PATTERN_VARIANTS = &getPatternVariants(); |
| 59 | +if (defined $query->{patternVariants}) { |
| 60 | + @PATTERN_VARIANTS = grep { &listContains($query->{patternVariants}, $_) } @PATTERN_VARIANTS; |
| 61 | +} |
| 62 | +&log("Using pattern variants <@PATTERN_VARIANTS>"); |
| 63 | + |
58 | 64 | # Define limits on each detector.
|
59 | 65 | my $ONE_MB_IN_KB = 1*1024; # ulimit -m and -v accept units of KB.
|
60 | 66 | my $memoryLimitInBytes = (defined $query->{memoryLimit}) ? int($query->{memoryLimit}) * $ONE_MB_IN_KB : -1;
|
61 | 67 |
|
62 | 68 | my $limitTime = (defined $query->{timeLimit}) ? "timeout $query->{timeLimit}s" : "";
|
63 | 69 | my $ulimitMemory = (defined $query->{memoryLimit}) ? "ulimit -m $memoryLimitInBytes; ulimit -v $memoryLimitInBytes;" : "";
|
64 | 70 |
|
65 |
| -my @patternsToTry = &expandPatternSpaceForDetectors($query->{pattern}); |
| 71 | +my @patternsToTry = &expandPatternSpaceForDetectors($query->{pattern}, \@PATTERN_VARIANTS); |
66 | 72 |
|
67 | 73 | # This will contain N_DETECTORS * scalar(@patternsToTry) opinions.
|
68 | 74 | my @detectorOpinions;
|
@@ -216,36 +222,68 @@ sub writeToFile {
|
216 | 222 | return $args{file};
|
217 | 223 | }
|
218 | 224 |
|
| 225 | +sub getPatternVariants { |
| 226 | + # Defaults are aggressive, we assume the outcome will be dynamically validated in the languages of interest |
| 227 | + # "leftanchor" is the most conservative, "bigCurlies" is still pretty conservative |
| 228 | + return ("leftanchor", "allCurlies"); |
| 229 | +} |
| 230 | + |
219 | 231 | sub expandPatternSpaceForDetectors {
|
220 |
| - my ($pattern) = @_; |
| 232 | + my ($pattern, $patternVariantList) = @_; |
| 233 | + |
| 234 | + my %dedupVariants; |
| 235 | + for my $variant (@$patternVariantList) { |
| 236 | + $dedupVariants{$variant} = 1; |
| 237 | + } |
| 238 | + my @variants = keys %dedupVariants; |
221 | 239 |
|
222 | 240 | my @patternsToTry = ($pattern);
|
223 | 241 |
|
224 |
| - # If pattern is unanchored, a backtracking regex engine will run the loop: |
225 |
| - # for (1 .. n): |
226 |
| - # _match(regex, substr) |
227 |
| - # This means that if each match is linear-time, the worst-case behavior is quadratic. |
228 |
| - # For example, /a+$/ is quadratic in Node.js. |
229 |
| - # The detectors don't seem to acknowledge this loop. |
230 |
| - # We can simulate it by prefixing un-anchored regexes with '^(.*?)'. |
231 |
| - # This is also how a linear-time engine scans all starting indices in parallel; see Cox's writings. |
232 |
| - if (substr($query->{pattern}, 0, 1) ne "^") { |
233 |
| - my $anchoredPattern = "^(.*?)$query->{pattern}"; |
234 |
| - push @patternsToTry, $anchoredPattern; |
| 242 | + if (&listContains(\@variants, "leftanchor")) { |
| 243 | + &log("Variant: leftanchor"); |
| 244 | + # If pattern is unanchored, a backtracking regex engine will run the loop: |
| 245 | + # for (1 .. n): |
| 246 | + # _match(regex, substr) |
| 247 | + # This means that if each match is linear-time, the worst-case behavior is quadratic. |
| 248 | + # For example, /a+$/ is quadratic in Node.js. |
| 249 | + # The detectors don't seem to acknowledge this loop. |
| 250 | + # We can simulate it by prefixing un-anchored regexes with '^(.*?)'. |
| 251 | + # This is also how a linear-time engine scans all starting indices in parallel; see Cox's writings. |
| 252 | + if (substr($query->{pattern}, 0, 1) ne "^") { |
| 253 | + my $anchoredPattern = "^(.*?)$query->{pattern}"; |
| 254 | + push @patternsToTry, $anchoredPattern; |
| 255 | + } |
235 | 256 | }
|
236 | 257 |
|
237 |
| - # If pattern contains curlies "{\d*,\d*}", the detectors may time out due to graph expansion. |
238 |
| - # We can try a more general pattern with "*" and "+" instead. |
239 |
| - # The detectors might give false positives but that's OK, that's what the validate stage is for. |
240 |
| - # I'm not being careful about escaped curly braces, so let's hope there are no meta-regexes here. |
241 |
| - my $genericCurlies = $query->{pattern}; |
242 |
| - # {0, and {, both mean "0 or more" |
243 |
| - $genericCurlies =~ s/{0,\d*}/\*/g; |
244 |
| - $genericCurlies =~ s/{,\d*}/\*/g; |
245 |
| - # {[1-9] means "1 or more" |
246 |
| - $genericCurlies =~ s/{[1-9]\d*,\d*}/\+/g; |
247 |
| - if ($genericCurlies ne $pattern) { |
248 |
| - push @patternsToTry, $genericCurlies; |
| 258 | + if (&listContains(\@variants, "allCurlies") or &listContains(\@variants, "bigCurlies")) { |
| 259 | + # If pattern contains curlies "{\d*,\d*}", the detectors may time out due to graph expansion. |
| 260 | + # We can try a more general pattern with "*" and "+" instead. |
| 261 | + # The detectors might give false positives but that's OK, that's what the validate stage is for. |
| 262 | + # I'm not being careful about escaped curly braces, so let's hope there are no meta-regexes here. |
| 263 | + my $curlyThreshold; |
| 264 | + if (&listContains(\@variants, "allCurlies")) { |
| 265 | + &log("Variant: allCurlies"); |
| 266 | + $curlyThreshold = 0; |
| 267 | + } |
| 268 | + elsif (&listContains(\@variants, "bigCurlies")) { |
| 269 | + &log("Variant: bigCurlies"); |
| 270 | + $curlyThreshold = 100; # Probably overly generous, but false positives are Bad. |
| 271 | + } |
| 272 | + |
| 273 | + my $decurlied = $query->{pattern}; |
| 274 | + $decurlied =~ s/\{(\d+),(\d+)\}/$2 > $curlyThreshold ? ($1 > 0 ? "+" : "*") : "{$1,$2}"/ge; |
| 275 | + $decurlied =~ s/\{,(\d+)\}/$1 > $curlyThreshold ? "*" : "{,$1}"/ge; |
| 276 | + $decurlied =~ s/\{(\d+),\}/$1 > 0 ? "+" : "*"/ge; |
| 277 | + |
| 278 | + my $genericCurlies = $query->{pattern}; |
| 279 | + # {0, and {, both mean "0 or more" |
| 280 | + $genericCurlies =~ s/{0,\d*}/\*/g; |
| 281 | + $genericCurlies =~ s/{,\d*}/\*/g; |
| 282 | + # {[1-9] means "1 or more" |
| 283 | + $genericCurlies =~ s/{[1-9]\d*,\d*}/\+/g; |
| 284 | + if ($genericCurlies ne $pattern) { |
| 285 | + push @patternsToTry, $genericCurlies; |
| 286 | + } |
249 | 287 | }
|
250 | 288 |
|
251 | 289 | return @patternsToTry;
|
|
0 commit comments