Skip to content

Conversation

@wrmedford
Copy link

  • Added native RE2.Set binding (iterable patterns, flags/anchor parsing, match/test/toString plus flags/sources/source/size/anchor props) and exported it from the addon.
  • Hoisted regex translation/escaping into shared helpers and reused across constructors.
  • Documented the new API in README, expanded type definitions, and added functional + perf-oriented tests (including TS coverage).

Closes #43

@uhop uhop self-assigned this Nov 23, 2025
@wrmedford
Copy link
Author

scenario patterns inputs avgLen matches RE2.Set (ms) RE2 list (ms) RegExp list (ms) notes
multi-small-50-hits 50 4000 19 4000 3.711 8.526 1.045 4k small strings, many hits
multi-small-50-nohits 50 4000 17 0 2.116 80.921 6.809 4k small strings, no hits
multi-small-200-hits 200 4000 20 4000 2.500 7.008 0.356 4k small strings, many hits
multi-small-200-nohits 200 4000 18 0 1.632 324.565 19.840 4k small strings, no hits
single-long-50-hits 50 1 1677 50 0.095 0.294 0.046 ~3 KB string, all tokens present
single-long-50-nohits 50 1 1674 0 0.012 0.246 0.038 ~3 KB string, no tokens
single-long-200-hits 200 1 6977 200 0.134 2.647 0.362 ~7 KB string, all tokens present
single-long-200-nohits 200 1 6974 0 0.033 3.906 0.469 ~8 KB string, no tokens

Ran a small bench across different implementations. This tends to perform best in situations where your patterns do not match.

@uhop
Copy link
Owner

uhop commented Nov 24, 2025

@wrmedford: Ran a small bench across different implementations.

Could you share code of your benchmark? I plan to add a benchmarking utility.

@wrmedford
Copy link
Author

@wrmedford: Ran a small bench across different implementations.

Could you share code of your benchmark? I plan to add a benchmarking utility.

'use strict';

const RE2 = require('./re2');

function makePatterns(n) {
  const arr = [];
  for (let i = 0; i < n; ++i) arr.push('token' + i + '(?:[a-z]+)?');
  return arr;
}

function makeInputs(patternCount, count, withHits) {
  const arr = [];
  for (let j = 0; j < count; ++j) {
    if (withHits) {
      arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' token' + (j % patternCount) + ' tail');
    } else {
      arr.push('xx' + (j % patternCount) + ' ' + (j & 7) + ' tok' + (j % patternCount) + ' tail');
    }
  }
  return arr;
}

function makeLongAllHits(patternCount) {
  const parts = [];
  for (let i = 0; i < patternCount; ++i) parts.push('some prefix ' + i + ' token' + i + ' suffix ' + (i & 7));
  return parts.join(' | ');
}

function makeLongNoHits(patternCount) {
  const parts = [];
  for (let i = 0; i < patternCount; ++i) parts.push('item' + i + ' nohit ' + (i & 7));
  return parts.join(' | ').repeat(2);
}

function measure(fn) {
  const start = process.hrtime.bigint();
  const result = fn();
  const ms = Number(process.hrtime.bigint() - start) / 1e6;
  return { timeMs: ms, result };
}

const configs = [
  { name: 'multi-small-50-hits', patterns: 50, inputs: makeInputs(50, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
  { name: 'multi-small-50-nohits', patterns: 50, inputs: makeInputs(50, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
  { name: 'multi-small-200-hits', patterns: 200, inputs: makeInputs(200, 4000, true), mode: 'multi', desc: '4k small strings, many hits' },
  { name: 'multi-small-200-nohits', patterns: 200, inputs: makeInputs(200, 4000, false), mode: 'multi', desc: '4k small strings, no hits' },
  { name: 'single-long-50-hits', patterns: 50, inputs: [makeLongAllHits(50)], mode: 'single', desc: '~3 KB string, all tokens present' },
  { name: 'single-long-50-nohits', patterns: 50, inputs: [makeLongNoHits(50)], mode: 'single', desc: '~3 KB string, no tokens' },
  { name: 'single-long-200-hits', patterns: 200, inputs: [makeLongAllHits(200)], mode: 'single', desc: '~7 KB string, all tokens present' },
  { name: 'single-long-200-nohits', patterns: 200, inputs: [makeLongNoHits(200)], mode: 'single', desc: '~8 KB string, no tokens' },
];

const results = [];

for (const cfg of configs) {
  const patterns = makePatterns(cfg.patterns);
  const set = new RE2.Set(patterns);
  const re2List = patterns.map((p) => new RE2(p));
  const jsList = patterns.map((p) => new RegExp(p));

  const avgLen = cfg.inputs.reduce((n, s) => n + s.length, 0) / cfg.inputs.length;

  if (cfg.mode === 'multi') {
    const setRes = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) m += set.test(s) ? 1 : 0;
      return m;
    });
    const re2Res = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) {
        for (const re of re2List) { if (re.test(s)) { ++m; break; } }
      }
      return m;
    });
    const jsRes = measure(() => {
      let m = 0;
      for (const s of cfg.inputs) {
        for (const re of jsList) { if (re.test(s)) { ++m; break; } }
      }
      return m;
    });
    results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
  } else {
    const setRes = measure(() => set.match(cfg.inputs[0]).length);
    const re2Res = measure(() => re2List.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
    const jsRes = measure(() => jsList.reduce((n, re) => n + (re.test(cfg.inputs[0]) ? 1 : 0), 0));
    results.push({ ...cfg, avgLen, inputsCount: cfg.inputs.length, set: setRes.timeMs, re2: re2Res.timeMs, js: jsRes.timeMs, matches: setRes.result });
  }
}

console.table(results.map(r => ({
  scenario: r.name,
  patterns: r.patterns,
  inputs: r.inputsCount,
  avgInputLen: Math.round(r.avgLen),
  matches: r.matches,
  setMs: r.set.toFixed(3),
  re2Ms: r.re2.toFixed(3),
  jsMs: r.js.toFixed(3),
  desc: r.desc,
})));

Let me know if you hit any issues.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Exposing RE2::Set

2 participants