Normative: add RegExp.escape (#3382)

ljharb · ljharb · commit 1867fff2a871 · 2024-08-28T14:13:03.000-07:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -14,7 +14,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/enforce-format.yml b/.github/workflows/enforce-format.yml
@@ -10,7 +10,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/ipr.yml b/.github/workflows/ipr.yml
@@ -16,7 +16,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/preview-build.yml b/.github/workflows/preview-build.yml
@@ -10,7 +10,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/preview.yml b/.github/workflows/preview.yml
@@ -20,7 +20,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
-        name: 'nvm install lts/* && npm ci'
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
         with:
           node-version: lts/*
           use-npm-ci: true
diff --git a/.github/workflows/publish-biblio.yml b/.github/workflows/publish-biblio.yml
@@ -24,7 +24,7 @@ jobs:
           registry-url: 'https://registry.npmjs.org'
 
       - name: Install dependencies
-        run: npm ci
+        run: npm ci --no-audit
 
       - name: Publish biblio
         run: scripts/publish-biblio.sh
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -0,0 +1,24 @@
+name: 'ecma-262'
+
+on: [pull_request]
+
+jobs:
+  spellcheck:
+    name: 'check for newly-introduced spelling errors'
+    runs-on: ubuntu-latest
+
+    steps:
+      - run: sudo apt-get install aspell
+      - uses: actions/checkout@v3
+        with:
+          # Number of commits to fetch. 0 indicates all history for all branches and tags.
+          # Default: 1
+          fetch-depth: 0
+      - uses: ljharb/actions/node/install@d9f477827ed71a259056764107f74afc29febcae
+        name: 'nvm install lts/* && npm ci --no-audit'
+        env:
+          NPM_CONFIG_AUDIT: false
+        with:
+          node-version: lts/*
+          use-npm-ci: true
+      - run: node scripts/spellcheck.mjs origin/"${GITHUB_BASE_REF}"
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@ node_modules/
 out/
 vendor/esmeta
 test*.js
+aspell.txt
 
 # lockfiles we don't use are ignored
 npm-shrinkwrap.json
diff --git a/scripts/spellcheck.mjs b/scripts/spellcheck.mjs
@@ -0,0 +1,72 @@
+import { promisify } from 'node:util';
+import { exec } from 'node:child_process';
+const execP = promisify(exec);
+import { writeFile } from 'node:fs/promises';
+
+const MIN_WORD_SIZE = 3;
+
+const BASE_REF = process.argv[2];
+const ASPELL_OPTS = [
+  '--add-html-check=alt,title,caption,variants',
+  '--ignore-case',
+  '--master=en_GB-ize',
+  '--mode=html',
+  '--run-together',
+  '--run-together-limit=99',
+  '--run-together-min=2',
+  'list',
+].join(' ');
+
+function makeDict(words) {
+  return `personal_ws-1.1 en ${words.length}\n${words.join('\n')}`;
+}
+
+function lines(text) {
+  if (text.length === 0) return [];
+  return text.split('\n');
+}
+
+console.log(`base ref: ${BASE_REF}`);
+
+let { stdout } = await execP(`git show "${BASE_REF}":spec.html | aspell ${ASPELL_OPTS} | sort -fu`);
+
+let existingWords = lines(stdout.trim());
+
+let existingComponents = Array.from(new Set(
+  existingWords
+    .flatMap(word => [...word.matchAll(/(?:^[a-z]|[A-Z])[a-z]{2,}/g)])
+    .map(([w]) => w.toLowerCase())
+));
+
+({ stdout } = await execP(`echo ${existingComponents.map(w => JSON.stringify(w)).join(' ')} | aspell ${ASPELL_OPTS} | sort -fu`));
+
+let existingComponentsReduced = lines(stdout.trim());
+
+await writeFile('aspell.txt', makeDict(existingComponentsReduced));
+
+({ stdout } = await execP(`echo ${existingWords.map(w => JSON.stringify(w)).join(' ')} | aspell --personal=./aspell.txt ${ASPELL_OPTS}`));
+
+let novel = [...existingComponentsReduced, ...lines(stdout.trim())].filter(w => w.length >= MIN_WORD_SIZE);
+novel.sort();
+console.log(`\npreviously used novel words: ${novel.join(', ')}`);
+await writeFile('aspell.txt', makeDict(novel));
+
+({ stdout } = await execP(`aspell --personal=./aspell.txt ${ASPELL_OPTS} list <spec.html | sort -u`));
+let misspellings = lines(stdout.trim()).filter(w => w.length >= MIN_WORD_SIZE);
+
+if (misspellings.length > 0) {
+  console.log(`\nmisspellings: ${misspellings.join(', ')}`);
+  let pattern = misspellings.map(w => `-e ${JSON.stringify(w)}`).join(' --or ');
+  ({ stdout } = await execP(`git grep --line-number --column --fixed-strings --only-matching ${pattern} -- spec.html`));
+
+  console.log('');
+
+  let info = lines(stdout.trim());
+  for (let warning of info) {
+    let [match, file, line, col, typo] = warning.match(/^([^:]+):(\d+):(\d+):(.*)$/);
+    let title = 'Potential Typo';
+    let message = `${JSON.stringify(typo)} is not a previously used word or composed of previously used words. Perhaps it is a typo?`;
+    // https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message
+    console.log(`::warning file=${file},line=${line},endLine=${line},col=${col},endColumn=${col + typo.length},title=${title}::${message}`);
+  }
+}
diff --git a/spec.html b/spec.html
@@ -37827,6 +37827,63 @@ <h1>Properties of the RegExp Constructor</h1>
         <li>has the following properties:</li>
       </ul>
 
+      <emu-clause id="sec-regexp.escape">
+        <h1>RegExp.escape ( _S_ )</h1>
+        <p>This function returns a copy of _S_ in which characters that are potentially special in a regular expression |Pattern| have been replaced by equivalent escape sequences.</p>
+        <p>It performs the following steps when called:</p>
+
+        <emu-alg>
+          1. If _S_ is not a String, throw a *TypeError* exception.
+          1. Let _escaped_ be the empty String.
+          1. Let _cpList_ be StringToCodePoints(_S_).
+          1. For each code point _c_ of _cpList_, do
+            1. If _escaped_ is the empty String and _c_ is matched by either |DecimalDigit| or |AsciiLetter|, then
+              1. NOTE: Escaping a leading digit ensures that output corresponds with pattern text which may be used after a `\0` character escape or a |DecimalEscape| such as `\1` and still match _S_ rather than be interpreted as an extension of the preceding escape sequence. Escaping a leading ASCII letter does the same for the context after `\c`.
+              1. Let _numericValue_ be the numeric value of _c_.
+              1. Let _hex_ be Number::toString(𝔽(_numericValue_), 16).
+              1. Assert: The length of _hex_ is 2.
+              1. Set _escaped_ to the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), *"x"*, and _hex_.
+            1. Else,
+              1. Set _escaped_ to the string-concatenation of _escaped_ and EncodeForRegExpEscape(_c_).
+          1. Return _escaped_.
+        </emu-alg>
+
+        <emu-note>
+          <p>Despite having similar names, EscapeRegExpPattern and `RegExp.escape` do not perform similar actions. The former escapes a pattern for representation as a string, while this function escapes a string for representation inside a pattern.</p>
+        </emu-note>
+
+        <emu-clause id="sec-encodeforregexpescape" type="abstract operation">
+          <h1>
+            EncodeForRegExpEscape (
+              _c_: a code point,
+            ): a String
+          </h1>
+          <dl class="header">
+            <dt>description</dt>
+            <dd>It returns a string representing a |Pattern| for matching _c_. If _c_ is white space or an ASCII punctuator, the returned value is an escape sequence. Otherwise, the returned value is a string representation of _c_ itself.</dd>
+          </dl>
+
+          <emu-alg>
+            1. If _c_ is matched by |SyntaxCharacter| or _c_ is U+002F (SOLIDUS), then
+              1. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and UTF16EncodeCodePoint(_c_).
+            1. Else if _c_ is the code point listed in some cell of the “Code Point” column of <emu-xref href="#table-controlescape-code-point-values"></emu-xref>, then
+              1. Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and the string in the “ControlEscape” column of the row whose “Code Point” column contains _c_.
+            1. Let _otherPunctuators_ be the string-concatenation of *",-=&lt;>#&amp;!%:;@~'`"* and the code unit 0x0022 (QUOTATION MARK).
+            1. Let _toEscape_ be StringToCodePoints(_otherPunctuators_).
+            1. If _toEscape_ contains _c_, _c_ is matched by |WhiteSpace| or |LineTerminator|, or _c_ has the same numeric value as a leading surrogate or trailing surrogate, then
+              1. If the numeric value of _c_ ≤ 0xFF, then
+                1. Let _hex_ be Number::toString(𝔽(_c_), 16).
+                1. Return the string-concatenation of the code unit 0x005C (REVERSE SOLIDUS), *"x"*, and StringPad(_hex_, 2, *"0"*, ~start~).
+              1. Let _escaped_ be the empty String.
+              1. Let _codeUnits_ be UTF16EncodeCodePoint(_c_).
+              1. For each code unit _cu_ of _codeUnits_, do
+                1. Set _escaped_ to the string-concatenation of _escaped_ and UnicodeEscape(_cu_).
+              1. Return _escaped_.
+            1. Return UTF16EncodeCodePoint(_c_).
+          </emu-alg>
+        </emu-clause>
+      </emu-clause>
+
       <emu-clause id="sec-regexp.prototype">
         <h1>RegExp.prototype</h1>
         <p>The initial value of `RegExp.prototype` is the RegExp prototype object.</p>