Skip to content

Commit f66121d

Browse files
authored
feat: support Unicode properties added in ES2023 (#59)
1 parent 434a53b commit f66121d

13 files changed

+166
-82
lines changed

.github/workflows/cron.yml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: cron
2+
on:
3+
schedule:
4+
- cron: 0 0 * * 0
5+
6+
jobs:
7+
check-resource-update:
8+
name: check-resource-update
9+
runs-on: ubuntu-latest
10+
steps:
11+
- name: Checkout
12+
uses: actions/checkout@v3
13+
- name: Install Node.js
14+
uses: actions/setup-node@v3
15+
with:
16+
node-version: 18
17+
- name: Install Packages
18+
run: npm install
19+
- name: Update
20+
run: npm run update:unicode
21+
- name: Check changes
22+
run: |
23+
git add --all && \
24+
git diff-index --cached HEAD --stat --exit-code

scripts/fetch-lines.ts

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
export async function* fetchLines(url: string): AsyncIterable<string> {
2+
const response = await fetch(url)
3+
yield* (await response.text()).split("\n")
4+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { getLatestUnicodePropertyValues } from "./get-latest-unicode-property-values"
2+
3+
export async function* getLatestUnicodeGeneralCategoryValues(): AsyncIterable<string> {
4+
for await (const value of getLatestUnicodePropertyValues()) {
5+
if (value.propertyAlias !== "gc") {
6+
continue
7+
}
8+
9+
yield* value.aliases
10+
}
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import { fetchLines } from "./fetch-lines"
2+
3+
const DB_URL =
4+
"https://unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt"
5+
const logger = console
6+
export type UnicodePropertyValue = {
7+
propertyAlias: string
8+
aliases: string[]
9+
canonical: string
10+
}
11+
let cache: UnicodePropertyValue[] | undefined = undefined
12+
export async function* getLatestUnicodePropertyValues(): AsyncIterable<UnicodePropertyValue> {
13+
logger.log("Fetching data... (%s)", DB_URL)
14+
const iterable = cache
15+
? cache
16+
: (async function* () {
17+
const newCache: UnicodePropertyValue[] = []
18+
for await (const line of fetchLines(DB_URL)) {
19+
if (!line || line.startsWith("#")) {
20+
continue
21+
}
22+
const [propertyAlias, alias, canonical, ...remaining] = line
23+
.split("#")[0] // strip comments
24+
.split(";") // split by semicolon
25+
.map((x) => x.trim()) // trim
26+
27+
const value: UnicodePropertyValue = {
28+
propertyAlias,
29+
aliases: [canonical, alias, ...remaining],
30+
canonical,
31+
}
32+
newCache.push(value)
33+
yield value
34+
}
35+
cache = newCache
36+
})()
37+
for await (const value of iterable) {
38+
yield value
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { getLatestUnicodePropertyValues } from "./get-latest-unicode-property-values"
2+
3+
export async function* getLatestUnicodeScriptValues(): AsyncIterable<string> {
4+
for await (const value of getLatestUnicodePropertyValues()) {
5+
if (value.propertyAlias !== "sc") {
6+
continue
7+
}
8+
9+
yield* value.aliases
10+
}
11+
}

scripts/update-unicode-ids.ts

+4-31
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import fs from "fs"
2-
import http from "http"
32
import { ESLint } from "eslint"
3+
import { fetchLines } from "./fetch-lines"
44

5-
const DB_URL = "http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt"
5+
const DB_URL = "https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt"
66
const FILE_PATH = "src/unicode/ids.ts"
77
const ID_START = /^([0-9a-z]+)(?:\.\.([0-9a-z]+))?[^;]*; ID_Start /iu
88
const ID_CONTINUE = /^([0-9a-z]+)(?:\.\.([0-9a-z]+))?[^;]*; ID_Continue /iu
@@ -19,7 +19,7 @@ const main = async () => {
1919
const idContinueLarge: [number, number][] = []
2020

2121
logger.log("Fetching data... (%s)", DB_URL)
22-
await processEachLine((line) => {
22+
for await (const line of fetchLines(DB_URL)) {
2323
let m: RegExpExecArray | null = null
2424
if (banner === "") {
2525
logger.log("Processing data... (%s)", line.slice(2))
@@ -42,7 +42,7 @@ const main = async () => {
4242
idContinueLarge.push([min, max])
4343
}
4444
}
45-
})
45+
}
4646

4747
logger.log("Normalizing data...")
4848
normalizeRanges(idStartSmall)
@@ -127,33 +127,6 @@ main().catch((err) => {
127127
process.exitCode = 1
128128
})
129129

130-
function processEachLine(processLine: (line: string) => void): Promise<void> {
131-
return new Promise((resolve, reject) => {
132-
http.get(DB_URL, (res) => {
133-
let buffer = ""
134-
res.setEncoding("utf8")
135-
res.on("data", (chunk) => {
136-
const lines = (buffer + String(chunk)).split("\n")
137-
if (lines.length === 1) {
138-
buffer = lines[0]
139-
} else {
140-
buffer = lines.pop()!
141-
for (const line of lines) {
142-
processLine(line)
143-
}
144-
}
145-
})
146-
res.on("end", () => {
147-
if (buffer) {
148-
processLine(buffer)
149-
}
150-
resolve()
151-
})
152-
res.on("error", reject)
153-
}).on("error", reject)
154-
})
155-
}
156-
157130
function normalizeRanges(ranges: [number, number][]): void {
158131
for (let i = ranges.length - 1; i >= 1; --i) {
159132
const currRange = ranges[i]

scripts/update-unicode-properties.ts

+58-25
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ import fs from "fs"
22
import type { DOMWindow } from "jsdom"
33
import { JSDOM } from "jsdom"
44
import { ESLint } from "eslint"
5+
import { getLatestUnicodeGeneralCategoryValues } from "./get-latest-unicode-general-category-values"
6+
import { getLatestUnicodeScriptValues } from "./get-latest-unicode-script-values"
57

68
const DATA_SOURCES = [
79
{
@@ -43,8 +45,8 @@ const DATA_SOURCES = [
4345
url: "https://tc39.es/ecma262/multipage/text-processing.html",
4446
version: 2023,
4547
binProperties: "#table-binary-unicode-properties",
46-
gcValues: "#table-unicode-general-category-values",
47-
scValues: "#table-unicode-script-values",
48+
gcValues: getLatestUnicodeGeneralCategoryValues,
49+
scValues: getLatestUnicodeScriptValues,
4850
},
4951
]
5052
const FILE_PATH = "src/unicode/properties.ts"
@@ -96,13 +98,21 @@ type Datum = {
9698
} while (window == null)
9799

98100
logger.log("Parsing tables")
99-
datum.binProperties = collectValues(
101+
datum.binProperties = await collectValues(
100102
window,
101103
binProperties,
102104
existing.binProperties,
103105
)
104-
datum.gcValues = collectValues(window, gcValues, existing.gcValues)
105-
datum.scValues = collectValues(window, scValues, existing.scValues)
106+
datum.gcValues = await collectValues(
107+
window,
108+
gcValues,
109+
existing.gcValues,
110+
)
111+
datum.scValues = await collectValues(
112+
window,
113+
scValues,
114+
existing.scValues,
115+
)
106116

107117
logger.log("Done")
108118
}
@@ -169,32 +179,55 @@ export function isValidLoneUnicodeProperty(version: number, value: string): bool
169179
process.exitCode = 1
170180
})
171181

172-
function collectValues(
182+
async function collectValues(
173183
window: DOMWindow,
174-
id: string,
184+
idSelectorOrProvider: string | (() => AsyncIterable<string>),
175185
existingSet: Set<string>,
176-
): string[] {
177-
const selector = `${id} td:nth-child(1) code`
178-
const nodes = window.document.querySelectorAll(selector)
179-
const values = Array.from(nodes, (node) => node.textContent ?? "")
180-
.filter((value) => {
181-
if (existingSet.has(value)) {
182-
return false
183-
}
184-
existingSet.add(value)
185-
return true
186-
})
187-
.sort(undefined)
186+
): Promise<string[]> {
187+
const getValues =
188+
typeof idSelectorOrProvider === "function"
189+
? idSelectorOrProvider
190+
: function* (): Iterable<string> {
191+
const selector = `${idSelectorOrProvider} td:nth-child(1) code`
192+
const nodes = window.document.querySelectorAll(selector)
193+
if (nodes.length === 0) {
194+
throw new Error(`No nodes found for selector ${selector}`)
195+
}
196+
logger.log(
197+
"%o nodes of %o were found.",
198+
nodes.length,
199+
selector,
200+
)
201+
for (const node of Array.from(nodes)) {
202+
yield node.textContent ?? ""
203+
}
204+
}
205+
206+
const missing = new Set(existingSet)
207+
const values = new Set<string>()
208+
let allCount = 0
209+
210+
for await (const value of getValues()) {
211+
allCount++
212+
missing.delete(value)
213+
if (existingSet.has(value)) {
214+
continue
215+
}
216+
existingSet.add(value)
217+
values.add(value)
218+
}
219+
220+
if (missing.size > 0) {
221+
throw new Error(`Missing values: ${Array.from(missing).join(", ")}`)
222+
}
188223

189224
logger.log(
190-
"%o nodes of %o were found, then %o adopted and %o ignored as duplication.",
191-
nodes.length,
192-
selector,
193-
values.length,
194-
nodes.length - values.length,
225+
"%o adopted and %o ignored as duplication.",
226+
values.size,
227+
allCount - values.size,
195228
)
196229

197-
return values
230+
return [...values].sort((a, b) => (a > b ? 1 : a < b ? -1 : 0))
198231
}
199232

200233
function makeClassDeclarationCode(versions: string[]): string {

src/ecma-versions.ts

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ export type EcmaVersion =
88
| 2020
99
| 2021
1010
| 2022
11+
| 2023

src/parser.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class RegExpParserState {
4545

4646
public constructor(options?: RegExpParser.Options) {
4747
this.strict = Boolean(options?.strict)
48-
this.ecmaVersion = options?.ecmaVersion ?? 2022
48+
this.ecmaVersion = options?.ecmaVersion ?? 2023
4949
}
5050

5151
public get pattern(): Pattern {
@@ -519,12 +519,13 @@ export namespace RegExpParser {
519519
strict?: boolean
520520

521521
/**
522-
* ECMAScript version. Default is `2022`.
522+
* ECMAScript version. Default is `2023`.
523523
* - `2015` added `u` and `y` flags.
524524
* - `2018` added `s` flag, Named Capturing Group, Lookbehind Assertion,
525525
* and Unicode Property Escape.
526526
* - `2019`, `2020`, and `2021` added more valid Unicode Property Escapes.
527527
* - `2022` added `d` flag.
528+
* - `2023` added more valid Unicode Property Escapes.
528529
*/
529530
ecmaVersion?: EcmaVersion
530531
}

src/unicode/properties.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ const scValueSets = new DataSet(
9494
"Elym Elymaic Hmnp Nand Nandinagari Nyiakeng_Puachue_Hmong Wancho Wcho",
9595
"Chorasmian Chrs Diak Dives_Akuru Khitan_Small_Script Kits Yezi Yezidi",
9696
"Cpmn Cypro_Minoan Old_Uyghur Ougr Tangsa Tnsa Toto Vith Vithkuqi",
97-
"",
97+
"Hrkt Katakana_Or_Hiragana Kawi Nag_Mundari Nagm Unknown Zzzz",
9898
)
9999
const binPropertySets = new DataSet(
100100
"AHex ASCII ASCII_Hex_Digit Alpha Alphabetic Any Assigned Bidi_C Bidi_Control Bidi_M Bidi_Mirrored CI CWCF CWCM CWKCF CWL CWT CWU Case_Ignorable Cased Changes_When_Casefolded Changes_When_Casemapped Changes_When_Lowercased Changes_When_NFKC_Casefolded Changes_When_Titlecased Changes_When_Uppercased DI Dash Default_Ignorable_Code_Point Dep Deprecated Dia Diacritic Emoji Emoji_Component Emoji_Modifier Emoji_Modifier_Base Emoji_Presentation Ext Extender Gr_Base Gr_Ext Grapheme_Base Grapheme_Extend Hex Hex_Digit IDC IDS IDSB IDST IDS_Binary_Operator IDS_Trinary_Operator ID_Continue ID_Start Ideo Ideographic Join_C Join_Control LOE Logical_Order_Exception Lower Lowercase Math NChar Noncharacter_Code_Point Pat_Syn Pat_WS Pattern_Syntax Pattern_White_Space QMark Quotation_Mark RI Radical Regional_Indicator SD STerm Sentence_Terminal Soft_Dotted Term Terminal_Punctuation UIdeo Unified_Ideograph Upper Uppercase VS Variation_Selector White_Space XIDC XIDS XID_Continue XID_Start space",
@@ -119,7 +119,8 @@ export function isValidUnicodeProperty(
119119
(version >= 2019 && scValueSets.es2019.has(value)) ||
120120
(version >= 2020 && scValueSets.es2020.has(value)) ||
121121
(version >= 2021 && scValueSets.es2021.has(value)) ||
122-
(version >= 2022 && scValueSets.es2022.has(value))
122+
(version >= 2022 && scValueSets.es2022.has(value)) ||
123+
(version >= 2023 && scValueSets.es2023.has(value))
123124
)
124125
}
125126
return false

src/validator.ts

+3-2
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,13 @@ export namespace RegExpValidator {
127127
strict?: boolean
128128

129129
/**
130-
* ECMAScript version. Default is `2022`.
130+
* ECMAScript version. Default is `2023`.
131131
* - `2015` added `u` and `y` flags.
132132
* - `2018` added `s` flag, Named Capturing Group, Lookbehind Assertion,
133133
* and Unicode Property Escape.
134134
* - `2019`, `2020`, and `2021` added more valid Unicode Property Escapes.
135135
* - `2022` added `d` flag.
136+
* - `2023` added more valid Unicode Property Escapes.
136137
*/
137138
ecmaVersion?: EcmaVersion
138139

@@ -614,7 +615,7 @@ export class RegExpValidator {
614615
}
615616

616617
private get ecmaVersion() {
617-
return this._options.ecmaVersion ?? 2022
618+
return this._options.ecmaVersion ?? 2023
618619
}
619620

620621
private onLiteralEnter(start: number): void {

test/fixtures/parser/literal.ts

+2-10
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
11
import fs from "fs"
22
import path from "path"
3+
import type { EcmaVersion } from "../../../src/ecma-versions"
34

45
type FixtureData = Record<
56
string,
67
{
78
options: {
89
strict: boolean
9-
ecmaVersion:
10-
| 5
11-
| 2015
12-
| 2016
13-
| 2017
14-
| 2018
15-
| 2019
16-
| 2020
17-
| 2021
18-
| 2022
10+
ecmaVersion: EcmaVersion
1911
}
2012
patterns: Record<
2113
string,

test/fixtures/visitor/index.ts

+2-10
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,13 @@
11
import fs from "fs"
22
import path from "path"
3+
import type { EcmaVersion } from "../../../src/ecma-versions"
34

45
type FixtureData = Record<
56
string,
67
{
78
options: {
89
strict?: boolean
9-
ecmaVersion?:
10-
| 5
11-
| 2015
12-
| 2016
13-
| 2017
14-
| 2018
15-
| 2019
16-
| 2020
17-
| 2021
18-
| 2022
10+
ecmaVersion?: EcmaVersion
1911
}
2012
patterns: Record<string, string[]>
2113
}

0 commit comments

Comments
 (0)