diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..f85be1bb --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,22 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "tsx", + "type": "node", + "request": "launch", + // Debug current file in VSCode + "program": "${file}", + "runtimeExecutable": "tsx", + "console": "integratedTerminal", + "internalConsoleOptions": "neverOpen", + // Files to exclude from debugger (e.g. call stack) + "skipFiles": [ + // Node.js internal core modules + "/**", + // Ignore all dependencies (optional) + "${workspaceFolder}/node_modules/**", + ] + } + ] +} diff --git a/bench/benchmark-operations.js b/bench/benchmark-operations.js index e1d61052..be59a6f5 100644 --- a/bench/benchmark-operations.js +++ b/bench/benchmark-operations.js @@ -2,14 +2,15 @@ import { createHash } from "node:crypto"; import { summary, group, bench, run, do_not_optimize } from "mitata"; -import { XMLParser } from "fast-xml-parser"; /** - * @module Case study whether to use URLSearchParams or manual string concat for simple search params. + * @module Case study whether to use URLSearchParams or manual string concat for simple search params and some other micro benchmarks to determine how we should do things. + * + * benchmarks marked with `.baseline(true)` are the ones that mark methods that we use in the code. */ summary(() => { - group(() => { + group("building search params", () => { function buildSearchParamsURLSP( amzCredential, date, @@ -154,10 +155,10 @@ summary(() => { undefined, ); } - }); + }).baseline(true); }); - group(() => { + group("building search params v2", () => { const options = { prefix: "/", maxKeys: 100, @@ -186,20 +187,19 @@ summary(() => { let s = "list-type=2"; if (options.prefix) { - // biome-ignore lint/style/useTemplate: + // biome-ignore lint/style/useTemplate: this is what we're benchmarking s += "&prefix=" + encodeURIComponent(options.prefix); } if (options.startAfter) { - // biome-ignore lint/style/useTemplate: + // biome-ignore lint/style/useTemplate: this is what we're benchmarking s += "&start-after=" + encodeURIComponent(options.startAfter); } if (options.maxKeys) { - // biome-ignore lint/style/useTemplate: + // biome-ignore lint/style/useTemplate: this is what we're benchmarking s += "&max-keys=" + options.maxKeys; // no encoding needed, since it's a number } if (options.continuationToken) { s += - // biome-ignore lint/style/useTemplate: "&continuation-token=" + encodeURIComponent(options.continuationToken); } @@ -222,10 +222,12 @@ summary(() => { s += `&continuation-token=${encodeURIComponent(options.continuationToken)}`; } const _ = s; - }).gc("once"); + }) + .gc("once") + .baseline(true); }); - group(() => { + group("computing sha256 of parts", () => { function signUpdate(method, path, query, host) { return createHash("sha256") .update(method) @@ -267,8 +269,9 @@ summary(() => { "localhost:1337", ); } - }); - bench("update calls", () => { + }).baseline(true); + + bench("consecutive update calls", () => { for (let i = 0; i < 1000; ++i) { signUpdate( "GET", @@ -292,7 +295,7 @@ summary(() => { }); }); - group(() => { + group("joining strings", () => { const headers = [ ["host"].sort(), ["host", "x-amz-date"].sort(), @@ -319,6 +322,7 @@ summary(() => { return h.join(";"); } + /** @param {string[]} h */ function concat(h) { let res = h.length > 0 ? h[0] : ""; for (let i = 1; i < h.length; ++i) { @@ -329,18 +333,18 @@ summary(() => { bench("string concat join", () => { for (let i = 0; i < headers.length; ++i) { - const x = concat(headers[i]); + const _ = concat(headers[i]); } - }); + }).baseline(true); bench("array string join", () => { for (let i = 0; i < headers.length; ++i) { - const x = join(headers[i]); + const _ = join(headers[i]); } }); }); - group(() => { + group("substring vs check if string is empty before append", () => { // Which is faster, always adding a & and substring(1) or check if we need a preceeding & on every append? bench("substring", () => { @@ -358,9 +362,10 @@ summary(() => { a += "&uploadId=12323456432"; - const q = a.substring(1); + const _ = a.substring(1); } - }); + }).baseline(true); + bench("conditional", () => { for (let i = 0; i < 1000; ++i) { let a = ""; @@ -418,60 +423,31 @@ summary(() => { // What is faster, passing an empty object as a default or accepting undefined and use safe-navigation? // -> This is probably hard to benchmark and the results are pretty close -> we don't care - group(() => { - bench("allocation", () => { - for (let i = 0; i < 1000; ++i) { - do_not_optimize(fnWithDefaultParam()); - do_not_optimize(fnWithDefaultParam({ a: true })); - do_not_optimize(fnWithDefaultParam({ a: true, b: true })); - do_not_optimize(fnWithDefaultParam()); - do_not_optimize(fnWithDefaultParam()); - do_not_optimize(fnWithDefaultParam()); - } - }); - bench("conditional", () => { - for (let i = 0; i < 1000; ++i) { - do_not_optimize(fnWithOptionalParam()); - do_not_optimize(fnWithOptionalParam({ a: true })); - do_not_optimize(fnWithOptionalParam({ a: true, b: true })); - do_not_optimize(fnWithOptionalParam()); - do_not_optimize(fnWithOptionalParam()); - do_not_optimize(fnWithOptionalParam()); - } - }); - }); - - group(() => { - // Do we want to pass a buffer to our XML parser? Undici offers a buffer directly, which could - // improve throughput due to an encoding step getting skipped - - const s = `test-bucket583ea250-5016-48e5-8b26-b3ce0d9e5822/foo-key-9000tWA7cuzMIElE_sIi8weNVQJdxXnxZI9mhRT3hi9Xuaeqv4DjyteO64y_o4SuJP_E0Uf-D4Mzqeno7eWIakTtmlgabUjQ3uko2TE9Qv5BpztLPVqqJKEQnhulwkgLzcOs031000false"4715e35cf900ae14837e3c098e87d522"2025-06-20T13:58:01.000Z16291456"ce1b200f8c97447474929b722ed93b00"2025-06-20T13:58:02.000Z26291456"3bc3be0b850eacf461ec036374616058"2025-06-20T13:58:02.000Z31048576webfile75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06awebfile75aa57f09aa0c8caeab4f8c24e99d10f8e7faeebf76c078efc7c6caea54ba06aSTANDARD`; - const b = Buffer.from(s, "ascii"); - - // -> buffer and string perform basically the same - // maybe we should use a buffer via undici, because undici could skip the string decoding - - const xmlParser = new XMLParser({ - ignoreAttributes: true, - isArray: (_, jPath) => - jPath === "ListMultipartUploadsResult.Upload" || - jPath === "ListBucketResult.Contents" || - jPath === "ListPartsResult.Part" || - jPath === "DeleteResult.Deleted" || - jPath === "DeleteResult.Error", - }); - - bench("parse string with fxp", () => { - for (let i = 0; i < 10000; ++i) { - xmlParser.parse(s); - } - }); - bench("parse buffer with fxp", () => { - for (let i = 0; i < 10000; ++i) { - xmlParser.parse(b); - } - }); - }); + group( + "empty object as default param vs undefined param + safe navigation", + () => { + bench("allocation (param = {})", () => { + for (let i = 0; i < 1000; ++i) { + do_not_optimize(fnWithDefaultParam()); + do_not_optimize(fnWithDefaultParam({ a: true })); + do_not_optimize(fnWithDefaultParam({ a: true, b: true })); + do_not_optimize(fnWithDefaultParam()); + do_not_optimize(fnWithDefaultParam()); + do_not_optimize(fnWithDefaultParam()); + } + }).baseline(true); + bench("conditional (safe navigation, param?.value)", () => { + for (let i = 0; i < 1000; ++i) { + do_not_optimize(fnWithOptionalParam()); + do_not_optimize(fnWithOptionalParam({ a: true })); + do_not_optimize(fnWithOptionalParam({ a: true, b: true })); + do_not_optimize(fnWithOptionalParam()); + do_not_optimize(fnWithOptionalParam()); + do_not_optimize(fnWithOptionalParam()); + } + }); + }, + ); }); }); diff --git a/bench/xml.ts b/bench/xml.ts index 77a235fc..29e78903 100644 --- a/bench/xml.ts +++ b/bench/xml.ts @@ -3,7 +3,7 @@ import { summary, group, bench, run, barplot } from "mitata"; import { XMLParser } from "fast-xml-parser"; import * as s3mini from "./s3mini-xml.ts"; -// import { parseListPartsResult as runtimeGeneratedParser } from "../src/parsers.ts"; +import { parseListPartsResult as runtimeGeneratedParser } from "../src/parsers.ts"; summary(() => { barplot(() => { @@ -41,13 +41,11 @@ summary(() => { }); */ - /* bench("custom parser (runtime-generated)", () => { for (let i = 0; i < 10000; ++i) { runtimeGeneratedParser(s); } }).baseline(true); - */ bench("xml parser of s3mini", () => { for (let i = 0; i < 10000; ++i) { diff --git a/src/S3Client.ts b/src/S3Client.ts index acd42bef..c71062f9 100644 --- a/src/S3Client.ts +++ b/src/S3Client.ts @@ -4,7 +4,7 @@ import { XMLParser, XMLBuilder } from "fast-xml-parser"; import S3File from "./S3File.ts"; import S3Error from "./S3Error.ts"; -import S3BucketEntry from "./S3BucketEntry.ts"; +import type S3BucketEntry from "./S3BucketEntry.ts"; import KeyCache from "./KeyCache.ts"; import * as amzDate from "./AmzDate.ts"; import * as sign from "./sign.ts"; @@ -47,6 +47,17 @@ import { } from "./encode.ts"; import type { Readable } from "node:stream"; +import { + parseListPartsResult, + parseListBucketResult, + parseInitiateMultipartUploadResult, + parseListMultipartUploadsResult, + parseCompleteMultipartUploadResult, + parseDeleteResult, + parseGetBucketCorsResult, + parseCopyObjectResult, +} from "./parsers.ts"; + export const kWrite = Symbol("kWrite"); export const kStream = Symbol("kStream"); export const kSignedRequest = Symbol("kSignedRequest"); @@ -54,12 +65,6 @@ export const kGetEffectiveParams = Symbol("kGetEffectiveParams"); const xmlParser = new XMLParser({ ignoreAttributes: true, - isArray: (_, jPath) => - jPath === "ListMultipartUploadsResult.Upload" || - jPath === "ListBucketResult.Contents" || - jPath === "ListPartsResult.Part" || - jPath === "DeleteResult.Deleted" || - jPath === "DeleteResult.Error", }); const xmlBuilder = new XMLBuilder({ attributeNamePrefix: "$", @@ -362,20 +367,6 @@ export type ListPartsResult = { storageClass?: StorageClass; checksumAlgorithm?: ChecksumAlgorithm; checksumType?: ChecksumType; - - // TODO - // initiator: unknown; - // - // string - // string - // - - // TODO - // owner: unknown; - // - // string - // string - // }; export type ListObjectsResult = { @@ -713,16 +704,9 @@ export default class S3Client { } const text = await response.body.text(); - const res = ensureParsedXml(text).CopyObjectResult ?? {}; - - return { - etag: res.ETag, - lastModified: res.LastModified ? new Date(res.LastModified) : undefined, - checksumCRC32: res.ChecksumCRC32, - checksumCRC32C: res.ChecksumCRC32C, - checksumSHA1: res.ChecksumSHA1, - checksumSHA256: res.ChecksumSHA256, - }; + + // biome-ignore lint/suspicious/noExplicitAny: PoC + return parseCopyObjectResult(text) as any as CopyObjectResult; } //#region multipart uploads @@ -752,13 +736,10 @@ export default class S3Client { } const text = await response.body.text(); - const res = ensureParsedXml(text).InitiateMultipartUploadResult ?? {}; - return { - bucket: res.Bucket, - key: res.Key, - uploadId: res.UploadId, - }; + // biome-ignore lint/suspicious/noExplicitAny: PoC + return (parseInitiateMultipartUploadResult(text) as any) + .result as CreateMultipartUploadResult; } /** @@ -830,34 +811,9 @@ export default class S3Client { } const text = await response.body.text(); - const root = ensureParsedXml(text).ListMultipartUploadsResult ?? {}; - - return { - bucket: root.Bucket || undefined, - delimiter: root.Delimiter || undefined, - prefix: root.Prefix || undefined, - keyMarker: root.KeyMarker || undefined, - uploadIdMarker: root.UploadIdMarker || undefined, - nextKeyMarker: root.NextKeyMarker || undefined, - nextUploadIdMarker: root.NextUploadIdMarker || undefined, - maxUploads: root.MaxUploads ?? 1000, // not using || to not override 0; caution: minio supports 10000(!) - isTruncated: root.IsTruncated === "true", - uploads: - root.Upload?.map( - // biome-ignore lint/suspicious/noExplicitAny: we're parsing here - (u: any) => - ({ - key: u.Key || undefined, - uploadId: u.UploadId || undefined, - // TODO: Initiator - // TODO: Owner - storageClass: u.StorageClass || undefined, - checksumAlgorithm: u.ChecksumAlgorithm || undefined, - checksumType: u.ChecksumType || undefined, - initiated: u.Initiated ? new Date(u.Initiated) : undefined, - }) satisfies MultipartUpload, - ) ?? [], - }; + // biome-ignore lint/suspicious/noExplicitAny: PoC + return (parseListMultipartUploadsResult(text) as any) + .result as ListMultipartUploadsResult; } /** @@ -939,20 +895,9 @@ export default class S3Client { throw await getResponseError(response, path); } const text = await response.body.text(); - const res = ensureParsedXml(text).CompleteMultipartUploadResult ?? {}; - - return { - location: res.Location || undefined, - bucket: res.Bucket || undefined, - key: res.Key || undefined, - etag: res.ETag || undefined, - checksumCRC32: res.ChecksumCRC32 || undefined, - checksumCRC32C: res.ChecksumCRC32C || undefined, - checksumCRC64NVME: res.ChecksumCRC64NVME || undefined, - checksumSHA1: res.ChecksumSHA1 || undefined, - checksumSHA256: res.ChecksumSHA256 || undefined, - checksumType: res.ChecksumType || undefined, - }; + + // biome-ignore lint/suspicious/noExplicitAny: PoC + return (parseCompleteMultipartUploadResult(text) as any).result; } /** @@ -1065,26 +1010,8 @@ export default class S3Client { if (response.statusCode === 200) { const text = await response.body.text(); - const root = ensureParsedXml(text).ListPartsResult ?? {}; - return { - bucket: root.Bucket, - key: root.Key, - uploadId: root.UploadId, - partNumberMarker: root.PartNumberMarker ?? undefined, - nextPartNumberMarker: root.NextPartNumberMarker ?? undefined, - maxParts: root.MaxParts ?? 1000, - isTruncated: root.IsTruncated ?? false, - parts: - // biome-ignore lint/suspicious/noExplicitAny: parsing code - root.Part?.map((part: any) => ({ - etag: part.ETag, - lastModified: part.LastModified - ? new Date(part.LastModified) - : undefined, - partNumber: part.PartNumber ?? undefined, - size: part.Size ?? undefined, - })) ?? [], - }; + // biome-ignore lint/suspicious/noExplicitAny: POC + return (parseListPartsResult(text) as any).result; } throw await getResponseError(response, path); @@ -1328,15 +1255,20 @@ export default class S3Client { ); if (response.statusCode !== 200) { - // undici docs state that we should dump the body if not used - response.body.dump(); // dump's floating promise should not throw - throw fromStatusCode(response.statusCode, ""); + // garage returns 204 instead of 404 + // fix submitted here: https://git.deuxfleurs.fr/Deuxfleurs/garage/pulls/1096 + // This workaround should be removed as soon as garage fixed the compat issue + throw fromStatusCode( + response.statusCode === 204 ? 404 : response.statusCode, + "", + ); } - // const text = await response.body.text(); - // console.log(text) + const text = await response.body.text(); - throw new Error("Not implemented"); + // biome-ignore lint/suspicious/noExplicitAny: PoC + return (parseGetBucketCorsResult(text) as any) + .result as GetBucketCorsResult; } /** @@ -1481,25 +1413,15 @@ export default class S3Client { } const text = await response.body.text(); - - const res = ensureParsedXml(text).ListBucketResult ?? {}; - if (!res) { + try { + // biome-ignore lint/suspicious/noExplicitAny: PoC + return (parseListBucketResult(text) as any).result; + } catch (cause) { throw new S3Error("Unknown", "", { message: "Could not read bucket contents.", + cause, }); } - - return { - name: res.Name, - prefix: res.Prefix, - startAfter: res.StartAfter, - isTruncated: res.IsTruncated, - continuationToken: res.ContinuationToken, - maxKeys: res.MaxKeys, - keyCount: res.KeyCount, - nextContinuationToken: res.NextContinuationToken, - contents: res.Contents?.map(S3BucketEntry.parse) ?? [], - }; } //#endregion @@ -1541,11 +1463,12 @@ export default class S3Client { if (response.statusCode === 200) { const text = await response.body.text(); - // biome-ignore lint/suspicious/noExplicitAny: parsing + // biome-ignore lint/suspicious/noExplicitAny: PoC let deleteResult: any; try { // Quite mode omits all deleted elements, so it will be parsed as "", wich we need to coalasce to null/undefined - deleteResult = ensureParsedXml(text).DeleteResult ?? {}; + // biome-ignore lint/suspicious/noExplicitAny: PoC + deleteResult = (parseDeleteResult(text) as any).result; } catch (cause) { // Possible according to AWS docs throw new S3Error("Unknown", "", { @@ -1554,16 +1477,7 @@ export default class S3Client { }); } - const errors = - // biome-ignore lint/suspicious/noExplicitAny: parsing - deleteResult.Error?.map((e: any) => ({ - code: e.Code, - key: e.Key, - message: e.Message, - versionId: e.VersionId, - })) ?? []; - - return { errors }; + return { errors: deleteResult.errors }; } if (400 <= response.statusCode && response.statusCode < 500) { @@ -1943,22 +1857,3 @@ export function buildSearchParams( return res; } - -// biome-ignore lint/suspicious/noExplicitAny: parsing result is just unknown -function ensureParsedXml(text: string): any { - try { - const r = xmlParser.parse(text); - if (!r) { - throw new S3Error("Unknown", "", { - message: "S3 service responded with empty XML.", - }); - } - return r; - } catch (cause) { - // Possible according to AWS docs - throw new S3Error("Unknown", "", { - message: "S3 service responded with invalid XML.", - cause, - }); - } -} diff --git a/src/parsers.ts b/src/parsers.ts new file mode 100644 index 00000000..e9dc6c46 --- /dev/null +++ b/src/parsers.ts @@ -0,0 +1,395 @@ +import { buildParser } from "./xml-parser/generator.ts"; + +const initiator = { + type: "object", + tagName: "Initiator", + optional: true, + children: { + displayName: { type: "string", tagName: "DisplayName" }, + id: { type: "string", tagName: "ID" }, + }, +} as const; +const owner = { + type: "object", + tagName: "Owner", + optional: true, + children: { + displayName: { type: "string", tagName: "DisplayName" }, + id: { type: "string", tagName: "ID" }, + }, +} as const; + +const optionalChecksumBag = { + checksumCRC: { + type: "string", + tagName: "ChecksumCRC", + optional: true, + }, + checksumCRC32: { + type: "string", + tagName: "ChecksumCRC32", + optional: true, + }, + checksumCRC32C: { + type: "string", + tagName: "ChecksumCRC32C", + optional: true, + }, + checksumCRC64NVME: { + type: "string", + tagName: "ChecksumCRC64NVME", + optional: true, + }, + checksumSHA1: { + type: "string", + tagName: "ChecksumSHA1", + optional: true, + }, + checksumSHA256: { + type: "string", + tagName: "ChecksumSHA256", + optional: true, + }, + checksumType: { + type: "string", + tagName: "ChecksumType", + optional: true, + }, +} as const; + +export const listPartsResultSpec = { + type: "root", + children: { + result: { + type: "object", + tagName: "ListPartsResult", + children: { + bucket: { type: "string", tagName: "Bucket" }, + key: { type: "string", tagName: "Key" }, + uploadId: { type: "string", tagName: "UploadId" }, + storageClass: { type: "string", tagName: "StorageClass" }, + checksumAlgorithm: { + type: "string", + tagName: "ChecksumAlgorithm", + optional: true, + emptyIsAbsent: true, + }, + checksumType: { + type: "string", + tagName: "ChecksumType", + optional: true, + emptyIsAbsent: true, + }, + partNumberMarker: { + type: "integer", + tagName: "PartNumberMarker", + optional: true, + }, + nextPartNumberMarker: { + type: "integer", + tagName: "NextPartNumberMarker", + optional: true, + }, + maxParts: { type: "integer", tagName: "MaxParts" }, + isTruncated: { + type: "boolean", + tagName: "IsTruncated", + defaultValue: false, + }, + initiator, + owner, + parts: { + type: "array", + tagName: "Part", + defaultEmpty: true, + item: { + type: "object", + children: { + etag: { type: "string", tagName: "ETag" }, + lastModified: { type: "date", tagName: "LastModified" }, + partNumber: { type: "integer", tagName: "PartNumber" }, + size: { type: "integer", tagName: "Size" }, + }, + }, + }, + }, + }, + }, +} as const; + +export const parseListPartsResult = buildParser(listPartsResultSpec); + +export const parseListBucketResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "ListBucketResult", + children: { + name: { type: "string", tagName: "Name" }, + prefix: { type: "string", tagName: "Prefix" }, + startAfter: { type: "string", tagName: "StartAfter", optional: true }, + isTruncated: { + type: "boolean", + tagName: "IsTruncated", + defaultValue: false, + }, + continuationToken: { + type: "string", + tagName: "ContinuationToken", + optional: true, + }, + nextContinuationToken: { + type: "string", + tagName: "NextContinuationToken", + optional: true, + }, + maxKeys: { type: "integer", tagName: "MaxKeys", defaultValue: 1000 }, + keyCount: { type: "integer", tagName: "KeyCount" }, + contents: { + type: "array", + tagName: "Contents", + defaultEmpty: true, + item: { + type: "object", + children: { + //#region returned by ceph + rgwxTag: { type: "ignored", tagName: "RgwxTag", optional: true }, + type: { type: "ignored", tagName: "Type", optional: true }, + //#endregion + + key: { type: "string", tagName: "Key" }, + size: { type: "integer", tagName: "Size" }, + lastModified: { type: "date", tagName: "LastModified" }, + etag: { type: "string", tagName: "ETag" }, + storageClass: { type: "string", tagName: "StorageClass" }, + checksumAlgorithm: { + type: "string", + tagName: "ChecksumAlgorithm", + optional: true, + }, + checksumType: { + type: "string", + tagName: "ChecksumType", + optional: true, + }, + }, + }, + }, + }, + }, + }, +}); + +export const parseInitiateMultipartUploadResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "InitiateMultipartUploadResult", + children: { + bucket: { type: "string", tagName: "Bucket" }, + key: { type: "string", tagName: "Key" }, + uploadId: { type: "string", tagName: "UploadId" }, + }, + }, + }, +}); + +export const parseListMultipartUploadsResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "ListMultipartUploadsResult", + children: { + bucket: { type: "string", tagName: "Bucket" }, + keyMarker: { + type: "string", + tagName: "KeyMarker", + optional: true, + emptyIsAbsent: true, + }, + uploadIdMarker: { + type: "string", + tagName: "UploadIdMarker", + optional: true, + emptyIsAbsent: true, + }, + nextKeyMarker: { + type: "string", + tagName: "NextKeyMarker", + optional: true, + }, + prefix: { + type: "string", + tagName: "Prefix", + optional: true, + emptyIsAbsent: true, + }, + delimiter: { + type: "string", + tagName: "Delimiter", + optional: true, + emptyIsAbsent: true, + }, + nextUploadIdMarker: { + type: "string", + tagName: "NextUploadIdMarker", + optional: true, + emptyIsAbsent: true, + }, + maxUploads: { + type: "integer", + tagName: "MaxUploads", + defaultValue: 1000, + }, + isTruncated: { + type: "boolean", + tagName: "IsTruncated", + defaultValue: false, + }, + + uploads: { + type: "array", + tagName: "Upload", + defaultEmpty: true, + item: { + type: "object", + children: { + checksumAlgorithm: { + type: "string", + tagName: "ChecksumAlgorithm", + optional: true, + }, + checksumType: { + type: "string", + tagName: "ChecksumType", + optional: true, + }, + initiated: { + type: "date", + tagName: "Initiated", + }, + initiator, + owner, + storageClass: { type: "string", tagName: "StorageClass" }, + key: { type: "string", tagName: "Key" }, + uploadId: { type: "string", tagName: "UploadId" }, + }, + }, + }, + }, + }, + }, +}); + +export const parseCompleteMultipartUploadResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "CompleteMultipartUploadResult", + children: { + location: { type: "string", tagName: "Location" }, + bucket: { type: "string", tagName: "Bucket" }, + key: { type: "string", tagName: "Key" }, + etag: { type: "string", tagName: "ETag" }, + ...optionalChecksumBag, + }, + }, + }, +}); + +export const parseDeleteResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "DeleteResult", + children: { + errors: { + type: "array", + optional: true, + defaultEmpty: true, + item: { + type: "object", + children: { + code: { type: "string", tagName: "Code" }, + message: { type: "string", tagName: "Message" }, + key: { type: "string", tagName: "Key" }, + etag: { type: "string", tagName: "ETag" }, + versionId: { type: "string", tagName: "VersionId" }, + }, + }, + }, + }, + }, + }, +}); + +export const parseGetBucketCorsResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "CORSConfiguration", + children: { + rules: { + type: "array", + tagName: "CORSRule", + optional: true, + defaultEmpty: true, + item: { + type: "object", + children: { + allowedOrigins: { + type: "array", + tagName: "AllowedOrigin", + item: { type: "string" }, + }, + allowedMethods: { + type: "array", + tagName: "AllowedMethod", + item: { type: "string" }, + }, + allowedHeaders: { + type: "array", + tagName: "AllowedHeader", + optional: true, + item: { type: "string" }, + }, + exposeHeaders: { + type: "array", + tagName: "ExposeHeader", + optional: true, + item: { type: "string" }, + }, + maxAgeSeconds: { + type: "integer", + tagName: "MaxAgeSeconds", + optional: true, + }, + id: { type: "string", tagName: "ID", optional: true }, + }, + }, + }, + }, + }, + }, +}); + +export const parseCopyObjectResult = buildParser({ + type: "root", + children: { + result: { + type: "object", + tagName: "CopyObjectResult", + children: { + etag: { type: "string", tagName: "ETag" }, + lastModified: { type: "date", tagName: "LastModified" }, + ...optionalChecksumBag, + }, + }, + }, +}); diff --git a/src/xml-parser/README.md b/src/xml-parser/README.md new file mode 100644 index 00000000..00336b5c --- /dev/null +++ b/src/xml-parser/README.md @@ -0,0 +1,17 @@ +# xml-parser +lean-s3 seems to re-invent the wheel by shipping its own XML parser. + +## Why? +`fast-xml-parser` is kind of slow and cannot take advantage of the schema we know beforehand. We use a custom parser that uses runtime-code-gen to create a parser that is tailored to a specific response. It's also possible to intentionally not implement stuff that is not needed in the context of S3. + +Not only will a specialized parser be faster, but it can also rename object fields and validate in the same step, rednering post-processing of the parsed output unnecessary. + +## Limitations +This parser only supports what's needed to parse S3 responses. It intentionally does not cover the entirety of xml. Namely: +- CDATA +- Attributes (they are skiped entirely) +- XXE +- XSD +- Comments +- Using `'` as quotes +- and probably much more diff --git a/src/xml-parser/cli-generator.ts b/src/xml-parser/cli-generator.ts new file mode 100644 index 00000000..ec92ce73 --- /dev/null +++ b/src/xml-parser/cli-generator.ts @@ -0,0 +1,24 @@ +/** + * @module Used to generate a static file containing a parser module for debugging purposes. + * Invoke using: + * ```sh + * tsx ./cli-generator.ts > parse.js + * ``` + */ + +import { buildStaticParserSourceWithText } from "./generator.ts"; + +const source = buildStaticParserSourceWithText( + { + type: "root", + children: { + note: { + type: "object", + children: {}, + }, + }, + }, + ``, +); + +console.log(source); diff --git a/src/xml-parser/generator.ts b/src/xml-parser/generator.ts new file mode 100644 index 00000000..8ecf2669 --- /dev/null +++ b/src/xml-parser/generator.ts @@ -0,0 +1,391 @@ +import * as rt from "./runtime.ts"; + +function emitSpecParser( + spec: ParseSpec | RootSpec, + tagName: string, + globals: Map, +): string { + if (globals.has(spec)) { + return ""; + } + + switch (spec.type) { + case "ignored": + case "string": + case "integer": + case "boolean": + case "date": + return ""; // these are built-in + case "object": + return emitObjectParser(spec, tagName, globals); + case "array": + return emitSpecParser(spec.item, tagName, globals); + case "root": + return emitRootParser(spec, globals); + } +} +function emitParserCall( + spec: ParseSpec, + tagName: string, + globals: Map, +): string { + switch (spec.type) { + case "ignored": + return `this.parseIgnoredTag(${asLiteral(tagName)})`; + case "string": + return `(this.parseStringTag(${asLiteral(tagName)})${spec.emptyIsAbsent ? " || undefined" : ""})`; + case "integer": + return `this.parseIntegerTag(${asLiteral(tagName)})`; + case "boolean": + return `this.parseBooleanTag(${asLiteral(tagName)})`; + case "date": + return `this.parseDateTag(${asLiteral(tagName)})`; + case "object": + return `this.${globals.get(spec)}()`; + case "array": + return ""; // arrays handled differently + } +} + +function emitChildParsers( + spec: RootSpec | ObjectSpec, + globals: Map, +): string { + let code = ""; + for (const [childName, childSpec] of Object.entries(spec.children)) { + const childTagName = childSpec.tagName ?? childName; + code += emitSpecParser(childSpec, childTagName, globals); + } + return code; +} + +function emitChildFieldInit(children: Record>) { + return Object.entries(children) + .map( + ([n, childSpec]) => + `${n}: ${ + childSpec.type === "array" && childSpec.defaultEmpty + ? "[]" + : childSpec.type === "boolean" && + typeof childSpec.defaultValue === "boolean" && + !childSpec.optional + ? childSpec.defaultValue.toString() + : childSpec.type === "string" && + typeof childSpec.defaultValue === "string" && + !childSpec.optional + ? childSpec.defaultValue + : childSpec.type === "integer" && + typeof childSpec.defaultValue === "number" && + !childSpec.optional + ? childSpec.defaultValue.toString() + : childSpec.type === "date" && + childSpec.defaultValue instanceof Date && + !childSpec.optional + ? `new Date("${childSpec.defaultValue.toISOString()}")` + : "undefined" + },`, + ) + .join("\n\t\t"); +} + +function emitResultAssignment( + resultField: string, + spec: ParseSpec, + fieldName: string, + globals: Map, +) { + return spec.type === "array" + ? spec.defaultEmpty + ? `${resultField}.${fieldName}.push(${emitParserCall(spec.item, spec.tagName ?? fieldName, globals)})` + : `(${resultField}.${fieldName} ??= []).push(${emitParserCall(spec.item, spec.tagName ?? fieldName, globals)})` + : `${resultField}.${fieldName} = ${emitParserCall(spec, spec.tagName ?? fieldName, globals)}`; +} + +function emitObjectParser( + spec: ObjectSpec, + tagName: string, + globals: Map, +): string { + const parseFn = `fn_${globals.size}_${tagName}`; + globals.set(spec, parseFn); + + const { children } = spec; + + return ` +${emitChildParsers(spec, globals)} +${parseFn}() { + // Init structure entirely, so v8 can create a single hidden class + const res = { + ${emitChildFieldInit(children)} + }; + + this.parseIdentifier(${asLiteral(tagName)}); + + if (this.token === ${rt.TokenKind.endSelfClosing} /* TokenKind.endSelfClosing */) { + this.nextToken(); + ${emitObjectInvariants(children).join("\n\t\t\t\t")} + return res; + } + + this.parseExpected(${rt.TokenKind.endTag} /* TokenKind.endTag */); + + while (true) { + switch (this.token) { + case ${rt.TokenKind.startClosingTag} /* TokenKind.startClosingTag */: + this.nextToken(); // consume TokenKind.startClosingTag + + this.parseIdentifier(${asLiteral(tagName)}); + this.parseExpected(${rt.TokenKind.endTag} /* TokenKind.endTag */); + ${emitObjectInvariants(children).join("\n\t\t\t\t")} + return res; + case ${rt.TokenKind.eof}: + throw new Error(\`Unterminated tag: "${tagName}"\`); + ${ + Object.keys(children).length > 0 + ? ` + case ${rt.TokenKind.startTag}: { + this.nextToken(); // consume TokenKind.startTag + + switch (this.scanner.getTokenValueEncoded()) { + ${Object.entries(children) + .map( + ([name, childSpec]) => + `case ${asLiteral(childSpec.tagName ?? name)}: + ${emitResultAssignment("res", childSpec, name, globals)}; + break;`, + ) + .join("\n\t\t\t\t\t")} + default: + throw new Error(\`Unexpected tag identifier: \${this.scanner.getTokenValueEncoded()}\`); + } + break; + } + ` + : "" + } + default: + throw new Error(\`Unhandled token kind: \${this.token}\`); + } + } +} +`.trimStart(); +} + +function emitRootParser( + spec: RootSpec, + globals: Map, +): string { + const parseFn = `parse_${globals.size}`; + globals.set(spec, parseFn); + + const { children } = spec; + + return ` +${emitChildParsers(spec, globals)} +${parseFn}() { + // Init structure entirely, so v8 can create a single hidden class + const res = { + ${emitChildFieldInit(children)} + }; + + while (true) { + switch (this.token) { + case ${rt.TokenKind.eof} /* TokenKind.eof */: + ${emitObjectInvariants(children).join("\n\t\t\t\t")} + return res; + ${ + Object.keys(children).length > 0 + ? ` + case ${rt.TokenKind.startTag} /* TokenKind.startTag */: { + this.nextToken(); // consume TokenKind.startTag + + switch (this.scanner.getTokenValueEncoded()) { + ${Object.entries(children) + .map( + ([name, childSpec]) => + `case ${asLiteral(childSpec.tagName ?? name)}: + ${emitResultAssignment("res", childSpec, name, globals)}; + break;`, + ) + .join("\n\t\t\t\t\t")} + default: + throw new Error(\`Unexpected tag identifier: \${this.scanner.getTokenValueEncoded()}\`); + } + break; + } + ` + : "" + } + default: + throw new Error(\`Unhandled token kind: \${this.token}\`); + } + } +} +`.trimStart(); +} + +function emitObjectInvariants( + children: (ObjectSpec | RootSpec)["children"], +): string[] { + return Object.entries(children) + .map(([name, childSpec]) => + childSpec.optional || + (childSpec.type === "array" && childSpec.defaultEmpty) + ? undefined + : `if (res.${name} === undefined) throw new TypeError(\`Value for field "${name}" was required but not present (expected as tag name "${childSpec.tagName ?? name}").\`);`, + ) + .filter(s => s !== undefined); +} + +function asLiteral(value: string): string { + return `"${value}"`; // TODO: Escaping +} +function _asIdentifier(value: string): string { + return value; // TODO: Escaping +} + +type ParseSpec = + | ObjectSpec + | ArraySpec + | StringSpec + | IgnoredSpec + | BooleanSpec + | IntegerSpec + | DateSpec; + +type RootSpec = { + type: "root"; + // tagName: string; + optional?: boolean; + children: Record>; +}; +type ObjectSpec = { + type: "object"; + tagName?: string; + optional?: boolean; + children: Record>; +}; +type ArraySpec = { + type: "array"; + tagName?: string; + optional?: boolean; + defaultEmpty?: boolean; + item: ParseSpec; +}; +type IgnoredSpec = { + type: "ignored"; + tagName?: string; + optional: true; +}; +type StringSpec = { + type: "string"; + tagName?: string; + optional?: boolean; + defaultValue?: string; + emptyIsAbsent?: boolean; +}; +type BooleanSpec = { + type: "boolean"; + tagName?: string; + optional?: boolean; + defaultValue?: boolean; +}; +type IntegerSpec = { + type: "integer"; + tagName?: string; + optional?: boolean; + defaultValue?: number; +}; +type DateSpec = { + type: "date"; + tagName?: string; + optional?: boolean; + defaultValue?: Date; +}; + +/* +type ParsedRoot>, T extends RootSpec> = { + [k in keyof T["children"]]: ParsedType; +}; + +type ParsedObject>, T extends ObjectSpec> = { + [k in keyof T["children"]]: ParsedType; +}; + +type ParsedType> = T extends StringSpec + ? string + : T extends BooleanSpec + ? boolean + : T extends IntegerSpec + ? number + : T extends DateSpec + ? Date + : T extends ObjectSpec + ? ParsedObject> + : never; + +function buildParser>, V extends T>( + rootSpec: RootSpec, +): Parser>> { + throw new Error("Not implemented"); +} +*/ + +type Parser = (text: string) => T; + +export function buildStaticParserSource( + rootSpec: RootSpec, +): string { + const globals = new Map(); + const parsingCode = emitSpecParser(rootSpec, "", globals); + const rootParseFunctionName = globals.get(rootSpec); + globals.clear(); // make sure we clear all references (even though this map won't survive past this function) + + return ` +import * as rt from "./runtime.ts"; +class GeneratedParser extends rt.Parser { + ${parsingCode} +} +export default (text) => new GeneratedParser(text).${rootParseFunctionName}(); +`.trimStart(); +} + +export function buildStaticParserSourceWithText( + rootSpec: RootSpec, + text: string, +): string { + const globals = new Map(); + const parsingCode = emitSpecParser(rootSpec, "", globals); + const rootParseFunctionName = globals.get(rootSpec); + globals.clear(); // make sure we clear all references (even though this map won't survive past this function) + + return ` +import * as rt from "./runtime.ts"; +class GeneratedParser extends rt.Parser { + ${parsingCode} +} +new GeneratedParser(text).${rootParseFunctionName}(\`${text}\`); +`.trimStart(); +} + +export function buildParser( + rootSpec: RootSpec, +): Parser { + const globals = new Map(); + const parsingCode = emitSpecParser(rootSpec, "", globals); + const rootParseFunctionName = globals.get(rootSpec); + globals.clear(); // make sure we clear all references (even though this map won't survive past this function) + + return new Function( + "rt", + ` +return (() => { + +class GeneratedParser extends rt.Parser { + ${parsingCode} +} +return (text) => new GeneratedParser(text).${rootParseFunctionName}(); +})() +`.trim(), + )(rt) as Parser; +} diff --git a/src/xml-parser/parse.js b/src/xml-parser/parse.js new file mode 100644 index 00000000..74b29b86 --- /dev/null +++ b/src/xml-parser/parse.js @@ -0,0 +1,73 @@ +import * as rt from "./runtime.ts"; +class GeneratedParser extends rt.Parser { + fn_1_note() { + // Init structure entirely, so v8 can create a single hidden class + const res = {}; + + this.parseIdentifier("note"); + + if (this.token === 4 /* TokenKind.endSelfClosing */) { + this.nextToken(); + + return res; + } + + this.parseExpected(2 /* TokenKind.endTag */); + + while (true) { + switch (this.token) { + case 3 /* TokenKind.startClosingTag */: + this.nextToken(); // consume TokenKind.startClosingTag + + this.parseIdentifier("note"); + this.parseExpected(2 /* TokenKind.endTag */); + + return res; + case 0: + throw new Error(`Unterminated tag: "note"`); + + default: + throw new Error(`Unhandled token kind: ${this.token}`); + } + } + } + + parse_0() { + // Init structure entirely, so v8 can create a single hidden class + const res = { + note: undefined, + }; + + while (true) { + switch (this.token) { + case 0 /* TokenKind.eof */: + if (res.note === undefined) + throw new TypeError( + `Value for field "note" was required but not present (expected as tag name "note").`, + ); + return res; + + case 1 /* TokenKind.startTag */: { + this.nextToken(); // consume TokenKind.startTag + + switch (this.scanner.getTokenValueEncoded()) { + case "note": + res.note = this.fn_1_note(); + break; + default: + throw new Error( + `Unexpected tag identifier: ${this.scanner.getTokenValueEncoded()}`, + ); + } + break; + } + + default: + throw new Error(`Unhandled token kind: ${this.token}`); + } + } + } +} +new GeneratedParser(text).parse_0( + ``, +); diff --git a/src/xml-parser/parse.test.ts b/src/xml-parser/parse.test.ts new file mode 100644 index 00000000..b35069f3 --- /dev/null +++ b/src/xml-parser/parse.test.ts @@ -0,0 +1,212 @@ +import { describe, test } from "node:test"; +import { expect } from "expect"; +import { buildParser } from "./generator.ts"; + +describe("xml parsing", () => { + test("optional preamble", () => { + const parse = buildParser({ + type: "root", + children: { + note: { + type: "object", + children: {}, + }, + }, + }); + + expect( + parse(``), + ).toStrictEqual({ note: {} }); + expect( + parse(``), + ).toStrictEqual({ note: {} }); + expect( + parse(` + + + `), + ).toStrictEqual({ note: {} }); + expect( + parse(` + + `), + ).toStrictEqual({ note: {} }); + expect(parse(``)).toStrictEqual({ note: {} }); + }); + + test("empty + self-closing tag", () => { + const parse = buildParser({ + type: "root", + children: { + note: { + type: "object", + children: {}, + }, + }, + }); + + expect(parse(``)).toStrictEqual({ note: {} }); + expect(parse(``)).toStrictEqual({ note: {} }); + expect(parse(``)).toStrictEqual({ note: {} }); + + const parse2 = buildParser({ + type: "root", + children: { + note: { + type: "string", + }, + }, + }); + expect(parse2(``)).toStrictEqual({ note: "" }); + expect(() => parse2(``)).toThrow( + new Error( + `Value for field "note" was required but not present (expected as tag name "note").`, + ), + ); + expect(() => parse2(``)).toThrow( + new Error( + `Value for field "note" was required but not present (expected as tag name "note").`, + ), + ); + }); + + describe("attributes", () => { + const parse = buildParser({ + type: "root", + children: { + user: { + type: "object", + children: {}, + }, + }, + }); + + test("skips attributes", () => { + const xml = ``; + expect(parse(xml)).toStrictEqual({ user: {} }); + }); + test("skipes quotes in attributes", () => { + const xml = ``; + expect(parse(xml)).toStrictEqual({ user: {} }); + }); + test("skipps apostrophe (') in attributes", () => { + const xml = ``; + expect(parse(xml)).toStrictEqual({ user: {} }); + }); + }); + + const noteSchema = { + type: "root", + children: { + note: { + type: "object", + children: { + to: { + type: "string", + }, + from: { + type: "string", + }, + }, + }, + }, + } as const; + + test("parses a simple XML string", () => { + const parse = buildParser(noteSchema); + const xml = `AliceBob`; + const doc = parse(xml); + + expect(doc).toStrictEqual({ + note: { + from: "Bob", + to: "Alice", + }, + }); + }); + + test("handles malformed XML", () => { + const parse = buildParser(noteSchema); + const malformed = `AliceBob`; // missing closing + expect(() => parse(malformed)).toThrow(); + }); + + describe("entity decoding", () => { + const parse = buildParser({ + type: "root", + children: { + code: { + type: "string", + tagName: "Code", + }, + }, + }); + + test("parses ampersand (&) correctly", () => { + const doc = parse(`Tom & Jerry`); + expect(doc).toStrictEqual({ + code: "Tom & Jerry", + }); + }); + + test("parses less-than (<) and greater-than (>)", () => { + const doc = parse(`<div>Hello</div>`); + expect(doc).toStrictEqual({ + code: "
Hello
", + }); + }); + + test("parses mixed escape characters", () => { + const doc = parse( + `"Use < and > for tags," she said & left.`, + ); + expect(doc).toStrictEqual({ + code: `"Use < and > for tags," she said & left.`, + }); + }); + + test("raw quotes in texts", () => { + const doc = parse(`"etag-value"`); + expect(doc).toStrictEqual({ + code: `"etag-value"`, + }); + }); + + test("leading and trailing equals", () => { + expect(parse(`=equal`)).toStrictEqual({ + code: `=equal`, + }); + expect(parse(`equal=`)).toStrictEqual({ + code: `equal=`, + }); + expect(parse(`=equal=`)).toStrictEqual({ + code: `=equal=`, + }); + }); + + test("leading and trailing raw quotes", () => { + expect(parse(`=equal"`)).toStrictEqual({ + code: `=equal"`, + }); + expect(parse(`"equal=`)).toStrictEqual({ + code: `"equal=`, + }); + expect(parse(`"=equal="`)).toStrictEqual({ + code: `"=equal="`, + }); + }); + + test("leading and trailing slash", () => { + expect(parse(`/equal`)).toStrictEqual({ + code: `/equal`, + }); + expect(parse(`equal/`)).toStrictEqual({ + code: `equal/`, + }); + expect(parse(`/equal/`)).toStrictEqual({ + code: `/equal/`, + }); + }); + }); +}); diff --git a/src/xml-parser/runtime.ts b/src/xml-parser/runtime.ts new file mode 100644 index 00000000..76469f47 --- /dev/null +++ b/src/xml-parser/runtime.ts @@ -0,0 +1,443 @@ +/** biome-ignore-all lint/suspicious/noAssignInExpressions: ok here */ + +export class Parser { + scanner: Scanner; + token!: TokenKind; + + nextToken = () => { + this.token = this.scanner.scan(); + }; + + constructor(text: string) { + this.scanner = new Scanner(text); + this.nextToken(); + } + + //#region primitives + + /** Assumes {@link TokenKind.startTag} was already consumed. */ + parseIgnoredTag(tagName: string): void { + this.parseIdentifier(tagName); + + if (this.token === TokenKind.endSelfClosing) { + this.nextToken(); + return; + } + + this.parseExpected(TokenKind.endTag); + + if (this.token === TokenKind.startClosingTag) { + this.nextToken(); + this.parseIdentifier(tagName); + this.parseExpected(TokenKind.endTag); + return; + } + + if (this.token !== TokenKind.textNode) { + throw new Error(`Expected text content for tag "${tagName}".`); + } + + this.nextToken(); + this.parseClosingTag(tagName); + } + + /** Assumes {@link TokenKind.startTag} was already consumed. */ + parseStringTag(tagName: string): string | undefined { + this.parseIdentifier(tagName); + + if (this.token === TokenKind.endSelfClosing) { + this.nextToken(); + return undefined; + } + + this.parseExpected(TokenKind.endTag); + + if (this.token === TokenKind.startClosingTag) { + this.nextToken(); + this.parseIdentifier(tagName); + this.parseExpected(TokenKind.endTag); + return ""; + } + + if (this.token !== TokenKind.textNode) { + throw new Error(`Expected text content for tag "${tagName}".`); + } + + const value = this.scanner.getTokenValueDecoded(); + this.nextToken(); + + this.parseClosingTag(tagName); + return value; + } + + /** Assumes {@link TokenKind.startTag} was already consumed. */ + parseDateTag(tagName: string): Date | undefined { + const value = this.parseStringTag(tagName); + if (value === undefined) { + return undefined; + } + + const r = new Date(value); + if (Number.isNaN(r.getTime())) { + throw new Error(`Expected valid date time: "${value}"`); + } + return r; + } + + /** Assumes {@link TokenKind.startTag} was already consumed. */ + parseIntegerTag(tagName: string): number | undefined { + const value = this.parseStringTag(tagName); + if (value === undefined) { + return undefined; + } + + const n = Number(value); + if (!Number.isInteger(n)) { + throw new Error(`Value is not an integer: "${value}"`); + } + return n; + } + + /** Assumes {@link TokenKind.startTag} was already consumed. */ + parseBooleanTag(tagName: string): boolean | undefined { + const value = this.parseStringTag(tagName); + return value === undefined + ? undefined + : value === "true" + ? true + : value === "false" + ? false + : undefined; + } + + //#endregion + + parseClosingTag(tagName: string): void { + this.parseExpected(TokenKind.startClosingTag); + this.parseIdentifier(tagName); + this.parseExpected(TokenKind.endTag); + } + + parseExpected(expected: TokenKind): void { + if (this.token !== expected) { + throw new Error(`Wrong token, expected: ${expected}, got: ${this.token}`); + } + this.nextToken(); + } + + parseIdentifier(identifier: string): void { + if (this.token !== TokenKind.identifier) { + throw new Error( + `Wrong token, expected: ${TokenKind.identifier}, got: ${this.token}`, + ); + } + if (this.scanner.getTokenValueEncoded() !== identifier) { + throw new Error( + `Expected identifier: ${identifier}, got: ${this.scanner.getTokenValueEncoded()}`, + ); + } + this.nextToken(); + } +} + +/** + * biome-ignore lint/suspicious/noConstEnum: Normally, we'd avoid using TS enums due to its incompability with JS. + * But we want to inline its values into the switch-cases and still have readable code. + * + * @remarks This enum cannot be used in runtime code, since it's `const` and will not exist in the parsing stage. Values have to be inlined by the generator + */ +export const enum TokenKind { + eof = 0, + startTag = 1, // < + endTag = 2, // > + startClosingTag = 3, // + identifier = 5, + textNode = 6, +} + +const entityPattern = /&(quot|apos|lt|gt|amp);/g; +const entityMap = { + """: '"', + "'": "'", + "<": "<", + ">": ">", + "&": "&", +} as const; + +/** + * biome-ignore lint/suspicious/noConstEnum: Normally, we'd avoid using TS enums due to its incompability with JS. + * But we want to inline its values into the switch-cases and still have readable code. + */ +const enum CharCode { + lessThan = 0x3c, + greaterThan = 0x3e, + slash = 0x2f, + equals = 0x3d, + doubleQuote = 0x22, + A = 0x41, + Z = 0x5a, + a = 0x61, + z = 0x7a, + _ = 0x5f, + _0 = 0x30, + _9 = 0x39, + tab = 0x09, + space = 0x20, + lineFeed = 0x0a, + carriageReturn = 0x0d, + verticalTab = 0x0b, // \v + formFeed = 0x0c, // \f + nonBreakingSpace = 0xa0, // + lineSeparator = 0x2028, + paragraphSeparator = 0x2029, + nextLine = 0x85, + exclamationMark = 0x21, + questionMark = 0x3f, + minus = 0x2d, +} + +class Scanner { + startPos: number; + pos: number; + end: number; + text: string; + + inTag = false; + + token = -1; + + tokenValueStart = -1; + tokenValueEnd = -1; + + getTokenValueEncoded() { + return this.text.substring(this.tokenValueStart, this.tokenValueEnd); + } + getTokenValueDecoded() { + return this.getTokenValueEncoded().replace( + entityPattern, + m => entityMap[m as keyof typeof entityMap] ?? m, + ); + } + + constructor(text: string) { + // Number(text); // collapse rope structure of V8 + this.startPos = 0; + this.pos = 0; + this.end = text.length; + this.text = text; + } + + scan(): TokenKind { + this.startPos = this.pos; + + while (true) { + if (this.pos >= this.end) { + return (this.token = TokenKind.eof); + } + + const ch = this.text.charCodeAt(this.pos); + switch (ch) { + case CharCode.lineFeed: + case CharCode.carriageReturn: + case CharCode.lineSeparator: + case CharCode.paragraphSeparator: + case CharCode.nextLine: + case CharCode.tab: + case CharCode.verticalTab: + case CharCode.formFeed: + case CharCode.space: + case CharCode.nonBreakingSpace: + ++this.pos; + continue; + case CharCode.equals: { + if (this.inTag) { + // equals are skipped in the handler for the identifier + throw new Error( + "Equals cannot appear in a tag without a leading identifier.", + ); + } + + const textNode = this.#scanTextNode(); + if (textNode === undefined) { + continue; + } + return textNode; + } + case CharCode.lessThan: + ++this.pos; + + this.inTag = true; + + if (this.pos < this.end) { + switch (this.text.charCodeAt(this.pos)) { + case CharCode.slash: + ++this.pos; + return (this.token = TokenKind.startClosingTag); + case CharCode.questionMark: + this.inTag = false; + this.#skipPreamble(); + continue; + default: + break; + } + } + return (this.token = TokenKind.startTag); + case CharCode.greaterThan: + ++this.pos; + this.inTag = false; + return (this.token = TokenKind.endTag); + case CharCode.slash: + if (!this.inTag) { + const textNode = this.#scanTextNode(); + if (textNode === undefined) { + continue; + } + return textNode; + } + + ++this.pos; + if (this.pos < this.end) { + const nextChar = this.text.charCodeAt(this.pos); + if (nextChar === CharCode.greaterThan) { + ++this.pos; + return (this.token = TokenKind.endSelfClosing); + } + } + return (this.token = TokenKind.endTag); + + case CharCode.doubleQuote: { + if (this.inTag) { + // quotes are skipped in the handler for the identifier + throw new Error( + "Double quotes cannot appear in a tag without a leading equals.", + ); + } + const textNode = this.#scanTextNode(); + if (textNode === undefined) { + continue; + } + return textNode; + } + default: + if (!this.inTag) { + return this.#scanTextNode(); + } + + if (isIdentifierStart(ch)) { + // We actually don't care about attributes, just skip them entirely in this case + const token = this.#scanIdentifier(); + + if (this.text.charCodeAt(this.pos) === CharCode.equals) { + ++this.pos; // consume = + if (this.text.charCodeAt(this.pos) !== CharCode.doubleQuote) { + throw new Error("Equals must be followed by a quoted string."); + } + this.skipQuotedString(); + continue; + } + return token; + } + continue; + } + } + } + + #scanTextNode(): TokenKind.textNode { + // Read text node + let tokenValueStart = this.pos; + while (isWhitespace(this.text.charCodeAt(this.pos))) { + ++tokenValueStart; + } + + this.pos = this.text.indexOf("<", tokenValueStart + 1); + if (this.pos === -1) { + throw new Error("Unterminated text node."); + } + + let tokenValueEnd = this.pos; + do { + --tokenValueEnd; + } while (isWhitespace(this.text.charCodeAt(tokenValueEnd))); + ++tokenValueEnd; + + this.tokenValueStart = tokenValueStart; + this.tokenValueEnd = tokenValueEnd; + return (this.token = TokenKind.textNode); + } + + skipQuotedString() { + ++this.pos; // consume opening " + + this.pos = this.text.indexOf('"', this.pos); + if (this.pos === -1) { + throw new Error("Unterminated quote."); + } + + ++this.pos; // consume closing " + } + + #skipIdentifier(): void { + ++this.pos; // consume first char + while ( + this.pos < this.end && + isIdentifierPart(this.text.charCodeAt(this.pos)) + ) { + ++this.pos; + } + } + + #scanIdentifier(): TokenKind.identifier { + const identifierStart = this.pos; + this.#skipIdentifier(); + this.tokenValueStart = identifierStart; + this.tokenValueEnd = this.pos; + return (this.token = TokenKind.identifier); + } + + #skipPreamble(): void { + ++this.pos; // consume ? + + const closingIndex = this.text.indexOf(">", this.pos); + if (closingIndex === -1) { + throw new Error("Unterminated XML preamble."); + } + const questionMarkIndex = closingIndex - 1; + if (this.text.charCodeAt(questionMarkIndex) !== CharCode.questionMark) { + throw new Error("Unterminated XML preamble."); + } + + this.pos = closingIndex + 1; // consume > + } +} + +function isIdentifierStart(ch: number): boolean { + return ( + (ch >= CharCode.A && ch <= CharCode.Z) || + (ch >= CharCode.a && ch <= CharCode.z) || + ch === CharCode._ + ); +} + +function isIdentifierPart(ch: number): boolean { + return ( + (ch >= CharCode.A && ch <= CharCode.Z) || + (ch >= CharCode.a && ch <= CharCode.z) || + ch === CharCode._ || + (ch >= CharCode._0 && ch <= CharCode._9) + ); +} +function isWhitespace(ch: number): boolean { + return ( + ch === CharCode.space || + ch === CharCode.tab || + ch === CharCode.lineFeed || + ch === CharCode.carriageReturn || + ch === CharCode.verticalTab || + ch === CharCode.formFeed || + ch === CharCode.nonBreakingSpace || + ch === CharCode.lineSeparator || + ch === CharCode.paragraphSeparator || + ch === CharCode.nextLine + ); +}