justinwilaby
diff --git a/‎.eslintrc.js
Lines changed: 14 additions & 16 deletions b/‎.eslintrc.js
Lines changed: 14 additions & 16 deletions
diff --git a/‎.github/workflows/ci.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎ENCODING_SPEC.md
Lines changed: 122 additions & 0 deletions b/‎ENCODING_SPEC.md
Lines changed: 122 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 20 additions & 28 deletions b/‎README.md
Lines changed: 20 additions & 28 deletions
diff --git a/‎jest.config.js
Lines changed: 5 additions & 7 deletions b/‎jest.config.js
Lines changed: 5 additions & 7 deletions
@@ -1,17 +1,15 @@
-module.exports = {
-    root: true,
-    parser: '@typescript-eslint/parser',
-    plugins: [
-        '@typescript-eslint'
-    ],
-    extends: [
-        'eslint:recommended',
-        'plugin:@typescript-eslint/recommended'
-    ],
-    rules: {
-        'no-bitwise': 'off',
-        '@typescript-eslint/no-this-alias': 'off',
-        '@typescript-eslint/no-explicit-any': 'off',
-        '@typescript-eslint/no-inferrable-types': 'warn'
-    }
+export const root = true;
+export const parser = '@typescript-eslint/parser';
+export const plugins = [
+  '@typescript-eslint'
+];
+export const extendsConfig = [
+  'eslint:recommended',
+  'plugin:@typescript-eslint/recommended'
+];
+export const rules = {
+  'no-bitwise': 'off',
+  '@typescript-eslint/no-this-alias': 'off',
+  '@typescript-eslint/no-explicit-any': 'off',
+  '@typescript-eslint/no-inferrable-types': 'warn'
 };
@@ -43,7 +43,7 @@ jobs:
         run: npm install
 
       - name: Build and test
-        run: npm run build && npm run test
+        run: npm run build && npm run coverage
 
       - name: Cache Node.js modules
         uses: actions/cache@v2
 
@@ -0,0 +1,122 @@
+# Custom Encoding Specification for FFI Boundary Crossing
+## Overview
+This document describes the custom encoding format used for serializing the Tag struct in Rust to a `Vec<u8>` for crossing FFI (Foreign Function Interface) boundaries. This encoding format ensures that the data can be efficiently transferred and reconstructed on the other side of the FFI boundary.
+
+## Tag Struct
+The Tag struct contains the following fields:
+
+- open_start: `[u32; 2]`
+- open_end: `[u32; 2]`
+- close_start: `[u32; 2]`
+- close_end: `[u32; 2]`
+- self_closing: `bool`
+- name: `Vec<u8>`
+- attributes: `Vec<Attribute>`
+- text_nodes: `Vec<Text>`
+### Encoding Format
+The encoding format is a binary representation of the Tag struct, with the following layout:
+
+1. ### Header (8 bytes):
+  - attributes_start: u32 (4 bytes) - The starting byte offset of the attributes section.
+  - text_nodes_start: u32 (4 bytes) - The starting byte offset of the text nodes section.
+
+2. ### Tag Data:
+  - open_start: `[u32; 2]` (8 bytes)
+  - open_end: `[u32; 2]` (8 bytes)
+  - close_start: `[u32; 2]` (8 bytes)
+  - close_end: `[u32; 2]` (8 bytes)
+  - self_closing: `u8` (1 byte)
+  - name_length: `u32` (4 bytes) - The length of the name field.
+  - name: `Vec<u8>` (variable length) - The UTF-8 encoded bytes of the tag name.
+
+3. ### Attributes Section:
+  - attributes_count: `u32` (4 bytes) - The number of attributes.
+  - For each attribute:
+      - attribute_length: `u32` (4 bytes) - The length of the encoded attribute.
+      - attribute_data: `Vec<u8>`(variable length) - The encoded attribute data.
+
+
+Text Nodes Section:
+
+text_nodes_count: u32 (4 bytes) - The number of text nodes.
+For each text node:
+text_length: u32 (4 bytes) - The length of the encoded text node.
+text_data: Vec<u8> (variable length) - The encoded text node data.
+Encoding Process
+The encoding process involves serializing each field of the Tag struct into a Vec<u8> in the specified order. The following Rust code demonstrates the encoding process:
+
+Encoding Process
+The encoding process involves serializing each field of the Tag struct into a `Vec<u8>` in the specified order. The following Rust code demonstrates the encoding process:
+```rust
+impl Encode<Vec<u8>> for Tag {
+    #[inline]
+    fn encode(&self) -> Vec<u8> {
+        let mut v = vec![0, 0, 0, 0, 0, 0, 0, 0];
+        let name_bytes = self.name.as_slice();
+
+        v.reserve(name_bytes.len() + 37);
+        // known byte length - 8 bytes per [u32; 2]
+        v.extend_from_slice(u32_to_u8(&self.open_start));
+        v.extend_from_slice(u32_to_u8(&self.open_end));
+
+        v.extend_from_slice(u32_to_u8(&self.close_start));
+        v.extend_from_slice(u32_to_u8(&self.close_end));
+        // bool - 1 byte
+        v.push(self.self_closing as u8);
+        // length of the name - 4 bytes
+        v.extend_from_slice(u32_to_u8(&[self.name.len() as u32]));
+        // name_bytes.len() bytes
+        v.extend_from_slice(name_bytes);
+
+        // write the starting location for the attributes at bytes 0..4
+        v.splice(0..4, u32_to_u8(&[v.len() as u32]).to_vec());
+        // write the number of attributes
+        v.extend_from_slice(u32_to_u8(&[self.attributes.len() as u32]));
+        // Encode and write the attributes
+        for a in &self.attributes {
+            let mut attr = a.encode();
+            let len = attr.len();
+            v.reserve(len + 4);
+            // write the length of this attribute
+            v.extend_from_slice(u32_to_u8(&[len as u32]));
+            v.append(&mut attr);
+        }
+
+        // write the starting location for the text node at bytes 4..8
+        v.splice(4..8, u32_to_u8(&[v.len() as u32]).to_vec());
+        // write the number of text nodes
+        v.extend_from_slice(u32_to_u8(&[self.text_nodes.len() as u32]));
+        // encode and write the text nodes
+        for t in &self.text_nodes {
+            let mut text = t.encode();
+            let len = text.len();
+            v.reserve(len + 4);
+            // write the length of this text node
+            v.extend_from_slice(u32_to_u8(&[len as u32]));
+            v.append(&mut text);
+        }
+        v
+    }
+}
+```
+# Decoding Process
+The decoding process involves reconstructing the Tag struct from the binary representation. The following steps outline the decoding process:
+
+1. Read the header to get the starting offsets for the attributes and text nodes sections.
+2. Read the tag data fields.
+3. Read the attributes section using the starting offset.
+4. Read the text nodes section using the starting offset.
+The decoding process should ensure that the data is read in the same order as it was written during encoding.
+
+# Why (serde-)wasm-bindgen Was Not Used
+While wasm-bindgen is a powerful tool that facilitates high-level interactions between Rust and JavaScript, it was not used in this project for the following performance-related reasons:
+
+1. **Lazy read of individual fields from a Uint8Array**: The decoding strategy does not require crossing the JS-wasm boundary each time a field is read (which is expensive), nor does it need to construct all field values at once on the JS side. Instead, the encoded data is a fixed structure on both the Rust and JS side and resides in linear memory. The data for each field is at a known address within the Uint8Array and can be read lazily via getters. This means that if you received a Tag from the parser and only need to read the `tag.name`, the `name` is decoded at the time it is read while leaving all other fields encoded. Your CPU overhead is lmited to decoding only the fields that are accessed and only at the time they are accessed.
+
+1. **Performance Overhead** : wasm-bindgen introduces significant overhead due to automatic type conversion and memory management. For performance-critical applications, this overhead can impact the overall efficiency of the system. By using a custom encoding format, we can minimize this overhead and achieve better performance.
+
+1. **Fine-Grained Control**: Custom encoding provides fine-grained control over the serialization and deserialization process. This allows for optimizations specific to the application's needs, such as minimizing the size of the encoded data and reducing the number of memory allocations.
+
+1. **Compactness**: The custom encoding format is designed to be compact, reducing memory usage and transmission time. This is particularly important for applications that need to transfer large amounts of data across the FFI boundary.
+
+1. **Avoiding Dependencies**: By not relying on wasm-bindgen, we avoid adding an additional dependency to the project. This can simplify the build process and reduce potential compatibility issues with other tools and libraries.
@@ -16,17 +16,16 @@ Suitable for [LSP](https://langserver.org/) implementations, sax-wasm provides l
 document for elements, attributes and text node which provides the raw building blocks for linting, transpilation and lexing.
 
 ## Benchmarks (Node v22.12.0 / 2.7 GHz Quad-Core Intel Core i7)
-All parsers are tested using a large XML document (1 MB) containing a variety of elements and is streamed when supported
-by the parser. This attempts to recreate the best real-world use case for parsing XML. Other libraries test benchmarks using a
-very small XML fragment such as `<foo bar="baz">quux</foo>` which does not hit all code branches responsible for processing the
-document and heavily skews the results in their favor.
-
-| Parser with Advanced Features                                                              | time/ms (lower is better) | JS     | Runs in browser |
-|--------------------------------------------------------------------------------------------|--------------------------:|:------:|:---------------:|
-| [sax-wasm](https://github.com/justinwilaby/sax-wasm)                                       |                    19.20 | ☑      | ☑               |
-| [sax-js](https://github.com/isaacs/sax-js)                                                 |                    64.23 | ☑      | ☑*              |
-| [ltx](https://github.com/xmppjs/ltx)                                                       |                    21.54 | ☑      | ☑               |
-| [node-xml](https://github.com/dylang/node-xml)                                             |                    87.06 | ☑      | ☐               |
+All parsers are tested using a large XML document (3 MB) containing a variety of elements and is streamed from memory to remove variations in disk access latency and focus on benchmarking just the parser alone. Other libraries test benchmarks using a very small XML fragment such as `<foo bar="baz">quux</foo>` which does not hit all code branches responsible for processing the document and heavily skews the results in their favor.
+
+| Parser with Advanced Features                                                              | time/ms (lower is better)| JS     | Runs in browser |
+|--------------------------------------------------------------------------------------------|-------------------------:|:------:|:---------------:|
+| [sax-wasm](https://github.com/justinwilaby/sax-wasm)                                       |                    0.466 | ☑      | ☑               |
+| [saxes](https://github.com/lddubeau/saxes)                                                 |                    0.868 | ☑      | ☑               |
+| [ltx(using Saxes as the parser)](https://github.com/xmppjs/ltx)                            |                    0.881 | ☑      | ☑               |
+| [node-xml](https://github.com/dylang/node-xml)                                             |                    1.549 | ☑      | ☐               |
+| [node-expat](https://github.com/xmppo/node-expat)                                          |                    1.551 | ☑      | ☐               |
+| [sax-js](https://github.com/isaacs/sax-js)                                                 |                    1.869 | ☑      | ☑*              |
 <sub>*built for node but *should* run in the browser</sub>
 
 ## Installation
@@ -40,25 +39,17 @@ import path from 'path';
 import { fileURLToPath } from 'url';
 import { SaxEventType, SAXParser } from 'sax-wasm';
 
-// Get the path to the WebAssembly binary and load it
-const __filename = fileURLToPath(import.meta.url);
-const __dirname = path.dirname(__filename);
-const saxPath = path.resolve(__dirname, 'node_modules/sax-wasm/lib/sax-wasm.wasm');
-const saxWasmBuffer = fs.readFileSync(saxPath);
+const wasmUrl = new URL(import.meta.resolve('sax-wasm/lib/sax-wasm.wasm'));
+const saxWasm = await readFile(wasmUrl);
+const parser = new SAXParser(SaxEventType.Cdata | SaxEventType.OpenTag);
 
-// Instantiate
-const parser = new SAXParser(SaxEventType.Attribute | SaxEventType.OpenTag);
-
-// Instantiate and prepare the wasm for parsing
-const ready = await parser.prepareWasm(saxWasmBuffer);
-if (ready) {
-  // stream from a file in the current directory
-  const readable = fs.createReadStream(path.resolve(__dirname, 'path/to/document.xml'), options);
+if (await parser.prepareWasm(saxWasm)) {
+  const xmlPath = import.meta.resolve('../src/xml.xml');
+  const readable = createReadStream(new URL(xmlPath));
   const webReadable = Readable.toWeb(readable);
-
   for await (const [event, detail] of parser.parse(webReadable.getReader())) {
-    if (event === SaxEventType.Attribute) {
-      // process attribute
+    if (event === SaxEventType.Cdata) {
+      // process Cdata
     } else {
       // process open tag
     }
@@ -75,7 +66,8 @@ under the hood to load the wasm.
 import { SaxEventType, SAXParser } from 'sax-wasm';
 
 // Fetch the WebAssembly binary
-const response = fetch('path/to/sax-wasm.wasm');
+const wasmUrl = new URL(import.meta.resolve('sax-wasm/lib/sax-wasm.wasm'));
+const response = fetch(wasmUrl);
 
 // Instantiate
 const parser = new SAXParser(SaxEventType.Attribute | SaxEventType.OpenTag);
 
@@ -1,7 +1,5 @@
-/** @type {import('ts-jest/dist/types').InitialOptionsTsJest} */
-module.exports = {
-    preset: 'ts-jest',
-    testEnvironment: 'node',
-    coverageProvider: 'v8',
-    collectCoverage: true
-};
+/** @type {import('ts-jest/dist/types').DefaultEsmPreset} */
+export const preset = 'ts-jest';
+export const testEnvironment = 'node';
+export const coverageProvider = 'v8';
+export const collectCoverage = true;