Skip to content
Draft
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Alternatively, you can use it via a CDN as follows:

## Usage

### Basic Usage

```javascript
import { Tokenizer } from "@huggingface/tokenizers";

Expand All @@ -57,6 +59,30 @@ const encoded = tokenizer.encode("Hello World"); // { ids: [9906, 4435], tokens:
const decoded = tokenizer.decode(encoded.ids); // 'Hello World'
```

### Advanced Usage - Component Exports

You can also import individual tokenizer components, similar to the Python tokenizers library:

```javascript
import { Tokenizer, Encoding } from "@huggingface/tokenizers";
import { Metaspace, Whitespace } from "@huggingface/tokenizers/pre-tokenizers";
import { BPE } from "@huggingface/tokenizers/models";
import { Lowercase, StripAccents } from "@huggingface/tokenizers/normalizers";
import { BPEDecoder } from "@huggingface/tokenizers/decoders";
import { TemplateProcessing } from "@huggingface/tokenizers/post-processors";

// Use these components to build custom tokenizers or handle specific use cases
const metaspace = new Metaspace({
type: "Metaspace",
replacement: "▁",
add_prefix_space: true,
});

// Type your encoding results
const encoded: Encoding = tokenizer.encode("Hello World");
```


## Requirements

This library expects two files from Hugging Face models:
Expand Down
1 change: 1 addition & 0 deletions scripts/build.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ await build("dist/tokenizers.mjs");
await build("dist/tokenizers.cjs");
await build("dist/tokenizers.min.mjs");
await build("dist/tokenizers.min.cjs");

11 changes: 11 additions & 0 deletions src/decoders.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
export { default as Decoder } from "./core/Decoder";
export { default as ByteLevelDecoder } from "./core/decoder/ByteLevelDecoder";
export { default as ReplaceDecoder } from "./core/decoder/ReplaceDecoder";
export { default as WordPieceDecoder } from "./core/decoder/WordPieceDecoder";
export { default as ByteFallback } from "./core/decoder/ByteFallback";
export { default as FuseDecoder } from "./core/decoder/FuseDecoder";
export { default as StripDecoder } from "./core/decoder/StripDecoder";
export { default as MetaspaceDecoder } from "./core/decoder/MetaspaceDecoder";
export { default as BPEDecoder } from "./core/decoder/BPEDecoder";
export { default as CTCDecoder } from "./core/decoder/CTCDecoder";
export { default as DecoderSequence } from "./core/decoder/DecoderSequence";
60 changes: 60 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1 +1,61 @@
export { default as Tokenizer } from "./core/Tokenizer";
export type { Encoding } from "./static/types";

// Decoders
export {
Decoder,
ByteLevelDecoder,
ReplaceDecoder,
WordPieceDecoder,
ByteFallback,
FuseDecoder,
StripDecoder,
MetaspaceDecoder,
BPEDecoder,
CTCDecoder,
DecoderSequence,
} from "./decoders";

// Models
export { Model, BPE, Unigram, WordPiece } from "./models";

// Normalizers
export {
Normalizer,
BertNormalizer,
NFD,
NFKD,
NFC,
NFKC,
NormalizerSequence,
Lowercase,
Prepend,
Strip,
StripAccents,
Precompiled,
Replace,
} from "./normalizers";

// Pre-tokenizers
export {
PreTokenizer,
BertPreTokenizer,
ByteLevelPreTokenizer,
Digits,
MetaspacePreTokenizer,
Punctuation,
PreTokenizerSequence,
Split,
Whitespace,
WhitespaceSplit,
} from "./pre-tokenizers";

// Post-processors
export {
PostProcessor,
BertProcessing,
ByteLevelPostProcessor,
RobertaProcessing,
PostProcessorSequence,
TemplateProcessing,
} from "./post-processors";
4 changes: 4 additions & 0 deletions src/models.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
export { default as Model } from "./core/TokenizerModel";
export { default as BPE } from "./core/tokenizerModelImplementations/BPE";
export { default as Unigram } from "./core/tokenizerModelImplementations/Unigram";
export { default as WordPiece } from "./core/tokenizerModelImplementations/WordPieceTokenizer";
13 changes: 13 additions & 0 deletions src/normalizers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export { default as Normalizer } from "./core/Normalizer";
export { default as BertNormalizer } from "./core/normalizer/BertNormalizer";
export { default as NFD } from "./core/normalizer/NFD";
export { default as NFKD } from "./core/normalizer/NFKD";
export { default as NFC } from "./core/normalizer/NFC";
export { default as NFKC } from "./core/normalizer/NFKC";
export { default as NormalizerSequence } from "./core/normalizer/NormalizerSequence";
export { default as Lowercase } from "./core/normalizer/Lowercase";
export { default as Prepend } from "./core/normalizer/Prepend";
export { default as Strip } from "./core/normalizer/StripNormalizer";
export { default as StripAccents } from "./core/normalizer/StripAccents";
export { default as Precompiled } from "./core/normalizer/Precompiled";
export { default as Replace } from "./core/normalizer/Replace";
6 changes: 6 additions & 0 deletions src/post-processors.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export { default as PostProcessor } from "./core/PostProcessor";
export { default as BertProcessing } from "./core/postProcessor/BertProcessing";
export { default as ByteLevelPostProcessor } from "./core/postProcessor/ByteLevelPostProcessor";
export { default as RobertaProcessing } from "./core/postProcessor/RobertaProcessing";
export { default as PostProcessorSequence } from "./core/postProcessor/PostProcessorSequence";
export { default as TemplateProcessing } from "./core/postProcessor/TemplateProcessing";
10 changes: 10 additions & 0 deletions src/pre-tokenizers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export { default as PreTokenizer } from "./core/PreTokenizer";
export { default as BertPreTokenizer } from "./core/preTokenizer/BertPreTokenizer";
export { default as ByteLevelPreTokenizer } from "./core/preTokenizer/ByteLevelPreTokenizer";
export { default as Digits } from "./core/preTokenizer/DigitsPreTokenizer";
export { default as MetaspacePreTokenizer } from "./core/preTokenizer/MetaspacePreTokenizer";
export { default as Punctuation } from "./core/preTokenizer/PunctuationPreTokenizer";
export { default as PreTokenizerSequence } from "./core/preTokenizer/PreTokenizerSequence";
export { default as Split } from "./core/preTokenizer/SplitPreTokenizer";
export { default as Whitespace } from "./core/preTokenizer/WhitespacePreTokenizer";
export { default as WhitespaceSplit } from "./core/preTokenizer/WhitespaceSplit";
89 changes: 89 additions & 0 deletions tests/exports.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import { Tokenizer } from "../src";
import type { Encoding } from "../src";
import { MetaspacePreTokenizer, Whitespace } from "../src";
import { BPE } from "../src";

describe("Additional exports", () => {
describe("Main exports", () => {
it("should export Tokenizer", () => {
expect(Tokenizer).toBeDefined();
});

it("should export Encoding type (compile-time test)", () => {
// This test verifies that the Encoding type can be used
const encoding: Encoding = {
ids: [1, 2, 3],
tokens: ["hello", "world", "!"],
attention_mask: [1, 1, 1],
};
expect(encoding.ids).toEqual([1, 2, 3]);
});
});

describe("Pre-tokenizer exports", () => {
it("should export MetaspacePreTokenizer", () => {
expect(MetaspacePreTokenizer).toBeDefined();
const metaspace = new MetaspacePreTokenizer({
type: "Metaspace",
replacement: "▁",
add_prefix_space: true,
});
expect(metaspace).toBeInstanceOf(MetaspacePreTokenizer);
});

it("should export Whitespace pre-tokenizer", () => {
expect(Whitespace).toBeDefined();
const whitespace = new Whitespace();
expect(whitespace).toBeInstanceOf(Whitespace);
});

it("MetaspacePreTokenizer should work correctly", () => {
const metaspace = new MetaspacePreTokenizer({
type: "Metaspace",
replacement: "▁",
add_prefix_space: true,
});
const result = metaspace.pre_tokenize_text("hello world");
expect(result).toEqual(["▁hello▁world"]);
});

it("Whitespace pre-tokenizer should work correctly", () => {
const whitespace = new Whitespace();
const result = whitespace.pre_tokenize_text("hello world!");
expect(result).toEqual(["hello", "world", "!"]);
});
});

describe("Model exports", () => {
it("should export BPE model", () => {
expect(BPE).toBeDefined();
});

it("BPE model should be instantiable", () => {
const bpe = new BPE({
type: "BPE",
vocab: { a: 0, b: 1, c: 2 },
merges: [["a", "b"]],
unk_token: "<unk>",
ignore_merges: false,
});
expect(bpe).toBeInstanceOf(BPE);
});
});

describe("Integration test - import from main export", () => {
it("should support importing everything from main export", async () => {
// All exports should be available from the main index
const {
Tokenizer: T1,
MetaspacePreTokenizer: M1,
Whitespace: W1,
BPE: B1,
} = await import("../src/index");
expect(T1).toBeDefined();
expect(M1).toBeDefined();
expect(W1).toBeDefined();
expect(B1).toBeDefined();
});
});
});
48 changes: 48 additions & 0 deletions tests/package-exports.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Integration test to verify all exports are available from main export
import { Tokenizer } from "../src";
import type { Encoding } from "../src";
import {
MetaspacePreTokenizer,
Whitespace,
BPE,
Lowercase,
StripAccents,
BPEDecoder,
TemplateProcessing,
} from "../src";

describe("Main export integration", () => {
it("should import Tokenizer and Encoding", () => {
expect(Tokenizer).toBeDefined();

// Encoding is a type-only export
const enc: Encoding = {
ids: [1, 2, 3],
tokens: ["a", "b", "c"],
attention_mask: [1, 1, 1],
};
expect(enc.ids).toHaveLength(3);
});

it("should import pre-tokenizers from main export", () => {
expect(MetaspacePreTokenizer).toBeDefined();
expect(Whitespace).toBeDefined();
});

it("should import models from main export", () => {
expect(BPE).toBeDefined();
});

it("should import normalizers from main export", () => {
expect(Lowercase).toBeDefined();
expect(StripAccents).toBeDefined();
});

it("should import decoders from main export", () => {
expect(BPEDecoder).toBeDefined();
});

it("should import post-processors from main export", () => {
expect(TemplateProcessing).toBeDefined();
});
});
9 changes: 8 additions & 1 deletion tsconfig.build.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,12 @@
"outDir": "types",
"noEmit": false
},
"include": ["src/index.ts"]
"include": [
"src/index.ts",
"src/pre-tokenizers.ts",
"src/models.ts",
"src/normalizers.ts",
"src/decoders.ts",
"src/post-processors.ts"
]
}