Skip to content

Commit 23ca660

Browse files
Copilotxenova
andcommitted
Add additional exports for Encoding, pre-tokenizers, and models
Co-authored-by: xenova <[email protected]>
1 parent 0dab200 commit 23ca660

File tree

7 files changed

+151
-7
lines changed

7 files changed

+151
-7
lines changed

package.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,28 @@
1818
"import": "./dist/tokenizers.min.mjs"
1919
},
2020
"default": "./dist/tokenizers.min.mjs"
21+
},
22+
"./pre-tokenizers": {
23+
"types": "./types/pre-tokenizers.d.ts",
24+
"node": {
25+
"require": "./dist/pre-tokenizers.min.cjs",
26+
"import": "./dist/pre-tokenizers.min.mjs"
27+
},
28+
"browser": {
29+
"import": "./dist/pre-tokenizers.min.mjs"
30+
},
31+
"default": "./dist/pre-tokenizers.min.mjs"
32+
},
33+
"./models": {
34+
"types": "./types/models.d.ts",
35+
"node": {
36+
"require": "./dist/models.min.cjs",
37+
"import": "./dist/models.min.mjs"
38+
},
39+
"browser": {
40+
"import": "./dist/models.min.mjs"
41+
},
42+
"default": "./dist/models.min.mjs"
2143
}
2244
},
2345
"files": [

scripts/build.mjs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ const reportSize = (outfile) => {
2525
console.log(`⚡ Done\n`);
2626
};
2727

28-
const build = async (outfile) => {
28+
const build = async (entryPoint, outfile) => {
2929
const format = outfile.endsWith(".mjs") ? "esm" : "cjs";
3030
const minifyOptions = /\.min\.[cm]js$/.test(outfile)
3131
? { minify: true, minifySyntax: true }
@@ -35,7 +35,7 @@ const build = async (outfile) => {
3535
bundle: true,
3636
treeShaking: true,
3737
logLevel: "silent",
38-
entryPoints: ["src/index.ts"],
38+
entryPoints: [entryPoint],
3939
platform: "neutral",
4040
metafile: true,
4141
format,
@@ -45,7 +45,17 @@ const build = async (outfile) => {
4545
reportSize(outfile);
4646
};
4747

48-
await build("dist/tokenizers.mjs");
49-
await build("dist/tokenizers.cjs");
50-
await build("dist/tokenizers.min.mjs");
51-
await build("dist/tokenizers.min.cjs");
48+
await build("src/index.ts", "dist/tokenizers.mjs");
49+
await build("src/index.ts", "dist/tokenizers.cjs");
50+
await build("src/index.ts", "dist/tokenizers.min.mjs");
51+
await build("src/index.ts", "dist/tokenizers.min.cjs");
52+
53+
await build("src/pre-tokenizers.ts", "dist/pre-tokenizers.mjs");
54+
await build("src/pre-tokenizers.ts", "dist/pre-tokenizers.cjs");
55+
await build("src/pre-tokenizers.ts", "dist/pre-tokenizers.min.mjs");
56+
await build("src/pre-tokenizers.ts", "dist/pre-tokenizers.min.cjs");
57+
58+
await build("src/models.ts", "dist/models.mjs");
59+
await build("src/models.ts", "dist/models.cjs");
60+
await build("src/models.ts", "dist/models.min.mjs");
61+
await build("src/models.ts", "dist/models.min.cjs");

src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
export { default as Tokenizer } from "./core/Tokenizer";
2+
export type { Encoding } from "./static/types";

src/models.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
export { default as BPE } from "./core/tokenizerModelImplementations/BPE";
2+
export { default as LegacyTokenizerModel } from "./core/tokenizerModelImplementations/LegacyTokenizerModel";
3+
export { default as Unigram } from "./core/tokenizerModelImplementations/Unigram";
4+
export { default as WordPiece } from "./core/tokenizerModelImplementations/WordPieceTokenizer";
5+
export { default as WordPieceTokenizer } from "./core/tokenizerModelImplementations/WordPieceTokenizer";

src/pre-tokenizers.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
export { default as BertPreTokenizer } from "./core/preTokenizer/BertPreTokenizer";
2+
export { default as ByteLevelPreTokenizer } from "./core/preTokenizer/ByteLevelPreTokenizer";
3+
export { default as DigitsPreTokenizer } from "./core/preTokenizer/DigitsPreTokenizer";
4+
export { default as Metaspace } from "./core/preTokenizer/MetaspacePreTokenizer";
5+
export { default as MetaspacePreTokenizer } from "./core/preTokenizer/MetaspacePreTokenizer";
6+
export { default as PreTokenizerSequence } from "./core/preTokenizer/PreTokenizerSequence";
7+
export { default as PunctuationPreTokenizer } from "./core/preTokenizer/PunctuationPreTokenizer";
8+
export { default as ReplacePreTokenizer } from "./core/preTokenizer/ReplacePreTokenizer";
9+
export { default as SplitPreTokenizer } from "./core/preTokenizer/SplitPreTokenizer";
10+
export { default as Whitespace } from "./core/preTokenizer/WhitespacePreTokenizer";
11+
export { default as WhitespacePreTokenizer } from "./core/preTokenizer/WhitespacePreTokenizer";
12+
export { default as WhitespaceSplit } from "./core/preTokenizer/WhitespaceSplit";

tests/exports.test.ts

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import { Tokenizer } from "../src";
2+
import type { Encoding } from "../src";
3+
import { Metaspace, Whitespace } from "../src/pre-tokenizers";
4+
import { BPE } from "../src/models";
5+
6+
describe("Additional exports", () => {
7+
describe("Main exports", () => {
8+
it("should export Tokenizer", () => {
9+
expect(Tokenizer).toBeDefined();
10+
});
11+
12+
it("should export Encoding type (compile-time test)", () => {
13+
// This test verifies that the Encoding type can be used
14+
const encoding: Encoding = {
15+
ids: [1, 2, 3],
16+
tokens: ["hello", "world", "!"],
17+
attention_mask: [1, 1, 1],
18+
};
19+
expect(encoding.ids).toEqual([1, 2, 3]);
20+
});
21+
});
22+
23+
describe("Pre-tokenizer exports", () => {
24+
it("should export Metaspace pre-tokenizer", () => {
25+
expect(Metaspace).toBeDefined();
26+
const metaspace = new Metaspace({
27+
type: "Metaspace",
28+
replacement: "▁",
29+
add_prefix_space: true,
30+
});
31+
expect(metaspace).toBeInstanceOf(Metaspace);
32+
});
33+
34+
it("should export Whitespace pre-tokenizer", () => {
35+
expect(Whitespace).toBeDefined();
36+
const whitespace = new Whitespace();
37+
expect(whitespace).toBeInstanceOf(Whitespace);
38+
});
39+
40+
it("Metaspace pre-tokenizer should work correctly", () => {
41+
const metaspace = new Metaspace({
42+
type: "Metaspace",
43+
replacement: "▁",
44+
add_prefix_space: true,
45+
});
46+
const result = metaspace.pre_tokenize_text("hello world");
47+
expect(result).toEqual(["▁hello▁world"]);
48+
});
49+
50+
it("Whitespace pre-tokenizer should work correctly", () => {
51+
const whitespace = new Whitespace();
52+
const result = whitespace.pre_tokenize_text("hello world!");
53+
expect(result).toEqual(["hello", "world", "!"]);
54+
});
55+
});
56+
57+
describe("Model exports", () => {
58+
it("should export BPE model", () => {
59+
expect(BPE).toBeDefined();
60+
});
61+
62+
it("BPE model should be instantiable", () => {
63+
const bpe = new BPE({
64+
type: "BPE",
65+
vocab: { a: 0, b: 1, c: 2 },
66+
merges: [["a", "b"]],
67+
unk_token: "<unk>",
68+
ignore_merges: false,
69+
});
70+
expect(bpe).toBeInstanceOf(BPE);
71+
});
72+
});
73+
74+
describe("Integration test - import paths", () => {
75+
it("should support the documented import syntax", async () => {
76+
// This test verifies that the documented import paths work
77+
// import { Tokenizer, Encoding } from "@huggingface/tokenizers";
78+
const { Tokenizer: T1 } = await import("../src/index");
79+
expect(T1).toBeDefined();
80+
// Encoding is a type-only export, so we can't test it at runtime
81+
82+
// import { Metaspace, Whitespace } from "@huggingface/tokenizers/pre-tokenizers";
83+
const { Metaspace: M1, Whitespace: W1 } = await import(
84+
"../src/pre-tokenizers"
85+
);
86+
expect(M1).toBeDefined();
87+
expect(W1).toBeDefined();
88+
89+
// import { BPE } from "@huggingface/tokenizers/models";
90+
const { BPE: B1 } = await import("../src/models");
91+
expect(B1).toBeDefined();
92+
});
93+
});
94+
});

tsconfig.build.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@
88
"outDir": "types",
99
"noEmit": false
1010
},
11-
"include": ["src/index.ts"]
11+
"include": ["src/index.ts", "src/pre-tokenizers.ts", "src/models.ts"]
1212
}

0 commit comments

Comments
 (0)