diff --git a/.gitignore b/.gitignore index e53d6d8..528dff7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ *.model *.zip .vscode/ +__pycache__/ +*.so \ No newline at end of file diff --git a/bytepiece-py/.gitignore b/bytepiece-py/.gitignore deleted file mode 100644 index 02981ea..0000000 --- a/bytepiece-py/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -# python generated files -__pycache__/ -*.py[oc] -build/ -dist/ -wheels/ -*.egg-info -# Rust -target/ - -# venv -.venv -*.so \ No newline at end of file diff --git a/bytepiece-py/.python-version b/bytepiece-py/.python-version index 8531a3b..2c07333 100644 --- a/bytepiece-py/.python-version +++ b/bytepiece-py/.python-version @@ -1 +1 @@ -3.12.2 +3.11 diff --git a/bytepiece-py/Cargo.toml b/bytepiece-py/Cargo.toml index 5f2e942..8c3a428 100644 --- a/bytepiece-py/Cargo.toml +++ b/bytepiece-py/Cargo.toml @@ -1,13 +1,15 @@ [package] -name = "bytepiece-py" +name = "bytepiece_py" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] -name = "bytepiece_py" +name = "_core" +# "cdylib" is necessary to produce a shared library for Python to import from. crate-type = ["cdylib"] [dependencies] -pyo3 = "0.21" +# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so) +# "abi3-py39" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.9 +pyo3 = { version = "0.23.3", features = ["extension-module", "abi3-py39"] } bytepiece = { workspace = true } \ No newline at end of file diff --git a/bytepiece-py/README.md b/bytepiece-py/README.md index dceae27..0d758c9 100644 --- a/bytepiece-py/README.md +++ b/bytepiece-py/README.md @@ -1,3 +1 @@ -# bytepiece-py - -Describe your project here. \ No newline at end of file +? \ No newline at end of file diff --git a/scripts/bench.py b/bytepiece-py/examples/bench.py similarity index 55% rename from scripts/bench.py rename to bytepiece-py/examples/bench.py index 7b30d6c..d3cd5aa 100644 --- a/scripts/bench.py +++ b/bytepiece-py/examples/bench.py @@ -1,4 +1,11 @@ -import bytepiece +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "rs-bytepiece", +# "bytepiece-py", +# ] +# /// +# import bytepiece import timeit import bytepiece_py import rs_bytepiece @@ -7,21 +14,22 @@ TEXT = "BytePiece是一个Byte-based的Unigram分词器,纯Python实现,更加易读和易拓展。由于采用了新的训练算法,所以压缩率通常比现有Tokenizer更高,同时支持多进程加速训练。此外,它直接操作文本的UTF-8 Bytes,几乎不进行任何的预处理,所以更加纯粹和语言无关。" -MODEL = "models/bytepiece_80k.model" +MODEL = "../models/bytepiece_80k.model" -t1 = bytepiece.Tokenizer(MODEL) +# t1 = bytepiece.Tokenizer(MODEL) t2 = bytepiece_py.Tokenizer(MODEL) t3 = rs_bytepiece.Tokenizer(MODEL) -assert t1.encode(TEXT) == t2.encode(TEXT) -assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT)) -print(t1.encode(TEXT)) +# assert t1.encode(TEXT) == t2.encode(TEXT) +# assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT)) +# print(t1.encode(TEXT)) print(t2.encode(TEXT)) +print(t3.encode(TEXT)) -print('bytepiece:') -print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000)) -print('bytepiece-py (ours)') +# print('bytepiece:') +# print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000)) +print("bytepiece-py (ours)") print(timeit.timeit("t2.encode(TEXT)", globals=globals(), number=10000)) -print('rs-bytepiece') +print("rs-bytepiece") print(timeit.timeit("t3.encode(TEXT)", globals=globals(), number=10000)) diff --git a/bytepiece-py/pyproject.toml b/bytepiece-py/pyproject.toml index c0910f3..3ed0cb0 100644 --- a/bytepiece-py/pyproject.toml +++ b/bytepiece-py/pyproject.toml @@ -1,28 +1,25 @@ [project] name = "bytepiece-py" -version = "0.2.1" +version = "0.1.1" description = "Add your description here" +readme = "README.md" authors = [ { name = "SunDoge", email = "triplez0@outlook.com" } ] +requires-python = ">=3.8" dependencies = [] -readme = "README.md" -requires-python = ">= 3.8" + +[tool.maturin] +module-name = "bytepiece_py._core" +python-packages = ["bytepiece_py"] +python-source = "src" [build-system] -requires = ["maturin>=1.2,<2.0"] +requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" -[tool.rye] -managed = true -dev-dependencies = [ - "pip>=24.0", +[dependency-groups] +dev = [ + "maturin>=1.7.8", + "rs-bytepiece>=0.2.2", ] - -[tool.maturin] -python-source = "python" -module-name = "bytepiece_py._lowlevel" -features = ["pyo3/extension-module"] - -[tool.rye.scripts] -dev = "maturin develop --skip-install" diff --git a/bytepiece-py/requirements-dev.lock b/bytepiece-py/requirements-dev.lock deleted file mode 100644 index 33d4094..0000000 --- a/bytepiece-py/requirements-dev.lock +++ /dev/null @@ -1,11 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. -pip==24.0 diff --git a/bytepiece-py/requirements.lock b/bytepiece-py/requirements.lock deleted file mode 100644 index 8f23096..0000000 --- a/bytepiece-py/requirements.lock +++ /dev/null @@ -1,10 +0,0 @@ -# generated by rye -# use `rye lock` or `rye sync` to update this lockfile -# -# last locked with the following flags: -# pre: false -# features: [] -# all-features: false -# with-sources: false - --e file:. diff --git a/bytepiece-py/python/bytepiece_py/__init__.py b/bytepiece-py/src/bytepiece_py/__init__.py similarity index 88% rename from bytepiece-py/python/bytepiece_py/__init__.py rename to bytepiece-py/src/bytepiece_py/__init__.py index 688b684..7db316e 100644 --- a/bytepiece-py/python/bytepiece_py/__init__.py +++ b/bytepiece-py/src/bytepiece_py/__init__.py @@ -1,7 +1,7 @@ import unicodedata from typing import Dict, List, Tuple, Union -from bytepiece_py import _lowlevel +from bytepiece_py import _core def normalize(text: str) -> bytes: @@ -11,9 +11,9 @@ def normalize(text: str) -> bytes: class Tokenizer: def __init__(self, pieces: Union[str, Dict[str, Tuple[str, int, str]]]) -> None: if isinstance(pieces, str): - self._tokenizer = _lowlevel._Tokenizer.from_path(pieces) + self._tokenizer = _core._Tokenizer.from_path(pieces) else: - self._tokenizer = _lowlevel._Tokenizer(pieces) + self._tokenizer = _core._Tokenizer(pieces) def encode( self, diff --git a/bytepiece-py/src/bytepiece_py/_core.pyi b/bytepiece-py/src/bytepiece_py/_core.pyi new file mode 100644 index 0000000..f7f26a9 --- /dev/null +++ b/bytepiece-py/src/bytepiece_py/_core.pyi @@ -0,0 +1,3 @@ +from __future__ import annotations + +def hello_from_bin() -> str: ... diff --git a/bytepiece-py/src/bytepiece_py/py.typed b/bytepiece-py/src/bytepiece_py/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/bytepiece-py/src/lib.rs b/bytepiece-py/src/lib.rs index 6dd865a..53ca65e 100644 --- a/bytepiece-py/src/lib.rs +++ b/bytepiece-py/src/lib.rs @@ -2,11 +2,12 @@ mod error; mod tokenizer; use pyo3::prelude::*; -use tokenizer::_Tokenizer; -/// A Python module implemented in Rust. +/// A Python module implemented in Rust. The name of this function must match +/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to +/// import the module. #[pymodule] -fn _lowlevel(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::<_Tokenizer>()?; +fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; Ok(()) } diff --git a/bytepiece-py/src/tokenizer.rs b/bytepiece-py/src/tokenizer.rs index f2ea8bc..e2ac2f3 100644 --- a/bytepiece-py/src/tokenizer.rs +++ b/bytepiece-py/src/tokenizer.rs @@ -37,10 +37,7 @@ impl _Tokenizer { ) -> Vec> { let bs = text.as_bytes(); let tokens = py.allow_threads(|| self.inner.tokenize(&bs, alpha)); - tokens - .into_iter() - .map(|bs| PyBytes::new_bound(py, bs)) - .collect() + tokens.into_iter().map(|bs| PyBytes::new(py, bs)).collect() } #[pyo3(signature = (text, add_bos = false, add_eos = false, alpha = -1.0))] @@ -58,11 +55,11 @@ impl _Tokenizer { pub fn decode<'py>(&self, py: Python<'py>, ids: Vec) -> Result> { let res = py.allow_threads(|| self.inner.decode(&ids))?; - Ok(PyBytes::new_bound(py, &res)) + Ok(PyBytes::new(py, &res)) } pub fn id_to_piece<'py>(&self, py: Python<'py>, id: usize) -> Bound<'py, PyBytes> { - PyBytes::new_bound(py, self.inner.id_to_piece(id)) + PyBytes::new(py, self.inner.id_to_piece(id)) } pub fn piece_to_id(&self, piece: &Bound) -> usize { diff --git a/bytepiece-py/uv.lock b/bytepiece-py/uv.lock new file mode 100644 index 0000000..8969113 --- /dev/null +++ b/bytepiece-py/uv.lock @@ -0,0 +1,94 @@ +version = 1 +requires-python = ">=3.8" + +[[package]] +name = "bytepiece-py" +version = "0.1.1" +source = { editable = "." } + +[package.dev-dependencies] +dev = [ + { name = "maturin" }, + { name = "rs-bytepiece" }, +] + +[package.metadata] + +[package.metadata.requires-dev] +dev = [ + { name = "maturin", specifier = ">=1.7.8" }, + { name = "rs-bytepiece", specifier = ">=0.2.2" }, +] + +[[package]] +name = "maturin" +version = "1.7.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/1e/085ddc0e5b08ae7af7a743a0dd6ed06b22a1332288488f1a333137885150/maturin-1.7.8.tar.gz", hash = "sha256:649c6ef3f0fa4c5f596140d761dc5a4d577c485cc32fb5b9b344a8280352880d", size = 195704 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/ed/c8bb26e91c879e418ae1b01630722ed20b6fe0e6755be8d538d83666f136/maturin-1.7.8-py3-none-linux_armv6l.whl", hash = "sha256:c6950fd2790acd93265e1501cea66f9249cff19724654424ca75a3b17ebb315b", size = 7515691 }, + { url = "https://files.pythonhosted.org/packages/38/7a/573f969315f0b92a09a0a565d45e98812c87796e2e19a7856159ab234faf/maturin-1.7.8-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f98288d5c382bacf0c076871dfd50c38f1eb2248f417551e98dd6f47f6ee8afa", size = 14434454 }, + { url = "https://files.pythonhosted.org/packages/a6/17/46834841fbf19231487f185e68b95ca348cc05cce49be8787e0bc7e9dc47/maturin-1.7.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b2d4e0f674ca29864e6b86c2eb9fee8236d1c7496c25f7300e34229272468f4c", size = 7509122 }, + { url = "https://files.pythonhosted.org/packages/c1/8f/bf8b4871eb390a4baef2e0bb5016852c7c0311a9772e2945534cfa2ee40e/maturin-1.7.8-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:6cafb17bf57822bdc04423d9e3e766d42918d474848fe9833e397267514ba891", size = 7598870 }, + { url = "https://files.pythonhosted.org/packages/dc/43/c842be67a7c59568082345249b956138ae93d0b2474fb41c186ce26d05e1/maturin-1.7.8-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:2b2bdee0c3a84696b3a809054c43ead1a04b7b3321cbd5b8f5676e4ba4691d0f", size = 7932310 }, + { url = "https://files.pythonhosted.org/packages/12/12/42435d05f2d6c75eb621751e6f021d29eb34d18e3b9c5c94d828744c2d54/maturin-1.7.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:b8188b71259fc2bc568d9c8acc186fcfed96f42539bcb55b8e6f4ec26e411f37", size = 7321964 }, + { url = "https://files.pythonhosted.org/packages/b4/26/f3272ee985ebf9b3e8c4cd4f4efb022af1e12c9f53aed0dcc9a255399f4e/maturin-1.7.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:a4f58c2a53c2958a1bf090960b08b28e676136cd88ac2f5dfdcf1b14ea54ec06", size = 7408613 }, + { url = "https://files.pythonhosted.org/packages/36/7d/be27bcc7d3ac6e6c2136a8ec0cc56f227a292d6cfdde55e095b6c0aa24a9/maturin-1.7.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:c5d6c0c631d1fc646cd3834795e6cfd72ab4271d289df7e0f911261a02bec75f", size = 9496974 }, + { url = "https://files.pythonhosted.org/packages/e1/e8/0d7323e9a31c11edf69c4473d73eca74803ce3e2390abf8ae3ac7eb10b04/maturin-1.7.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c23664d19dadcbf800ef70f26afb2e0485a985c62889930934f019c565534c23", size = 10828401 }, + { url = "https://files.pythonhosted.org/packages/7e/82/5080e052c0d8c9872f6d4b94cae84c17ed7f2ea270d709210ea6445b655f/maturin-1.7.8-py3-none-win32.whl", hash = "sha256:403eebf1afa6f19b49425f089e39c53b8e597bc86a47f3a76e828dc78d27fa80", size = 6845240 }, + { url = "https://files.pythonhosted.org/packages/6d/c9/9b162361ded893f36038c2f8ac6a972ec441c11df8d17c440997eb28090f/maturin-1.7.8-py3-none-win_amd64.whl", hash = "sha256:1ce48d007438b895f8665314b6748ac0dab31e4f32049a60b52281dd2dccbdde", size = 7762332 }, + { url = "https://files.pythonhosted.org/packages/fa/40/46d4742db742f69a7fe0054cd7c82bc79b2d70cb8c91f7e737e75c28a5f3/maturin-1.7.8-py3-none-win_arm64.whl", hash = "sha256:cc92a62953205e8945b6cfe6943d6a8576a4442d30d9c67141f944f4f4640e62", size = 6501353 }, +] + +[[package]] +name = "rs-bytepiece" +version = "0.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8a/22/b6bbac87677550e256e049e97b2cc4a75aa33bd12bb81ac7c4709d407178/rs_bytepiece-0.2.2.tar.gz", hash = "sha256:f86206004808f118d7581fe314387c6d0c45566cd34fda346c439c3392e912cd", size = 1241456 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/63/809ac38242cf82098144777c8f82a94290d15aa04f9c45dbb57da7a78be8/rs_bytepiece-0.2.2-cp37-abi3-macosx_10_7_x86_64.whl", hash = "sha256:14338a8c8573df2ac4dc83567a58722d18984f678ce3426237d58e689288df1d", size = 2295816 }, + { url = "https://files.pythonhosted.org/packages/14/61/0a22cb90c845829383640d2a9ef17ab0286e9cef9189b7346194b5df2b72/rs_bytepiece-0.2.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2405a38e0a03985fabeb025eae65fcafcf5aae7c5f9b065c10cc17b72ae5e723", size = 3463328 }, + { url = "https://files.pythonhosted.org/packages/62/30/dea969abe55a7cf936c69b32b5ddda7eb09687300be9fa8923535da0d9f6/rs_bytepiece-0.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:2e6a9f8b78bc24d4856b00240c82392ebe6008b110ba482e22c00f48f81b4e6c", size = 3819501 }, +] + +[[package]] +name = "tomli" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 }, + { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 }, + { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 }, + { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 }, + { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 }, + { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 }, + { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 }, + { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 }, + { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 }, + { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 }, + { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 }, + { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 }, + { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 }, + { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 }, + { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 }, + { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 }, + { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 }, + { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 }, + { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 }, + { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 }, + { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 }, + { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 }, + { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 }, + { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 }, + { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 }, + { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 }, + { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 }, + { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 }, + { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 }, + { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 }, + { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 }, +] diff --git a/bytepiece/Cargo.toml b/bytepiece/Cargo.toml index 8b92d2f..b8b3625 100644 --- a/bytepiece/Cargo.toml +++ b/bytepiece/Cargo.toml @@ -16,13 +16,13 @@ documentation = "https://docs.rs/bytepiece" [dependencies] aho-corasick = "1.1" base64-simd = "0.8.0" -fastrand = "2.0.1" -once_cell = "1.19.0" -ouroboros = "0.18.3" -regex = "1.9" +fastrand = "2.3.0" +once_cell = "1.20.2" +ouroboros = "0.18.4" +regex = "1.11.1" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -thiserror = "1.0" +thiserror = "2.0" [features] diff --git a/bytepiece/README.md b/bytepiece/README.md deleted file mode 100644 index d7566d9..0000000 --- a/bytepiece/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# bytepiece - diff --git a/scripts/download_model.py b/scripts/download_model.py index 8dd820d..067c3bc 100644 --- a/scripts/download_model.py +++ b/scripts/download_model.py @@ -1,3 +1,11 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "requests", +# ] +# /// + + import argparse import os import requests @@ -5,15 +13,15 @@ from zipfile import ZipFile MODELS = [ - 'bytepiece_80k', - 'bytepiece_160k', + "bytepiece_80k", + "bytepiece_160k", ] BASE_URL = "https://github.com/bojone/bytepiece/raw/main/models/" DEST_DIR = "models" parser = argparse.ArgumentParser() -parser.add_argument('-m', '--model', choices=MODELS, default=MODELS[0]) +parser.add_argument("-m", "--model", choices=MODELS, default=MODELS[0]) args = parser.parse_args() @@ -21,7 +29,7 @@ dest_dir.mkdir(exist_ok=True) model_name: str = args.model -model_filename = model_name + '.zip' +model_filename = model_name + ".zip" model_url = BASE_URL + model_filename model_file_path = dest_dir / model_filename @@ -30,10 +38,10 @@ if not model_file_path.exists(): resp = requests.get(model_url) - with model_file_path.open('wb') as f: + with model_file_path.open("wb") as f: f.write(resp.content) else: - print(f'{model_file_path} exists') + print(f"{model_file_path} exists") with ZipFile(model_file_path) as zip_file: zip_file.extractall(dest_dir)