SunDoge · SunDoge · Dec 21, 2024 · Dec 20, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 *.model
 *.zip
 .vscode/
+__pycache__/
+*.so
diff --git a/bytepiece-py/.gitignore b/bytepiece-py/.gitignore
diff --git a/bytepiece-py/.python-version b/bytepiece-py/.python-version
@@ -1 +1 @@
-3.12.2
+3.11
diff --git a/bytepiece-py/Cargo.toml b/bytepiece-py/Cargo.toml
@@ -1,13 +1,15 @@
 [package]
-name = "bytepiece-py"
+name = "bytepiece_py"
 version = "0.1.0"
 edition = "2021"
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [lib]
-name = "bytepiece_py"
+name = "_core"
+# "cdylib" is necessary to produce a shared library for Python to import from.
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = "0.21"
+# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
+# "abi3-py39" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.9
+pyo3 = { version = "0.23.3", features = ["extension-module", "abi3-py39"] }
 bytepiece = { workspace = true }
diff --git a/bytepiece-py/README.md b/bytepiece-py/README.md
@@ -1,3 +1 @@
-# bytepiece-py
-
-Describe your project here.
+?
diff --git a/scripts/bench.py → bytepiece-py/examples/bench.py b/scripts/bench.py → bytepiece-py/examples/bench.py
@@ -1,4 +1,11 @@
-import bytepiece
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "rs-bytepiece",
+#     "bytepiece-py",
+# ]
+# ///
+# import bytepiece
 import timeit
 import bytepiece_py
 import rs_bytepiece
@@ -7,21 +14,22 @@
 TEXT = "BytePiece是一个Byte-based的Unigram分词器，纯Python实现，更加易读和易拓展。由于采用了新的训练算法，所以压缩率通常比现有Tokenizer更高，同时支持多进程加速训练。此外，它直接操作文本的UTF-8 Bytes，几乎不进行任何的预处理，所以更加纯粹和语言无关。"
 
 
-MODEL = "models/bytepiece_80k.model"
+MODEL = "../models/bytepiece_80k.model"
 
-t1 = bytepiece.Tokenizer(MODEL)
+# t1 = bytepiece.Tokenizer(MODEL)
 t2 = bytepiece_py.Tokenizer(MODEL)
 t3 = rs_bytepiece.Tokenizer(MODEL)
 
-assert t1.encode(TEXT) == t2.encode(TEXT)
-assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT))
-print(t1.encode(TEXT))
+# assert t1.encode(TEXT) == t2.encode(TEXT)
+# assert t1.decode(t1.encode(TEXT)) == t2.decode(t2.encode(TEXT))
+# print(t1.encode(TEXT))
 print(t2.encode(TEXT))
+print(t3.encode(TEXT))
 
 
-print('bytepiece:')
-print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000))
-print('bytepiece-py (ours)')
+# print('bytepiece:')
+# print(timeit.timeit("t1.encode(TEXT)", globals=globals(), number=10000))
+print("bytepiece-py (ours)")
 print(timeit.timeit("t2.encode(TEXT)", globals=globals(), number=10000))
-print('rs-bytepiece')
+print("rs-bytepiece")
 print(timeit.timeit("t3.encode(TEXT)", globals=globals(), number=10000))
diff --git a/bytepiece-py/pyproject.toml b/bytepiece-py/pyproject.toml
@@ -1,28 +1,25 @@
 [project]
 name = "bytepiece-py"
-version = "0.2.1"
+version = "0.1.1"
 description = "Add your description here"
+readme = "README.md"
 authors = [
     { name = "SunDoge", email = "[email protected]" }
 ]
+requires-python = ">=3.8"
 dependencies = []
-readme = "README.md"
-requires-python = ">= 3.8"
+
+[tool.maturin]
+module-name = "bytepiece_py._core"
+python-packages = ["bytepiece_py"]
+python-source = "src"
 
 [build-system]
-requires = ["maturin>=1.2,<2.0"]
+requires = ["maturin>=1.0,<2.0"]
 build-backend = "maturin"
 
-[tool.rye]
-managed = true
-dev-dependencies = [
-    "pip>=24.0",
+[dependency-groups]
+dev = [
+    "maturin>=1.7.8",
+    "rs-bytepiece>=0.2.2",
 ]
-
-[tool.maturin]
-python-source = "python"
-module-name = "bytepiece_py._lowlevel"
-features = ["pyo3/extension-module"]
-
-[tool.rye.scripts]
-dev = "maturin develop --skip-install"
diff --git a/bytepiece-py/requirements-dev.lock b/bytepiece-py/requirements-dev.lock
diff --git a/bytepiece-py/requirements.lock b/bytepiece-py/requirements.lock
diff --git a/bytepiece-py/python/bytepiece_py/__init__.py → bytepiece-py/src/bytepiece_py/__init__.py b/bytepiece-py/python/bytepiece_py/__init__.py → bytepiece-py/src/bytepiece_py/__init__.py
@@ -1,7 +1,7 @@
 import unicodedata
 from typing import Dict, List, Tuple, Union
 
-from bytepiece_py import _lowlevel
+from bytepiece_py import _core
 
 
 def normalize(text: str) -> bytes:
@@ -11,9 +11,9 @@ def normalize(text: str) -> bytes:
 class Tokenizer:
     def __init__(self, pieces: Union[str, Dict[str, Tuple[str, int, str]]]) -> None:
         if isinstance(pieces, str):
-            self._tokenizer = _lowlevel._Tokenizer.from_path(pieces)
+            self._tokenizer = _core._Tokenizer.from_path(pieces)
         else:
-            self._tokenizer = _lowlevel._Tokenizer(pieces)
+            self._tokenizer = _core._Tokenizer(pieces)
 
     def encode(
         self,

diff --git a/bytepiece-py/src/bytepiece_py/_core.pyi b/bytepiece-py/src/bytepiece_py/_core.pyi
@@ -0,0 +1,3 @@
+from __future__ import annotations
+
+def hello_from_bin() -> str: ...
diff --git a/bytepiece-py/src/bytepiece_py/py.typed b/bytepiece-py/src/bytepiece_py/py.typed
diff --git a/bytepiece-py/src/lib.rs b/bytepiece-py/src/lib.rs
@@ -2,11 +2,12 @@ mod error;
 mod tokenizer;
 
 use pyo3::prelude::*;
-use tokenizer::_Tokenizer;
 
-/// A Python module implemented in Rust.
+/// A Python module implemented in Rust. The name of this function must match
+/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
+/// import the module.
 #[pymodule]
-fn _lowlevel(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<_Tokenizer>()?;
+fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<tokenizer::_Tokenizer>()?;
     Ok(())
 }
diff --git a/bytepiece-py/src/tokenizer.rs b/bytepiece-py/src/tokenizer.rs
@@ -37,10 +37,7 @@ impl _Tokenizer {
     ) -> Vec<Bound<'py, PyBytes>> {
         let bs = text.as_bytes();
         let tokens = py.allow_threads(|| self.inner.tokenize(&bs, alpha));
-        tokens
-            .into_iter()
-            .map(|bs| PyBytes::new_bound(py, bs))
-            .collect()
+        tokens.into_iter().map(|bs| PyBytes::new(py, bs)).collect()
     }
 
     #[pyo3(signature = (text, add_bos = false, add_eos = false, alpha = -1.0))]
@@ -58,11 +55,11 @@ impl _Tokenizer {
 
     pub fn decode<'py>(&self, py: Python<'py>, ids: Vec<usize>) -> Result<Bound<'py, PyBytes>> {
         let res = py.allow_threads(|| self.inner.decode(&ids))?;
-        Ok(PyBytes::new_bound(py, &res))
+        Ok(PyBytes::new(py, &res))
     }
 
     pub fn id_to_piece<'py>(&self, py: Python<'py>, id: usize) -> Bound<'py, PyBytes> {
-        PyBytes::new_bound(py, self.inner.id_to_piece(id))
+        PyBytes::new(py, self.inner.id_to_piece(id))
     }
 
     pub fn piece_to_id(&self, piece: &Bound<PyBytes>) -> usize {
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ @@
     *.model
     *.zip
     .vscode/
+    __pycache__/
+    *.so
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from __future__ import annotations

		def hello_from_bin() -> str: ...