diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml index b4f3086e..79a37494 100644 --- a/.github/workflows/cifuzz.yml +++ b/.github/workflows/cifuzz.yml @@ -23,7 +23,7 @@ jobs: dry-run: false language: c++ - name: Upload Crash - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: failure() && steps.build.outcome == 'success' with: name: artifacts diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index b7a2e499..b1cbcf73 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -1,76 +1,99 @@ name: CI for general build - +# This workflow handles the general build process including CMake configuration, +# C++ build, Python wrapper compilation, and testing across multiple platforms on: - push: - branches: [ master ] - tags: - - 'v*' pull_request: branches: [ master ] + types: [opened, synchronize, reopened] + workflow_dispatch: + +# Prevent concurrent workflow runs on the same PR +concurrency: + group: cmake-${{ github.event.name }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: false + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PYTHONPATH: ${{ github.workspace }}/build/root/lib + LD_LIBRARY_PATH: ${{ github.workspace }}/build/root/lib permissions: - contents: read + contents: write + pull-requests: write + actions: write + checks: write + id-token: write jobs: build: + # Only run on pull requests from forks + if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork || github.event_name == 'workflow_dispatch' strategy: + fail-fast: false matrix: - os: [ ubuntu-latest, ubuntu-20.04, windows-latest, macOS-11 ] + os: [ ubuntu-latest ] arch: [ x64 ] - include: - - os: windows-latest - arch: x86 runs-on: ${{ matrix.os }} - permissions: - contents: write # svenstaro/upload-release-action + # Inherit permissions from workflow level + # Removed redundant permissions block as it's inherited from workflow level steps: + - name: Install Dependencies + run: | + sudo apt-get update + sudo apt-get install -y cmake build-essential swig python3-dev - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 2 - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 with: python-version: '3.x' architecture: ${{matrix.arch}} - - name: Config for Windows - if: runner.os == 'Windows' + - name: Configure CMake run: | - if ("${{matrix.arch}}" -eq "x64") { - $msbuildPlatform = "x64" - } else { - $msbuildPlatform = "Win32" - } - cmake -A $msbuildPlatform -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DSPM_ENABLE_SHARED=OFF -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root - - - name: Config for Unix - if: runner.os != 'Windows' - run: cmake -B ${{github.workspace}}/build -DSPM_BUILD_TEST=ON -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root - env: - CMAKE_OSX_ARCHITECTURES: arm64;x86_64 + echo "Configuring CMake build..." + cmake -B ${{github.workspace}}/build \ + -DSPM_BUILD_TEST=ON \ + -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/build/root - name: Build - run: cmake --build ${{github.workspace}}/build --config Release --target install --parallel 8 + run: | + echo "Building with CMake..." + cmake --build ${{github.workspace}}/build --config Release --target install --parallel 8 - name: Test working-directory: ${{github.workspace}}/build - run: ctest -C Release --output-on-failure + run: | + echo "Running tests..." + ctest -C Release --output-on-failure -V - name: Package working-directory: ${{github.workspace}}/build - run: cpack + run: | + echo "Creating package..." + cpack -V - - name: Build Python wrapper + - name: Build Python wrapper (Unix) + if: runner.os != 'Windows' working-directory: ${{github.workspace}}/python + shell: bash run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install build pytest python -m pip install --require-hashes --no-dependencies -r ../.github/workflows/requirements/base.txt - python setup.py build - python setup.py bdist_wheel - python -m pytest + # Ensure we have the built C++ library in the Python path + echo "PYTHONPATH=${{github.workspace}}/build/root/lib" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=${{github.workspace}}/build/root/lib" >> $GITHUB_ENV + python setup.py build -v + python setup.py bdist_wheel -v + python -m pytest -v --log-cli-level=INFO - - name: Upload artifcacts - uses: actions/upload-artifact@v3 + - name: Upload artifacts + uses: actions/upload-artifact@v4 with: - name: artifcacts + name: artifacts-${{ matrix.os }}-${{ matrix.arch }} path: ./build/*.7z - name: Upload Release Assets diff --git a/.github/workflows/wheel.yml b/.github/workflows/wheel.yml index a41cac95..9d8fbb5a 100644 --- a/.github/workflows/wheel.yml +++ b/.github/workflows/wheel.yml @@ -8,8 +8,16 @@ on: pull_request: branches: [ master ] +concurrency: + group: wheel-${{ github.event.pull_request.number || github.sha }}-${{ github.event.pull_request.head.ref || github.ref_name }} + cancel-in-progress: false + permissions: - contents: read + contents: write + pull-requests: write + actions: write + checks: write + issues: write jobs: build_wheels: @@ -18,16 +26,16 @@ jobs: digests-macos: ${{ steps.hash-macos.outputs.digests }} digests-windows: ${{ steps.hash-windows.outputs.digests }} strategy: + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macOS-11] runs-on: ${{ matrix.os }} name: Build wheels on ${{ matrix.os }} - permissions: - contents: write # svenstaro/upload-release-action - steps: - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + with: + fetch-depth: 0 - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 with: python-version: "3.x" @@ -70,7 +78,8 @@ jobs: CIBW_ARCHS_MACOS: x86_64 universal2 arm64 CIBW_ARCHS_WINDOWS: auto ARM64 CIBW_SKIP: "pp* *-musllinux_*" - CIBW_BUILD_VERBOSITY: 1 + CIBW_TEST_SKIP: "*-win_arm64 *_aarch64 *-macosx_arm64" + CIBW_BUILD_VERBOSITY: 2 - name: Build sdist archive working-directory: ${{github.workspace}}/python @@ -91,9 +100,9 @@ jobs: run: cp -f dist/*.tar.gz wheelhouse/ - name: Upload artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: artifacts + name: artifacts-${{ matrix.os }} path: | ./python/wheelhouse/*.whl ./python/wheelhouse/*.tar.gz @@ -124,7 +133,7 @@ jobs: if: runner.os == 'Windows' run: echo "digests=$(sha256sum ./python/wheelhouse/* | base64 -w0)" >> $GITHUB_OUTPUT - gather-disgests: + gather-digests: needs: [build_wheels] outputs: digests: ${{ steps.hash.outputs.digests }} @@ -138,19 +147,26 @@ jobs: WINDOWS_DIGESTS: "${{ needs.build_wheels.outputs.digests-windows }}" run: | set -euo pipefail - echo "$LINUX_DIGESTS" | base64 -d > checksums.txt - echo "$MACOS_DIGESTS" | base64 -d >> checksums.txt - echo "$WINDOWS_DIGESTS" | base64 -d >> checksums.txt + touch checksums.txt + if [ ! -z "${LINUX_DIGESTS:-}" ]; then + echo "$LINUX_DIGESTS" | base64 -d >> checksums.txt + fi + if [ ! -z "${MACOS_DIGESTS:-}" ]; then + echo "$MACOS_DIGESTS" | base64 -d >> checksums.txt + fi + if [ ! -z "${WINDOWS_DIGESTS:-}" ]; then + echo "$WINDOWS_DIGESTS" | base64 -d >> checksums.txt + fi echo "digests=$(cat checksums.txt | base64 -w0)" >> $GITHUB_OUTPUT provenance: if: startsWith(github.ref, 'refs/tags/') - needs: [build_wheels, gather-disgests] + needs: [build_wheels, gather-digests] permissions: actions: read # To read the workflow path. id-token: write # To sign the provenance. contents: write # To add assets to a release. uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.0.0 with: - base64-subjects: "${{ needs.gather-disgests.outputs.digests }}" + base64-subjects: "${{ needs.gather-digests.outputs.digests }}" upload-assets: true # Optional: Upload to a new release diff --git a/README.md b/README.md index 76acb3e6..a3548a6d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ SentencePiece is an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems where the vocabulary size is predetermined prior to the neural model training. SentencePiece implements -**subword units** (e.g., **byte-pair-encoding (BPE)** [[Sennrich et al.](https://www.aclweb.org/anthology/P16-1162)]) and +**subword units** (e.g., **byte-pair-encoding (BPE)** [[Sennrich et al.](https://aclanthology.org/P16-1162)]) and **unigram language model** [[Kudo.](https://arxiv.org/abs/1804.10959)]) with the extension of direct training from raw sentences. SentencePiece allows us to make a purely end-to-end system that does not depend on language-specific pre/postprocessing. diff --git a/python/setup.py b/python/setup.py index f7d8cf1e..c4226b02 100755 --- a/python/setup.py +++ b/python/setup.py @@ -24,8 +24,14 @@ from setuptools.command.build_ext import build_ext as _build_ext from setuptools.command.build_py import build_py as _build_py +# Add the source directory to the Python path +package_root = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(package_root, 'src', 'sentencepiece')) sys.path.append(os.path.join('.', 'test')) +# Import version directly from the package +from _version import __version__ + def long_description(): with codecs.open('README.md', 'r', 'utf-8') as f: @@ -33,9 +39,6 @@ def long_description(): return long_description -exec(open('src/sentencepiece/_version.py').read()) - - def run_pkg_config(section, pkg_config_path=None): try: cmd = 'pkg-config sentencepiece --{}'.format(section) @@ -192,6 +195,7 @@ def get_win_arch(): license='Apache', platforms='Unix', py_modules=[ + 'sentencepiece/_init', 'sentencepiece/__init__', 'sentencepiece/_version', 'sentencepiece/sentencepiece_model_pb2', diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py index e028957d..a4b17bec 100644 --- a/python/src/sentencepiece/__init__.py +++ b/python/src/sentencepiece/__init__.py @@ -4,18 +4,71 @@ # Do not make changes to this file unless you know what you are doing - modify # the SWIG interface file instead. -from sys import version_info as _swig_python_version_info -# Import the low-level C/C++ module +# First import initialization module to set up paths if __package__ or "." in __name__: - from . import _sentencepiece + from . import _init else: - import _sentencepiece + import _init + +import re +import csv +import sys +import os +from io import StringIO +from io import BytesIO +from sys import version_info as _swig_python_version_info try: import builtins as __builtin__ except ImportError: import __builtin__ +# Module state tracking +_sentencepiece_module = None +_module_loading = False +_module_initialized = False +_registration_complete = False +_registration_in_progress = False +_module_load_attempted = False +_loading_lock = False + +def _load_sentencepiece(): + """Load and cache the SWIG module with proper initialization checks.""" + global _sentencepiece_module, _module_loading, _module_initialized + global _registration_complete, _registration_in_progress, _module_load_attempted + global _loading_lock + + # Return cached module if already loaded + if _sentencepiece_module is not None and _module_initialized: + return _sentencepiece_module + + # Prevent circular imports during loading + if _module_loading or _loading_lock: + if _module_load_attempted: + raise ImportError("Circular import detected while loading _sentencepiece") + return None + + try: + _loading_lock = True + _module_loading = True + _module_load_attempted = True + + # Import SWIG module based on package context + if __package__ or "." in __name__: + from . import _sentencepiece as _sp + else: + import _sentencepiece as _sp + + _sentencepiece_module = _sp + _module_initialized = True + return _sentencepiece_module + except ImportError as e: + raise ImportError(f"Failed to load _sentencepiece module: {e}") + finally: + _module_loading = False + _loading_lock = False + _module_load_attempted = False + def _swig_repr(self): try: strthis = "proxy of " + self.this.__repr__() @@ -23,7 +76,6 @@ def _swig_repr(self): strthis = "" return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,) - def _swig_setattr_nondynamic_instance_variable(set): def set_instance_attr(self, name, value): if name == "this": @@ -36,7 +88,6 @@ def set_instance_attr(self, name, value): raise AttributeError("You cannot add instance attributes to %s" % self) return set_instance_attr - def _swig_setattr_nondynamic_class_variable(set): def set_class_attr(cls, name, value): if hasattr(cls, name) and not isinstance(getattr(cls, name), property): @@ -45,47 +96,70 @@ def set_class_attr(cls, name, value): raise AttributeError("You cannot add class attributes to %s" % cls) return set_class_attr - def _swig_add_metaclass(metaclass): """Class decorator for adding a metaclass to a SWIG wrapped class - a slimmed down version of six.add_metaclass""" def wrapper(cls): return metaclass(cls.__name__, cls.__bases__, cls.__dict__.copy()) return wrapper - class _SwigNonDynamicMeta(type): """Meta class to enforce nondynamic attributes (no new attributes) for a class""" __setattr__ = _swig_setattr_nondynamic_class_variable(type.__setattr__) - +# Define all classes before any registrations class ImmutableSentencePieceText_ImmutableSentencePiece(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr def __init__(self): - _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText_ImmutableSentencePiece()) - __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText_ImmutableSentencePiece + self.this = None + self._initialized = False + + def _initialize(self): + if self._initialized: + return + try: + _sp = _load_sentencepiece() + if not hasattr(_sp, 'new_ImmutableSentencePieceText_ImmutableSentencePiece'): + raise ImportError("SWIG module not properly initialized") + self.this = _sp.new_ImmutableSentencePieceText_ImmutableSentencePiece() + self._initialized = True + except ImportError as e: + raise RuntimeError(f"Failed to initialize: {e}") + + def _ensure_initialized(self): + if not self._initialized: + self._initialize() + + __swig_destroy__ = property(lambda self: _load_sentencepiece().delete_ImmutableSentencePieceText_ImmutableSentencePiece if self._initialized else None) def _piece(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__piece(self) def _surface(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__surface(self) def _id(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__id(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__id(self) def _begin(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__begin(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__begin(self) def _end(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__end(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__end(self) def _surface_as_bytes(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__surface_as_bytes(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__surface_as_bytes(self) def _piece_as_bytes(self): - return _sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece__piece_as_bytes(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_ImmutableSentencePiece__piece_as_bytes(self) piece = property(_piece) piece_as_bytes = property(_piece_as_bytes) @@ -96,49 +170,73 @@ def _piece_as_bytes(self): end = property(_end) def __str__(self): - return ('piece: \"{}\"\n' - 'id: {}\n' - 'surface: \"{}\"\n' - 'begin: {}\n' - 'end: {}\n').format(self.piece, self.id, self.surface, - self.begin, self.end) + self._ensure_initialized() + return ('piece: \"{}\"\n' + 'id: {}\n' + 'surface: \"{}\"\n' + 'begin: {}\n' + 'end: {}\n').format(self.piece, self.id, self.surface, + self.begin, self.end) def __eq__(self, other): - return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end + self._ensure_initialized() + return self.piece == other.piece and self.id == other.id and self.surface == other.surface and self.begin == other.begin and self.end == other.end def __hash__(self): - return hash(str(self)) + self._ensure_initialized() + return hash(str(self)) __repr__ = __str__ - -# Register ImmutableSentencePieceText_ImmutableSentencePiece in _sentencepiece: -_sentencepiece.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) class ImmutableSentencePieceText(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr def __init__(self): - _sentencepiece.ImmutableSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableSentencePieceText()) - __swig_destroy__ = _sentencepiece.delete_ImmutableSentencePieceText + self.this = None + self._initialized = False + + def _initialize(self): + if self._initialized: + return + try: + _sp = _load_sentencepiece() + if not hasattr(_sp, 'new_ImmutableSentencePieceText'): + raise ImportError("SWIG module not properly initialized") + self.this = _sp.new_ImmutableSentencePieceText() + self._initialized = True + except ImportError as e: + raise RuntimeError(f"Failed to initialize: {e}") + + def _ensure_initialized(self): + if not self._initialized: + self._initialize() + + __swig_destroy__ = property(lambda self: _load_sentencepiece().delete_ImmutableSentencePieceText if self._initialized else None) def _pieces_size(self): - return _sentencepiece.ImmutableSentencePieceText__pieces_size(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText__pieces_size(self) def _pieces(self, index): - return _sentencepiece.ImmutableSentencePieceText__pieces(self, index) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText__pieces(self, index) def _text(self): - return _sentencepiece.ImmutableSentencePieceText__text(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText__text(self) def _score(self): - return _sentencepiece.ImmutableSentencePieceText__score(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText__score(self) def SerializeAsString(self): - return _sentencepiece.ImmutableSentencePieceText_SerializeAsString(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText_SerializeAsString(self) def _text_as_bytes(self): - return _sentencepiece.ImmutableSentencePieceText__text_as_bytes(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableSentencePieceText__text_as_bytes(self) text = property(_text) text_as_bytes = property(_text_as_bytes) @@ -185,24 +283,45 @@ def __str__(self): __repr__ = __str__ -# Register ImmutableSentencePieceText in _sentencepiece: -_sentencepiece.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) +# Registration will be handled by _initialize_all_registrations() after all classes are defined + class ImmutableNBestSentencePieceText(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr def __init__(self): - _sentencepiece.ImmutableNBestSentencePieceText_swiginit(self, _sentencepiece.new_ImmutableNBestSentencePieceText()) - __swig_destroy__ = _sentencepiece.delete_ImmutableNBestSentencePieceText + self.this = None + self._initialized = False + + def _initialize(self): + if self._initialized: + return + try: + _sp = _load_sentencepiece() + if not hasattr(_sp, 'new_ImmutableNBestSentencePieceText'): + raise ImportError("SWIG module not properly initialized") + self.this = _sp.new_ImmutableNBestSentencePieceText() + self._initialized = True + except ImportError as e: + raise RuntimeError(f"Failed to initialize: {e}") + + def _ensure_initialized(self): + if not self._initialized: + self._initialize() + + __swig_destroy__ = property(lambda self: _load_sentencepiece().delete_ImmutableNBestSentencePieceText if self._initialized else None) def _nbests_size(self): - return _sentencepiece.ImmutableNBestSentencePieceText__nbests_size(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableNBestSentencePieceText__nbests_size(self) def _nbests(self, index): - return _sentencepiece.ImmutableNBestSentencePieceText__nbests(self, index) + self._ensure_initialized() + return _load_sentencepiece().ImmutableNBestSentencePieceText__nbests(self, index) def SerializeAsString(self): - return _sentencepiece.ImmutableNBestSentencePieceText_SerializeAsString(self) + self._ensure_initialized() + return _load_sentencepiece().ImmutableNBestSentencePieceText_SerializeAsString(self) class ImmutableSentencePieceTextIterator: def __init__(self, proto): @@ -243,182 +362,183 @@ def __str__(self): # Register ImmutableNBestSentencePieceText in _sentencepiece: -_sentencepiece.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) +_load_sentencepiece().ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) class SentencePieceProcessor(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr def __init__(self): - _sentencepiece.SentencePieceProcessor_swiginit(self, _sentencepiece.new_SentencePieceProcessor()) - __swig_destroy__ = _sentencepiece.delete_SentencePieceProcessor + _sp = _load_sentencepiece() + _sp.SentencePieceProcessor_swiginit(self, _sp.new_SentencePieceProcessor()) + __swig_destroy__ = property(lambda self: _load_sentencepiece().delete_SentencePieceProcessor) def LoadFromSerializedProto(self, serialized): - return _sentencepiece.SentencePieceProcessor_LoadFromSerializedProto(self, serialized) + return _load_sentencepiece().SentencePieceProcessor_LoadFromSerializedProto(self, serialized) def SetEncodeExtraOptions(self, extra_option): - return _sentencepiece.SentencePieceProcessor_SetEncodeExtraOptions(self, extra_option) + return _load_sentencepiece().SentencePieceProcessor_SetEncodeExtraOptions(self, extra_option) def SetDecodeExtraOptions(self, extra_option): - return _sentencepiece.SentencePieceProcessor_SetDecodeExtraOptions(self, extra_option) + return _load_sentencepiece().SentencePieceProcessor_SetDecodeExtraOptions(self, extra_option) def SetVocabulary(self, valid_vocab): - return _sentencepiece.SentencePieceProcessor_SetVocabulary(self, valid_vocab) + return _load_sentencepiece().SentencePieceProcessor_SetVocabulary(self, valid_vocab) def ResetVocabulary(self): - return _sentencepiece.SentencePieceProcessor_ResetVocabulary(self) + return _load_sentencepiece().SentencePieceProcessor_ResetVocabulary(self) def LoadVocabulary(self, filename, threshold): - return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold) + return _load_sentencepiece().SentencePieceProcessor_LoadVocabulary(self, filename, threshold) def CalculateEntropy(self, *args): - return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args) + return _load_sentencepiece().SentencePieceProcessor_CalculateEntropy(self, *args) def GetPieceSize(self): - return _sentencepiece.SentencePieceProcessor_GetPieceSize(self) + return _load_sentencepiece().SentencePieceProcessor_GetPieceSize(self) def PieceToId(self, piece): - return _sentencepiece.SentencePieceProcessor_PieceToId(self, piece) + return _load_sentencepiece().SentencePieceProcessor_PieceToId(self, piece) def IdToPiece(self, id): - return _sentencepiece.SentencePieceProcessor_IdToPiece(self, id) + return _load_sentencepiece().SentencePieceProcessor_IdToPiece(self, id) def GetScore(self, id): - return _sentencepiece.SentencePieceProcessor_GetScore(self, id) + return _load_sentencepiece().SentencePieceProcessor_GetScore(self, id) def IsUnknown(self, id): - return _sentencepiece.SentencePieceProcessor_IsUnknown(self, id) + return _load_sentencepiece().SentencePieceProcessor_IsUnknown(self, id) def IsControl(self, id): - return _sentencepiece.SentencePieceProcessor_IsControl(self, id) + return _load_sentencepiece().SentencePieceProcessor_IsControl(self, id) def IsUnused(self, id): - return _sentencepiece.SentencePieceProcessor_IsUnused(self, id) + return _load_sentencepiece().SentencePieceProcessor_IsUnused(self, id) def IsByte(self, id): - return _sentencepiece.SentencePieceProcessor_IsByte(self, id) + return _load_sentencepiece().SentencePieceProcessor_IsByte(self, id) def unk_id(self): - return _sentencepiece.SentencePieceProcessor_unk_id(self) + return _load_sentencepiece().SentencePieceProcessor_unk_id(self) def bos_id(self): - return _sentencepiece.SentencePieceProcessor_bos_id(self) + return _load_sentencepiece().SentencePieceProcessor_bos_id(self) def eos_id(self): - return _sentencepiece.SentencePieceProcessor_eos_id(self) + return _load_sentencepiece().SentencePieceProcessor_eos_id(self) def pad_id(self): - return _sentencepiece.SentencePieceProcessor_pad_id(self) + return _load_sentencepiece().SentencePieceProcessor_pad_id(self) def serialized_model_proto(self): - return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self) + return _load_sentencepiece().SentencePieceProcessor_serialized_model_proto(self) def LoadFromFile(self, arg): - return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) + return _load_sentencepiece().SentencePieceProcessor_LoadFromFile(self, arg) def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsPieces(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsSerializedProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsImmutableProto(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsIdsBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsPiecesBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsSerializedProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__EncodeAsImmutableProtoBatch(self, ins, num_threads, enable_sampling, nbest_size, alpha, add_bos, add_eos, reverse, emit_unk_piece) def _DecodeIds(self, ids): - return _sentencepiece.SentencePieceProcessor__DecodeIds(self, ids) + return _load_sentencepiece().SentencePieceProcessor__DecodeIds(self, ids) def _DecodeIdsAsBytes(self, ids): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytes(self, ids) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsBytes(self, ids) def _DecodePieces(self, pieces): - return _sentencepiece.SentencePieceProcessor__DecodePieces(self, pieces) + return _load_sentencepiece().SentencePieceProcessor__DecodePieces(self, pieces) def _DecodeIdsAsSerializedProto(self, ids): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsSerializedProto(self, ids) def _DecodePiecesAsSerializedProto(self, pieces): - return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) + return _load_sentencepiece().SentencePieceProcessor__DecodePiecesAsSerializedProto(self, pieces) def _DecodeIdsAsImmutableProto(self, ids): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsImmutableProto(self, ids) def _DecodePiecesAsImmutableProto(self, pieces): - return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces) + return _load_sentencepiece().SentencePieceProcessor__DecodePiecesAsImmutableProto(self, pieces) def _DecodeIdsBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsBatch(self, ins, num_threads) def _DecodeIdsAsBytesBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsBytesBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsBytesBatch(self, ins, num_threads) def _DecodeIdsAsSerializedProtoBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsSerializedProtoBatch(self, ins, num_threads) def _DecodeIdsAsImmutableProtoBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodeIdsAsImmutableProtoBatch(self, ins, num_threads) def _DecodePiecesBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodePiecesBatch(self, ins, num_threads) def _DecodePiecesAsSerializedProtoBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodePiecesAsSerializedProtoBatch(self, ins, num_threads) def _DecodePiecesAsImmutableProtoBatch(self, ins, num_threads): - return _sentencepiece.SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads) + return _load_sentencepiece().SentencePieceProcessor__DecodePiecesAsImmutableProtoBatch(self, ins, num_threads) def _NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__NBestEncodeAsIds(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) def _NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__NBestEncodeAsPieces(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) def _NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__NBestEncodeAsSerializedProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) def _NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__NBestEncodeAsImmutableProto(self, text, nbest_size, add_bos, add_eos, reverse, emit_unk_piece) def _SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__SampleEncodeAndScoreAsIds(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) def _SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__SampleEncodeAndScoreAsPieces(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) def _SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__SampleEncodeAndScoreAsSerializedProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) def _SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece): - return _sentencepiece.SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) + return _load_sentencepiece().SentencePieceProcessor__SampleEncodeAndScoreAsImmutableProto(self, text, num_samples, alpha, wor, include_best, add_bos, add_eos, reverse, emit_unk_piece) def _Normalize(self, text): - return _sentencepiece.SentencePieceProcessor__Normalize(self, text) + return _load_sentencepiece().SentencePieceProcessor__Normalize(self, text) def _NormalizeWithOffsets(self, text): - return _sentencepiece.SentencePieceProcessor__NormalizeWithOffsets(self, text) + return _load_sentencepiece().SentencePieceProcessor__NormalizeWithOffsets(self, text) def _CalculateEntropy(self, text, alpha): - return _sentencepiece.SentencePieceProcessor__CalculateEntropy(self, text, alpha) + return _load_sentencepiece().SentencePieceProcessor__CalculateEntropy(self, text, alpha) def _CalculateEntropyBatch(self, ins, alpha, num_threads): - return _sentencepiece.SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads) + return _load_sentencepiece().SentencePieceProcessor__CalculateEntropyBatch(self, ins, alpha, num_threads) def _OverrideNormalizerSpec(self, args): - return _sentencepiece.SentencePieceProcessor__OverrideNormalizerSpec(self, args) + return _load_sentencepiece().SentencePieceProcessor__OverrideNormalizerSpec(self, args) def Init(self, model_file=None, @@ -961,14 +1081,22 @@ def Load(self, model_file=None, model_proto=None): return self.LoadFromFile(model_file) -# Register SentencePieceProcessor in _sentencepiece: -_sentencepiece.SentencePieceProcessor_swigregister(SentencePieceProcessor) +# Define registration functions that will be called after all classes are defined +def _register_processor(): + _load_sentencepiece().SentencePieceProcessor_swigregister(SentencePieceProcessor) + +def _register_trainer(): + _load_sentencepiece().SentencePieceTrainer_swigregister(SentencePieceTrainer) + +def _register_normalizer(): + _load_sentencepiece().SentencePieceNormalizer_swigregister(SentencePieceNormalizer) def SetRandomGeneratorSeed(seed): - return _sentencepiece.SetRandomGeneratorSeed(seed) + return _load_sentencepiece().SetRandomGeneratorSeed(seed) def SetMinLogLevel(v): - return _sentencepiece.SetMinLogLevel(v) + return _load_sentencepiece().SetMinLogLevel(v) + class SentencePieceTrainer(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") @@ -978,23 +1106,23 @@ def __init__(self, *args, **kwargs): @staticmethod def _TrainFromString(arg): - return _sentencepiece.SentencePieceTrainer__TrainFromString(arg) + return _load_sentencepiece().SentencePieceTrainer__TrainFromString(arg) @staticmethod def _TrainFromMap(args): - return _sentencepiece.SentencePieceTrainer__TrainFromMap(args) + return _load_sentencepiece().SentencePieceTrainer__TrainFromMap(args) @staticmethod def _TrainFromMap2(args, iter): - return _sentencepiece.SentencePieceTrainer__TrainFromMap2(args, iter) + return _load_sentencepiece().SentencePieceTrainer__TrainFromMap2(args, iter) @staticmethod def _TrainFromMap3(args): - return _sentencepiece.SentencePieceTrainer__TrainFromMap3(args) + return _load_sentencepiece().SentencePieceTrainer__TrainFromMap3(args) @staticmethod def _TrainFromMap4(args, iter): - return _sentencepiece.SentencePieceTrainer__TrainFromMap4(args, iter) + return _load_sentencepiece().SentencePieceTrainer__TrainFromMap4(args, iter) @staticmethod def _Train(arg=None, **kwargs): @@ -1046,40 +1174,63 @@ def Train(arg=None, logstream=None, **kwargs): with _LogStream(ostream=logstream): SentencePieceTrainer._Train(arg=arg, **kwargs) - -# Register SentencePieceTrainer in _sentencepiece: -_sentencepiece.SentencePieceTrainer_swigregister(SentencePieceTrainer) class SentencePieceNormalizer(object): thisown = property(lambda x: x.this.own(), lambda x, v: x.this.own(v), doc="The membership flag") __repr__ = _swig_repr def __init__(self): - _sentencepiece.SentencePieceNormalizer_swiginit(self, _sentencepiece.new_SentencePieceNormalizer()) - __swig_destroy__ = _sentencepiece.delete_SentencePieceNormalizer + self.this = None + self._initialized = False + + def _initialize(self): + if self._initialized: + return + try: + _sp = _load_sentencepiece() + if not hasattr(_sp, 'new_SentencePieceNormalizer'): + raise ImportError("SWIG module not properly initialized") + self.this = _sp.new_SentencePieceNormalizer() + self._initialized = True + except ImportError as e: + raise RuntimeError(f"Failed to initialize: {e}") + + def _ensure_initialized(self): + if not self._initialized: + self._initialize() + + __swig_destroy__ = property(lambda self: _load_sentencepiece().delete_SentencePieceNormalizer if self._initialized else None) def LoadFromSerializedProto(self, serialized): - return _sentencepiece.SentencePieceNormalizer_LoadFromSerializedProto(self, serialized) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer_LoadFromSerializedProto(self, serialized) def LoadFromRuleTSV(self, filename): - return _sentencepiece.SentencePieceNormalizer_LoadFromRuleTSV(self, filename) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer_LoadFromRuleTSV(self, filename) def LoadFromRuleName(self, name): - return _sentencepiece.SentencePieceNormalizer_LoadFromRuleName(self, name) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer_LoadFromRuleName(self, name) def serialized_model_proto(self): - return _sentencepiece.SentencePieceNormalizer_serialized_model_proto(self) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer_serialized_model_proto(self) def LoadFromFile(self, arg): - return _sentencepiece.SentencePieceNormalizer_LoadFromFile(self, arg) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer_LoadFromFile(self, arg) def _Normalize(self, text): - return _sentencepiece.SentencePieceNormalizer__Normalize(self, text) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer__Normalize(self, text) def _NormalizeWithOffsets(self, text): - return _sentencepiece.SentencePieceNormalizer__NormalizeWithOffsets(self, text) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer__NormalizeWithOffsets(self, text) def _SetProtoField(self, name, value): - return _sentencepiece.SentencePieceNormalizer__SetProtoField(self, name, value) + self._ensure_initialized() + return _load_sentencepiece().SentencePieceNormalizer__SetProtoField(self, name, value) def Init(self, model_file=None, @@ -1100,7 +1251,7 @@ def Init(self, escape_whitespaces: escape whitespaces. remove_extra_whitespaces: remove extra whitespaces. """ - + self._ensure_initialized() _sentencepiece_normalizer_init_native(self) if model_file: @@ -1131,6 +1282,7 @@ def _normalize(text): def __getstate__(self): + self._ensure_initialized() return self.serialized_model_proto() @@ -1138,10 +1290,99 @@ def __setstate__(self, serialized_model_proto): self.__init__() self.LoadFromSerializedProto(serialized_model_proto) +# Global initialization and registration state +_module_loading = False +_module_initialized = False +_registration_complete = False +_registration_in_progress = False + +def _register_immutable_classes(): + """Register immutable classes in the correct order.""" + global _registration_complete, _registration_in_progress + if _registration_complete: + return True + + if _registration_in_progress: + return False -# Register SentencePieceNormalizer in _sentencepiece: -_sentencepiece.SentencePieceNormalizer_swigregister(SentencePieceNormalizer) + try: + _registration_in_progress = True + _sp = _load_sentencepiece() + # Register immutable classes in dependency order + if not hasattr(_sp, 'ImmutableSentencePieceText_ImmutableSentencePiece_swigregister'): + _registration_in_progress = False + return False + + # Ensure all required registration functions exist + required_funcs = [ + 'ImmutableSentencePieceText_ImmutableSentencePiece_swigregister', + 'ImmutableSentencePieceText_swigregister', + 'ImmutableNBestSentencePieceText_swigregister' + ] + if not all(hasattr(_sp, func) for func in required_funcs): + _registration_in_progress = False + return False + + # Register in dependency order + _sp.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) + _sp.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) + _sp.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) + + _registration_complete = True + _registration_in_progress = False + return True + except ImportError as e: + _registration_in_progress = False + raise ImportError(f"Failed to load SWIG module during immutable class registration: {e}") + except AttributeError as e: + _registration_in_progress = False + raise ImportError(f"Failed to register immutable classes - missing SWIG attributes: {e}") + +def _initialize_all_registrations(): + """Initialize all registrations after classes are defined.""" + global _module_initialized, _registration_complete, _registration_in_progress + + if _module_initialized and _registration_complete: + return # Prevent double initialization + + if _registration_in_progress: + return # Prevent recursive initialization + try: + _registration_in_progress = True + # Ensure SWIG module is loaded first + _sp = _load_sentencepiece() + if _sp is None: + _registration_in_progress = False + raise ImportError("Failed to load SWIG module") + + # Register immutable classes first + if not _register_immutable_classes(): + _registration_in_progress = False + raise ImportError("Failed to register immutable classes") + + # Register processor classes in order, with dependency checks + processor_registrations = [ + ('SentencePieceProcessor_swigregister', _register_processor), + ('SentencePieceTrainer_swigregister', _register_trainer), + ('SentencePieceNormalizer_swigregister', _register_normalizer) + ] + + for attr, register_func in processor_registrations: + if hasattr(_sp, attr): + register_func() + + _module_initialized = True + _registration_in_progress = False + except ImportError as e: + _registration_in_progress = False + raise ImportError(f"Failed to initialize registrations: {e}") + except Exception as e: + _registration_in_progress = False + raise ImportError(f"Unexpected error during registration initialization: {e}") + +# Initialize all registrations after classes are defined +_initialize_all_registrations() import re import csv @@ -1181,6 +1422,51 @@ def _batched_func(self, arg): setattr(classname, name, _batched_func) +def _register_all_classes(): + """Register all SWIG-generated classes after they are fully defined.""" + global _registration_complete, _registration_in_progress, _registration_lock + if _registration_complete: + return + + # First ensure module is fully loaded without registration + _registration_lock = True + try: + # Load module without registrations first + _sp = None + for _ in range(2): # Try twice to handle potential circular imports + _sp = _load_sentencepiece() + if _sp is not None: + break + + if _sp is None: + raise ImportError("Failed to load _sentencepiece module") + + # Now that module is loaded, perform registrations + _registration_in_progress = True + try: + # Register immutable classes first, with retries + for _ in range(2): + try: + _sp.ImmutableSentencePieceText_ImmutableSentencePiece_swigregister(ImmutableSentencePieceText_ImmutableSentencePiece) + _sp.ImmutableSentencePieceText_swigregister(ImmutableSentencePieceText) + _sp.ImmutableNBestSentencePieceText_swigregister(ImmutableNBestSentencePieceText) + break + except (AttributeError, ImportError): + continue + + # Register processor classes after immutables + _sp.SentencePieceProcessor_swigregister(SentencePieceProcessor) + _sp.SentencePieceTrainer_swigregister(SentencePieceTrainer) + _sp.SentencePieceNormalizer_swigregister(SentencePieceNormalizer) + _registration_complete = True + finally: + _registration_in_progress = False + finally: + _registration_lock = False + +# Delay registration until after all classes are defined +_register_all_classes() + _sentencepiece_processor_init_native = SentencePieceProcessor.__init__ _sentencepiece_normalizer_init_native = SentencePieceNormalizer.__init__ setattr(SentencePieceProcessor, '__init__', SentencePieceProcessor.Init) @@ -1220,5 +1506,3 @@ def __exit__(self, type, value, traceback): os.dup2(self.orig_stream_dup, self.orig_stream_fileno) os.close(self.orig_stream_dup) self.ostream.close() - - diff --git a/python/src/sentencepiece/_init.py b/python/src/sentencepiece/_init.py new file mode 100644 index 00000000..dcd20d70 --- /dev/null +++ b/python/src/sentencepiece/_init.py @@ -0,0 +1,23 @@ +""" +SentencePiece Python Module Initialization +This file handles the proper initialization sequence for the SentencePiece module. +""" +import os +import sys +from pathlib import Path + +def initialize_module(): + """Initialize the SentencePiece module by setting up the proper import paths.""" + # Add the directory containing _sentencepiece to Python path if needed + module_dir = Path(__file__).parent + if str(module_dir) not in sys.path: + sys.path.insert(0, str(module_dir)) + + # Set LD_LIBRARY_PATH for Linux systems if needed + if sys.platform.startswith('linux'): + lib_path = os.environ.get('LD_LIBRARY_PATH', '') + if str(module_dir) not in lib_path: + os.environ['LD_LIBRARY_PATH'] = f"{module_dir}:{lib_path}" + +# Initialize the module when imported +initialize_module() diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i index 76417d0e..634dbd7a 100644 --- a/python/src/sentencepiece/sentencepiece.i +++ b/python/src/sentencepiece/sentencepiece.i @@ -1916,6 +1916,8 @@ inline void InitNumThreads(const std::vector &ins, int *num_threads) { %pythoncode %{ +from ._version import __version__ + import re import csv import sys @@ -1974,8 +1976,6 @@ _add_snake_case(SentencePieceNormalizer) set_random_generator_seed = SetRandomGeneratorSeed set_min_log_level = SetMinLogLevel -from ._version import __version__ - class _LogStream(object): def __init__(self, ostream=None): self.ostream = ostream diff --git a/python/test/sentencepiece_test.py b/python/test/sentencepiece_test.py index b043cc2b..9aeb4a67 100755 --- a/python/test/sentencepiece_test.py +++ b/python/test/sentencepiece_test.py @@ -15,15 +15,16 @@ # See the License for the specific language governing permissions and # limitations under the License.! +import os import sys - -sys.path.insert(0, 'src') - -from collections import defaultdict +import unittest import io -import os import pickle -import unittest +from collections import defaultdict + +# Ensure proper module initialization by using absolute imports +if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'src')): + sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) import sentencepiece as spm print('VERSION={}'.format(spm.__version__))