[core] Avoid key bytes OOM in ClusteringFileRewriter.sortAndRewriteFile #314
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################################################################################ | |
| # Licensed to the Apache Software Foundation (ASF) under one | |
| # or more contributor license agreements. See the NOTICE file | |
| # distributed with this work for additional information | |
| # regarding copyright ownership. The ASF licenses this file | |
| # to you under the Apache License, Version 2.0 (the | |
| # "License"); you may not use this file except in compliance | |
| # with the License. You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| ################################################################################ | |
| name: Python Check Code Style and Test | |
| on: | |
| push: | |
| paths: | |
| - 'paimon-python/**' | |
| - '!**/*.md' | |
| - '.github/workflows/paimon-python-checks.yml' | |
| pull_request: | |
| paths: | |
| - 'paimon-python/**' | |
| - '!**/*.md' | |
| - '.github/workflows/paimon-python-checks.yml' | |
| env: | |
| JDK_VERSION: 8 | |
| MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true | |
| LUMINA_DATA_VERSION: 0.1.0 | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }} | |
| cancel-in-progress: true | |
| jobs: | |
| # Lint + test on 3.6 and 3.10 only (not every Python version). | |
| lint-python: | |
| runs-on: ubuntu-latest | |
| container: "python:${{ matrix.python-version }}-slim" | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: [ '3.6.15', '3.10', '3.11' ] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Set up JDK ${{ env.JDK_VERSION }} | |
| uses: actions/setup-java@v5 | |
| with: | |
| java-version: ${{ env.JDK_VERSION }} | |
| distribution: 'temurin' | |
| - name: Set up Maven | |
| uses: stCarolas/setup-maven@v5 | |
| with: | |
| maven-version: 3.8.8 | |
| - name: Install system dependencies | |
| shell: bash | |
| run: | | |
| apt-get update && apt-get install -y \ | |
| build-essential \ | |
| git \ | |
| curl \ | |
| pkg-config \ | |
| libssl-dev \ | |
| && apt-get clean \ | |
| && rm -rf /var/lib/apt/lists/* | |
| - name: Verify Java and Maven installation | |
| run: | | |
| java -version | |
| mvn -version | |
| - name: Install Rust toolchain | |
| run: | | |
| curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal | |
| echo "$HOME/.cargo/bin" >> $GITHUB_PATH | |
| - name: Build Tantivy native library | |
| run: | | |
| cd paimon-tantivy/paimon-tantivy-jni/rust | |
| cargo build --release | |
| - name: Copy Tantivy native library to resources | |
| run: | | |
| RESOURCE_DIR=paimon-tantivy/paimon-tantivy-jni/src/main/resources/native/linux-amd64 | |
| mkdir -p ${RESOURCE_DIR} | |
| cp paimon-tantivy/paimon-tantivy-jni/rust/target/release/libtantivy_jni.so ${RESOURCE_DIR}/ | |
| - name: Verify Python version | |
| run: python --version | |
| - name: Verify requirements.txt dependencies can be installed | |
| shell: bash | |
| run: | | |
| cd paimon-python | |
| python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { | |
| echo "ERROR: Failed to resolve dependencies from dev/requirements.txt" | |
| rm -rf "$TEMP_DIR" | |
| exit 1 | |
| } | |
| rm -rf "$TEMP_DIR" | |
| echo "✓ dev/requirements.txt can be resolved on this Python" | |
| - name: Build Java | |
| run: | | |
| echo "Start compiling modules" | |
| mvn -T 2C -B -ntp clean install -DskipTests | |
| - name: Install Python dependencies | |
| shell: bash | |
| run: | | |
| df -h | |
| if [[ "${{ matrix.python-version }}" == "3.6.15" ]]; then | |
| python -m pip install --upgrade pip==21.3.1 | |
| python --version | |
| python -m pip install --no-cache-dir pyroaring readerwriterlock==1.0.9 'fsspec==2021.10.1' 'cachetools==4.2.4' 'ossfs==2021.8.0' pyarrow==6.0.1 pandas==1.1.5 'polars==0.9.12' 'fastavro==1.4.7' zstandard==0.19.0 dataclasses==0.8.0 flake8 pytest py4j==0.10.9.9 requests parameterized==0.8.1 2>&1 >/dev/null | |
| python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/ | |
| else | |
| python -m pip install --upgrade pip | |
| pip install torch --index-url https://download.pytorch.org/whl/cpu | |
| python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 cramjam flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 | |
| python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/ | |
| if python -c "import sys; sys.exit(0 if sys.version_info >= (3, 11) else 1)"; then | |
| python -m pip install vortex-data | |
| fi | |
| fi | |
| df -h | |
| - name: Build and install tantivy-py from source | |
| if: matrix.python-version != '3.6.15' | |
| shell: bash | |
| run: | | |
| pip install maturin[patchelf] | |
| git clone -b support_directory https://github.com/JingsongLi/tantivy-py.git /tmp/tantivy-py | |
| cd /tmp/tantivy-py | |
| maturin build --release | |
| pip install target/wheels/tantivy-*.whl | |
| - name: Build and install pypaimon-rust from source | |
| if: matrix.python-version != '3.6.15' | |
| shell: bash | |
| run: | | |
| git clone https://github.com/apache/paimon-rust.git /tmp/paimon-rust | |
| cd /tmp/paimon-rust/bindings/python | |
| maturin build --release -o dist | |
| pip install dist/pypaimon_rust-*.whl | |
| pip install 'datafusion>=52' | |
| - name: Run lint-python.sh | |
| shell: bash | |
| run: | | |
| chmod +x paimon-python/dev/lint-python.sh | |
| ./paimon-python/dev/lint-python.sh -e pytest_torch | |
| torch_test: | |
| runs-on: ubuntu-latest | |
| container: "python:3.10-slim" | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Install system dependencies | |
| shell: bash | |
| run: | | |
| apt-get update && apt-get install -y \ | |
| build-essential \ | |
| git \ | |
| curl \ | |
| && apt-get clean \ | |
| && rm -rf /var/lib/apt/lists/* | |
| - name: Verify Python version | |
| run: python --version | |
| - name: Install Python dependencies | |
| shell: bash | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install torch --index-url https://download.pytorch.org/whl/cpu | |
| python -m pip install pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 pylance==0.39.0 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 | |
| python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/ | |
| - name: Run lint-python.sh | |
| shell: bash | |
| run: | | |
| chmod +x paimon-python/dev/lint-python.sh | |
| ./paimon-python/dev/lint-python.sh -i pytest_torch | |
| # One job: check dev/requirements.txt on each Python version in sequence, then Ray version test on 3.10. | |
| requirement_version_compatible_test: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Install system dependencies | |
| run: | | |
| sudo apt-get update && sudo apt-get install -y --no-install-recommends \ | |
| build-essential git curl cmake \ | |
| && sudo rm -rf /var/lib/apt/lists/* | |
| - name: Verify dev/requirements.txt on Python 3.8 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.8' | |
| - name: Resolve requirements (3.8) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.8" | |
| - name: Verify dev/requirements.txt on Python 3.9 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.9' | |
| - name: Resolve requirements (3.9) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.9" | |
| - name: Verify dev/requirements.txt on Python 3.10 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Resolve requirements (3.10) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.10" | |
| - name: Verify dev/requirements.txt on Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: Resolve requirements (3.11) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.11" | |
| - name: Verify dev/requirements.txt on Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.12' | |
| - name: Resolve requirements (3.12) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.12" | |
| - name: Verify dev/requirements.txt on Python 3.13 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.13' | |
| - name: Resolve requirements (3.13) | |
| run: | | |
| cd paimon-python && python -m pip install --upgrade pip | |
| TEMP_DIR=$(mktemp -d) | |
| python -m pip install -r dev/requirements.txt --target "$TEMP_DIR" || { rm -rf "$TEMP_DIR"; exit 1; } | |
| rm -rf "$TEMP_DIR" && echo "✓ dev/requirements.txt OK on Python 3.13" | |
| - name: Setup Python 3.10 for Ray test | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.10' | |
| - name: Install base Python dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| pip install torch --index-url https://download.pytorch.org/whl/cpu | |
| python -m pip install --no-cache-dir \ | |
| pyroaring readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 \ | |
| fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 \ | |
| numpy==1.24.3 pandas==2.0.3 cramjam pytest~=7.0 py4j==0.10.9.9 requests \ | |
| parameterized==0.9.0 packaging | |
| python -m pip install 'lumina-data>=${{ env.LUMINA_DATA_VERSION }}' -i https://pypi.org/simple/ | |
| - name: Test Ray version compatibility | |
| run: | | |
| cd paimon-python | |
| echo "==========================================" | |
| echo "Testing Ray version compatibility" | |
| echo "==========================================" | |
| for ray_version in 2.44.0 2.48.0 2.53.0; do | |
| echo "Testing Ray version: $ray_version" | |
| python -m pip install --no-cache-dir -q ray==$ray_version | |
| python -c "import ray; print(f'Ray version: {ray.__version__}')" | |
| python -c "from packaging.version import parse; import ray; assert parse(ray.__version__) == parse('$ray_version'), f'Expected Ray $ray_version, got {ray.__version__}'" | |
| python -m pytest pypaimon/tests/ray_data_test.py::RayDataTest -v --tb=short || { | |
| echo "Tests failed for Ray $ray_version"; python -m pip uninstall -y ray; exit 1; | |
| } | |
| python -m pip uninstall -y ray | |
| done | |
| env: | |
| PYTHONPATH: ${{ github.workspace }}/paimon-python |