From f01b9c644e98b59c06bb0be131239a475e69aa6b Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 10:10:22 -0400 Subject: [PATCH 01/15] updates --- .github/workflows/perform-release.yml | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 6f82b26c6..7cdff207b 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -197,26 +197,13 @@ jobs: # build failure prevents any artifact from being published. # ══════════════════════════════════════════════════════════════════════ - pypi-publish: - name: Publish Python Wheels - if: ${{ !inputs.dry-run && !cancelled() && !failure() }} - needs: - - pypi-build - - nvingest-docker-build - - helm-build - uses: ./.github/workflows/reusable-pypi-publish.yml - secrets: - ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }} - ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }} - ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} - nvingest-docker-publish: name: Publish nv-ingest Docker Image if: ${{ !inputs.dry-run && !cancelled() && !failure() }} needs: - determine-version - - pypi-build - nvingest-docker-build + - pypi-build - helm-build runs-on: linux-large-disk outputs: @@ -279,8 +266,8 @@ jobs: if: ${{ !inputs.dry-run && !inputs.skip-helm-chart && !cancelled() && !failure() }} needs: - determine-version - - pypi-build - nvingest-docker-build + - pypi-build - helm-build runs-on: ubuntu-latest env: @@ -337,6 +324,19 @@ jobs: --name nv-ingest \ --version "${{ needs.determine-version.outputs.version }}" + pypi-publish: + name: Publish Python Wheels (last — PyPI versions are immutable) + if: ${{ !inputs.dry-run && !cancelled() && !failure() }} + needs: + - pypi-build + - nvingest-docker-publish + - helm-publish + uses: ./.github/workflows/reusable-pypi-publish.yml + secrets: + ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }} + ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }} + ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} + # ── Tag Release ─────────────────────────────────────────────────────── tag-release: name: Tag Release @@ -435,7 +435,7 @@ jobs: elif [ "$PYPI_BUILD_RESULT" != "success" ]; then MSG+="\n$(status_emoji "$PYPI_BUILD_RESULT") *PyPI Wheels* — Build: ${PYPI_BUILD_RESULT}" else - MSG+="\n$(status_emoji "$PYPI_PUBLISH_RESULT") *PyPI Wheels* — Publish blocked (other build failed)" + MSG+="\n$(status_emoji "$PYPI_PUBLISH_RESULT") *PyPI Wheels* — Publish blocked (other publish failed)" fi MSG+="\n \`nv-ingest-api\` \`nv-ingest-client\` \`nv-ingest\` \`nemo-retriever\`" From a6d354aa2fd6732feffd99d06b680e16235611a3 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 10:54:23 -0400 Subject: [PATCH 02/15] updates --- .github/workflows/perform-release.yml | 80 +++++++++++++-------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 7cdff207b..ad6a39fda 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -138,6 +138,18 @@ jobs: cache-to: type=gha,scope=nvingest,mode=max cache-from: type=gha,scope=nvingest + - name: Export Docker image + run: | + docker save "${{ steps.meta.outputs.image }}" | gzip -1 > nv-ingest-docker-image.tar.gz + ls -lh nv-ingest-docker-image.tar.gz + + - name: Upload Docker image artifact + uses: actions/upload-artifact@v4 + with: + name: nv-ingest-docker-image + path: nv-ingest-docker-image.tar.gz + if-no-files-found: error + helm-build: name: Build Helm Chart if: ${{ !inputs.skip-helm-chart }} @@ -149,6 +161,11 @@ jobs: with: ref: ${{ needs.determine-version.outputs.source-ref }} + - name: Overlay CI scripts from workflow branch + run: | + git fetch --depth=1 origin "${{ github.ref_name }}" + git checkout FETCH_HEAD -- ci/scripts/ + - name: Setup Helm uses: azure/setup-helm@v4 @@ -191,6 +208,13 @@ jobs: --version "${{ needs.determine-version.outputs.version }}" \ --dry-run + - name: Upload Helm chart artifact + uses: actions/upload-artifact@v4 + with: + name: helm-chart + path: nv-ingest-*.tgz + if-no-files-found: error + # ══════════════════════════════════════════════════════════════════════ # Publish Phase — runs only after ALL builds succeed and dry-run is # off. Every publish job depends on every build job so that a single @@ -207,59 +231,30 @@ jobs: - helm-build runs-on: linux-large-disk outputs: - image: ${{ steps.meta.outputs.image }} + image: ${{ needs.nvingest-docker-build.outputs.image }} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ needs.determine-version.outputs.source-ref }} - - name: Setup Docker Buildx - uses: ./.github/actions/setup-docker-buildx - with: - use-qemu: 'true' - platforms: 'linux/amd64,linux/arm64' - - name: Login to NGC uses: ./.github/actions/docker-login-ngc with: password: ${{ secrets.DOCKER_PASSWORD }} - - name: Set image metadata - id: meta - run: | - if [ -z "$DOCKER_REGISTRY" ]; then - echo "::error::DOCKER_REGISTRY secret is not set" - exit 1 - fi - IMAGE="${DOCKER_REGISTRY}/nv-ingest:${{ needs.determine-version.outputs.version }}" - echo "image=${IMAGE}" >> $GITHUB_OUTPUT - echo "Image tag: ${IMAGE}" - env: - DOCKER_REGISTRY: ${{ secrets.DOCKER_REGISTRY }} + - name: Download Docker image artifact + uses: actions/download-artifact@v4 + with: + name: nv-ingest-docker-image - - name: Create HF token file - env: - HF_ACCESS_TOKEN: ${{ secrets.HF_ACCESS_TOKEN }} + - name: Load and push image run: | - mkdir -p ./scripts/private_local - if [ -n "${HF_ACCESS_TOKEN}" ]; then - printf '%s' "${HF_ACCESS_TOKEN}" > ./scripts/private_local/hf_token.txt - fi - - - name: Build and push multi-platform image - uses: docker/build-push-action@v5 - with: - context: . - push: true - target: runtime - platforms: linux/amd64,linux/arm64 - build-args: | - DOWNLOAD_LLAMA_TOKENIZER=True - GIT_COMMIT=${{ github.sha }} - tags: ${{ steps.meta.outputs.image }} - secret-files: hf_token=./scripts/private_local/hf_token.txt - cache-from: type=gha,scope=nvingest + IMAGE="${{ needs.nvingest-docker-build.outputs.image }}" + echo "Loading image from tarball..." + gunzip -c nv-ingest-docker-image.tar.gz | docker load + echo "Pushing ${IMAGE}..." + docker push "${IMAGE}" helm-publish: name: Publish Helm Chart @@ -281,6 +276,11 @@ jobs: with: ref: ${{ needs.determine-version.outputs.source-ref }} + - name: Overlay CI scripts from workflow branch + run: | + git fetch --depth=1 origin "${{ github.ref_name }}" + git checkout FETCH_HEAD -- ci/scripts/ + - name: Setup Helm uses: azure/setup-helm@v4 From db42b0cc2128f86f84c4da527185f5e0cf0bd7a0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 11:00:56 -0400 Subject: [PATCH 03/15] Update github action versions --- .github/workflows/perform-release.yml | 24 ++++++++++----------- .github/workflows/release-helm.yml | 4 ++-- .github/workflows/reusable-pypi-build.yml | 6 +++--- .github/workflows/reusable-pypi-publish.yml | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index ad6a39fda..05889334e 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -23,7 +23,7 @@ on: description: '⛔ SKIP Helm chart — disable Helm build and publish entirely' required: false type: boolean - default: true + default: false release-type: description: 'Release type for PyPI packages' required: false @@ -95,7 +95,7 @@ jobs: image: ${{ steps.meta.outputs.image }} steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ needs.determine-version.outputs.source-ref }} @@ -123,7 +123,7 @@ jobs: fi - name: Build image (validate) - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . push: false @@ -144,7 +144,7 @@ jobs: ls -lh nv-ingest-docker-image.tar.gz - name: Upload Docker image artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: nv-ingest-docker-image path: nv-ingest-docker-image.tar.gz @@ -157,7 +157,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ needs.determine-version.outputs.source-ref }} @@ -192,7 +192,7 @@ jobs: helm dependency build helm/ - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' @@ -209,7 +209,7 @@ jobs: --dry-run - name: Upload Helm chart artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: helm-chart path: nv-ingest-*.tgz @@ -234,7 +234,7 @@ jobs: image: ${{ needs.nvingest-docker-build.outputs.image }} steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ needs.determine-version.outputs.source-ref }} @@ -244,7 +244,7 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} - name: Download Docker image artifact - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: name: nv-ingest-docker-image @@ -272,7 +272,7 @@ jobs: NGC_CLI_FORMAT_TYPE: json steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ needs.determine-version.outputs.source-ref }} @@ -291,7 +291,7 @@ jobs: | tar xz -C /usr/local/bin helm-docs - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' @@ -349,7 +349,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ needs.determine-version.outputs.source-ref }} diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml index f3307c1a7..cc772f771 100644 --- a/.github/workflows/release-helm.yml +++ b/.github/workflows/release-helm.yml @@ -44,7 +44,7 @@ jobs: NGC_CLI_FORMAT_TYPE: json steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ inputs.source-ref }} @@ -58,7 +58,7 @@ jobs: | tar xz -C /usr/local/bin helm-docs - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' diff --git a/.github/workflows/reusable-pypi-build.yml b/.github/workflows/reusable-pypi-build.yml index 7f9947d1d..e009b26ff 100644 --- a/.github/workflows/reusable-pypi-build.yml +++ b/.github/workflows/reusable-pypi-build.yml @@ -36,7 +36,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ inputs.source-ref }} @@ -52,7 +52,7 @@ jobs: echo "Building version: $VERSION" - name: Setup Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.12' @@ -108,7 +108,7 @@ jobs: python -m build - name: Upload wheel artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: python-wheels path: | diff --git a/.github/workflows/reusable-pypi-publish.yml b/.github/workflows/reusable-pypi-publish.yml index 8d3601e01..23e46f164 100644 --- a/.github/workflows/reusable-pypi-publish.yml +++ b/.github/workflows/reusable-pypi-publish.yml @@ -16,7 +16,7 @@ jobs: steps: - name: Download wheel artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: name: python-wheels path: ./dist From e9f5da7851a5cde0a05b0305c9f76a368c0b1a4a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 11:11:17 -0400 Subject: [PATCH 04/15] Update github action versions --- .github/workflows/perform-release.yml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 05889334e..377fc4cd4 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -19,6 +19,11 @@ on: required: false type: boolean default: false + skip-docker: + description: '⛔ SKIP Docker — disable Docker build and publish entirely' + required: false + type: boolean + default: false skip-helm-chart: description: '⛔ SKIP Helm chart — disable Helm build and publish entirely' required: false @@ -89,6 +94,7 @@ jobs: nvingest-docker-build: name: Build nv-ingest Docker Image + if: ${{ !inputs.skip-docker }} needs: determine-version runs-on: linux-large-disk outputs: @@ -223,7 +229,7 @@ jobs: nvingest-docker-publish: name: Publish nv-ingest Docker Image - if: ${{ !inputs.dry-run && !cancelled() && !failure() }} + if: ${{ !inputs.dry-run && !inputs.skip-docker && !cancelled() && !failure() }} needs: - determine-version - nvingest-docker-build @@ -319,8 +325,8 @@ jobs: NGC_CLI_API_KEY: ${{ secrets.NVIDIA_API_KEY }} run: | python ci/scripts/release_helm_chart.py \ - --org nvidian \ - --team nemo-llm \ + --org ${{ secrets.NGC_ORG }} \ + --team ${{ secrets.NGC_TEAM }} \ --name nv-ingest \ --version "${{ needs.determine-version.outputs.version }}" @@ -389,6 +395,7 @@ jobs: HELM_BUILD_RESULT: ${{ needs.helm-build.result }} HELM_PUBLISH_RESULT: ${{ needs.helm-publish.result }} TAG_RESULT: ${{ needs.tag-release.result }} + SKIP_DOCKER: ${{ inputs.skip-docker }} SKIP_HELM: ${{ inputs.skip-helm-chart }} NVINGEST_IMAGE: ${{ needs.nvingest-docker-publish.outputs.image || needs.nvingest-docker-build.outputs.image }} RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} @@ -440,7 +447,9 @@ jobs: MSG+="\n \`nv-ingest-api\` \`nv-ingest-client\` \`nv-ingest\` \`nemo-retriever\`" # — nv-ingest Docker — - if [ "$DRY_RUN" = "true" ]; then + if [ "$SKIP_DOCKER" = "true" ]; then + MSG+="\n:fast_forward: *nv-ingest Docker* — Disabled (skip-docker)" + elif [ "$DRY_RUN" = "true" ]; then MSG+="\n$(status_emoji "$NVINGEST_BUILD_RESULT") *nv-ingest Docker* — Built and validated (not pushed)" elif [ "$NVINGEST_PUBLISH_RESULT" = "success" ]; then MSG+="\n:white_check_mark: *nv-ingest Docker* — \`${NVINGEST_IMAGE}\`" From 02971c17799c9dddc552ee12e09d3f788e0c6b6a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 11:46:43 -0400 Subject: [PATCH 05/15] Update to PEP 639 compliant wheels for LICENSE files --- api/pyproject.toml | 4 +- client/pyproject.toml | 3 +- nemo_retriever/LICENSE | 201 ++++++++++++++++++++++++++++++++++ nemo_retriever/pyproject.toml | 4 +- src/pyproject.toml | 4 +- 5 files changed, 209 insertions(+), 7 deletions(-) create mode 100644 nemo_retriever/LICENSE diff --git a/api/pyproject.toml b/api/pyproject.toml index d2bb1f633..3e1da46d8 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -13,10 +13,10 @@ readme = "README.md" authors = [ {name = "Jeremy Dyer", email = "jdyer@nvidia.com"} ] -license = {text = "Apache-2.0"} +license = "Apache-2.0" +license-files = ["LICENSE"] classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ] dependencies = [ diff --git a/client/pyproject.toml b/client/pyproject.toml index 1db428311..7ffa4d8ce 100644 --- a/client/pyproject.toml +++ b/client/pyproject.toml @@ -13,7 +13,8 @@ readme = "README.md" authors = [ {name = "Jeremy Dyer", email = "jdyer@nvidia.com"} ] -license = {text = "Apache-2.0"} +license = "Apache-2.0" +license-files = ["LICENSE"] requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3.11", diff --git a/nemo_retriever/LICENSE b/nemo_retriever/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/nemo_retriever/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index ea22099fb..a500eb1ef 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -15,11 +15,11 @@ requires-python = ">=3.12" authors = [ { name = "Jeremy Dyer", email = "jdyer@nvidia.com" }, ] -license = { text = "Apache-2.0" } +license = "Apache-2.0" +license-files = ["LICENSE"] classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.12", - "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ] dependencies = [ diff --git a/src/pyproject.toml b/src/pyproject.toml index b8fbd942e..817af365b 100644 --- a/src/pyproject.toml +++ b/src/pyproject.toml @@ -13,10 +13,10 @@ readme = "README.md" authors = [ {name = "Jeremy Dyer", email = "jdyer@nvidia.com"} ] -license = {text = "Apache-2.0"} +license = "Apache-2.0" +license-files = ["LICENSE"] classifiers = [ "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ] dependencies = [ From dcb4459c95c9bb04c6900add34d491d0d4fdd593 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 11:55:47 -0400 Subject: [PATCH 06/15] updates --- .github/workflows/perform-release.yml | 1 + .github/workflows/reusable-pypi-build.yml | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 377fc4cd4..386ce00ec 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -90,6 +90,7 @@ jobs: version: ${{ needs.determine-version.outputs.version }} release-type: ${{ inputs.release-type }} source-ref: ${{ needs.determine-version.outputs.source-ref }} + workflow-ref: ${{ github.ref_name }} runner: linux-large-disk nvingest-docker-build: diff --git a/.github/workflows/reusable-pypi-build.yml b/.github/workflows/reusable-pypi-build.yml index e009b26ff..914750992 100644 --- a/.github/workflows/reusable-pypi-build.yml +++ b/.github/workflows/reusable-pypi-build.yml @@ -18,6 +18,11 @@ on: required: false type: string default: 'main' + workflow-ref: + description: 'Git ref of the workflow branch (used to overlay pyproject.toml files)' + required: false + type: string + default: '' runner: description: 'GitHub runner to use' required: false @@ -40,6 +45,12 @@ jobs: with: ref: ${{ inputs.source-ref }} + - name: Overlay build config from workflow branch + if: ${{ inputs.workflow-ref != '' && inputs.workflow-ref != inputs.source-ref }} + run: | + git fetch --depth=1 origin "${{ inputs.workflow-ref }}" + git checkout FETCH_HEAD -- api/pyproject.toml client/pyproject.toml src/pyproject.toml nemo_retriever/pyproject.toml + - name: Determine version id: set-version run: | From f22679b1c00ed9a6a64b95ddd159a1d3bc89d3ad Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 12:03:33 -0400 Subject: [PATCH 07/15] updates --- .github/workflows/reusable-pypi-publish.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/reusable-pypi-publish.yml b/.github/workflows/reusable-pypi-publish.yml index 23e46f164..d6ac755d5 100644 --- a/.github/workflows/reusable-pypi-publish.yml +++ b/.github/workflows/reusable-pypi-publish.yml @@ -21,6 +21,11 @@ jobs: name: python-wheels path: ./dist + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: '3.12' + - name: Install twine run: pip install 'twine>=6.1' From 5cba5ebe9cfa6699d55394bbee5d3c2b9783ef81 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 12:16:52 -0400 Subject: [PATCH 08/15] updates --- .github/workflows/reusable-pypi-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/reusable-pypi-publish.yml b/.github/workflows/reusable-pypi-publish.yml index d6ac755d5..0a585eea4 100644 --- a/.github/workflows/reusable-pypi-publish.yml +++ b/.github/workflows/reusable-pypi-publish.yml @@ -36,7 +36,7 @@ jobs: ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }} run: | # Publish all wheels - twine upload \ + twine upload --verbose \ --repository-url $ARTIFACTORY_URL \ -u $ARTIFACTORY_USERNAME \ -p $ARTIFACTORY_PASSWORD \ From b12455cd67350561fc7788df4f07f38bf4bc569b Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 14:31:31 -0400 Subject: [PATCH 09/15] Properly acquire the docker image name --- .github/workflows/perform-release.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 386ce00ec..5d9f69747 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -238,7 +238,7 @@ jobs: - helm-build runs-on: linux-large-disk outputs: - image: ${{ needs.nvingest-docker-build.outputs.image }} + image: ${{ steps.push.outputs.image }} steps: - name: Checkout code uses: actions/checkout@v5 @@ -256,10 +256,17 @@ jobs: name: nv-ingest-docker-image - name: Load and push image + id: push run: | - IMAGE="${{ needs.nvingest-docker-build.outputs.image }}" echo "Loading image from tarball..." - gunzip -c nv-ingest-docker-image.tar.gz | docker load + LOAD_OUTPUT=$(gunzip -c nv-ingest-docker-image.tar.gz | docker load) + echo "$LOAD_OUTPUT" + IMAGE=$(echo "$LOAD_OUTPUT" | sed -n 's/^Loaded image: //p') + if [ -z "$IMAGE" ]; then + echo "::error::Failed to parse image name from docker load output" + exit 1 + fi + echo "image=${IMAGE}" >> $GITHUB_OUTPUT echo "Pushing ${IMAGE}..." docker push "${IMAGE}" From 90d3b4daa26d4e4f499b84f58f93077e8569d8c2 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 15:56:43 -0400 Subject: [PATCH 10/15] class up slack announcement --- .github/workflows/perform-release.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml index 5d9f69747..addd44a69 100644 --- a/.github/workflows/perform-release.yml +++ b/.github/workflows/perform-release.yml @@ -398,6 +398,7 @@ jobs: SOURCE_BRANCH: ${{ needs.determine-version.outputs.source-ref }} PYPI_BUILD_RESULT: ${{ needs.pypi-build.result }} PYPI_PUBLISH_RESULT: ${{ needs.pypi-publish.result }} + PYPI_VERSION: ${{ needs.pypi-build.outputs.version }} NVINGEST_BUILD_RESULT: ${{ needs.nvingest-docker-build.result }} NVINGEST_PUBLISH_RESULT: ${{ needs.nvingest-docker-publish.result }} HELM_BUILD_RESULT: ${{ needs.helm-build.result }} @@ -406,7 +407,9 @@ jobs: SKIP_DOCKER: ${{ inputs.skip-docker }} SKIP_HELM: ${{ inputs.skip-helm-chart }} NVINGEST_IMAGE: ${{ needs.nvingest-docker-publish.outputs.image || needs.nvingest-docker-build.outputs.image }} + REPO_URL: ${{ github.server_url }}/${{ github.repository }} RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }} run: | status_emoji() { case "$1" in @@ -434,6 +437,9 @@ jobs: HEADER=":rocket: *Release ${VERSION} Published* :rocket:" fi + PIP_VER="${PYPI_VERSION:-${VERSION}}" + TAG_URL="${REPO_URL}/releases/tag/${VERSION}" + MSG="${HEADER}" MSG+="\n" MSG+="\n*Version:* \`${VERSION}\`" @@ -452,7 +458,10 @@ jobs: else MSG+="\n$(status_emoji "$PYPI_PUBLISH_RESULT") *PyPI Wheels* — Publish blocked (other publish failed)" fi - MSG+="\n \`nv-ingest-api\` \`nv-ingest-client\` \`nv-ingest\` \`nemo-retriever\`" + MSG+="\n \`nv-ingest-api==${PIP_VER}\` \`nv-ingest-client==${PIP_VER}\` \`nv-ingest==${PIP_VER}\` \`nemo-retriever==${PIP_VER}\`" + MSG+="\n" + MSG+="\n *Quick install:*" + MSG+="\n \`\`\`pip install --index-url ${ARTIFACTORY_URL} nv-ingest-api==${PIP_VER} nv-ingest-client==${PIP_VER} nv-ingest==${PIP_VER} nemo-retriever==${PIP_VER}\`\`\`" # — nv-ingest Docker — if [ "$SKIP_DOCKER" = "true" ]; then @@ -461,6 +470,7 @@ jobs: MSG+="\n$(status_emoji "$NVINGEST_BUILD_RESULT") *nv-ingest Docker* — Built and validated (not pushed)" elif [ "$NVINGEST_PUBLISH_RESULT" = "success" ]; then MSG+="\n:white_check_mark: *nv-ingest Docker* — \`${NVINGEST_IMAGE}\`" + MSG+="\n \`\`\`docker pull ${NVINGEST_IMAGE}\`\`\`" elif [ "$NVINGEST_BUILD_RESULT" != "success" ]; then MSG+="\n$(status_emoji "$NVINGEST_BUILD_RESULT") *nv-ingest Docker* — Build: ${NVINGEST_BUILD_RESULT}" else @@ -485,7 +495,7 @@ jobs: if [ "$DRY_RUN" = "true" ]; then MSG+="\n:fast_forward: *Git Tag* — Skipped (dry run)" elif [ "$TAG_RESULT" = "success" ]; then - MSG+="\n:white_check_mark: *Git Tag* — \`${VERSION}\`" + MSG+="\n:white_check_mark: *Git Tag* — <${TAG_URL}|\`${VERSION}\`>" else MSG+="\n$(status_emoji "$TAG_RESULT") *Git Tag* — ${TAG_RESULT}" fi From 54dee222f29a7e6f2fdee2ecc90c72bff1a5422a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 16:25:23 -0400 Subject: [PATCH 11/15] Add step to install UV and install packages --- .github/workflows/retriever-unit-tests.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/retriever-unit-tests.yml b/.github/workflows/retriever-unit-tests.yml index e26d93328..46136184c 100644 --- a/.github/workflows/retriever-unit-tests.yml +++ b/.github/workflows/retriever-unit-tests.yml @@ -19,11 +19,16 @@ jobs: with: python-version: "3.12" + - name: Install uv + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + - name: Install unit test dependencies run: | - python -m pip install --upgrade pip - python -m pip install pytest pandas pydantic pyyaml typer scikit-learn - python -m pip install api/ + uv pip install --system -e src/ -e api/ -e client/ + uv pip install --system -e nemo_retriever/[dev] \ + --extra-index-url https://test.pypi.org/simple/ \ - name: Run retriever unit tests env: From 9ef315e58133cd95ea31e3c9b9b118e4d5b59617 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 16:28:47 -0400 Subject: [PATCH 12/15] Remove extra-index-url --- .github/workflows/retriever-unit-tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/retriever-unit-tests.yml b/.github/workflows/retriever-unit-tests.yml index 46136184c..87fb7ee25 100644 --- a/.github/workflows/retriever-unit-tests.yml +++ b/.github/workflows/retriever-unit-tests.yml @@ -27,8 +27,7 @@ jobs: - name: Install unit test dependencies run: | uv pip install --system -e src/ -e api/ -e client/ - uv pip install --system -e nemo_retriever/[dev] \ - --extra-index-url https://test.pypi.org/simple/ \ + uv pip install --system -e nemo_retriever - name: Run retriever unit tests env: From 34453ac0cbfd79d7bc690a5dc29afc352ef79208 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 16:35:48 -0400 Subject: [PATCH 13/15] Remove dead unit tests --- nemo_retriever/tests/test_batch_pipeline.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/nemo_retriever/tests/test_batch_pipeline.py b/nemo_retriever/tests/test_batch_pipeline.py index 2d18d92bb..6dfc913a6 100644 --- a/nemo_retriever/tests/test_batch_pipeline.py +++ b/nemo_retriever/tests/test_batch_pipeline.py @@ -1,23 +1,6 @@ -import pytest - -pytest.importorskip("ray") - -from nemo_retriever.examples.batch_pipeline import _count_materialized_rows from nemo_retriever.utils.input_files import resolve_input_patterns -class _DatasetWithoutLen: - def count(self) -> int: - return 42 - - def __len__(self) -> int: - raise AssertionError("__len__ should not be used") - - -def test_count_materialized_rows_prefers_dataset_count() -> None: - assert _count_materialized_rows(_DatasetWithoutLen()) == 42 - - def test_resolve_input_file_patterns_recurses_for_directory_inputs(tmp_path) -> None: dataset_dir = tmp_path / "earnings_consulting" dataset_dir.mkdir() From 29a99ea6f9850c34a5977fd4d91070e69c79042f Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 16:40:18 -0400 Subject: [PATCH 14/15] Fix failing unit tests --- nemo_retriever/tests/test_html_convert.py | 8 ++++---- nemo_retriever/tests/test_txt_split.py | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/nemo_retriever/tests/test_html_convert.py b/nemo_retriever/tests/test_html_convert.py index e558a4b29..47dc746b4 100644 --- a/nemo_retriever/tests/test_html_convert.py +++ b/nemo_retriever/tests/test_html_convert.py @@ -12,6 +12,7 @@ import pytest from nemo_retriever.html.convert import html_bytes_to_chunks_df, html_file_to_chunks_df, html_to_markdown +from nemo_retriever.params import HtmlChunkParams def test_html_to_markdown_str(): @@ -49,8 +50,7 @@ def test_html_file_to_chunks_df(tmp_path: Path): ) df = html_file_to_chunks_df( str(f), - max_tokens=512, - overlap_tokens=0, + params=HtmlChunkParams(max_tokens=512, overlap_tokens=0), ) assert isinstance(df, pd.DataFrame) assert "text" in df.columns and "path" in df.columns and "page_number" in df.columns and "metadata" in df.columns @@ -67,7 +67,7 @@ def test_html_file_to_chunks_df_empty_content(tmp_path: Path): pytest.importorskip("transformers") f = tmp_path / "empty.html" f.write_text("", encoding="utf-8") - df = html_file_to_chunks_df(str(f), max_tokens=512) + df = html_file_to_chunks_df(str(f), params=HtmlChunkParams(max_tokens=512)) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["text", "path", "page_number", "metadata"] assert len(df) == 0 @@ -78,7 +78,7 @@ def test_html_bytes_to_chunks_df(tmp_path: Path): pytest.importorskip("transformers") html_bytes = b"

Chunk content from bytes.

" path = str(tmp_path / "virtual.html") - df = html_bytes_to_chunks_df(html_bytes, path, max_tokens=512, overlap_tokens=0) + df = html_bytes_to_chunks_df(html_bytes, path, params=HtmlChunkParams(max_tokens=512, overlap_tokens=0)) assert isinstance(df, pd.DataFrame) assert "text" in df.columns and "path" in df.columns and "page_number" in df.columns and "metadata" in df.columns assert len(df) >= 1 diff --git a/nemo_retriever/tests/test_txt_split.py b/nemo_retriever/tests/test_txt_split.py index 212c94813..9785e137d 100644 --- a/nemo_retriever/tests/test_txt_split.py +++ b/nemo_retriever/tests/test_txt_split.py @@ -12,7 +12,7 @@ import pandas as pd import pytest -from nemo_retriever.txt.split import split_text_by_tokens, txt_file_to_chunks_df +from nemo_retriever.txt.split import split_text_by_tokens, txt_file_to_chunks_df, TextChunkParams class _MockTokenizer: @@ -63,8 +63,7 @@ def test_txt_file_to_chunks_df(tmp_path: Path): f.write_text("First paragraph here. Second paragraph there.", encoding="utf-8") df = txt_file_to_chunks_df( str(f), - max_tokens=512, - overlap_tokens=0, + params=TextChunkParams(max_tokens=512, overlap_tokens=0), ) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["text", "path", "page_number", "metadata"] @@ -79,7 +78,7 @@ def test_txt_file_to_chunks_df_empty_file(tmp_path: Path): pytest.importorskip("transformers") f = tmp_path / "empty.txt" f.write_text("", encoding="utf-8") - df = txt_file_to_chunks_df(str(f), max_tokens=512) + df = txt_file_to_chunks_df(str(f), params=TextChunkParams(max_tokens=512)) assert isinstance(df, pd.DataFrame) assert list(df.columns) == ["text", "path", "page_number", "metadata"] assert len(df) == 0 From a292d890bafd4aadee9e026ed21e8905053423d0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 11 Mar 2026 16:43:35 -0400 Subject: [PATCH 15/15] Fix failing unit tests --- nemo_retriever/tests/test_txt_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_retriever/tests/test_txt_split.py b/nemo_retriever/tests/test_txt_split.py index 9785e137d..966fff242 100644 --- a/nemo_retriever/tests/test_txt_split.py +++ b/nemo_retriever/tests/test_txt_split.py @@ -66,7 +66,7 @@ def test_txt_file_to_chunks_df(tmp_path: Path): params=TextChunkParams(max_tokens=512, overlap_tokens=0), ) assert isinstance(df, pd.DataFrame) - assert list(df.columns) == ["text", "path", "page_number", "metadata"] + assert list(df.columns) == ["text", "content", "path", "page_number", "metadata"] assert len(df) >= 1 assert df["path"].iloc[0] == str(f.resolve()) assert df["page_number"].iloc[0] >= 1