diff --git a/.github/workflows/ci-go.yml b/.github/workflows/ci-go.yml index fd25637..b6fa92c 100644 --- a/.github/workflows/ci-go.yml +++ b/.github/workflows/ci-go.yml @@ -26,18 +26,13 @@ permissions: jobs: test: - name: test (go${{ matrix.go }} on ${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - go: ["1.26"] + name: test (go 1.26) + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-go@v6 with: - go-version: ${{ matrix.go }} + go-version: "1.26" cache: true - name: Install task uses: arduino/setup-task@v2 diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index 7bb4683..6c90f36 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -26,20 +26,15 @@ permissions: jobs: test: - name: test (py${{ matrix.python }} on ${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - python: ["3.10", "3.11", "3.12", "3.13"] + name: test (python 3.10) + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: astral-sh/setup-uv@v6 with: enable-cache: true - name: Pin Python - run: uv python install ${{ matrix.python }} + run: uv python install 3.10 - name: Install task uses: arduino/setup-task@v2 with: diff --git a/.github/workflows/ci-typescript.yml b/.github/workflows/ci-typescript.yml index 2476989..0ca6fb4 100644 --- a/.github/workflows/ci-typescript.yml +++ b/.github/workflows/ci-typescript.yml @@ -30,21 +30,14 @@ permissions: jobs: test: - name: test (node${{ matrix.node }} on ${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macos-latest] - node: ["20", "22", "24"] + name: test (node 24) + runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: pnpm/action-setup@v4 - with: - version: 11.0.8 - uses: actions/setup-node@v5 with: - node-version: ${{ matrix.node }} + node-version: "24" cache: pnpm - name: Install task uses: arduino/setup-task@v2 diff --git a/.github/workflows/publish-python.yml b/.github/workflows/publish-python.yml deleted file mode 100644 index 20bef51..0000000 --- a/.github/workflows/publish-python.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: publish-python - -on: - push: - tags: - - "python-v*" - workflow_dispatch: - -permissions: - contents: read - -jobs: - publish: - name: publish to PyPI - runs-on: ubuntu-latest - environment: - name: pypi - url: https://pypi.org/p/kreuzberg-cloud - permissions: - id-token: write - steps: - - uses: actions/checkout@v6 - - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - - name: Install task - uses: arduino/setup-task@v2 - with: - version: 3.x - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Install deps - run: uv sync - - name: Generate client - run: task python:generate - - name: Build wheel + sdist - run: task python:build - - name: Publish to PyPI (trusted publishing) - run: cd packages/python && uv publish --trusted-publishing always diff --git a/.github/workflows/publish-typescript.yml b/.github/workflows/publish-typescript.yml deleted file mode 100644 index e9092bd..0000000 --- a/.github/workflows/publish-typescript.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: publish-typescript - -on: - push: - tags: - - "ts-v*" - workflow_dispatch: - -permissions: - contents: read - -jobs: - publish: - name: publish to npm - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@v6 - - uses: pnpm/action-setup@v4 - with: - version: 11.0.8 - - uses: actions/setup-node@v5 - with: - node-version: "20" - registry-url: "https://registry.npmjs.org" - cache: pnpm - - name: Install task - uses: arduino/setup-task@v2 - with: - version: 3.x - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Install deps - run: pnpm install - - name: Generate client - run: task typescript:generate - - name: Build - run: task typescript:build - - name: Publish to npm (with provenance) - run: cd packages/typescript && pnpm publish --access public --provenance --no-git-checks - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..d66c5e0 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,214 @@ +name: release + +on: + push: + tags: ['v*.*.*'] + workflow_dispatch: + inputs: + tag: + description: "Override tag (vX.Y.Z) — required for workflow_dispatch" + required: false + dry_run: + description: "Build everything, skip publishes" + type: boolean + default: false + force_republish: + description: "Republish even if version exists in registry" + type: boolean + default: false + +permissions: + contents: write + id-token: write + +concurrency: + group: release-${{ github.ref }} + cancel-in-progress: false + +jobs: + prepare: + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.metadata.outputs.tag }} + version: ${{ steps.metadata.outputs.version }} + npm_tag: ${{ steps.metadata.outputs.npm_tag }} + dry_run: ${{ steps.metadata.outputs.dry_run }} + force_republish: ${{ steps.metadata.outputs.force_republish }} + release_python: ${{ steps.metadata.outputs.release_python }} + release_node: ${{ steps.metadata.outputs.release_node }} + release_go: ${{ steps.metadata.outputs.release_go }} + steps: + - uses: kreuzberg-dev/actions/prepare-release-metadata@v1 + id: metadata + with: + available-targets: "python,node,go" + + validate-versions: + needs: prepare + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Verify all manifests match the tag + shell: bash + env: + VERSION: ${{ needs.prepare.outputs.version }} + run: | + set -euo pipefail + file_version=$(cat VERSION) + py_version=$(grep -E '^version = ' packages/python/pyproject.toml | head -1 | sed 's/version = "\(.*\)"/\1/') + ts_version=$(python3 -c 'import json; print(json.load(open("packages/typescript/package.json"))["version"])') + go_version=$(grep -E '^const Version = ' packages/go/v1/version.go | sed 's/.*"\(.*\)".*/\1/') + for v in "$file_version" "$py_version" "$ts_version" "$go_version"; do + if [ "$v" != "$VERSION" ]; then + echo "version mismatch: tag=$VERSION file=$file_version py=$py_version ts=$ts_version go=$go_version" >&2 + exit 1 + fi + done + echo "all manifests at $VERSION" + + check-pypi: + needs: prepare + if: needs.prepare.outputs.release_python == 'true' + runs-on: ubuntu-latest + outputs: + exists: ${{ steps.check.outputs.exists }} + steps: + - uses: kreuzberg-dev/actions/check-registry@v1 + id: check + with: + registry: pypi + package: kreuzberg-cloud + version: ${{ needs.prepare.outputs.version }} + + check-npm: + needs: prepare + if: needs.prepare.outputs.release_node == 'true' + runs-on: ubuntu-latest + outputs: + exists: ${{ steps.check.outputs.exists }} + steps: + - uses: kreuzberg-dev/actions/check-registry@v1 + id: check + with: + registry: npm + package: "@kreuzberg/cloud" + version: ${{ needs.prepare.outputs.version }} + + build-python: + needs: [prepare, validate-versions] + if: needs.prepare.outputs.release_python == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: kreuzberg-dev/actions/setup-python-env@v1 + with: + python-version: "3.10" + - uses: kreuzberg-dev/actions/install-task@v1 + - name: Generate client + run: task python:generate + - name: Build wheel + sdist + run: task python:build + - uses: actions/upload-artifact@v7 + with: + name: python-dist + path: dist/ + retention-days: 7 + + build-typescript: + needs: [prepare, validate-versions] + if: needs.prepare.outputs.release_node == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: kreuzberg-dev/actions/setup-node-workspace@v1 + - uses: kreuzberg-dev/actions/install-task@v1 + - name: Generate client + run: task typescript:generate + - name: Build + run: task typescript:build + - name: Pack + run: cd packages/typescript && pnpm pack --pack-destination ${{ runner.temp }} + - uses: actions/upload-artifact@v7 + with: + name: typescript-dist + path: ${{ runner.temp }}/*.tgz + retention-days: 7 + + publish-pypi: + needs: [prepare, validate-versions, check-pypi, build-python] + if: | + needs.prepare.outputs.release_python == 'true' && + (needs.check-pypi.outputs.exists != 'true' || + needs.prepare.outputs.force_republish == 'true') + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/kreuzberg-cloud + permissions: + id-token: write + steps: + - uses: actions/download-artifact@v7 + with: + name: python-dist + path: dist/ + - uses: kreuzberg-dev/actions/publish-pypi@v1 + with: + packages-dir: dist + dry-run: ${{ needs.prepare.outputs.dry_run }} + + publish-npm: + needs: [prepare, validate-versions, check-npm, build-typescript] + if: | + needs.prepare.outputs.release_node == 'true' && + (needs.check-npm.outputs.exists != 'true' || + needs.prepare.outputs.force_republish == 'true') + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v6 + - uses: actions/download-artifact@v7 + with: + name: typescript-dist + path: packages/typescript/dist-pack/ + - uses: kreuzberg-dev/actions/publish-npm@v1 + with: + packages-dir: packages/typescript/dist-pack + npm-tag: ${{ needs.prepare.outputs.npm_tag }} + provenance: "true" + dry-run: ${{ needs.prepare.outputs.dry_run }} + env: + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} + + tag-go: + needs: [prepare, validate-versions] + if: needs.prepare.outputs.release_go == 'true' + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - uses: kreuzberg-dev/actions/finalize-release@v1 + with: + tag: ${{ needs.prepare.outputs.tag }} + go-module-path: packages/go/v1 + dry-run: ${{ needs.prepare.outputs.dry_run }} + + github-release: + needs: [prepare, publish-pypi, publish-npm, tag-go] + if: | + always() && + !cancelled() && + needs.prepare.outputs.dry_run != 'true' + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v6 + - uses: kreuzberg-dev/actions/publish-github-release@v1 + with: + tag: ${{ needs.prepare.outputs.tag }} + generate-notes: "true" diff --git a/.github/workflows/spec-sync.yml b/.github/workflows/spec-sync.yml index 7a7dde9..430258d 100644 --- a/.github/workflows/spec-sync.yml +++ b/.github/workflows/spec-sync.yml @@ -1,6 +1,9 @@ name: spec-sync on: + # Weekly sweep on Monday 06:00 UTC. Source of truth is + # kreuzberg-cloud/services/api/spec/openapi.json — the canonical OpenAPI + # spec emitted by the public extraction API (utoipa-generated). schedule: - cron: "0 6 * * 1" workflow_dispatch: @@ -21,8 +24,16 @@ jobs: repository: kreuzberg-dev/kreuzberg-cloud path: _kreuzberg-cloud ref: main - - name: Copy spec - run: cp _kreuzberg-cloud/frontend/openapi-backend.yaml spec/openapi.yaml + - name: Install yq + run: | + sudo wget -qO /usr/local/bin/yq \ + https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + - name: Convert JSON spec to YAML + run: | + yq -P -oy '.' \ + _kreuzberg-cloud/services/api/spec/openapi.json \ + > spec/openapi.yaml - name: Detect drift id: diff run: | @@ -38,7 +49,9 @@ jobs: commit-message: "chore(spec): sync openapi.yaml from kreuzberg-cloud" title: "chore(spec): sync openapi.yaml from kreuzberg-cloud" body: | - Automated sync of `spec/openapi.yaml` from `kreuzberg-dev/kreuzberg-cloud@main`. + Automated sync of `spec/openapi.yaml` from + `kreuzberg-dev/kreuzberg-cloud@main:services/api/spec/openapi.json` + (the canonical public extraction API spec). Run `task generate` and review per-language diffs before merging. branch: chore/spec-sync diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index a3701ea..770ecd1 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -24,11 +24,9 @@ jobs: with: enable-cache: true - uses: pnpm/action-setup@v4 - with: - version: 11.0.8 - uses: actions/setup-node@v5 with: - node-version: "20" + node-version: "24" cache: pnpm - uses: actions/setup-go@v6 with: @@ -40,5 +38,13 @@ jobs: uv sync pnpm install (cd packages/go/v1 && go mod download) + - name: Install task + uses: arduino/setup-task@v2 + with: + version: 3.x + - name: Generate SDK bindings + run: | + task python:generate + task typescript:generate - name: Run prek run: prek run --all-files --show-diff-on-failure diff --git a/.typos.toml b/.typos.toml new file mode 100644 index 0000000..6c156f7 --- /dev/null +++ b/.typos.toml @@ -0,0 +1,20 @@ +# typos configuration — https://github.com/crate-ci/typos + +[default] +extend-ignore-re = [ + # Go module pseudo-versions (hex commit hashes embedded in go.mod / go.sum) + # frequently contain digit-letter substrings that trigger false positives. + "v0\\.0\\.0-[0-9]+-[0-9a-f]+", +] + +[files] +extend-exclude = [ + "go.work.sum", + "**/go.sum", + "**/_generated/**", + "**/dist/**", + "**/node_modules/**", + "**/.venv/**", + "pnpm-lock.yaml", + "uv.lock", +] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..37dffc6 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.0.1] - 2026-05-10 + +### Added + +- Python client `kreuzberg-cloud` on PyPI: ergonomic `KreuzbergCloud` and `AsyncKreuzbergCloud` clients with `extract`, `extract_batch`, `get_job`, `wait_for_job`, `extract_and_wait`, plus typed error hierarchy (`AuthError`, `RateLimitError`, `ValidationError`, `NotFoundError`, `ServerError`, `TimeoutError`). +- TypeScript client `@kreuzberg/cloud` on npm: ESM-only client with the same method surface as Python (`extract`, `extractBatch`, `getJob`, `waitForJob`, `extractAndWait`), error hierarchy, retry/backoff config, and full type declarations. +- Go client `github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1`: hand-written interim client with `Extract`, `ExtractBatch`, `GetJob`, `WaitForJob`, `WaitForJobs`, `ExtractAndWait`, `FromSandbox`; idiomatic error hierarchy via `errors.As`. +- Zero-friction sandbox onboarding: `client.create_sandbox_key()` (Py), `KreuzbergCloud.fromSandbox()` (TS), `client.FromSandbox(ctx)` (Go) — fetch an anonymous sandbox key (50 pages, 24h) and start extracting without signup. +- All three packages generated from `services/api`'s public extraction OpenAPI spec. +- Comprehensive test coverage: 53 tests (Python), 57 tests (TypeScript), ~44 tests (Go). + +[Unreleased]: https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/compare/v0.0.1...HEAD +[0.0.1]: https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/releases/tag/v0.0.1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c8db056 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,83 @@ +# Contributing to kreuzberg-cloud-sdk + +[← Back to README](README.md) + +Official client SDKs for the Kreuzberg Cloud public extraction API, generated from the upstream OpenAPI 3.1 specification. + +## Layout + +``` +packages/ + python/ # PyPI distribution (httpx-based, sync + async) + typescript/ # npm distribution (ESM-only, openapi-fetch) + go/v1/ # Go module — hand-written interim while oapi-codegen 3.1 support is upstream-blocked +spec/ + openapi.yaml # Vendored copy of the API spec +tasks/ # Per-language Taskfile fragments +scripts/ + sync-versions.py # Propagates the root VERSION file into every per-package manifest +VERSION # Single source of truth for the SDK version across all three packages +``` + +## Development + +This repo is part of the [`kreuzberg-dev`](https://github.com/kreuzberg-dev) polyrepo. + +```sh +task setup # install pnpm + uv + Go deps, install pre-commit hooks +task generate # regenerate clients from spec/openapi.yaml +task test # run all language test suites +task lint # prek run --all-files +task build # build all language packages +``` + +The OpenAPI spec is vendored from `kreuzberg-cloud`. The source of truth is the public extraction API spec emitted by `services/api` (utoipa-generated) and committed at `kreuzberg-cloud/services/api/spec/openapi.json`. To refresh: + +```sh +task spec:fetch # copy + JSON→YAML from ../kreuzberg-cloud/services/api/spec/openapi.json +``` + +CI also runs a weekly `spec-sync` workflow that opens an automated PR with the latest snapshot. + +## Versioning + +The single source of truth is the root `VERSION` file. `scripts/sync-versions.py` (run via `task version:sync`) propagates that value into every per-package manifest: + +- `packages/python/pyproject.toml` (`project.version`) +- `packages/python/src/kreuzberg_cloud/__init__.py` (`__version__`) +- `packages/typescript/package.json` (`version`) +- `packages/go/v1/version.go` (`const Version`) + +Go module versions for the module path itself live in git tags only (`packages/go/v1/vX.Y.Z`), created automatically by the release workflow. + +Tasks: + +- `task version:show` — print current version +- `task version:set -- X.Y.Z` — set explicit version, propagate, validate +- `task version:bump:patch` / `bump:minor` / `bump:major` — semver-aware wrappers + +## Releasing + +Releases use a single unified `vX.Y.Z` tag that drives all three publishes from one workflow run. + +1. `task version:set -- X.Y.Z` — propagates the new version to every manifest. +2. `task release:check` — pre-flight: validates semver, runs lint + test + build. +3. Commit: `git commit -am "chore(release): vX.Y.Z"` and open a PR to `main`. +4. After merge to `main`: `git checkout main && git pull && task release:tag` creates the annotated `vX.Y.Z` tag locally (refuses to run on a dirty tree). +5. `git push origin vX.Y.Z` — pushes the tag, triggering `.github/workflows/release.yml`: + - Validates every manifest matches the tag's version. + - Pre-checks PyPI + npm registries; skips already-published versions. + - Builds + publishes Python (PyPI, OIDC trusted publisher) and TypeScript (npm, `--provenance`, org `NPM_TOKEN`). + - Creates the Go module subtag `packages/go/v1/vX.Y.Z` via `kreuzberg-dev/actions/finalize-release@v1`. + - Auto-generates a GitHub Release with notes. +6. `workflow_dispatch` supports `dry_run=true` and `force_republish=true` for staged verification before tagging. + +## Code style & checks + +Run `prek run --all-files` to validate formatting, linting, and type checking across all languages. Pre-commit hooks enforce this automatically. + +## Contact + +- Issues: https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/issues +- Email: contact@kreuzberg.dev +- Discord: https://discord.gg/xt9WY3GnKR diff --git a/README.md b/README.md index 747025f..68f2f57 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,68 @@ # kreuzberg-cloud-sdk -Official client SDKs for the [Kreuzberg Cloud](https://kreuzberg.cloud) API, -generated from the upstream OpenAPI 3.1 specification. +
-| Language | Package | Registry | -|----------|---------|----------| -| Python | `kreuzberg-cloud` | [PyPI](https://pypi.org/project/kreuzberg-cloud/) | -| TypeScript / Node.js | `@kreuzberg/cloud` | [npm](https://www.npmjs.com/package/@kreuzberg/cloud) | -| Go | `github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1` | [pkg.go.dev](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1) | +Kreuzberg Cloud -## Layout +
-``` -packages/ - python/ # PyPI distribution (httpx-based, fully async) - typescript/ # npm distribution (ESM-only, openapi-fetch) - go/v1/ # Go module (oapi-codegen, stdlib net/http) -spec/ - openapi.yaml # Vendored copy of the API spec -tasks/ # Per-language Taskfile fragments -``` +
+ +PyPI +npm +Go Reference +License +Documentation +CI + +
+ +
-## Development +Discord -This repo is part of the [`kreuzberg-dev`](https://github.com/kreuzberg-dev) polyrepo. +
+Official client SDKs for the [Kreuzberg Cloud](https://kreuzberg.cloud) public extraction API. Generated from the upstream OpenAPI 3.1 specification. + +| Language | Package | Registry | Status | +|----------|---------|----------|--------| +| Python | `kreuzberg-cloud` | [PyPI](https://pypi.org/project/kreuzberg-cloud/) | generated (httpx) | +| TypeScript / Node.js | `@kreuzberg/cloud` | [npm](https://www.npmjs.com/package/@kreuzberg/cloud) | generated (openapi-fetch) | +| Go | `github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1` | [pkg.go.dev](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1) | hand-written interim — codegen blocked on [oapi-codegen 3.1 support](https://github.com/oapi-codegen/oapi-codegen/issues/373) | + +## Install + +**Python:** ```sh -task setup # install pnpm + uv + Go deps, install pre-commit hooks -task generate # regenerate clients from spec/openapi.yaml -task lint # prek run --all-files -task test # run all language test suites +pip install kreuzberg-cloud ``` -The OpenAPI spec is vendored from `kreuzberg-cloud`. To refresh: +**TypeScript / Node.js:** +```sh +pnpm add @kreuzberg/cloud +# or npm install @kreuzberg/cloud +# or yarn add @kreuzberg/cloud +``` +**Go:** ```sh -task spec:fetch # copy from ../kreuzberg-cloud/frontend/openapi-backend.yaml +go get github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1 ``` -## Versioning +For language-specific quickstarts, examples, and API documentation, see the per-language READMEs in `packages/{python,typescript,go/v1}/`. -Each package versions independently; tags are language-scoped: +## Documentation -- `python-vX.Y.Z` → PyPI -- `ts-vX.Y.Z` → npm -- `go/vX.Y.Z` → Go module proxy +- API & Quickstart: [docs.kreuzberg.cloud](https://docs.kreuzberg.cloud) +- API Reference: [docs.kreuzberg.cloud/reference/api](https://docs.kreuzberg.cloud/reference/api/) +- Sandbox onboarding: zero-friction key issuance documented in the per-language READMEs +- Changelog: [CHANGELOG.md](CHANGELOG.md) -## Contact +## Contributing -- Issues: https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/issues -- Email: contact@kreuzberg.dev +See [CONTRIBUTING.md](CONTRIBUTING.md) for development workflows, repo layout, and release procedures. ## License -MIT © Kreuzberg, Inc. — see [LICENSE](./LICENSE). +MIT — see [LICENSE](LICENSE). diff --git a/Taskfile.yml b/Taskfile.yml index 9f1e7e8..2ea03ba 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -79,6 +79,35 @@ tasks: - task: lint - task: test + release:check: + desc: "Pre-flight: validate version semver, lint, test, build" + cmds: + - | + version=$(cat VERSION) + python3 -c "import re,sys;sys.exit(0 if re.match(r'^\d+\.\d+\.\d+([-+][\w.+-]+)?$','$version') else 1)" || { echo "VERSION '$version' is not valid semver" >&2; exit 1; } + echo "version: $version" + - task: lint + - task: test + - task: build + - echo "ready to tag v$(cat VERSION)" + + release:tag: + desc: "Create and push the unified release tag (vX.Y.Z) — refuses if working tree dirty" + cmds: + - | + if [ -n "$(git status --porcelain)" ]; then + echo "working tree is dirty — commit or stash before tagging" >&2 + exit 1 + fi + version=$(cat VERSION) + tag="v$version" + if git rev-parse "$tag" >/dev/null 2>&1; then + echo "tag $tag already exists locally" >&2 + exit 1 + fi + git tag -a "$tag" -m "Release $tag" + echo "created tag $tag — push with: git push origin $tag" + clean: desc: "Remove generated clients and build artifacts" cmds: diff --git a/packages/go/v1/LICENSE b/packages/go/v1/LICENSE new file mode 100644 index 0000000..59f7bf8 --- /dev/null +++ b/packages/go/v1/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Kreuzberg, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/go/v1/README.md b/packages/go/v1/README.md index 262daa5..1834b41 100644 --- a/packages/go/v1/README.md +++ b/packages/go/v1/README.md @@ -1,5 +1,28 @@ # kreuzbergcloud +
+ +Kreuzberg Cloud + +
+ +
+ +PyPI +npm +Go Reference +License +Documentation +CI + +
+ +
+ +Discord + +
+ Official Go client for the [Kreuzberg Cloud](https://kreuzberg.cloud) document-processing API. @@ -9,30 +32,170 @@ go get github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1 Requires Go 1.26+. -## Usage +> **Status.** This is a hand-written interim client covering the operations +> the v1 docs Quickstart needs: `Extract`, `ExtractBatch`, `GetJob`, +> `WaitForJob`, `WaitForJobs`, `ExtractAndWait`, and `CreateSandboxKey`. It +> will be replaced with generated bindings when +> [oapi-codegen](https://github.com/oapi-codegen/oapi-codegen) gains OpenAPI +> 3.1 support; the public surface above will be preserved. + +## Quickstart — explicit API key ```go package main import ( "context" + "fmt" "log" + "os" kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" ) func main() { + ctx := context.Background() client, err := kreuzbergcloud.New( - kreuzbergcloud.WithAPIKey("..."), + kreuzbergcloud.WithAPIKey(os.Getenv("KREUZBERG_API_KEY")), + ) + if err != nil { + log.Fatal(err) + } + file, err := os.Open("invoice.pdf") + if err != nil { + log.Fatal(err) + } + defer file.Close() + + result, err := client.ExtractAndWait( + ctx, + kreuzbergcloud.FileSource{Name: "invoice.pdf", Reader: file}, + nil, ) if err != nil { log.Fatal(err) } - _ = client - _ = context.Background() + fmt.Println(result.Content) } ``` +## Batch extraction with parallel waits + +```go +package main + +import ( + "context" + "fmt" + "log" + "os" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func main() { + ctx := context.Background() + client, err := kreuzbergcloud.New( + kreuzbergcloud.WithAPIKey(os.Getenv("KREUZBERG_API_KEY")), + ) + if err != nil { + log.Fatal(err) + } + + paths := []string{"invoice-a.pdf", "invoice-b.pdf"} + var sources []kreuzbergcloud.FileSource + for _, path := range paths { + f, err := os.Open(path) + if err != nil { + log.Fatal(err) + } + defer f.Close() + sources = append(sources, kreuzbergcloud.FileSource{Name: path, Reader: f}) + } + + jobs, err := client.ExtractBatch(ctx, sources, nil) + if err != nil { + log.Fatal(err) + } + ids := make([]string, len(jobs)) + for i, job := range jobs { + ids[i] = job.ID + } + results, err := client.WaitForJobs(ctx, ids, nil) + if err != nil { + log.Fatal(err) + } + for i, result := range results { + fmt.Printf("%s -> %d chars\n", paths[i], len(result.Content)) + } +} +``` + +## Zero-friction sandbox + +`FromSandbox` mints a short-lived anonymous API key and hands back a fully +configured client. Useful for demos, docs, and getting-started experiences — +not for production traffic. + +```go +package main + +import ( + "context" + "fmt" + "log" + "os" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func main() { + ctx := context.Background() + client, err := kreuzbergcloud.FromSandbox(ctx) + if err != nil { + log.Fatal(err) + } + file, err := os.Open("invoice.pdf") + if err != nil { + log.Fatal(err) + } + defer file.Close() + result, err := client.ExtractAndWait( + ctx, + kreuzbergcloud.FileSource{Name: "invoice.pdf", Reader: file}, + nil, + ) + if err != nil { + log.Fatal(err) + } + fmt.Println(result.Content) +} +``` + +## Errors + +Every non-2xx response is mapped to a typed error. Use `errors.As` to +discriminate: + +```go +import "errors" + +result, err := client.ExtractAndWait(ctx, file, nil) +var rateLimited *kreuzbergcloud.RateLimitError +if errors.As(err, &rateLimited) { + time.Sleep(rateLimited.RetryAfter) +} +``` + +The full hierarchy is `APIError` (base) plus `AuthError`, `ValidationError`, +`NotFoundError`, `RateLimitError`, `ServerError`, and `TimeoutError` (for +`WaitForJob` deadline expiry, distinct from context cancellation). + +## Documentation + +- API reference: +- OpenAPI spec: + ## License MIT — © Kreuzberg, Inc. diff --git a/packages/go/v1/client.go b/packages/go/v1/client.go index 0e0aaa8..30f6d31 100644 --- a/packages/go/v1/client.go +++ b/packages/go/v1/client.go @@ -10,7 +10,7 @@ import ( // DefaultBaseURL is the production endpoint of the Kreuzberg Cloud API. const DefaultBaseURL = "https://api.kreuzberg.cloud" -const userAgent = "kreuzberg-cloud-go/0.0.1" +const userAgent = "kreuzberg-cloud-go/" + Version // Option configures a Client constructed via New. type Option func(*clientConfig) @@ -36,11 +36,26 @@ func WithUserAgent(ua string) Option { return func(c *clientConfig) { c.userAgent = ua } } +// WithTimeout sets a per-request timeout that wraps the caller's context for +// every HTTP call. Zero or negative values disable the wrapper (the caller's +// context governs the deadline). +func WithTimeout(d time.Duration) Option { + return func(c *clientConfig) { c.timeout = d } +} + +// WithRetries sets the maximum number of automatic retry attempts on +// retryable HTTP responses (429, 502, 503, 504). Default: 0 (no retries). +func WithRetries(n int) Option { + return func(c *clientConfig) { c.retries = n } +} + type clientConfig struct { baseURL string apiKey string userAgent string httpClient *http.Client + timeout time.Duration + retries int } // Client is a thin wrapper around the generated openapi-fetch client. It diff --git a/packages/go/v1/errors.go b/packages/go/v1/errors.go new file mode 100644 index 0000000..b6e8d75 --- /dev/null +++ b/packages/go/v1/errors.go @@ -0,0 +1,189 @@ +package kreuzbergcloud + +import ( + "encoding/json" + "errors" + "fmt" + "net/http" + "strconv" + "time" +) + +// APIError is the base error type returned for any non-2xx HTTP response. It +// carries the HTTP status code, a human-readable message extracted from the +// response body when possible, and the raw body for callers that need +// programmatic access to vendor-specific error fields. +type APIError struct { + Status int + Message string + Body json.RawMessage +} + +// Error implements the error interface. +func (e *APIError) Error() string { + if e.Message != "" { + return fmt.Sprintf("kreuzberg-cloud: HTTP %d: %s", e.Status, e.Message) + } + return fmt.Sprintf("kreuzberg-cloud: HTTP %d", e.Status) +} + +// AuthError wraps 401 Unauthorized responses. +type AuthError struct{ APIError } + +// Error overrides the embedded APIError.Error to surface the error type. +func (e *AuthError) Error() string { + return "kreuzberg-cloud: authentication failed: " + e.APIError.Error() +} + +// Unwrap exposes the embedded APIError so errors.As(err, &apiErr) works. +func (e *AuthError) Unwrap() error { return &e.APIError } + +// ValidationError wraps 400 Bad Request and 422 Unprocessable Entity responses. +type ValidationError struct{ APIError } + +// Error overrides APIError.Error for type clarity in stack traces. +func (e *ValidationError) Error() string { + return "kreuzberg-cloud: validation failed: " + e.APIError.Error() +} + +// Unwrap exposes the embedded APIError. +func (e *ValidationError) Unwrap() error { return &e.APIError } + +// NotFoundError wraps 404 Not Found responses. +type NotFoundError struct{ APIError } + +// Error overrides APIError.Error. +func (e *NotFoundError) Error() string { + return "kreuzberg-cloud: not found: " + e.APIError.Error() +} + +// Unwrap exposes the embedded APIError. +func (e *NotFoundError) Unwrap() error { return &e.APIError } + +// RateLimitError wraps 429 Too Many Requests responses. RetryAfter is parsed +// from the Retry-After header when present (zero duration otherwise). +type RateLimitError struct { + APIError + RetryAfter time.Duration +} + +// Error overrides APIError.Error. +func (e *RateLimitError) Error() string { + if e.RetryAfter > 0 { + return fmt.Sprintf("kreuzberg-cloud: rate limited (retry after %s): %s", + e.RetryAfter, e.APIError.Error()) + } + return "kreuzberg-cloud: rate limited: " + e.APIError.Error() +} + +// Unwrap exposes the embedded APIError. +func (e *RateLimitError) Unwrap() error { return &e.APIError } + +// ServerError wraps 5xx responses. +type ServerError struct{ APIError } + +// Error overrides APIError.Error. +func (e *ServerError) Error() string { + return "kreuzberg-cloud: server error: " + e.APIError.Error() +} + +// Unwrap exposes the embedded APIError. +func (e *ServerError) Unwrap() error { return &e.APIError } + +// TimeoutError is returned by [Client.WaitForJob] when the configured +// [WaitOptions.Timeout] elapses before the job reaches a terminal status. It +// is distinct from a context cancellation: callers should use errors.Is or +// errors.As to disambiguate. +type TimeoutError struct { + JobID string + Elapsed time.Duration +} + +// Error implements error. +func (e *TimeoutError) Error() string { + return fmt.Sprintf( + "kreuzberg-cloud: timed out waiting for job %s after %s", + e.JobID, e.Elapsed, + ) +} + +// errorBody is the canonical error envelope used by the API service. +type errorBody struct { + Error string `json:"error,omitempty"` + Message string `json:"message,omitempty"` +} + +// classifyHTTPError converts a non-2xx HTTP response into a typed error. +// +// The body has already been read by the caller — we accept it as raw bytes so +// the caller can close the response without re-buffering. +func classifyHTTPError(status int, body []byte, header http.Header) error { + rawBody := json.RawMessage(body) + message := extractMessage(body) + base := APIError{Status: status, Message: message, Body: rawBody} + + switch { + case status == http.StatusUnauthorized || status == http.StatusForbidden: + return &AuthError{APIError: base} + case status == http.StatusBadRequest || status == http.StatusUnprocessableEntity: + return &ValidationError{APIError: base} + case status == http.StatusNotFound: + return &NotFoundError{APIError: base} + case status == http.StatusTooManyRequests: + return &RateLimitError{APIError: base, RetryAfter: parseRetryAfter(header)} + case status >= 500: + return &ServerError{APIError: base} + default: + return &base + } +} + +func extractMessage(body []byte) string { + if len(body) == 0 { + return "" + } + var envelope errorBody + if err := json.Unmarshal(body, &envelope); err == nil { + if envelope.Error != "" { + return envelope.Error + } + if envelope.Message != "" { + return envelope.Message + } + } + return string(body) +} + +func parseRetryAfter(header http.Header) time.Duration { + value := header.Get("Retry-After") + if value == "" { + return 0 + } + if seconds, err := strconv.Atoi(value); err == nil { + return time.Duration(seconds) * time.Second + } + if when, err := http.ParseTime(value); err == nil { + delta := time.Until(when) + if delta > 0 { + return delta + } + } + return 0 +} + +// IsRetryable reports whether an error should trigger a transport-level retry. +// 429 and 502/503/504 are retryable; everything else is terminal. +func IsRetryable(err error) bool { + var rate *RateLimitError + if errors.As(err, &rate) { + return true + } + var srv *ServerError + if errors.As(err, &srv) { + switch srv.Status { + case http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout: + return true + } + } + return false +} diff --git a/packages/go/v1/errors_test.go b/packages/go/v1/errors_test.go new file mode 100644 index 0000000..ee4e4fd --- /dev/null +++ b/packages/go/v1/errors_test.go @@ -0,0 +1,150 @@ +package kreuzbergcloud_test + +import ( + "context" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func TestError_AuthErrorOn401(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusUnauthorized, `{"error":"missing token"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var auth *kreuzbergcloud.AuthError + if !asError(err, &auth) { + t.Fatalf("expected AuthError, got %T: %v", err, err) + } + if !strings.Contains(auth.Message, "missing token") { + t.Errorf("Message = %q, want it to contain 'missing token'", auth.Message) + } +} + +func TestError_AuthErrorOn403(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusForbidden, `{"error":"forbidden"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var auth *kreuzbergcloud.AuthError + if !asError(err, &auth) { + t.Fatalf("expected AuthError, got %T: %v", err, err) + } +} + +func TestError_ValidationErrorOn400(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusBadRequest, `{"error":"bad"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var validation *kreuzbergcloud.ValidationError + if !asError(err, &validation) { + t.Fatalf("expected ValidationError, got %T: %v", err, err) + } +} + +func TestError_ValidationErrorOn422(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusUnprocessableEntity, `{"error":"unprocessable"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var validation *kreuzbergcloud.ValidationError + if !asError(err, &validation) { + t.Fatalf("expected ValidationError, got %T: %v", err, err) + } +} + +func TestError_NotFoundErrorOn404(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusNotFound, `{"error":"missing"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var notFound *kreuzbergcloud.NotFoundError + if !asError(err, ¬Found) { + t.Fatalf("expected NotFoundError, got %T: %v", err, err) + } +} + +func TestError_RateLimitErrorOn429WithRetryAfter(t *testing.T) { + t.Parallel() + server := newStatusServer( + http.StatusTooManyRequests, + `{"error":"slow down"}`, + http.Header{"Retry-After": []string{"3"}}, + ) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var rate *kreuzbergcloud.RateLimitError + if !asError(err, &rate) { + t.Fatalf("expected RateLimitError, got %T: %v", err, err) + } + if rate.RetryAfter != 3*time.Second { + t.Errorf("RetryAfter = %v, want 3s", rate.RetryAfter) + } +} + +func TestError_ServerErrorOn500(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusInternalServerError, `{"error":"oops"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var srv *kreuzbergcloud.ServerError + if !asError(err, &srv) { + t.Fatalf("expected ServerError, got %T: %v", err, err) + } +} + +func TestError_BasePropertiesAccessibleViaUnwrap(t *testing.T) { + t.Parallel() + server := newStatusServer(http.StatusBadRequest, `{"error":"bad"}`, nil) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "any") + var apiErr *kreuzbergcloud.APIError + if !asError(err, &apiErr) { + t.Fatalf("errors.As to *APIError failed: %v", err) + } + if apiErr.Status != 400 { + t.Errorf("Status = %d, want 400", apiErr.Status) + } + if !strings.Contains(string(apiErr.Body), "bad") { + t.Errorf("Body = %s, want to contain 'bad'", apiErr.Body) + } +} + +func TestError_TimeoutErrorIsDistinctFromContextCancel(t *testing.T) { + t.Parallel() + timeoutErr := &kreuzbergcloud.TimeoutError{JobID: "j", Elapsed: time.Second} + if !strings.Contains(timeoutErr.Error(), "timed out") { + t.Errorf("TimeoutError.Error() = %q, want to mention 'timed out'", timeoutErr.Error()) + } + if asError(timeoutErr, new(*kreuzbergcloud.APIError)) { + t.Errorf("TimeoutError should not unwrap to *APIError") + } +} + +// newStatusServer returns an *httptest.Server that always responds with the +// given status code, body, and (optional) headers — handy for stamping out +// error-class tests without per-test boilerplate. +func newStatusServer(status int, body string, header http.Header) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + for key, values := range header { + for _, value := range values { + w.Header().Add(key, value) + } + } + w.WriteHeader(status) + _, _ = w.Write([]byte(body)) + })) +} diff --git a/packages/go/v1/extract.go b/packages/go/v1/extract.go new file mode 100644 index 0000000..eac3cfc --- /dev/null +++ b/packages/go/v1/extract.go @@ -0,0 +1,151 @@ +package kreuzbergcloud + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "mime/multipart" + "net/textproto" + "strings" +) + +// Extract submits a single document for asynchronous extraction and returns +// the queued [Job]. Use [Client.WaitForJob] (or [Client.ExtractAndWait] for a +// one-shot helper) to obtain the extraction result. +func (c *Client) Extract( + ctx context.Context, + file FileSource, + opts *ExtractionOptions, +) (*Job, error) { + jobs, err := c.ExtractBatch(ctx, []FileSource{file}, opts) + if err != nil { + return nil, err + } + if len(jobs) == 0 { + return nil, fmt.Errorf("kreuzberg-cloud: server accepted extract request but returned no job IDs") + } + return jobs[0], nil +} + +// ExtractBatch submits multiple documents in a single multipart request. +// Each file becomes one Job in the response, in submission order. +func (c *Client) ExtractBatch( + ctx context.Context, + files []FileSource, + opts *ExtractionOptions, +) ([]*Job, error) { + if len(files) == 0 { + return nil, fmt.Errorf("kreuzberg-cloud: ExtractBatch requires at least one file") + } + for i, f := range files { + if f.Name == "" { + return nil, fmt.Errorf("kreuzberg-cloud: file %d: Name must not be empty", i) + } + if f.Reader == nil { + return nil, fmt.Errorf("kreuzberg-cloud: file %d (%s): Reader must not be nil", i, f.Name) + } + } + body, contentType, err := buildMultipartBody(files, opts) + if err != nil { + return nil, err + } + var resp extractResponse + spec := requestSpec{ + method: "POST", + path: "/v1/extract", + body: bytes.NewReader(body), + bodyContentType: contentType, + rewindBody: func() (io.Reader, error) { + return bytes.NewReader(body), nil + }, + } + if err := c.doJSON(ctx, spec, &resp); err != nil { + return nil, err + } + if len(resp.JobIDs) != len(files) { + return nil, fmt.Errorf( + "kreuzberg-cloud: expected %d job IDs, got %d", + len(files), len(resp.JobIDs), + ) + } + jobs := make([]*Job, len(resp.JobIDs)) + for i, id := range resp.JobIDs { + filename := "" + if i < len(files) { + filename = files[i].Name + } + jobs[i] = &Job{ID: id, Filename: filename, Status: resp.Status} + } + return jobs, nil +} + +// buildMultipartBody serializes files and optional ExtractionOptions into a +// multipart/form-data body matching the API's documented wire format: +// +// parts: file (one per document) + optional "options" (JSON string) +func buildMultipartBody( + files []FileSource, + opts *ExtractionOptions, +) ([]byte, string, error) { + var buf bytes.Buffer + writer := multipart.NewWriter(&buf) + for _, file := range files { + header := textproto.MIMEHeader{} + header.Set( + "Content-Disposition", + fmt.Sprintf( + `form-data; name="file"; filename=%q`, + strings.ReplaceAll(file.Name, `"`, `\"`), + ), + ) + header.Set("Content-Type", sniffContentType(file.Name)) + part, err := writer.CreatePart(header) + if err != nil { + return nil, "", fmt.Errorf("kreuzberg-cloud: creating multipart part: %w", err) + } + if _, err := io.Copy(part, file.Reader); err != nil { + return nil, "", fmt.Errorf( + "kreuzberg-cloud: copying file %q into multipart body: %w", + file.Name, err, + ) + } + } + if opts != nil { + encoded, err := json.Marshal(opts) + if err != nil { + return nil, "", fmt.Errorf("kreuzberg-cloud: encoding options: %w", err) + } + if err := writer.WriteField("options", string(encoded)); err != nil { + return nil, "", fmt.Errorf("kreuzberg-cloud: writing options field: %w", err) + } + } + if err := writer.Close(); err != nil { + return nil, "", fmt.Errorf("kreuzberg-cloud: closing multipart writer: %w", err) + } + return buf.Bytes(), writer.FormDataContentType(), nil +} + +// sniffContentType picks a reasonable multipart Content-Type for a filename. +// We map a small set of common extensions; everything else falls back to the +// generic application/octet-stream which the server accepts. +func sniffContentType(filename string) string { + lower := strings.ToLower(filename) + switch { + case strings.HasSuffix(lower, ".pdf"): + return "application/pdf" + case strings.HasSuffix(lower, ".png"): + return "image/png" + case strings.HasSuffix(lower, ".jpg"), strings.HasSuffix(lower, ".jpeg"): + return "image/jpeg" + case strings.HasSuffix(lower, ".txt"): + return "text/plain" + case strings.HasSuffix(lower, ".md"): + return "text/markdown" + case strings.HasSuffix(lower, ".docx"): + return "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + default: + return "application/octet-stream" + } +} diff --git a/packages/go/v1/extract_test.go b/packages/go/v1/extract_test.go new file mode 100644 index 0000000..f036639 --- /dev/null +++ b/packages/go/v1/extract_test.go @@ -0,0 +1,273 @@ +package kreuzbergcloud_test + +import ( + "context" + "encoding/json" + "io" + "mime" + "mime/multipart" + "net/http" + "net/http/httptest" + "strings" + "testing" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func TestExtract_SubmitsSingleFileAndReturnsJob(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/extract" { + t.Errorf("path = %q, want /v1/extract", r.URL.Path) + } + if r.Method != http.MethodPost { + t.Errorf("method = %q, want POST", r.Method) + } + if got := r.Header.Get("Authorization"); got != "Bearer test-key" { + t.Errorf("Authorization = %q, want %q", got, "Bearer test-key") + } + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["job-1"],"status":"pending"}`)) + })) + defer server.Close() + + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL), kreuzbergcloud.WithAPIKey("test-key")) + job, err := client.Extract( + context.Background(), + kreuzbergcloud.FileSource{Name: "invoice.pdf", Reader: strings.NewReader("hello")}, + nil, + ) + if err != nil { + t.Fatalf("Extract: %v", err) + } + if job.ID != "job-1" { + t.Errorf("Job.ID = %q, want job-1", job.ID) + } + if job.Filename != "invoice.pdf" { + t.Errorf("Job.Filename = %q, want invoice.pdf", job.Filename) + } + if job.Status != "pending" { + t.Errorf("Job.Status = %q, want pending", job.Status) + } +} + +func TestExtract_MultipartBodyShape(t *testing.T) { + t.Parallel() + type capture struct { + filenames []string + contentTypes []string + options string + } + var got capture + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mediaType, params, err := mime.ParseMediaType(r.Header.Get("Content-Type")) + if err != nil || !strings.HasPrefix(mediaType, "multipart/") { + t.Errorf("content-type = %q, want multipart/...", r.Header.Get("Content-Type")) + } + reader := multipart.NewReader(r.Body, params["boundary"]) + for { + part, err := reader.NextPart() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("multipart read: %v", err) + } + body, _ := io.ReadAll(part) + switch part.FormName() { + case "file": + got.filenames = append(got.filenames, part.FileName()) + got.contentTypes = append(got.contentTypes, part.Header.Get("Content-Type")) + case "options": + got.options = string(body) + } + } + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["a","b"],"status":"pending"}`)) + })) + defer server.Close() + + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + disable := false + _, err := client.ExtractBatch( + context.Background(), + []kreuzbergcloud.FileSource{ + {Name: "a.pdf", Reader: strings.NewReader("aaaa")}, + {Name: "b.png", Reader: strings.NewReader("bbbb")}, + }, + &kreuzbergcloud.ExtractionOptions{ + ExtractionConfig: &kreuzbergcloud.ExtractionConfig{ + OutputFormat: "markdown", + DisableOCR: &disable, + }, + }, + ) + if err != nil { + t.Fatalf("ExtractBatch: %v", err) + } + want := []string{"a.pdf", "b.png"} + for i, name := range want { + if got.filenames[i] != name { + t.Errorf("filenames[%d] = %q, want %q", i, got.filenames[i], name) + } + } + if got.contentTypes[0] != "application/pdf" { + t.Errorf("contentTypes[0] = %q, want application/pdf", got.contentTypes[0]) + } + if got.contentTypes[1] != "image/png" { + t.Errorf("contentTypes[1] = %q, want image/png", got.contentTypes[1]) + } + if !strings.Contains(got.options, `"output_format":"markdown"`) { + t.Errorf("options = %q, missing output_format=markdown", got.options) + } + if !strings.Contains(got.options, `"disable_ocr":false`) { + t.Errorf("options = %q, missing disable_ocr=false", got.options) + } +} + +func TestExtractBatch_RejectsEmptySlice(t *testing.T) { + t.Parallel() + client := mustClient(t, kreuzbergcloud.WithBaseURL("https://example.test")) + if _, err := client.ExtractBatch(context.Background(), nil, nil); err == nil { + t.Errorf("ExtractBatch(nil) returned nil error") + } +} + +func TestExtractBatch_RejectsMissingFilename(t *testing.T) { + t.Parallel() + client := mustClient(t, kreuzbergcloud.WithBaseURL("https://example.test")) + _, err := client.ExtractBatch( + context.Background(), + []kreuzbergcloud.FileSource{{Name: "", Reader: strings.NewReader("x")}}, + nil, + ) + if err == nil { + t.Errorf("ExtractBatch with empty Name returned nil error") + } +} + +func TestExtractBatch_RejectsMissingReader(t *testing.T) { + t.Parallel() + client := mustClient(t, kreuzbergcloud.WithBaseURL("https://example.test")) + _, err := client.ExtractBatch( + context.Background(), + []kreuzbergcloud.FileSource{{Name: "x.pdf", Reader: nil}}, + nil, + ) + if err == nil { + t.Errorf("ExtractBatch with nil Reader returned nil error") + } +} + +func TestExtractBatch_ReturnsJobIDsInOrder(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["x","y","z"],"status":"pending"}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + jobs, err := client.ExtractBatch( + context.Background(), + []kreuzbergcloud.FileSource{ + {Name: "1.pdf", Reader: strings.NewReader("1")}, + {Name: "2.pdf", Reader: strings.NewReader("2")}, + {Name: "3.pdf", Reader: strings.NewReader("3")}, + }, + nil, + ) + if err != nil { + t.Fatalf("ExtractBatch: %v", err) + } + want := []string{"x", "y", "z"} + wantFiles := []string{"1.pdf", "2.pdf", "3.pdf"} + for i, job := range jobs { + if job.ID != want[i] { + t.Errorf("jobs[%d].ID = %q, want %q", i, job.ID, want[i]) + } + if job.Filename != wantFiles[i] { + t.Errorf("jobs[%d].Filename = %q, want %q", i, job.Filename, wantFiles[i]) + } + } +} + +func TestExtractBatch_ServerReturnsMismatchedJobCount(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["only-one"],"status":"pending"}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.ExtractBatch( + context.Background(), + []kreuzbergcloud.FileSource{ + {Name: "a.pdf", Reader: strings.NewReader("aa")}, + {Name: "b.pdf", Reader: strings.NewReader("bb")}, + }, + nil, + ) + if err == nil { + t.Errorf("expected error on mismatched job count, got nil") + } +} + +func TestExtract_OptionsAreOptional(t *testing.T) { + t.Parallel() + var receivedOptions bool + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, params, _ := mime.ParseMediaType(r.Header.Get("Content-Type")) + reader := multipart.NewReader(r.Body, params["boundary"]) + for { + part, err := reader.NextPart() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("read multipart: %v", err) + } + if part.FormName() == "options" { + receivedOptions = true + } + } + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["nooptions"],"status":"pending"}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + if _, err := client.Extract( + context.Background(), + kreuzbergcloud.FileSource{Name: "a.pdf", Reader: strings.NewReader("a")}, + nil, + ); err != nil { + t.Fatalf("Extract: %v", err) + } + if receivedOptions { + t.Errorf("server received options part when none was passed") + } +} + +func TestExtract_PropagatesAPIError(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _ = json.NewEncoder(w).Encode(map[string]string{"error": "no document"}) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.Extract( + context.Background(), + kreuzbergcloud.FileSource{Name: "x.pdf", Reader: strings.NewReader("x")}, + nil, + ) + if err == nil { + t.Fatalf("expected error, got nil") + } + var validation *kreuzbergcloud.ValidationError + if !asError(err, &validation) { + t.Fatalf("expected ValidationError, got %T: %v", err, err) + } + if !strings.Contains(validation.Message, "no document") { + t.Errorf("error message = %q, want it to contain 'no document'", validation.Message) + } +} diff --git a/packages/go/v1/helpers_test.go b/packages/go/v1/helpers_test.go new file mode 100644 index 0000000..04ad841 --- /dev/null +++ b/packages/go/v1/helpers_test.go @@ -0,0 +1,25 @@ +package kreuzbergcloud_test + +import ( + "errors" + "testing" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +// mustClient builds a Client with the given options, failing the test on +// configuration errors. Used to keep test bodies focused on assertions. +func mustClient(t *testing.T, opts ...kreuzbergcloud.Option) *kreuzbergcloud.Client { + t.Helper() + client, err := kreuzbergcloud.New(opts...) + if err != nil { + t.Fatalf("kreuzbergcloud.New: %v", err) + } + return client +} + +// asError is a generic wrapper around errors.As that returns a bool, so test +// bodies can write `if !asError(err, &target)` without juggling pointer types. +func asError[T error](err error, target *T) bool { + return errors.As(err, target) +} diff --git a/packages/go/v1/http.go b/packages/go/v1/http.go new file mode 100644 index 0000000..3d9301e --- /dev/null +++ b/packages/go/v1/http.go @@ -0,0 +1,204 @@ +package kreuzbergcloud + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +const ( + contentTypeJSON = "application/json" + maxRetryBackoff = 30 * time.Second + initialRetryBackoff = 250 * time.Millisecond + retryBackoffFactor = 2 +) + +// requestSpec describes a single HTTP request issued by the typed helpers +// below. Body is nil for GET; bodyContentType is required when Body is set. +type requestSpec struct { + method string + path string + body io.Reader + bodyContentType string + // rewindBody is invoked before each retry attempt to reset the body + // reader to the start. Required when retries > 0 and body != nil. + rewindBody func() (io.Reader, error) +} + +// doJSON executes spec, decodes a 2xx JSON response into out, and maps +// non-2xx responses to typed error values. out may be nil when the caller +// only cares about the status code. +func (c *Client) doJSON(ctx context.Context, spec requestSpec, out any) error { + body, err := c.do(ctx, spec) + if err != nil { + return err + } + defer closeQuietly(body) + if out == nil { + if _, copyErr := io.Copy(io.Discard, body); copyErr != nil { + return fmt.Errorf("kreuzberg-cloud: discarding response body: %w", copyErr) + } + return nil + } + if err := json.NewDecoder(body).Decode(out); err != nil { + return fmt.Errorf("kreuzberg-cloud: decoding response: %w", err) + } + return nil +} + +// do executes spec and returns the response body for 2xx responses. The +// caller MUST close the returned reader. +func (c *Client) do(ctx context.Context, spec requestSpec) (io.ReadCloser, error) { + attempt := 0 + for { + body, err := c.doOnce(ctx, spec) + if err == nil { + return body, nil + } + if attempt >= c.cfg.retries || !shouldRetry(err) || ctx.Err() != nil { + return nil, err + } + delay := nextBackoff(attempt, retryAfter(err)) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + } + if spec.body != nil { + if spec.rewindBody == nil { + return nil, err + } + rewound, rewindErr := spec.rewindBody() + if rewindErr != nil { + return nil, fmt.Errorf("kreuzberg-cloud: rewinding body for retry: %w", rewindErr) + } + spec.body = rewound + } + attempt++ + } +} + +// doOnce performs a single HTTP round-trip without retries. When a per-call +// timeout is configured, the cancel function is attached to the response body +// (2xx) or invoked immediately on failure — we cannot defer it here because +// 2xx body reads happen after this call returns. +func (c *Client) doOnce(ctx context.Context, spec requestSpec) (io.ReadCloser, error) { + cancel := func() {} + if c.cfg.timeout > 0 { + ctx, cancel = context.WithTimeout(ctx, c.cfg.timeout) + } + return c.doOnceWithCancel(ctx, spec, cancel) +} + +// doOnceWithCancel executes the round-trip; cancel is called once the caller +// has finished reading the response body (2xx) or immediately on failure. +func (c *Client) doOnceWithCancel( + ctx context.Context, + spec requestSpec, + cancel context.CancelFunc, +) (io.ReadCloser, error) { + url := c.urlFor(spec.path) + req, err := http.NewRequestWithContext(ctx, spec.method, url, spec.body) + if err != nil { + cancel() + return nil, fmt.Errorf("kreuzberg-cloud: building request: %w", err) + } + if spec.bodyContentType != "" { + req.Header.Set("Content-Type", spec.bodyContentType) + } + req.Header.Set("Accept", contentTypeJSON) + if err := c.authorize(ctx, req); err != nil { + cancel() + return nil, err + } + resp, err := c.cfg.httpClient.Do(req) + if err != nil { + cancel() + return nil, fmt.Errorf("kreuzberg-cloud: %s %s: %w", spec.method, url, err) + } + if resp.StatusCode >= 200 && resp.StatusCode < 300 { + return &cancellingReadCloser{rc: resp.Body, cancel: cancel}, nil + } + defer closeQuietly(resp.Body) + defer cancel() + body, readErr := io.ReadAll(resp.Body) + if readErr != nil { + return nil, fmt.Errorf("kreuzberg-cloud: reading error response body: %w", readErr) + } + return nil, classifyHTTPError(resp.StatusCode, body, resp.Header) +} + +// urlFor concatenates baseURL and path, normalising a single slash boundary. +func (c *Client) urlFor(path string) string { + base := strings.TrimRight(c.cfg.baseURL, "/") + if !strings.HasPrefix(path, "/") { + path = "/" + path + } + return base + path +} + +// shouldRetry mirrors IsRetryable but tolerates wrapped errors. +func shouldRetry(err error) bool { return IsRetryable(err) } + +// retryAfter returns the server-suggested backoff for 429 responses or zero. +func retryAfter(err error) time.Duration { + var rate *RateLimitError + if errors.As(err, &rate) { + return rate.RetryAfter + } + return 0 +} + +// nextBackoff computes the backoff for the given attempt, honoring an +// optional server-suggested Retry-After value. Attempt is zero-based. +func nextBackoff(attempt int, suggested time.Duration) time.Duration { + if suggested > 0 { + if suggested > maxRetryBackoff { + return maxRetryBackoff + } + return suggested + } + delay := initialRetryBackoff + for i := 0; i < attempt; i++ { + delay *= retryBackoffFactor + if delay >= maxRetryBackoff { + return maxRetryBackoff + } + } + return delay +} + +// closeQuietly closes c, swallowing the error. We intentionally drop the +// error: the caller has already obtained the data it needs from the response, +// and a Close failure on read is not actionable. +func closeQuietly(c io.Closer) { + if c == nil { + return + } + if err := c.Close(); err != nil { + _ = err + } +} + +// cancellingReadCloser wraps a response body so the per-request cancel func +// fires when the body is closed. This keeps short-lived contexts attached to +// streaming bodies without leaking goroutines. +type cancellingReadCloser struct { + rc io.ReadCloser + cancel context.CancelFunc +} + +func (c *cancellingReadCloser) Read(p []byte) (int, error) { return c.rc.Read(p) } + +func (c *cancellingReadCloser) Close() error { + err := c.rc.Close() + if c.cancel != nil { + c.cancel() + } + return err +} diff --git a/packages/go/v1/jobs.go b/packages/go/v1/jobs.go new file mode 100644 index 0000000..cb48df7 --- /dev/null +++ b/packages/go/v1/jobs.go @@ -0,0 +1,188 @@ +package kreuzbergcloud + +import ( + "context" + "errors" + "fmt" + "sync" + "time" +) + +// Default poll/timeout values used when [WaitOptions] is nil or has zero +// fields. Tuned to be friendly for long-running extractions while remaining +// snappy for fast jobs (sub-second OCR-free PDFs). +const ( + defaultWaitTimeout = 5 * time.Minute + defaultWaitPollInterval = 1 * time.Second + maxWaitPollInterval = 30 * time.Second +) + +// GetJob fetches the current status of a single job by ID. +func (c *Client) GetJob(ctx context.Context, jobID string) (*Job, error) { + if jobID == "" { + return nil, fmt.Errorf("kreuzberg-cloud: GetJob requires a non-empty jobID") + } + var job Job + spec := requestSpec{method: "GET", path: "/v1/jobs/" + jobID} + if err := c.doJSON(ctx, spec, &job); err != nil { + return nil, err + } + return &job, nil +} + +// WaitForJob polls GET /v1/jobs/{id} until the job reaches a terminal status +// or the configured timeout elapses. The returned [JobResult] is the same +// payload exposed via Job.Result on success; on a "failed" terminal status +// it returns a typed error wrapping the server-supplied message. +func (c *Client) WaitForJob( + ctx context.Context, + jobID string, + opts *WaitOptions, +) (*JobResult, error) { + options := normaliseWaitOptions(opts) + start := time.Now() + deadline := start.Add(options.Timeout) + pollInterval := options.PollInterval + for { + job, err := c.GetJob(ctx, jobID) + if err != nil { + return nil, err + } + if IsTerminalStatus(job.Status) { + return jobResultFromTerminal(job) + } + if !time.Now().Before(deadline) { + return nil, &TimeoutError{JobID: jobID, Elapsed: time.Since(start)} + } + wait := pollInterval + if remaining := time.Until(deadline); remaining < wait { + wait = remaining + } + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(wait): + } + if options.Backoff == BackoffExponential { + pollInterval = nextPollInterval(pollInterval) + } + } +} + +// WaitForJobs concurrently waits for a slice of job IDs and returns their +// results in submission order. Errors from individual jobs are propagated +// immediately — the first error cancels the remaining waits. +func (c *Client) WaitForJobs( + ctx context.Context, + jobIDs []string, + opts *WaitOptions, +) ([]*JobResult, error) { + if len(jobIDs) == 0 { + return nil, nil + } + results := make([]*JobResult, len(jobIDs)) + errs := make([]error, len(jobIDs)) + groupCtx, cancel := context.WithCancel(ctx) + defer cancel() + var waitGroup sync.WaitGroup + for index, jobID := range jobIDs { + waitGroup.Add(1) + go func(i int, id string) { + defer waitGroup.Done() + result, err := c.WaitForJob(groupCtx, id, opts) + results[i] = result + if err != nil { + errs[i] = err + cancel() + } + }(index, jobID) + } + waitGroup.Wait() + for _, err := range errs { + if err != nil && !errors.Is(err, context.Canceled) { + return results, err + } + } + for _, err := range errs { + if err != nil { + return results, err + } + } + return results, nil +} + +// ExtractAndWait is a convenience wrapper that submits a single document and +// blocks until extraction completes, returning the final [JobResult]. The +// extraction options and wait policy can be overridden via opts; either field +// may be nil to accept defaults. +func (c *Client) ExtractAndWait( + ctx context.Context, + file FileSource, + opts *ExtractAndWaitOptions, +) (*JobResult, error) { + var extraction *ExtractionOptions + var wait *WaitOptions + if opts != nil { + extraction = opts.Extraction + wait = opts.Wait + } + job, err := c.Extract(ctx, file, extraction) + if err != nil { + return nil, err + } + return c.WaitForJob(ctx, job.ID, wait) +} + +func normaliseWaitOptions(opts *WaitOptions) WaitOptions { + out := WaitOptions{ + Timeout: defaultWaitTimeout, + PollInterval: defaultWaitPollInterval, + Backoff: BackoffExponential, + } + if opts == nil { + return out + } + if opts.Timeout > 0 { + out.Timeout = opts.Timeout + } + if opts.PollInterval > 0 { + out.PollInterval = opts.PollInterval + } + out.Backoff = opts.Backoff + return out +} + +func nextPollInterval(current time.Duration) time.Duration { + next := current * 2 + if next > maxWaitPollInterval { + return maxWaitPollInterval + } + return next +} + +// jobResultFromTerminal converts a terminal Job into a JobResult or error. +func jobResultFromTerminal(job *Job) (*JobResult, error) { + switch job.Status { + case JobStatusCompleted, JobStatusPartialSuccess: + if job.Result == nil { + return nil, fmt.Errorf( + "kreuzberg-cloud: job %s reported %s but no result body", + job.ID, job.Status, + ) + } + return job.Result, nil + case JobStatusFailed: + message := "job failed" + if job.Result != nil && job.Result.Content != "" { + message = job.Result.Content + } + return nil, fmt.Errorf("kreuzberg-cloud: job %s failed: %s", job.ID, message) + case JobStatusCancelled: + return nil, fmt.Errorf("kreuzberg-cloud: job %s was canceled", job.ID) + default: + return nil, fmt.Errorf( + "kreuzberg-cloud: job %s reached unrecognized terminal status %q", + job.ID, job.Status, + ) + } +} diff --git a/packages/go/v1/jobs_test.go b/packages/go/v1/jobs_test.go new file mode 100644 index 0000000..8a0e599 --- /dev/null +++ b/packages/go/v1/jobs_test.go @@ -0,0 +1,284 @@ +package kreuzbergcloud_test + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func TestGetJob_ReturnsParsedJob(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/jobs/job-1" { + t.Errorf("path = %q, want /v1/jobs/job-1", r.URL.Path) + } + _, _ = w.Write([]byte(`{ + "id":"job-1", + "filename":"a.pdf", + "status":"completed", + "created_at":"2025-12-21T10:00:00Z", + "result":{"content":"hello world"} + }`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + job, err := client.GetJob(context.Background(), "job-1") + if err != nil { + t.Fatalf("GetJob: %v", err) + } + if job.ID != "job-1" { + t.Errorf("ID = %q, want job-1", job.ID) + } + if job.Status != "completed" { + t.Errorf("Status = %q, want completed", job.Status) + } + if job.Result == nil || job.Result.Content != "hello world" { + t.Errorf("Result.Content = %v, want 'hello world'", job.Result) + } + if !job.CreatedAt.Equal(time.Date(2025, 12, 21, 10, 0, 0, 0, time.UTC)) { + t.Errorf("CreatedAt = %v, want 2025-12-21T10:00:00Z", job.CreatedAt) + } +} + +func TestGetJob_RejectsEmptyID(t *testing.T) { + t.Parallel() + client := mustClient(t, kreuzbergcloud.WithBaseURL("https://example.test")) + _, err := client.GetJob(context.Background(), "") + if err == nil { + t.Errorf("GetJob(\"\") returned nil error") + } +} + +func TestGetJob_NotFound(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusNotFound) + _, _ = w.Write([]byte(`{"error":"Job not found"}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.GetJob(context.Background(), "missing") + var notFound *kreuzbergcloud.NotFoundError + if !asError(err, ¬Found) { + t.Fatalf("expected NotFoundError, got %T: %v", err, err) + } + if notFound.Status != http.StatusNotFound { + t.Errorf("Status = %d, want 404", notFound.Status) + } +} + +func TestWaitForJob_ImmediateCompletion(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`{ + "id":"j", + "filename":"a.pdf", + "status":"completed", + "created_at":"2025-12-21T10:00:00Z", + "result":{"content":"done"} + }`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + result, err := client.WaitForJob(context.Background(), "j", nil) + if err != nil { + t.Fatalf("WaitForJob: %v", err) + } + if result.Content != "done" { + t.Errorf("Content = %q, want done", result.Content) + } +} + +func TestWaitForJob_PollsUntilTerminal(t *testing.T) { + t.Parallel() + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + count := calls.Add(1) + status := "processing" + extra := "" + if count >= 3 { + status = "completed" + extra = `,"result":{"content":"finished"}` + } + fmt.Fprintf( + w, + `{"id":"j","filename":"a.pdf","status":%q,"created_at":"2025-12-21T10:00:00Z"%s}`, + status, extra, + ) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + result, err := client.WaitForJob(context.Background(), "j", &kreuzbergcloud.WaitOptions{ + Timeout: 2 * time.Second, + PollInterval: 5 * time.Millisecond, + Backoff: kreuzbergcloud.BackoffConstant, + }) + if err != nil { + t.Fatalf("WaitForJob: %v", err) + } + if result.Content != "finished" { + t.Errorf("Content = %q, want finished", result.Content) + } + if got := calls.Load(); got < 3 { + t.Errorf("server saw %d calls, want >=3", got) + } +} + +func TestWaitForJob_TimesOut(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`{ + "id":"j","filename":"a.pdf","status":"processing","created_at":"2025-12-21T10:00:00Z" + }`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.WaitForJob(context.Background(), "j", &kreuzbergcloud.WaitOptions{ + Timeout: 40 * time.Millisecond, + PollInterval: 5 * time.Millisecond, + Backoff: kreuzbergcloud.BackoffConstant, + }) + var timeout *kreuzbergcloud.TimeoutError + if !asError(err, &timeout) { + t.Fatalf("expected TimeoutError, got %T: %v", err, err) + } + if timeout.JobID != "j" { + t.Errorf("TimeoutError.JobID = %q, want j", timeout.JobID) + } +} + +func TestWaitForJob_FailedStatusReturnsError(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`{ + "id":"j","filename":"a.pdf","status":"failed","created_at":"2025-12-21T10:00:00Z", + "result":{"content":"OCR engine crashed"} + }`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.WaitForJob(context.Background(), "j", nil) + if err == nil { + t.Fatalf("expected error for failed job") + } + if !strings.Contains(err.Error(), "OCR engine crashed") { + t.Errorf("error %q does not surface server message", err) + } +} + +func TestWaitForJob_ExponentialBackoffIncreases(t *testing.T) { + t.Parallel() + var timestamps []time.Time + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + timestamps = append(timestamps, time.Now()) + status := "processing" + extra := "" + if len(timestamps) >= 4 { + status = "completed" + extra = `,"result":{"content":"done"}` + } + fmt.Fprintf( + w, + `{"id":"j","filename":"a.pdf","status":%q,"created_at":"2025-12-21T10:00:00Z"%s}`, + status, extra, + ) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + if _, err := client.WaitForJob(context.Background(), "j", &kreuzbergcloud.WaitOptions{ + Timeout: 2 * time.Second, + PollInterval: 10 * time.Millisecond, + Backoff: kreuzbergcloud.BackoffExponential, + }); err != nil { + t.Fatalf("WaitForJob: %v", err) + } + if len(timestamps) < 4 { + t.Fatalf("got %d polls, want >=4", len(timestamps)) + } + gap1 := timestamps[1].Sub(timestamps[0]) + gap2 := timestamps[2].Sub(timestamps[1]) + if gap2 < gap1 { + t.Errorf("expected exponential growth, gap1=%v gap2=%v", gap1, gap2) + } +} + +func TestWaitForJobs_ParallelCompletion(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + id := strings.TrimPrefix(r.URL.Path, "/v1/jobs/") + fmt.Fprintf( + w, + `{"id":%q,"filename":"a.pdf","status":"completed","created_at":"2025-12-21T10:00:00Z","result":{"content":%q}}`, + id, "result-"+id, + ) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + results, err := client.WaitForJobs(context.Background(), []string{"a", "b", "c"}, nil) + if err != nil { + t.Fatalf("WaitForJobs: %v", err) + } + want := []string{"result-a", "result-b", "result-c"} + for i, result := range results { + if result.Content != want[i] { + t.Errorf("results[%d].Content = %q, want %q", i, result.Content, want[i]) + } + } +} + +func TestExtractAndWait_HappyPath(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPost && r.URL.Path == "/v1/extract": + w.WriteHeader(http.StatusAccepted) + _, _ = w.Write([]byte(`{"job_ids":["jobxyz"],"status":"pending"}`)) + case r.Method == http.MethodGet && r.URL.Path == "/v1/jobs/jobxyz": + _, _ = w.Write([]byte(`{ + "id":"jobxyz","filename":"a.pdf","status":"completed", + "created_at":"2025-12-21T10:00:00Z","result":{"content":"the text"} + }`)) + default: + t.Errorf("unexpected request %s %s", r.Method, r.URL.Path) + } + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + result, err := client.ExtractAndWait( + context.Background(), + kreuzbergcloud.FileSource{Name: "a.pdf", Reader: strings.NewReader("hello")}, + nil, + ) + if err != nil { + t.Fatalf("ExtractAndWait: %v", err) + } + if result.Content != "the text" { + t.Errorf("Content = %q, want 'the text'", result.Content) + } +} + +func TestExtractAndWait_PropagatesExtractError(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"bad input"}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.ExtractAndWait( + context.Background(), + kreuzbergcloud.FileSource{Name: "a.pdf", Reader: strings.NewReader("hello")}, + nil, + ) + var validation *kreuzbergcloud.ValidationError + if !asError(err, &validation) { + t.Fatalf("expected ValidationError, got %T: %v", err, err) + } +} diff --git a/packages/go/v1/retry_test.go b/packages/go/v1/retry_test.go new file mode 100644 index 0000000..3ef3315 --- /dev/null +++ b/packages/go/v1/retry_test.go @@ -0,0 +1,122 @@ +package kreuzbergcloud_test + +import ( + "context" + "net/http" + "net/http/httptest" + "sync/atomic" + "testing" + "time" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func TestRetry_RetriesOn503(t *testing.T) { + t.Parallel() + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + count := calls.Add(1) + if count < 3 { + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte(`{"error":"unavailable"}`)) + return + } + _, _ = w.Write([]byte(`{ + "id":"j","filename":"a.pdf","status":"completed", + "created_at":"2025-12-21T10:00:00Z","result":{"content":"ok"} + }`)) + })) + defer server.Close() + client := mustClient(t, + kreuzbergcloud.WithBaseURL(server.URL), + kreuzbergcloud.WithRetries(5), + ) + job, err := client.GetJob(context.Background(), "j") + if err != nil { + t.Fatalf("GetJob: %v", err) + } + if got := calls.Load(); got != 3 { + t.Errorf("calls = %d, want 3", got) + } + if job.Status != "completed" { + t.Errorf("Status = %q, want completed", job.Status) + } +} + +func TestRetry_RetriesOn429HonoursRetryAfter(t *testing.T) { + t.Parallel() + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + count := calls.Add(1) + if count < 2 { + w.Header().Set("Retry-After", "1") + w.WriteHeader(http.StatusTooManyRequests) + _, _ = w.Write([]byte(`{"error":"slow"}`)) + return + } + _, _ = w.Write([]byte(`{ + "id":"j","filename":"a.pdf","status":"completed", + "created_at":"2025-12-21T10:00:00Z","result":{"content":"ok"} + }`)) + })) + defer server.Close() + start := time.Now() + client := mustClient(t, + kreuzbergcloud.WithBaseURL(server.URL), + kreuzbergcloud.WithRetries(3), + ) + if _, err := client.GetJob(context.Background(), "j"); err != nil { + t.Fatalf("GetJob: %v", err) + } + elapsed := time.Since(start) + // Retry-After 1s should make total elapsed at least ~900ms. + if elapsed < 900*time.Millisecond { + t.Errorf("elapsed = %v, want >= 900ms (Retry-After honored)", elapsed) + } +} + +func TestRetry_GivesUpAfterMaxAttempts(t *testing.T) { + t.Parallel() + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls.Add(1) + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte(`{"error":"down"}`)) + })) + defer server.Close() + client := mustClient(t, + kreuzbergcloud.WithBaseURL(server.URL), + kreuzbergcloud.WithRetries(2), + ) + _, err := client.GetJob(context.Background(), "j") + var srv *kreuzbergcloud.ServerError + if !asError(err, &srv) { + t.Fatalf("expected ServerError after retries, got %T: %v", err, err) + } + // Original attempt + 2 retries = 3 total. + if got := calls.Load(); got != 3 { + t.Errorf("calls = %d, want 3", got) + } +} + +func TestRetry_DoesNotRetryOn400(t *testing.T) { + t.Parallel() + var calls atomic.Int32 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + calls.Add(1) + w.WriteHeader(http.StatusBadRequest) + _, _ = w.Write([]byte(`{"error":"nope"}`)) + })) + defer server.Close() + client := mustClient(t, + kreuzbergcloud.WithBaseURL(server.URL), + kreuzbergcloud.WithRetries(5), + ) + _, err := client.GetJob(context.Background(), "j") + if err == nil { + t.Fatal("expected error") + } + if got := calls.Load(); got != 1 { + t.Errorf("calls = %d, want 1 (no retries on 400)", got) + } +} diff --git a/packages/go/v1/sandbox.go b/packages/go/v1/sandbox.go new file mode 100644 index 0000000..15342a6 --- /dev/null +++ b/packages/go/v1/sandbox.go @@ -0,0 +1,43 @@ +package kreuzbergcloud + +import ( + "context" + "fmt" +) + +// CreateSandboxKey requests a short-lived anonymous API key from the public +// sandbox endpoint. No prior authentication is required — the API rate-limits +// by IP and rejects abusive callers with 429. +func (c *Client) CreateSandboxKey(ctx context.Context) (*SandboxKey, error) { + var key SandboxKey + spec := requestSpec{method: "POST", path: "/v1/sandbox/key"} + if err := c.doJSON(ctx, spec, &key); err != nil { + return nil, err + } + if key.APIKey == "" { + return nil, fmt.Errorf("kreuzberg-cloud: sandbox key endpoint returned empty api_key") + } + return &key, nil +} + +// FromSandbox is a zero-friction constructor: it builds an anonymous client, +// calls [Client.CreateSandboxKey], and returns a new client preconfigured +// with the resulting API key. Use it for documentation snippets, demo apps, +// and getting-started guides — the issued key is rate-limited and not +// suitable for production. +// +// Caller-supplied options are applied to both the provisioning client and the +// final client, so WithBaseURL / WithHTTPClient / WithTimeout flow through. +func FromSandbox(ctx context.Context, opts ...Option) (*Client, error) { + provisioning, err := New(opts...) + if err != nil { + return nil, err + } + key, err := provisioning.CreateSandboxKey(ctx) + if err != nil { + return nil, err + } + finalOpts := append([]Option{}, opts...) + finalOpts = append(finalOpts, WithAPIKey(key.APIKey)) + return New(finalOpts...) +} diff --git a/packages/go/v1/sandbox_test.go b/packages/go/v1/sandbox_test.go new file mode 100644 index 0000000..8f204db --- /dev/null +++ b/packages/go/v1/sandbox_test.go @@ -0,0 +1,122 @@ +package kreuzbergcloud_test + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + kreuzbergcloud "github.com/kreuzberg-dev/kreuzberg-cloud-sdk/go/v1" +) + +func TestCreateSandboxKey_ReturnsParsedKey(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/sandbox/key" { + t.Errorf("path = %q, want /v1/sandbox/key", r.URL.Path) + } + if r.Method != http.MethodPost { + t.Errorf("method = %q, want POST", r.Method) + } + if got := r.Header.Get("Authorization"); got != "" { + t.Errorf("Authorization = %q, want empty (anonymous)", got) + } + _, _ = w.Write([]byte(`{ + "api_key":"sandbox-abc", + "expires_at":"2025-12-21T11:00:00Z", + "pages_remaining":50 + }`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + key, err := client.CreateSandboxKey(context.Background()) + if err != nil { + t.Fatalf("CreateSandboxKey: %v", err) + } + if key.APIKey != "sandbox-abc" { + t.Errorf("APIKey = %q, want sandbox-abc", key.APIKey) + } + if key.PagesRemaining != 50 { + t.Errorf("PagesRemaining = %d, want 50", key.PagesRemaining) + } + want := time.Date(2025, 12, 21, 11, 0, 0, 0, time.UTC) + if !key.ExpiresAt.Equal(want) { + t.Errorf("ExpiresAt = %v, want %v", key.ExpiresAt, want) + } +} + +func TestFromSandbox_ReturnsClientWithKey(t *testing.T) { + t.Parallel() + var seen []string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + seen = append(seen, r.URL.Path+"|"+r.Header.Get("Authorization")) + switch r.URL.Path { + case "/v1/sandbox/key": + _, _ = w.Write([]byte(`{ + "api_key":"sb-xyz","expires_at":"2025-12-21T11:00:00Z","pages_remaining":10 + }`)) + case "/v1/jobs/some-id": + _, _ = w.Write([]byte(`{ + "id":"some-id","filename":"a.pdf","status":"completed", + "created_at":"2025-12-21T10:00:00Z","result":{"content":"ok"} + }`)) + default: + t.Errorf("unexpected request: %s", r.URL.Path) + } + })) + defer server.Close() + client, err := kreuzbergcloud.FromSandbox(context.Background(), kreuzbergcloud.WithBaseURL(server.URL)) + if err != nil { + t.Fatalf("FromSandbox: %v", err) + } + if _, err := client.GetJob(context.Background(), "some-id"); err != nil { + t.Fatalf("GetJob: %v", err) + } + if len(seen) != 2 { + t.Fatalf("server saw %d calls, want 2", len(seen)) + } + if !strings.HasPrefix(seen[0], "/v1/sandbox/key|") { + t.Errorf("first call = %q, want /v1/sandbox/key with empty auth", seen[0]) + } + if !strings.HasSuffix(seen[0], "|") { + t.Errorf("first call %q should be anonymous", seen[0]) + } + if seen[1] != "/v1/jobs/some-id|Bearer sb-xyz" { + t.Errorf("second call = %q, want bearer sb-xyz", seen[1]) + } +} + +func TestCreateSandboxKey_RateLimitedReturnsRateLimitError(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Retry-After", "5") + w.WriteHeader(http.StatusTooManyRequests) + fmt.Fprintf(w, `{"error":"too fast"}`) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.CreateSandboxKey(context.Background()) + var rate *kreuzbergcloud.RateLimitError + if !asError(err, &rate) { + t.Fatalf("expected RateLimitError, got %T: %v", err, err) + } + if rate.RetryAfter != 5*time.Second { + t.Errorf("RetryAfter = %v, want 5s", rate.RetryAfter) + } +} + +func TestCreateSandboxKey_RejectsEmptyAPIKey(t *testing.T) { + t.Parallel() + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte(`{"api_key":"","expires_at":"2025-12-21T11:00:00Z","pages_remaining":0}`)) + })) + defer server.Close() + client := mustClient(t, kreuzbergcloud.WithBaseURL(server.URL)) + _, err := client.CreateSandboxKey(context.Background()) + if err == nil { + t.Errorf("expected error for empty api_key, got nil") + } +} diff --git a/packages/go/v1/types.go b/packages/go/v1/types.go new file mode 100644 index 0000000..bf0d71b --- /dev/null +++ b/packages/go/v1/types.go @@ -0,0 +1,150 @@ +package kreuzbergcloud + +import ( + "encoding/json" + "io" + "time" +) + +// JobStatus is one of the values returned by the API in [Job.Status]. +// +// We model the status as a plain string rather than a typed enum so that new +// server-side states do not break clients compiled against an older SDK. +const ( + JobStatusAwaitingUpload = "awaiting_upload" + JobStatusPending = "pending" + JobStatusProcessing = "processing" + JobStatusChunking = "chunking" + JobStatusAggregating = "aggregating" + JobStatusCompleted = "completed" + JobStatusPartialSuccess = "partial_success" + JobStatusFailed = "failed" + JobStatusCancelled = "cancelled" //nolint:misspell // server-side enum value +) + +// terminalJobStatuses lists the statuses for which the server will produce no +// further updates. WaitForJob polls until one of these is observed. +var terminalJobStatuses = map[string]struct{}{ + JobStatusCompleted: {}, + JobStatusPartialSuccess: {}, + JobStatusFailed: {}, + JobStatusCancelled: {}, +} + +// IsTerminalStatus reports whether the status will produce no further updates. +func IsTerminalStatus(status string) bool { + _, ok := terminalJobStatuses[status] + return ok +} + +// Job is the lifecycle envelope returned by GET /v1/jobs/{id}. The optional +// Result field is populated once the job reaches a terminal status. +type Job struct { + ID string `json:"id"` + Filename string `json:"filename"` + Status string `json:"status"` + CreatedAt time.Time `json:"created_at"` + ProcessingTimeMs *int64 `json:"processing_time_ms,omitempty"` + Result *JobResult `json:"result,omitempty"` + RawResponseBody json.RawMessage `json:"-"` +} + +// JobResult is the extraction payload produced on terminal success. The shape +// matches the OpenAPI ExtractionResult schema; rich nested types like Tables +// and Images are deferred to a follow-up — see TODO below. +// +// TODO(workstream-e-go): expose strongly-typed Tables, Images, Chunks, +// PageContent, and Metadata. Currently held as json.RawMessage so callers can +// access the data without forcing us to mirror every nested struct here. +type JobResult struct { + Content string `json:"content"` + MimeType string `json:"mime_type,omitempty"` + DetectedLanguages []string `json:"detected_languages,omitempty"` + QualityScore *float64 `json:"quality_score,omitempty"` + Pages json.RawMessage `json:"pages,omitempty"` + Tables json.RawMessage `json:"tables,omitempty"` + Images json.RawMessage `json:"images,omitempty"` + Chunks json.RawMessage `json:"chunks,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` + ProcessingWarnings json.RawMessage `json:"processing_warnings,omitempty"` +} + +// Text is a convenience alias for [JobResult.Content]. +func (r *JobResult) Text() string { + if r == nil { + return "" + } + return r.Content +} + +// OCRConfig is the minimum viable OCR option set for v1. Additional knobs +// (psm, dpi, custom languages) are deferred — TODO(workstream-e-go). +type OCRConfig struct { + Backend string `json:"backend,omitempty"` + Language string `json:"language,omitempty"` +} + +// ExtractionConfig mirrors a small subset of the server-side extraction +// configuration. We expose only the knobs the v1 docs Quickstart needs: +// output format, OCR backend/language, and the force/disable OCR toggles. +// +// TODO(workstream-e-go): expose chunking, content filtering, language +// detection, table extraction, security limits, and presign options. +type ExtractionConfig struct { + OutputFormat string `json:"output_format,omitempty"` + OCR *OCRConfig `json:"ocr,omitempty"` + ForceOCR *bool `json:"force_ocr,omitempty"` + DisableOCR *bool `json:"disable_ocr,omitempty"` +} + +// ExtractionOptions is the top-level options envelope sent on every extract +// request. It nests an [ExtractionConfig] so the JSON wire shape mirrors the +// REST API and the Python/TypeScript SDKs exactly. +type ExtractionOptions struct { + ExtractionConfig *ExtractionConfig `json:"extraction_config,omitempty"` +} + +// SandboxKey is the response from POST /v1/sandbox/key. The endpoint mints a +// short-lived, anonymous API key for the public sandbox project. +type SandboxKey struct { + APIKey string `json:"api_key"` + ExpiresAt time.Time `json:"expires_at"` + PagesRemaining int64 `json:"pages_remaining"` +} + +// BackoffKind selects the polling backoff strategy used by [Client.WaitForJob]. +type BackoffKind int + +const ( + // BackoffExponential doubles the delay between polls up to a 30s ceiling. + BackoffExponential BackoffKind = iota + // BackoffConstant uses [WaitOptions.PollInterval] for every poll. + BackoffConstant +) + +// WaitOptions configures [Client.WaitForJob]. Zero-value fields fall back to +// sensible defaults (5 minute timeout, 1 second interval, exponential backoff). +type WaitOptions struct { + Timeout time.Duration + PollInterval time.Duration + Backoff BackoffKind +} + +// ExtractAndWaitOptions configures [Client.ExtractAndWait]. +type ExtractAndWaitOptions struct { + Extraction *ExtractionOptions + Wait *WaitOptions +} + +// FileSource describes a file to upload as a multipart part. Reader is read +// in full; Name is used for Content-Disposition and Content-Type sniffing. +type FileSource struct { + Name string + Reader io.Reader +} + +// extractResponse models the 202 Accepted body of POST /v1/extract. +type extractResponse struct { + JobIDs []string `json:"job_ids"` + Status string `json:"status"` +} diff --git a/packages/go/v1/version.go b/packages/go/v1/version.go new file mode 100644 index 0000000..ee002cf --- /dev/null +++ b/packages/go/v1/version.go @@ -0,0 +1,3 @@ +package kreuzbergcloud + +const Version = "0.0.1" diff --git a/packages/python/LICENSE b/packages/python/LICENSE new file mode 100644 index 0000000..59f7bf8 --- /dev/null +++ b/packages/python/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Kreuzberg, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/python/README.md b/packages/python/README.md index 585c703..4d982ea 100644 --- a/packages/python/README.md +++ b/packages/python/README.md @@ -1,11 +1,35 @@ # kreuzberg-cloud +
+ +Kreuzberg Cloud + +
+ +
+ +PyPI +npm +Go Reference +License +Documentation +CI + +
+ +
+ +Discord + +
+ Official Python client for the [Kreuzberg Cloud](https://kreuzberg.cloud) document-processing API. -- httpx-based, fully async (sync wrappers also exposed) -- Generated from the upstream OpenAPI 3.1 spec +- httpx-based, sync (`KreuzbergCloud`) and async (`AsyncKreuzbergCloud`) surfaces +- Generated from the upstream OpenAPI 3.1 spec, then wrapped in ergonomic helpers - Type-annotated end to end, `py.typed` shipped +- Zero-friction onboarding via `from_sandbox()` — no signup needed for evaluation ## Install @@ -17,19 +41,74 @@ uv add kreuzberg-cloud Requires Python 3.10+. -## Usage +## Quickstart + +### Sync — single file with explicit API key + +```python +from pathlib import Path +from kreuzberg_cloud import KreuzbergCloud + +with KreuzbergCloud(api_key="sk_live_...") as client: + job = client.extract_and_wait(file=Path("invoice.pdf")) + if job.result is not None: + print(job.result.content) +``` + +### Async — batch extract with parallel waits ```python +import asyncio +from pathlib import Path from kreuzberg_cloud import AsyncKreuzbergCloud async def main() -> None: - async with AsyncKreuzbergCloud(api_key="...") as client: - job = await client.extract(file_path="invoice.pdf") - result = await client.wait_for_job(job.id) - print(result.text) + async with AsyncKreuzbergCloud(api_key="sk_live_...") as client: + jobs = await client.extract_batch([Path("a.pdf"), Path("b.pdf"), Path("c.pdf")]) + results = await client.wait_for_jobs([str(j.id) for j in jobs]) + for job in results: + print(job.filename, job.status) + +asyncio.run(main()) +``` + +### Async — sandbox onboarding (no API key required) + +```python +import asyncio +from kreuzberg_cloud import AsyncKreuzbergCloud + +async def main() -> None: + async with await AsyncKreuzbergCloud.from_sandbox() as client: + job = await client.extract_and_wait(file=b"hello world") + print(job.status, job.result and job.result.content) + +asyncio.run(main()) ``` -A blocking client is also available as `KreuzbergCloud` for sync code paths. +## Public API + +The following methods are available on both `KreuzbergCloud` (sync) and +`AsyncKreuzbergCloud` (async): + +| Method | Purpose | +|---|---| +| `extract(file=..., options=...)` | Submit one document, get back a `Job`. | +| `extract_batch(files, options=...)` | Submit many documents (parallel for async). | +| `get_job(job_id)` | Fetch current job status / result. | +| `wait_for_job(job_id, timeout=300, ...)` | Poll until terminal status. | +| `wait_for_jobs(job_ids, ...)` | Wait for multiple jobs. | +| `extract_and_wait(file=..., ...)` | Submit + wait in one call. | +| `create_sandbox_key()` | Mint an ephemeral sandbox API key. | +| `from_sandbox()` (classmethod) | Build a client preconfigured with a sandbox key. | + +Errors are raised as one of: +`KreuzbergCloudError` (base), `AuthError`, `ValidationError`, `NotFoundError`, +`RateLimitError` (carries `retry_after`), `ServerError`, `TimeoutError`. + +## Documentation + +Full reference and guides: ## License diff --git a/packages/python/pyproject.toml b/packages/python/pyproject.toml index a58d359..82cd640 100644 --- a/packages/python/pyproject.toml +++ b/packages/python/pyproject.toml @@ -7,17 +7,17 @@ name = "kreuzberg-cloud" version = "0.0.1" description = "Official Python client for the Kreuzberg Cloud document-processing API." readme = "README.md" +keywords = [ "client", "cloud", "document-extraction", "kreuzberg", "openapi", "sdk" ] license = "MIT" -license-files = [ "../../LICENSE" ] -authors = [ { name = "Kreuzberg, Inc.", email = "contact@kreuzberg.dev" } ] +license-files = [ "LICENSE" ] maintainers = [ { name = "Kreuzberg, Inc.", email = "contact@kreuzberg.dev" } ] -keywords = [ "kreuzberg", "cloud", "openapi", "client", "sdk", "document-extraction" ] +authors = [ { name = "Kreuzberg, Inc.", email = "contact@kreuzberg.dev" } ] +requires-python = ">=3.10" classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -27,29 +27,23 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", ] -requires-python = ">=3.10" dependencies = [ "attrs>=24.2", "httpx>=0.28.1", "python-dateutil>=2.9", - "typing-extensions>=4.12 ; python_version < '3.11'", + "typing-extensions>=4.12; python_version<'3.11'", ] +urls.Changelog = "https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/blob/main/CHANGELOG.md" +urls.Documentation = "https://docs.kreuzberg.cloud" +urls.Homepage = "https://kreuzberg.cloud" +urls.Issues = "https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/issues" +urls.Repository = "https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk" -[project.urls] -Homepage = "https://kreuzberg.cloud" -Documentation = "https://docs.kreuzberg.cloud" -Repository = "https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk" -Issues = "https://github.com/kreuzberg-dev/kreuzberg-cloud-sdk/issues" - -[tool.hatch.build] -packages = [ "src/kreuzberg_cloud" ] - -[tool.hatch.build.targets.wheel] -packages = [ "src/kreuzberg_cloud" ] - -[tool.hatch.build.targets.sdist] -include = [ +[tool.hatch] +build.targets.sdist.include = [ "src/kreuzberg_cloud/**", "README.md", + "LICENSE", "pyproject.toml", ] +build.targets.wheel.packages = [ "src/kreuzberg_cloud" ] diff --git a/packages/python/src/kreuzberg_cloud/__init__.py b/packages/python/src/kreuzberg_cloud/__init__.py index 4f793a1..87faa6c 100644 --- a/packages/python/src/kreuzberg_cloud/__init__.py +++ b/packages/python/src/kreuzberg_cloud/__init__.py @@ -2,13 +2,44 @@ from __future__ import annotations +from kreuzberg_cloud._generated.models.extraction_options import ExtractionOptions +from kreuzberg_cloud._generated.models.extraction_result import ExtractionResult +from kreuzberg_cloud._generated.models.job_response import JobResponse +from kreuzberg_cloud._generated.models.job_status import JobStatus from kreuzberg_cloud.client import AsyncKreuzbergCloud, KreuzbergCloud -from kreuzberg_cloud.errors import KreuzbergCloudError +from kreuzberg_cloud.errors import ( + AuthError, + KreuzbergCloudError, + NotFoundError, + RateLimitError, + ServerError, + TimeoutError, # noqa: A004 — domain-specific timeout, intentionally shadows builtin in this namespace + ValidationError, +) +from kreuzberg_cloud.models import SandboxKey + +# Friendly aliases over the generated types: the API talks about "jobs" and +# "extraction results", so expose the typed models under those names. +Job = JobResponse +JobResult = ExtractionResult __all__ = [ "AsyncKreuzbergCloud", + "AuthError", + "ExtractionOptions", + "ExtractionResult", + "Job", + "JobResponse", + "JobResult", + "JobStatus", "KreuzbergCloud", "KreuzbergCloudError", + "NotFoundError", + "RateLimitError", + "SandboxKey", + "ServerError", + "TimeoutError", + "ValidationError", "__version__", ] diff --git a/packages/python/src/kreuzberg_cloud/client.py b/packages/python/src/kreuzberg_cloud/client.py index a3d662f..e012431 100644 --- a/packages/python/src/kreuzberg_cloud/client.py +++ b/packages/python/src/kreuzberg_cloud/client.py @@ -1,30 +1,120 @@ -"""High-level wrappers around the generated kreuzberg-cloud client. +"""High-level wrappers around the kreuzberg-cloud HTTP API. -The generated client (under :mod:`kreuzberg_cloud._generated`) is produced by -``openapi-python-client`` and exposes one function per OpenAPI operation. The -classes in this module wrap it in an idiomatic, ergonomic shape — handling -authentication, base URL configuration, and resource lifecycle — without -hiding the underlying typed models. +The classes in this module wrap the underlying transport in an idiomatic, +ergonomic shape — handling authentication, base URL configuration, resource +lifecycle, multipart upload encoding, and job polling — without hiding the +typed models generated by ``openapi-python-client`` (re-exported from the +package root). """ from __future__ import annotations +import asyncio import sys -from typing import TYPE_CHECKING +import time +from pathlib import Path +from typing import TYPE_CHECKING, Any, BinaryIO, Literal import httpx +from kreuzberg_cloud._generated.models.extraction_options import ExtractionOptions +from kreuzberg_cloud._generated.models.job_response import JobResponse +from kreuzberg_cloud.errors import TimeoutError as ClientTimeoutError +from kreuzberg_cloud.errors import raise_for_status +from kreuzberg_cloud.models import SandboxKey + if sys.version_info >= (3, 11): from typing import Self else: from typing_extensions import Self if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import Iterable, Mapping from types import TracebackType DEFAULT_BASE_URL = "https://api.kreuzberg.cloud" DEFAULT_TIMEOUT_SECONDS = 30.0 +_USER_AGENT = "kreuzberg-cloud-python/0.0.1" + +_TERMINAL_STATUSES: frozenset[str] = frozenset({"completed", "failed", "cancelled", "partial_success"}) +_FAILED_STATUSES: frozenset[str] = frozenset({"failed", "cancelled"}) + +_DEFAULT_POLL_INTERVAL = 1.0 +_DEFAULT_WAIT_TIMEOUT = 300.0 +_MAX_BACKOFF_INTERVAL = 30.0 +_BACKOFF_FACTOR = 2.0 + +FileInput = Path | bytes | BinaryIO +"""Accepted shapes for a single file argument: filesystem path, raw bytes, or an open binary stream.""" + +OptionsInput = ExtractionOptions | dict[str, Any] | None +"""Accepted shapes for the ``options`` argument: typed model, plain dict, or ``None``.""" + +BackoffStrategy = Literal["constant", "exponential"] + + +def _coerce_options(options: OptionsInput) -> dict[str, Any] | None: + """Normalize an :class:`ExtractionOptions`/dict/``None`` into a plain dict (or ``None``).""" + if options is None: + return None + if isinstance(options, ExtractionOptions): + return options.to_dict() + return dict(options) + + +def _prepare_file_part(file: FileInput) -> tuple[str, bytes | BinaryIO, str]: + """Convert a ``FileInput`` into the ``(filename, payload, content-type)`` tuple httpx wants. + + - :class:`Path`: read bytes, infer filename from path, fall back to octet-stream MIME. + - :class:`bytes`: synthetic ``upload.bin`` filename. + - File-like: probe ``.name`` for a filename, fall back to ``upload.bin``. + """ + if isinstance(file, Path): + return (file.name, file.read_bytes(), "application/octet-stream") + if isinstance(file, (bytes, bytearray)): + return ("upload.bin", bytes(file), "application/octet-stream") + name = getattr(file, "name", None) + filename = Path(str(name)).name if isinstance(name, str) and name else "upload.bin" + return (filename, file, "application/octet-stream") + + +def _multipart_files(file: FileInput) -> list[tuple[str, tuple[str, bytes | BinaryIO, str]]]: + """Build the ``files=`` argument for ``httpx`` multipart upload of a single document.""" + return [("files", _prepare_file_part(file))] + + +def _multipart_data(options: OptionsInput) -> dict[str, str] | None: + """Build the ``data=`` argument carrying serialized options as a JSON-encoded multipart part.""" + coerced = _coerce_options(options) + if coerced is None: + return None + import json # local import — keeps stdlib import out of hot path # noqa: PLC0415 + + return {"options": json.dumps(coerced)} + + +def _job_id_from_extract_response(payload: Any) -> str: + """Pluck the first job id out of a ``POST /v1/extract`` response body.""" + if not isinstance(payload, dict): + raise ValueError(f"unexpected extract response shape: {payload!r}") + job_ids = payload.get("job_ids") + if not isinstance(job_ids, list) or not job_ids: + raise ValueError(f"extract response missing job_ids: {payload!r}") + return str(job_ids[0]) + + +def _parse_job(payload: Any) -> JobResponse: + """Parse a ``GET /v1/jobs/{id}`` response body into a typed :class:`JobResponse`.""" + if not isinstance(payload, dict): + raise ValueError(f"unexpected job response shape: {payload!r}") + return JobResponse.from_dict(payload) + + +def _next_interval(current: float, backoff: BackoffStrategy) -> float: + """Advance the polling interval according to the configured backoff strategy.""" + if backoff == "constant": + return current + return min(current * _BACKOFF_FACTOR, _MAX_BACKOFF_INTERVAL) class _BaseClient: @@ -41,7 +131,7 @@ def __init__( self._api_key = api_key self._base_url = base_url.rstrip("/") self._timeout = timeout - self._headers: dict[str, str] = {"User-Agent": "kreuzberg-cloud-python/0.0.1"} + self._headers: dict[str, str] = {"User-Agent": _USER_AGENT} if headers: self._headers.update(headers) if api_key is not None: @@ -51,8 +141,12 @@ def __init__( class KreuzbergCloud(_BaseClient): """Synchronous client for the Kreuzberg Cloud API. - This is a placeholder shell — endpoint methods are wired up after the - generated client lands under ``kreuzberg_cloud._generated``. + Use as a context manager to ensure the underlying HTTP connection pool is + closed: + + >>> with KreuzbergCloud(api_key="...") as client: # doctest: +SKIP + ... job = client.extract(file=Path("invoice.pdf")) + ... result = client.wait_for_job(job.id) """ def __init__( @@ -83,12 +177,137 @@ def close(self) -> None: """Close the underlying HTTP transport, releasing connections.""" self._http.close() + def extract(self, *, file: FileInput, options: OptionsInput = None) -> JobResponse: + """Submit a single document for extraction via ``POST /v1/extract`` (multipart). + + Returns a :class:`JobResponse` with ``status='pending'`` — call + :meth:`wait_for_job` (or use :meth:`extract_and_wait`) to obtain the + finished extraction result. + """ + response = self._http.post( + "/v1/extract", + files=_multipart_files(file), + data=_multipart_data(options), + ) + raise_for_status(response) + job_id = _job_id_from_extract_response(response.json()) + return self.get_job(job_id) + + def extract_batch( + self, + files: Iterable[FileInput], + options: OptionsInput = None, + ) -> list[JobResponse]: + """Submit multiple documents for extraction. + + Each file is sent as its own ``POST /v1/extract`` request — this keeps + the per-job ``Job`` representation 1:1 with the request, simplifying + downstream tracking. For the sync client the requests run sequentially. + """ + return [self.extract(file=item, options=options) for item in files] + + def get_job(self, job_id: str) -> JobResponse: + """Fetch a job's current status and (when terminal) its extraction result.""" + response = self._http.get(f"/v1/jobs/{job_id}") + raise_for_status(response) + return _parse_job(response.json()) + + def wait_for_job( + self, + job_id: str, + *, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> JobResponse: + """Poll ``GET /v1/jobs/{id}`` until the job reaches a terminal status or ``timeout`` elapses. + + Raises :class:`kreuzberg_cloud.errors.TimeoutError` if the deadline is + hit before the job reaches a terminal status. Failed/cancelled jobs are + returned to the caller (not raised) — inspect ``job.status`` to branch. + """ + deadline = time.monotonic() + timeout + interval = poll_interval + while True: + job = self.get_job(job_id) + if job.status in _TERMINAL_STATUSES: + return job + now = time.monotonic() + if now >= deadline: + raise ClientTimeoutError( + f"job {job_id} did not reach a terminal status within {timeout}s", + status_code=None, + ) + sleep_for = min(interval, deadline - now) + time.sleep(sleep_for) + interval = _next_interval(interval, backoff) + + def wait_for_jobs( + self, + job_ids: Iterable[str], + *, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> list[JobResponse]: + """Wait for multiple jobs sequentially (sync); see :meth:`AsyncKreuzbergCloud.wait_for_jobs` for parallel.""" + return [ + self.wait_for_job(job_id, timeout=timeout, poll_interval=poll_interval, backoff=backoff) + for job_id in job_ids + ] + + def extract_and_wait( + self, + *, + file: FileInput, + options: OptionsInput = None, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> JobResponse: + """Submit a document and block until extraction completes (or ``timeout`` is hit).""" + job = self.extract(file=file, options=options) + return self.wait_for_job( + str(job.id), + timeout=timeout, + poll_interval=poll_interval, + backoff=backoff, + ) + + def create_sandbox_key(self) -> SandboxKey: + """Mint an ephemeral sandbox API key via ``POST /v1/sandbox/key`` (unauthenticated).""" + response = self._http.post("/v1/sandbox/key") + raise_for_status(response) + return SandboxKey.from_dict(response.json()) + + @classmethod + def from_sandbox( + cls, + *, + base_url: str = DEFAULT_BASE_URL, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + ) -> Self: + """Construct a client preconfigured with a fresh sandbox key. + + Convenience wrapper for the zero-friction onboarding path: spin up a + bare client, mint a sandbox key, and return a new authenticated client. + """ + bootstrap = cls(base_url=base_url, timeout=timeout) + try: + key = bootstrap.create_sandbox_key() + finally: + bootstrap.close() + return cls(api_key=key.api_key, base_url=base_url, timeout=timeout) + class AsyncKreuzbergCloud(_BaseClient): """Asynchronous client for the Kreuzberg Cloud API. - This is a placeholder shell — endpoint methods are wired up after the - generated client lands under ``kreuzberg_cloud._generated``. + Mirrors :class:`KreuzbergCloud` method-for-method; everything is awaitable. + + >>> async with AsyncKreuzbergCloud(api_key="...") as client: # doctest: +SKIP + ... result = await client.extract_and_wait(file=Path("invoice.pdf")) + ... print(result.result) """ def __init__( @@ -118,3 +337,120 @@ async def __aexit__( async def aclose(self) -> None: """Close the underlying async HTTP transport, releasing connections.""" await self._http.aclose() + + async def extract(self, *, file: FileInput, options: OptionsInput = None) -> JobResponse: + """Async equivalent of :meth:`KreuzbergCloud.extract`.""" + response = await self._http.post( + "/v1/extract", + files=_multipart_files(file), + data=_multipart_data(options), + ) + raise_for_status(response) + job_id = _job_id_from_extract_response(response.json()) + return await self.get_job(job_id) + + async def extract_batch( + self, + files: Iterable[FileInput], + options: OptionsInput = None, + ) -> list[JobResponse]: + """Submit multiple documents in parallel via ``asyncio.gather``.""" + materialized = list(files) + coros = [self.extract(file=item, options=options) for item in materialized] + return list(await asyncio.gather(*coros)) + + async def get_job(self, job_id: str) -> JobResponse: + """Async equivalent of :meth:`KreuzbergCloud.get_job`.""" + response = await self._http.get(f"/v1/jobs/{job_id}") + raise_for_status(response) + return _parse_job(response.json()) + + async def wait_for_job( + self, + job_id: str, + *, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> JobResponse: + """Async equivalent of :meth:`KreuzbergCloud.wait_for_job`.""" + deadline = time.monotonic() + timeout + interval = poll_interval + while True: + job = await self.get_job(job_id) + if job.status in _TERMINAL_STATUSES: + return job + now = time.monotonic() + if now >= deadline: + raise ClientTimeoutError( + f"job {job_id} did not reach a terminal status within {timeout}s", + status_code=None, + ) + sleep_for = min(interval, deadline - now) + await asyncio.sleep(sleep_for) + interval = _next_interval(interval, backoff) + + async def wait_for_jobs( + self, + job_ids: Iterable[str], + *, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> list[JobResponse]: + """Wait for multiple jobs concurrently.""" + coros = [ + self.wait_for_job(job_id, timeout=timeout, poll_interval=poll_interval, backoff=backoff) + for job_id in job_ids + ] + return list(await asyncio.gather(*coros)) + + async def extract_and_wait( + self, + *, + file: FileInput, + options: OptionsInput = None, + timeout: float = _DEFAULT_WAIT_TIMEOUT, + poll_interval: float = _DEFAULT_POLL_INTERVAL, + backoff: BackoffStrategy = "exponential", + ) -> JobResponse: + """Submit a document and await extraction in a single call.""" + job = await self.extract(file=file, options=options) + return await self.wait_for_job( + str(job.id), + timeout=timeout, + poll_interval=poll_interval, + backoff=backoff, + ) + + async def create_sandbox_key(self) -> SandboxKey: + """Async equivalent of :meth:`KreuzbergCloud.create_sandbox_key`.""" + response = await self._http.post("/v1/sandbox/key") + raise_for_status(response) + return SandboxKey.from_dict(response.json()) + + @classmethod + async def from_sandbox( + cls, + *, + base_url: str = DEFAULT_BASE_URL, + timeout: float = DEFAULT_TIMEOUT_SECONDS, + ) -> Self: + """Async equivalent of :meth:`KreuzbergCloud.from_sandbox`.""" + bootstrap = cls(base_url=base_url, timeout=timeout) + try: + key = await bootstrap.create_sandbox_key() + finally: + await bootstrap.aclose() + return cls(api_key=key.api_key, base_url=base_url, timeout=timeout) + + +# Re-export friendly aliases. The OpenAPI-generated symbols are kept available +# for power users who want the underlying types. +__all__ = [ + "AsyncKreuzbergCloud", + "BackoffStrategy", + "FileInput", + "KreuzbergCloud", + "OptionsInput", +] diff --git a/packages/python/src/kreuzberg_cloud/errors.py b/packages/python/src/kreuzberg_cloud/errors.py index b8580fc..0b405e4 100644 --- a/packages/python/src/kreuzberg_cloud/errors.py +++ b/packages/python/src/kreuzberg_cloud/errors.py @@ -2,7 +2,10 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import httpx class KreuzbergCloudError(Exception): @@ -12,3 +15,103 @@ def __init__(self, message: str, *, status_code: int | None = None, payload: Any super().__init__(message) self.status_code = status_code self.payload = payload + + +class AuthError(KreuzbergCloudError): + """Raised when the API rejects the request with HTTP 401 (missing/invalid credentials).""" + + +class ValidationError(KreuzbergCloudError): + """Raised when the API rejects the request with HTTP 400 (malformed input).""" + + +class NotFoundError(KreuzbergCloudError): + """Raised when the API returns HTTP 404 (resource does not exist).""" + + +class RateLimitError(KreuzbergCloudError): + """Raised when the API returns HTTP 429 (rate limit exceeded). + + The optional ``retry_after`` attribute exposes the server-suggested wait + time in seconds, parsed from the ``Retry-After`` response header when + present. + """ + + def __init__( + self, + message: str, + *, + status_code: int | None = 429, + payload: Any | None = None, + retry_after: float | None = None, + ) -> None: + super().__init__(message, status_code=status_code, payload=payload) + self.retry_after = retry_after + + +class ServerError(KreuzbergCloudError): + """Raised when the API returns any HTTP 5xx (server-side failure).""" + + +class TimeoutError(KreuzbergCloudError): # noqa: A001 — intentional shadowing of builtin to surface a domain-specific timeout + """Raised when a client-side wait operation exceeds its budget. + + Distinct from ``httpx.TimeoutException`` (single-request network timeout) + — this is raised by polling helpers like :meth:`wait_for_job` when the + overall deadline passes before the job reaches a terminal state. + """ + + +def _parse_retry_after(value: str | None) -> float | None: + """Parse a ``Retry-After`` header value into seconds, returning ``None`` on failure.""" + if value is None: + return None + try: + return float(value) + except ValueError: + return None + + +def _safe_json(response: httpx.Response) -> Any | None: + """Return ``response.json()`` or ``None`` if the body is not valid JSON.""" + try: + return response.json() + except ValueError: + return None + + +def _extract_message(payload: Any | None, default: str) -> str: + """Pull a useful error message out of the API's structured error payload.""" + if isinstance(payload, dict): + for key in ("message", "error", "detail"): + value = payload.get(key) + if isinstance(value, str) and value: + return value + return default + + +def raise_for_status(response: httpx.Response) -> None: + """Convert a non-2xx httpx response into the matching :class:`KreuzbergCloudError` subclass. + + No-op for 2xx responses. Used by every public client method to normalize + error reporting across the SDK surface. + """ + status = response.status_code + if 200 <= status < 300: + return + + payload = _safe_json(response) + message = _extract_message(payload, f"HTTP {status}") + + if status == 400: + raise ValidationError(message, status_code=status, payload=payload) + if status == 401: + raise AuthError(message, status_code=status, payload=payload) + if status == 404: + raise NotFoundError(message, status_code=status, payload=payload) + if status == 429: + retry_after = _parse_retry_after(response.headers.get("Retry-After")) + raise RateLimitError(message, status_code=status, payload=payload, retry_after=retry_after) + if 500 <= status < 600: + raise ServerError(message, status_code=status, payload=payload) + raise KreuzbergCloudError(message, status_code=status, payload=payload) diff --git a/packages/python/src/kreuzberg_cloud/models.py b/packages/python/src/kreuzberg_cloud/models.py new file mode 100644 index 0000000..1c7a931 --- /dev/null +++ b/packages/python/src/kreuzberg_cloud/models.py @@ -0,0 +1,40 @@ +"""Public models that augment the auto-generated OpenAPI types. + +The auto-generated types under :mod:`kreuzberg_cloud._generated.models` are +re-exported from :mod:`kreuzberg_cloud` under friendlier names. Anything in +this module is hand-written — typically because the relevant endpoint is +not part of the published OpenAPI spec yet. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +from dateutil.parser import isoparse + +if TYPE_CHECKING: + from collections.abc import Mapping + from datetime import datetime + + +@dataclass(frozen=True, slots=True) +class SandboxKey: + """Ephemeral API key returned by ``POST /v1/sandbox/key``. + + Used by :meth:`AsyncKreuzbergCloud.from_sandbox` to spin up a zero-friction + client with a short-lived key (no signup required). + """ + + api_key: str + expires_at: datetime + pages_remaining: int + + @classmethod + def from_dict(cls, payload: Mapping[str, Any]) -> SandboxKey: + """Parse a sandbox-key API payload into a :class:`SandboxKey`.""" + return cls( + api_key=str(payload["api_key"]), + expires_at=isoparse(str(payload["expires_at"])), + pages_remaining=int(payload["pages_remaining"]), + ) diff --git a/packages/python/tests/conftest.py b/packages/python/tests/conftest.py new file mode 100644 index 0000000..0c6461d --- /dev/null +++ b/packages/python/tests/conftest.py @@ -0,0 +1,58 @@ +"""Shared pytest fixtures for the kreuzberg-cloud test suite.""" + +from __future__ import annotations + +import uuid +from typing import Any + +import pytest + +TEST_BASE_URL = "https://api.example.test" +TEST_API_KEY = "test-api-key" + + +def make_job_payload( + *, + job_id: str | None = None, + status: str = "pending", + filename: str = "invoice.pdf", + result: dict[str, Any] | None = None, + processing_time_ms: int | None = None, +) -> dict[str, Any]: + """Build a JobResponse-shaped payload for stubbed responses.""" + payload: dict[str, Any] = { + "id": job_id or str(uuid.uuid4()), + "filename": filename, + "status": status, + "created_at": "2026-05-09T10:00:00Z", + } + if result is not None: + payload["result"] = result + if processing_time_ms is not None: + payload["processing_time_ms"] = processing_time_ms + return payload + + +def make_extract_response(*, job_ids: list[str] | None = None, status: str = "pending") -> dict[str, Any]: + """Build a 202 Accepted body returned by ``POST /v1/extract``.""" + return { + "job_ids": job_ids or [str(uuid.uuid4())], + "status": status, + } + + +def make_extraction_result(*, content: str = "hello world") -> dict[str, Any]: + """Build a minimal ExtractionResult dict (just content + mime_type).""" + return {"content": content, "mime_type": "text/plain"} + + +@pytest.fixture +def base_url() -> str: + """Return the synthetic base URL used by the respx-mocked tests.""" + return TEST_BASE_URL + + +@pytest.fixture +def api_key() -> str: + """Return the synthetic API key used by the respx-mocked tests.""" + return TEST_API_KEY diff --git a/packages/python/tests/test_errors.py b/packages/python/tests/test_errors.py new file mode 100644 index 0000000..57448b8 --- /dev/null +++ b/packages/python/tests/test_errors.py @@ -0,0 +1,163 @@ +"""Tests verifying every HTTP error class is raised on the matching status code.""" + +from __future__ import annotations + +import httpx +import pytest +import respx + +from kreuzberg_cloud import ( + AsyncKreuzbergCloud, + AuthError, + KreuzbergCloud, + KreuzbergCloudError, + NotFoundError, + RateLimitError, + ServerError, + ValidationError, +) + + +@respx.mock +def test_400_raises_validation_error_sync(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(400, json={"error": "bad_request", "message": "missing file"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ValidationError) as exc_info: + client.extract(file=b"x") + assert exc_info.value.status_code == 400 + assert "missing file" in str(exc_info.value) + + +@respx.mock +def test_401_raises_auth_error_sync(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(401, json={"error": "unauthorized", "message": "invalid api key"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(AuthError) as exc_info: + client.extract(file=b"x") + assert exc_info.value.status_code == 401 + + +@respx.mock +def test_404_raises_not_found_error_sync(base_url: str, api_key: str) -> None: + job_id = "11111111-1111-1111-1111-111111111111" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(404, json={"error": "not_found", "message": "no such job"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(NotFoundError): + client.get_job(job_id) + + +@respx.mock +def test_429_raises_rate_limit_error_with_retry_after(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response( + 429, + json={"error": "rate_limited", "message": "slow down"}, + headers={"Retry-After": "12"}, + ), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(RateLimitError) as exc_info: + client.extract(file=b"x") + assert exc_info.value.retry_after == 12.0 + + +@respx.mock +def test_429_without_retry_after_header_has_none(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(429, json={"message": "rate limited"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(RateLimitError) as exc_info: + client.extract(file=b"x") + assert exc_info.value.retry_after is None + + +@respx.mock +def test_500_raises_server_error_sync(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(500, json={"error": "internal_error", "message": "boom"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ServerError) as exc_info: + client.extract(file=b"x") + assert exc_info.value.status_code == 500 + + +@respx.mock +def test_503_raises_server_error_sync(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(503, json={"message": "unavailable"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ServerError): + client.extract(file=b"x") + + +@respx.mock +def test_unknown_4xx_raises_base_error(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(418, json={"message": "teapot"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(KreuzbergCloudError) as exc_info: + client.extract(file=b"x") + # 418 falls through to the base class — not a 5xx, not a known 4xx. + assert exc_info.value.status_code == 418 + assert type(exc_info.value) is KreuzbergCloudError + + +@respx.mock +def test_non_json_error_body_falls_back_to_default_message(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(500, content=b"oops"), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ServerError) as exc_info: + client.extract(file=b"x") + assert "HTTP 500" in str(exc_info.value) + assert exc_info.value.payload is None + + +@pytest.mark.asyncio +@respx.mock +async def test_401_raises_auth_error_async(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(401, json={"message": "no creds"}), + ) + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + with pytest.raises(AuthError): + await client.extract(file=b"x") + + +@pytest.mark.asyncio +@respx.mock +async def test_429_raises_rate_limit_error_async(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response( + 429, + json={"message": "rate limit hit"}, + headers={"Retry-After": "5"}, + ), + ) + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + with pytest.raises(RateLimitError) as exc_info: + await client.extract(file=b"x") + assert exc_info.value.retry_after == 5.0 + + +@pytest.mark.asyncio +@respx.mock +async def test_500_raises_server_error_async(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(502, json={"message": "bad gateway"}), + ) + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + with pytest.raises(ServerError): + await client.extract(file=b"x") + + +@respx.mock +def test_400_on_get_job_raises_validation_error(base_url: str, api_key: str) -> None: + job_id = "22222222-2222-2222-2222-222222222222" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(400, json={"message": "bad job id format"}), + ) + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ValidationError): + client.get_job(job_id) diff --git a/packages/python/tests/test_extract.py b/packages/python/tests/test_extract.py new file mode 100644 index 0000000..dfc592e --- /dev/null +++ b/packages/python/tests/test_extract.py @@ -0,0 +1,252 @@ +"""Tests for ``KreuzbergCloud.extract`` / ``AsyncKreuzbergCloud.extract`` and batch variants.""" + +from __future__ import annotations + +import io +import json + +import httpx +import pytest +import respx + +from kreuzberg_cloud import AsyncKreuzbergCloud, ExtractionOptions, KreuzbergCloud +from tests.conftest import TEST_API_KEY, make_extract_response, make_job_payload + + +@respx.mock +def test_extract_sync_happy_path_with_bytes(base_url: str, api_key: str) -> None: + job_id = "11111111-1111-1111-1111-111111111111" + extract_route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + job_route = respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.extract(file=b"%PDF-1.4 fake") + + assert extract_route.called + assert job_route.called + assert str(job.id) == job_id + assert job.status == "pending" + + +@respx.mock +def test_extract_sync_sends_multipart_with_files_field(base_url: str, api_key: str) -> None: + job_id = "22222222-2222-2222-2222-222222222222" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=b"data") + + assert route.called + request = route.calls.last.request + content_type = request.headers["content-type"] + assert content_type.startswith("multipart/form-data") + body = request.content + assert b'name="files"' in body + assert b"data" in body + + +@respx.mock +def test_extract_sync_serializes_options_as_json_part(base_url: str, api_key: str) -> None: + job_id = "33333333-3333-3333-3333-333333333333" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + options = {"extraction_config": {"chunk_content": True}} + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=b"data", options=options) + + body = route.calls.last.request.content + assert b'name="options"' in body + # The "options" multipart part is the JSON-serialized dict. + assert b'"chunk_content": true' in body + + +@respx.mock +def test_extract_sync_accepts_extraction_options_model(base_url: str, api_key: str) -> None: + job_id = "44444444-4444-4444-4444-444444444444" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + options = ExtractionOptions() + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=b"data", options=options) + + body = route.calls.last.request.content + assert b'name="options"' in body + + +@respx.mock +def test_extract_sync_accepts_binaryio_input(base_url: str, api_key: str) -> None: + job_id = "55555555-5555-5555-5555-555555555555" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + stream = io.BytesIO(b"hello") + stream.name = "doc.pdf" + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.extract(file=stream) + + assert str(job.id) == job_id + + +@respx.mock +def test_extract_sync_accepts_path_input(tmp_path: object, base_url: str, api_key: str) -> None: + from pathlib import Path + + target = Path(str(tmp_path)) / "fixture.txt" + target.write_bytes(b"hello world") + + job_id = "66666666-6666-6666-6666-666666666666" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=target) + + body = route.calls.last.request.content + assert b'filename="fixture.txt"' in body + + +@respx.mock +def test_extract_sends_authorization_header(base_url: str, api_key: str) -> None: + job_id = "77777777-7777-7777-7777-777777777777" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id)), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=b"x") + + assert route.calls.last.request.headers["authorization"] == f"Bearer {TEST_API_KEY}" + + +@respx.mock +def test_extract_batch_sync_issues_one_request_per_file(base_url: str, api_key: str) -> None: + job_ids = [ + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", + ] + extract_route = respx.post(f"{base_url}/v1/extract").mock( + side_effect=[ + httpx.Response(202, json=make_extract_response(job_ids=[job_ids[0]])), + httpx.Response(202, json=make_extract_response(job_ids=[job_ids[1]])), + ], + ) + for job_id in job_ids: + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + jobs = client.extract_batch([b"a", b"b"]) + + assert extract_route.call_count == 2 + assert [str(j.id) for j in jobs] == job_ids + + +@pytest.mark.asyncio +@respx.mock +async def test_extract_async_happy_path(base_url: str, api_key: str) -> None: + job_id = "cccccccc-cccc-cccc-cccc-cccccccccccc" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = await client.extract(file=b"data") + + assert str(job.id) == job_id + + +@pytest.mark.asyncio +@respx.mock +async def test_extract_batch_async_runs_in_parallel(base_url: str, api_key: str) -> None: + job_ids = [ + "dddddddd-dddd-dddd-dddd-dddddddddddd", + "eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee", + "ffffffff-ffff-ffff-ffff-ffffffffffff", + ] + extract_route = respx.post(f"{base_url}/v1/extract").mock( + side_effect=[httpx.Response(202, json=make_extract_response(job_ids=[jid])) for jid in job_ids], + ) + for jid in job_ids: + respx.get(f"{base_url}/v1/jobs/{jid}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=jid, status="pending")), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + jobs = await client.extract_batch([b"a", b"b", b"c"]) + + assert extract_route.call_count == 3 + assert {str(j.id) for j in jobs} == set(job_ids) + + +@respx.mock +def test_extract_response_with_unexpected_shape_raises(base_url: str, api_key: str) -> None: + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json={"unexpected": "shape"}), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client, pytest.raises(ValueError, match="job_ids"): + client.extract(file=b"x") + + +@respx.mock +def test_extract_options_dict_round_trip_is_correct_json(base_url: str, api_key: str) -> None: + job_id = "abababab-abab-abab-abab-abababababab" + route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id)), + ) + + options = {"extraction_config": {"detect_languages": True, "force_ocr": False}} + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract(file=b"x", options=options) + + body = route.calls.last.request.content.decode("utf-8", errors="replace") + # Find the options JSON inside the multipart body + marker = 'name="options"' + assert marker in body + # The exact serialization is contractual: round-trip via json.loads is the + # cleanest assertion that nothing got mangled. + start = body.index("\r\n\r\n", body.index(marker)) + 4 + end = body.index("\r\n", start) + parsed = json.loads(body[start:end]) + assert parsed == options diff --git a/packages/python/tests/test_extract_and_wait.py b/packages/python/tests/test_extract_and_wait.py new file mode 100644 index 0000000..9e65cf3 --- /dev/null +++ b/packages/python/tests/test_extract_and_wait.py @@ -0,0 +1,113 @@ +"""End-to-end tests covering ``extract_and_wait`` for both sync and async clients.""" + +from __future__ import annotations + +import httpx +import pytest +import respx + +from kreuzberg_cloud import AsyncKreuzbergCloud, KreuzbergCloud +from kreuzberg_cloud import TimeoutError as ClientTimeoutError +from tests.conftest import make_extract_response, make_extraction_result, make_job_payload + + +@respx.mock +def test_extract_and_wait_sync_returns_extraction_result(base_url: str, api_key: str) -> None: + job_id = "11111111-1111-1111-1111-111111111111" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response( + 200, + json=make_job_payload( + job_id=job_id, + status="completed", + result=make_extraction_result(content="extracted text"), + ), + ), + ], + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.extract_and_wait(file=b"data", poll_interval=0.01, timeout=2.0, backoff="constant") + + assert job.status == "completed" + assert job.result is not None + assert job.result.content == "extracted text" # type: ignore[union-attr] + + +@pytest.mark.asyncio +@respx.mock +async def test_extract_and_wait_async_returns_extraction_result(base_url: str, api_key: str) -> None: + job_id = "22222222-2222-2222-2222-222222222222" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response( + 200, + json=make_job_payload(job_id=job_id, status="completed", result=make_extraction_result(content="async ok")), + ), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = await client.extract_and_wait(file=b"data", poll_interval=0.01, timeout=2.0) + + assert job.status == "completed" + assert job.result.content == "async ok" # type: ignore[union-attr] + + +@pytest.mark.asyncio +@respx.mock +async def test_extract_and_wait_async_propagates_failed_status(base_url: str, api_key: str) -> None: + job_id = "33333333-3333-3333-3333-333333333333" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="failed")), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = await client.extract_and_wait(file=b"data", poll_interval=0.01, timeout=2.0) + + # Failed jobs are returned (not raised) so callers can branch on status. + assert job.status == "failed" + + +@pytest.mark.asyncio +@respx.mock +async def test_extract_and_wait_async_times_out(base_url: str, api_key: str) -> None: + job_id = "44444444-4444-4444-4444-444444444444" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + with pytest.raises(ClientTimeoutError): + await client.extract_and_wait(file=b"data", poll_interval=0.02, timeout=0.1) + + +@respx.mock +def test_extract_and_wait_sync_passes_options_through(base_url: str, api_key: str) -> None: + job_id = "55555555-5555-5555-5555-555555555555" + extract_route = respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="completed")), + ) + + options = {"extraction_config": {"chunk_content": True}} + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.extract_and_wait(file=b"data", options=options, poll_interval=0.01, timeout=2.0) + + body = extract_route.calls.last.request.content + assert b'"chunk_content": true' in body diff --git a/packages/python/tests/test_jobs.py b/packages/python/tests/test_jobs.py new file mode 100644 index 0000000..bf345da --- /dev/null +++ b/packages/python/tests/test_jobs.py @@ -0,0 +1,236 @@ +"""Tests for ``get_job`` / ``wait_for_job`` / ``wait_for_jobs`` covering polling and timeouts.""" + +from __future__ import annotations + +import time + +import httpx +import pytest +import respx + +from kreuzberg_cloud import AsyncKreuzbergCloud, KreuzbergCloud +from kreuzberg_cloud import TimeoutError as ClientTimeoutError +from tests.conftest import make_extraction_result, make_job_payload + + +@respx.mock +def test_get_job_sync_returns_typed_job(base_url: str, api_key: str) -> None: + job_id = "11111111-1111-1111-1111-111111111111" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.get_job(job_id) + + assert str(job.id) == job_id + assert job.status == "processing" + assert job.filename == "invoice.pdf" + + +@respx.mock +def test_wait_for_job_sync_returns_immediately_when_completed(base_url: str, api_key: str) -> None: + job_id = "22222222-2222-2222-2222-222222222222" + route = respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response( + 200, + json=make_job_payload(job_id=job_id, status="completed", result=make_extraction_result()), + ), + ) + + start = time.monotonic() + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.wait_for_job(job_id, poll_interval=0.01, timeout=5.0) + elapsed = time.monotonic() - start + + assert job.status == "completed" + assert route.call_count == 1 + assert elapsed < 1.0 # no polling beyond the first hit + + +@respx.mock +def test_wait_for_job_sync_polls_until_terminal(base_url: str, api_key: str) -> None: + job_id = "33333333-3333-3333-3333-333333333333" + route = respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + httpx.Response( + 200, + json=make_job_payload(job_id=job_id, status="completed", result=make_extraction_result()), + ), + ], + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.wait_for_job(job_id, poll_interval=0.01, timeout=5.0, backoff="constant") + + assert job.status == "completed" + assert route.call_count == 3 + + +@respx.mock +def test_wait_for_job_sync_returns_failed_status_without_raising(base_url: str, api_key: str) -> None: + job_id = "44444444-4444-4444-4444-444444444444" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="failed")), + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.wait_for_job(job_id, poll_interval=0.01, timeout=1.0) + + assert job.status == "failed" + + +@respx.mock +def test_wait_for_job_sync_times_out_when_never_terminal(base_url: str, api_key: str) -> None: + job_id = "55555555-5555-5555-5555-555555555555" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + ) + + with ( + KreuzbergCloud(api_key=api_key, base_url=base_url) as client, + pytest.raises(ClientTimeoutError, match="terminal status"), + ): + client.wait_for_job(job_id, poll_interval=0.05, timeout=0.15) + + +@pytest.mark.asyncio +@respx.mock +async def test_wait_for_job_async_polls_until_terminal(base_url: str, api_key: str) -> None: + job_id = "66666666-6666-6666-6666-666666666666" + route = respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="completed")), + ], + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = await client.wait_for_job(job_id, poll_interval=0.01, timeout=2.0) + + assert job.status == "completed" + assert route.call_count == 2 + + +@pytest.mark.asyncio +@respx.mock +async def test_wait_for_job_async_times_out(base_url: str, api_key: str) -> None: + job_id = "77777777-7777-7777-7777-777777777777" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + return_value=httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + with pytest.raises(ClientTimeoutError): + await client.wait_for_job(job_id, poll_interval=0.02, timeout=0.1) + + +@respx.mock +def test_wait_for_job_exponential_backoff_increases_interval( + monkeypatch: pytest.MonkeyPatch, base_url: str, api_key: str +) -> None: + """Verify the exponential backoff actually scales the sleep delay each iteration.""" + job_id = "88888888-8888-8888-8888-888888888888" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="processing")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="completed")), + ], + ) + + sleeps: list[float] = [] + + def _record_sleep(seconds: float) -> None: + sleeps.append(seconds) + + # Patch the time.sleep used by the sync client (imported as a module symbol). + from kreuzberg_cloud import client as client_module + + monkeypatch.setattr(client_module.time, "sleep", _record_sleep) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.wait_for_job(job_id, poll_interval=0.5, timeout=10.0, backoff="exponential") + + # Two sleeps before the third (terminal) response. + assert len(sleeps) == 2 + assert sleeps[0] == pytest.approx(0.5) + # Exponential backoff doubles the interval (capped at 30s). + assert sleeps[1] == pytest.approx(1.0) + + +@respx.mock +def test_wait_for_job_constant_backoff_keeps_interval_steady( + monkeypatch: pytest.MonkeyPatch, base_url: str, api_key: str +) -> None: + job_id = "99999999-9999-9999-9999-999999999999" + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response(200, json=make_job_payload(job_id=job_id, status="completed")), + ], + ) + + sleeps: list[float] = [] + from kreuzberg_cloud import client as client_module + + monkeypatch.setattr(client_module.time, "sleep", sleeps.append) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + client.wait_for_job(job_id, poll_interval=0.25, timeout=10.0, backoff="constant") + + assert sleeps == [pytest.approx(0.25), pytest.approx(0.25)] + + +@pytest.mark.asyncio +@respx.mock +async def test_wait_for_jobs_async_runs_in_parallel(base_url: str, api_key: str) -> None: + job_ids = [ + "aaaaaaaa-1111-1111-1111-aaaaaaaaaaaa", + "bbbbbbbb-2222-2222-2222-bbbbbbbbbbbb", + ] + for jid in job_ids: + respx.get(f"{base_url}/v1/jobs/{jid}").mock( + return_value=httpx.Response( + 200, + json=make_job_payload(job_id=jid, status="completed", result=make_extraction_result()), + ), + ) + + async with AsyncKreuzbergCloud(api_key=api_key, base_url=base_url) as client: + jobs = await client.wait_for_jobs(job_ids, poll_interval=0.01, timeout=2.0) + + assert {str(j.id) for j in jobs} == set(job_ids) + assert all(j.status == "completed" for j in jobs) + + +@respx.mock +def test_extract_and_wait_sync_returns_completed_job(base_url: str, api_key: str) -> None: + from tests.conftest import make_extract_response + + job_id = "cccccccc-3333-3333-3333-cccccccccccc" + respx.post(f"{base_url}/v1/extract").mock( + return_value=httpx.Response(202, json=make_extract_response(job_ids=[job_id])), + ) + respx.get(f"{base_url}/v1/jobs/{job_id}").mock( + side_effect=[ + # First call: surfaced by extract() itself (returns pending Job). + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + # Subsequent calls: poll loop inside wait_for_job. + httpx.Response(200, json=make_job_payload(job_id=job_id, status="pending")), + httpx.Response( + 200, + json=make_job_payload(job_id=job_id, status="completed", result=make_extraction_result(content="hi")), + ), + ], + ) + + with KreuzbergCloud(api_key=api_key, base_url=base_url) as client: + job = client.extract_and_wait(file=b"x", poll_interval=0.01, timeout=5.0, backoff="constant") + + assert job.status == "completed" + assert job.result is not None + assert job.result.content == "hi" # type: ignore[union-attr] diff --git a/packages/python/tests/test_sandbox.py b/packages/python/tests/test_sandbox.py new file mode 100644 index 0000000..1c07e16 --- /dev/null +++ b/packages/python/tests/test_sandbox.py @@ -0,0 +1,90 @@ +"""Tests for the sandbox-key onboarding flow (``create_sandbox_key`` / ``from_sandbox``).""" + +from __future__ import annotations + +import datetime as dt + +import httpx +import pytest +import respx + +from kreuzberg_cloud import AsyncKreuzbergCloud, KreuzbergCloud, RateLimitError, SandboxKey + +SANDBOX_KEY_PAYLOAD = { + "api_key": "sk_sandbox_01ABCDEFGHIJKLMNOPQRSTUVWX", + "expires_at": "2026-05-10T10:00:00Z", + "pages_remaining": 50, +} + + +@respx.mock +def test_create_sandbox_key_sync_returns_typed_value(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock(return_value=httpx.Response(200, json=SANDBOX_KEY_PAYLOAD)) + + with KreuzbergCloud(base_url=base_url) as client: + key = client.create_sandbox_key() + + assert isinstance(key, SandboxKey) + assert key.api_key == SANDBOX_KEY_PAYLOAD["api_key"] + assert key.pages_remaining == 50 + assert key.expires_at == dt.datetime(2026, 5, 10, 10, 0, 0, tzinfo=dt.timezone.utc) + + +@pytest.mark.asyncio +@respx.mock +async def test_create_sandbox_key_async_returns_typed_value(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock(return_value=httpx.Response(200, json=SANDBOX_KEY_PAYLOAD)) + + async with AsyncKreuzbergCloud(base_url=base_url) as client: + key = await client.create_sandbox_key() + + assert key.api_key == SANDBOX_KEY_PAYLOAD["api_key"] + + +@respx.mock +def test_from_sandbox_sync_returns_authenticated_client(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock(return_value=httpx.Response(200, json=SANDBOX_KEY_PAYLOAD)) + + client = KreuzbergCloud.from_sandbox(base_url=base_url) + try: + assert client._headers["Authorization"] == f"Bearer {SANDBOX_KEY_PAYLOAD['api_key']}" + finally: + client.close() + + +@pytest.mark.asyncio +@respx.mock +async def test_from_sandbox_async_returns_authenticated_client(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock(return_value=httpx.Response(200, json=SANDBOX_KEY_PAYLOAD)) + + client = await AsyncKreuzbergCloud.from_sandbox(base_url=base_url) + try: + assert client._headers["Authorization"] == f"Bearer {SANDBOX_KEY_PAYLOAD['api_key']}" + finally: + await client.aclose() + + +@respx.mock +def test_create_sandbox_key_sync_raises_rate_limit_with_retry_after(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock( + return_value=httpx.Response( + 429, + json={"error": "rate_limited", "message": "too many sandbox keys from this IP"}, + headers={"Retry-After": "60"}, + ), + ) + + with KreuzbergCloud(base_url=base_url) as client, pytest.raises(RateLimitError) as exc_info: + client.create_sandbox_key() + assert exc_info.value.retry_after == 60.0 + + +@pytest.mark.asyncio +@respx.mock +async def test_from_sandbox_async_raises_rate_limit(base_url: str) -> None: + respx.post(f"{base_url}/v1/sandbox/key").mock( + return_value=httpx.Response(429, json={"message": "throttled"}), + ) + + with pytest.raises(RateLimitError): + await AsyncKreuzbergCloud.from_sandbox(base_url=base_url) diff --git a/packages/typescript/LICENSE b/packages/typescript/LICENSE new file mode 100644 index 0000000..59f7bf8 --- /dev/null +++ b/packages/typescript/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Kreuzberg, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/packages/typescript/README.md b/packages/typescript/README.md index d6df461..97b5f9f 100644 --- a/packages/typescript/README.md +++ b/packages/typescript/README.md @@ -1,5 +1,28 @@ # @kreuzberg/cloud +
+ +Kreuzberg Cloud + +
+ +
+ +PyPI +npm +Go Reference +License +Documentation +CI + +
+ +
+ +Discord + +
+ Official TypeScript / Node.js client for the [Kreuzberg Cloud](https://kreuzberg.cloud) document-processing API. @@ -11,25 +34,79 @@ document-processing API. ```sh pnpm add @kreuzberg/cloud -# or -npm install @kreuzberg/cloud ``` -## Usage +## Quickstart — single file ```ts -import { createClient } from "@kreuzberg/cloud"; +import { KreuzbergCloud } from "@kreuzberg/cloud"; +import { readFile } from "node:fs/promises"; -const client = createClient({ - baseUrl: "https://api.kreuzberg.cloud", - apiKey: process.env.KREUZBERG_API_KEY!, +const client = new KreuzbergCloud({ apiKey: process.env.KREUZBERG_API_KEY! }); + +const data = await readFile("invoice.pdf"); +const result = await client.extractAndWait({ + file: { name: "invoice.pdf", data, mimeType: "application/pdf" }, }); +console.log(result.result?.content); +``` + +## Quickstart — batch + parallel wait + +```ts +import { KreuzbergCloud } from "@kreuzberg/cloud"; +import { readFile } from "node:fs/promises"; -const { data, error } = await client.GET("/healthz"); -if (error) throw error; -console.log(data.status); +const client = new KreuzbergCloud({ apiKey: process.env.KREUZBERG_API_KEY! }); + +const files = await Promise.all( + ["a.pdf", "b.pdf", "c.pdf"].map(async (name) => ({ + name, + data: await readFile(name), + })), +); +const jobs = await client.extractBatch({ files }); +const results = await client.waitForJobs(jobs.map((j) => j.id)); +for (const r of results) console.log(r.id, r.status); +``` + +## Quickstart — sandbox (no signup) + +```ts +import { KreuzbergCloud } from "@kreuzberg/cloud"; + +const client = await KreuzbergCloud.fromSandbox(); +const result = await client.extractAndWait({ + file: new Blob(["Hello world"], { type: "text/plain" }), +}); +console.log(result.result?.content); ``` +## API + +| Method | Returns | +|---|---| +| `extract({ file, options? })` | `Promise` | +| `extractBatch({ files, options? })` | `Promise` | +| `getJob(jobId)` | `Promise` | +| `waitForJob(jobId, opts?)` | `Promise` | +| `waitForJobs(jobIds, opts?)` | `Promise` | +| `extractAndWait({ file, options?, ...waitOpts })` | `Promise` | +| `createSandboxKey()` | `Promise` | +| `KreuzbergCloud.fromSandbox(opts?)` | `Promise` (static) | + +Errors throw subclasses of `KreuzbergError` (`AuthError`, `RateLimitError`, +`ValidationError`, `NotFoundError`, `ServerError`, `TimeoutError`). Each +carries `status: number` and `body: unknown`. `RateLimitError.retryAfter` +is parsed from the `Retry-After` response header. + +The low-level `createClient(...)` factory (a thin `openapi-fetch` wrapper) +is still exported for direct OpenAPI access. + +## Docs + +Full reference at . + ## License MIT — © Kreuzberg, Inc. diff --git a/packages/typescript/package.json b/packages/typescript/package.json index a15d75c..1d1d1b9 100644 --- a/packages/typescript/package.json +++ b/packages/typescript/package.json @@ -1,7 +1,7 @@ { "name": "@kreuzberg/cloud", "version": "0.0.1", - "description": "Official TypeScript client for the Kreuzberg Cloud document-processing API", + "description": "Official TypeScript client for the Kreuzberg Cloud document-processing API.", "license": "MIT", "author": "Kreuzberg, Inc. ", "homepage": "https://kreuzberg.cloud", @@ -38,7 +38,7 @@ "LICENSE" ], "engines": { - "node": ">=20" + "node": ">=22" }, "publishConfig": { "access": "public", diff --git a/packages/typescript/src/client.ts b/packages/typescript/src/client.ts index d01d6e3..9e7e538 100644 --- a/packages/typescript/src/client.ts +++ b/packages/typescript/src/client.ts @@ -1,22 +1,373 @@ +/** + * High-level Kreuzberg Cloud client. + * + * Wraps the generated `openapi-fetch` low-level client with ergonomic methods + * that mirror the Python SDK shape (`extract`, `getJob`, `waitForJob`, + * `extractAndWait`, `createSandboxKey`, etc.) and strongly-typed error + * mapping. See {@link KreuzbergCloud} for the public surface. + */ + import createOpenApiClient, { type Client } from "openapi-fetch"; import type { paths } from "./_generated/schema.js"; +import { KreuzbergError, RateLimitError, TimeoutError, raiseForStatus } from "./errors.js"; +import type { + ExtractResponse, + ExtractionOptions, + Job, + JobResult, + SandboxKey, + WebhookConfig, +} from "./types.js"; +import { SUCCESS_JOB_STATUSES, TERMINAL_JOB_STATUSES } from "./types.js"; const DEFAULT_BASE_URL = "https://api.kreuzberg.cloud"; const USER_AGENT = "kreuzberg-cloud-typescript/0.0.1"; +const DEFAULT_TIMEOUT_MS = 30_000; +const DEFAULT_POLL_INTERVAL_MS = 1_000; +const DEFAULT_WAIT_TIMEOUT_MS = 5 * 60_000; +const DEFAULT_RETRY_STATUSES: readonly number[] = [429, 502, 503, 504]; +const DEFAULT_RETRY_BACKOFF_BASE_MS = 200; +const DEFAULT_RETRY_BACKOFF_CAP_MS = 30_000; +const DEFAULT_BACKOFF_FACTOR = 2; +const SANDBOX_KEY_PATH = "/v1/sandbox/key"; -export interface CreateClientOptions { - /** Base URL of the Kreuzberg Cloud API. Defaults to the production endpoint. */ - baseUrl?: string; - /** API key issued by Kreuzberg Cloud. Sent as `Authorization: Bearer `. */ +/** + * Backoff strategy for retries and `waitForJob` polling. + * + * - `exponential` — interval doubles after every attempt, capped at 30s. + * - `constant` — interval stays the same on every attempt. + */ +export type BackoffStrategy = "exponential" | "constant"; + +export interface KreuzbergCloudOptions { apiKey?: string; - /** Additional request headers merged into every call. */ + baseUrl?: string; + fetch?: typeof fetch; headers?: Record; - /** Custom fetch implementation. Defaults to the global `fetch`. */ + timeoutMs?: number; + retries?: number; + retryOn?: readonly number[]; + retryBackoff?: BackoffStrategy; + /** Sleep helper, swappable in tests. Defaults to `setTimeout`. */ + sleep?: (ms: number) => Promise; +} + +export interface FromSandboxOptions { + baseUrl?: string; fetch?: typeof fetch; + headers?: Record; + timeoutMs?: number; + retries?: number; + retryOn?: readonly number[]; + retryBackoff?: BackoffStrategy; } +export type FileLike = + | File + | Blob + | Uint8Array + | { name?: string; data: Blob | Uint8Array; mimeType?: string }; + +export interface ExtractParams { + file: FileLike; + options?: ExtractionOptions; + webhook?: WebhookConfig; +} + +export interface ExtractBatchParams { + files: readonly FileLike[]; + options?: ExtractionOptions; + webhook?: WebhookConfig; +} + +export interface WaitOptions { + /** Total time to wait before throwing TimeoutError. Default 5 minutes. */ + timeoutMs?: number; + /** Initial poll interval. Default 1000ms. */ + pollIntervalMs?: number; + /** `exponential` doubles the interval each cycle (capped). Default exponential. */ + backoff?: BackoffStrategy; +} + +export interface ExtractAndWaitParams extends ExtractParams, WaitOptions {} + +/** Public client. */ export type KreuzbergCloudClient = Client; +/** + * High-level client. Construct with `new KreuzbergCloud({ apiKey })`, or use + * the {@link KreuzbergCloud.fromSandbox} factory for a temporary key. + */ +export class KreuzbergCloud { + private readonly baseUrl: string; + private readonly headers: Record; + private readonly fetchImpl: typeof fetch; + private readonly timeoutMs: number; + private readonly retries: number; + private readonly retryOn: readonly number[]; + private readonly retryBackoff: BackoffStrategy; + private readonly sleep: (ms: number) => Promise; + /** Underlying `openapi-fetch` client — exposed for advanced use. */ + public readonly raw: KreuzbergCloudClient; + + public constructor(options: KreuzbergCloudOptions = {}) { + this.baseUrl = (options.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, ""); + this.fetchImpl = options.fetch ?? fetch; + this.timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS; + this.retries = options.retries ?? 0; + this.retryOn = options.retryOn ?? DEFAULT_RETRY_STATUSES; + this.retryBackoff = options.retryBackoff ?? "exponential"; + this.sleep = options.sleep ?? defaultSleep; + + this.headers = { + "User-Agent": USER_AGENT, + ...options.headers, + }; + if (options.apiKey !== undefined) { + this.headers["Authorization"] = `Bearer ${options.apiKey}`; + } + + this.raw = createOpenApiClient({ + baseUrl: this.baseUrl, + headers: this.headers, + fetch: this.fetchImpl, + }); + } + + /** + * Submit a single document for extraction. Returns the initial {@link Job} + * record (still pending — call `waitForJob` to await completion). + */ + public async extract(params: ExtractParams): Promise { + const jobs = await this.extractBatch({ + files: [params.file], + ...(params.options !== undefined ? { options: params.options } : {}), + ...(params.webhook !== undefined ? { webhook: params.webhook } : {}), + }); + const first = jobs[0]; + if (first === undefined) { + throw new KreuzbergError("Server did not return any job IDs", { + status: 500, + body: { jobs }, + }); + } + return first; + } + + /** + * Submit multiple documents in a single multipart request. Returns one + * {@link Job} per file, in the same order. + */ + public async extractBatch(params: ExtractBatchParams): Promise { + if (params.files.length === 0) { + throw new KreuzbergError("extractBatch called with no files", { status: 400, body: null }); + } + + const form = new FormData(); + for (const file of params.files) { + const { blob, filename } = toBlob(file); + form.append("file", blob, filename); + } + if (params.options !== undefined) { + form.append("options", JSON.stringify(params.options)); + } + const webhookPayload: WebhookConfig = params.webhook ?? { url: "" }; + form.append("webhook", JSON.stringify(webhookPayload)); + + const response = await this.requestWithRetry("POST", "/v1/extract", { body: form }); + const body = (await response.json()) as ExtractResponse; + const now = new Date().toISOString(); + return params.files.map((file, index) => { + const id = body.job_ids[index]; + if (id === undefined) { + throw new KreuzbergError( + `Server returned ${body.job_ids.length} job IDs for ${params.files.length} files`, + { status: 500, body }, + ); + } + const filename = describeFile(file); + return { + id, + filename, + status: "pending", + created_at: now, + }; + }); + } + + /** Fetch the current state of a job. */ + public async getJob(jobId: string): Promise { + const response = await this.requestWithRetry("GET", `/v1/jobs/${encodeURIComponent(jobId)}`); + return (await response.json()) as Job; + } + + /** + * Poll {@link getJob} until the job reaches a terminal status. Throws + * {@link TimeoutError} if the wait exceeds `timeoutMs`. Throws + * {@link KreuzbergError} if the terminal status is `failed` or `cancelled`. + */ + public async waitForJob(jobId: string, options: WaitOptions = {}): Promise { + const timeoutMs = options.timeoutMs ?? DEFAULT_WAIT_TIMEOUT_MS; + const initialInterval = options.pollIntervalMs ?? DEFAULT_POLL_INTERVAL_MS; + const backoff = options.backoff ?? "exponential"; + const start = Date.now(); + let interval = initialInterval; + + for (;;) { + const job = await this.getJob(jobId); + if (TERMINAL_JOB_STATUSES.includes(job.status)) { + if (SUCCESS_JOB_STATUSES.includes(job.status)) { + return job; + } + throw new KreuzbergError(`Job ${jobId} ended with status ${job.status}`, { + status: 200, + body: job, + }); + } + + const elapsed = Date.now() - start; + const remaining = timeoutMs - elapsed; + if (remaining <= 0) { + throw new TimeoutError(`Timed out waiting for job ${jobId} after ${timeoutMs}ms`, { + status: 408, + body: job, + }); + } + const delay = Math.min(interval, remaining); + await this.sleep(delay); + if (backoff === "exponential") { + interval = Math.min(interval * DEFAULT_BACKOFF_FACTOR, DEFAULT_RETRY_BACKOFF_CAP_MS); + } + } + } + + /** Wait for many jobs in parallel. */ + public async waitForJobs( + jobIds: readonly string[], + options: WaitOptions = {}, + ): Promise { + return Promise.all(jobIds.map((id) => this.waitForJob(id, options))); + } + + /** Convenience: extract one file and await its result. */ + public async extractAndWait(params: ExtractAndWaitParams): Promise { + const extractParams: ExtractParams = { + file: params.file, + ...(params.options !== undefined ? { options: params.options } : {}), + ...(params.webhook !== undefined ? { webhook: params.webhook } : {}), + }; + const job = await this.extract(extractParams); + const waitOptions: WaitOptions = { + ...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}), + ...(params.pollIntervalMs !== undefined ? { pollIntervalMs: params.pollIntervalMs } : {}), + ...(params.backoff !== undefined ? { backoff: params.backoff } : {}), + }; + return this.waitForJob(job.id, waitOptions); + } + + /** + * Mint a temporary sandbox API key. Calls `POST /v1/sandbox/key` (raw + * fetch, not in the generated paths yet). + */ + public async createSandboxKey(): Promise { + const response = await this.requestWithRetry("POST", SANDBOX_KEY_PATH); + return (await response.json()) as SandboxKey; + } + + /** + * Construct a client authenticated with a fresh sandbox key. Issues an + * unauthenticated request to `POST /v1/sandbox/key`, then returns a new + * `KreuzbergCloud` configured with the returned key. + */ + public static async fromSandbox(options: FromSandboxOptions = {}): Promise { + const bootstrap = new KreuzbergCloud(options); + const key = await bootstrap.createSandboxKey(); + return new KreuzbergCloud({ ...options, apiKey: key.api_key }); + } + + /** + * Issue a raw HTTP request with auth, timeout, and retry handling. Used + * internally for endpoints that require multipart bodies or aren't yet in + * the generated schema. + */ + private async requestWithRetry( + method: string, + path: string, + init: { body?: FormData | string | Uint8Array; headers?: Record } = {}, + ): Promise { + const url = `${this.baseUrl}${path}`; + let attempt = 0; + let interval = DEFAULT_RETRY_BACKOFF_BASE_MS; + for (;;) { + const headers = { ...this.headers, ...init.headers }; + const requestInit: RequestInit = { + method, + headers, + signal: AbortSignal.timeout(this.timeoutMs), + }; + if (init.body !== undefined) { + requestInit.body = init.body; + } + + let response: Response; + try { + response = await this.fetchImpl(url, requestInit); + } catch (cause) { + if (attempt < this.retries) { + attempt += 1; + await this.sleep(interval); + interval = nextBackoffInterval(interval, this.retryBackoff); + continue; + } + throw new KreuzbergError(`Network error contacting ${url}`, { + status: 0, + body: null, + cause, + }); + } + + if (response.ok) { + return response; + } + + if (this.retryOn.includes(response.status) && attempt < this.retries) { + attempt += 1; + const retryAfter = parseRetryAfterHeader(response.headers.get("retry-after")); + const wait = retryAfter !== undefined ? retryAfter * 1000 : interval; + await this.sleep(wait); + interval = nextBackoffInterval(interval, this.retryBackoff); + continue; + } + + try { + await raiseForStatus(response); + } catch (error) { + if ( + error instanceof RateLimitError && + error.retryAfter !== undefined && + attempt < this.retries + ) { + attempt += 1; + await this.sleep(error.retryAfter * 1000); + interval = nextBackoffInterval(interval, this.retryBackoff); + continue; + } + throw error; + } + // raiseForStatus always throws; this line is unreachable but satisfies TS. + throw new KreuzbergError("Unreachable", { status: response.status, body: null }); + } + } +} + +/** Backwards-compatible factory matching the original low-level API. */ +export interface CreateClientOptions { + baseUrl?: string; + apiKey?: string; + headers?: Record; + fetch?: typeof fetch; +} + export function createClient(options: CreateClientOptions = {}): KreuzbergCloudClient { const headers: Record = { "User-Agent": USER_AGENT, @@ -25,10 +376,74 @@ export function createClient(options: CreateClientOptions = {}): KreuzbergCloudC if (options.apiKey !== undefined) { headers["Authorization"] = `Bearer ${options.apiKey}`; } - return createOpenApiClient({ baseUrl: options.baseUrl ?? DEFAULT_BASE_URL, headers, - ...(options.fetch ? { fetch: options.fetch } : {}), + ...(options.fetch !== undefined ? { fetch: options.fetch } : {}), + }); +} + +function defaultSleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); }); } + +function nextBackoffInterval(current: number, strategy: BackoffStrategy): number { + if (strategy === "constant") { + return current; + } + return Math.min(current * DEFAULT_BACKOFF_FACTOR, DEFAULT_RETRY_BACKOFF_CAP_MS); +} + +function parseRetryAfterHeader(value: string | null): number | undefined { + if (value === null) { + return undefined; + } + const seconds = Number(value); + if (Number.isFinite(seconds) && seconds >= 0) { + return seconds; + } + const parsed = Date.parse(value); + if (Number.isNaN(parsed)) { + return undefined; + } + return Math.max(0, Math.ceil((parsed - Date.now()) / 1000)); +} + +/** Convert a {@link FileLike} input into a `Blob` plus best-guess filename. */ +function toBlob(file: FileLike): { blob: Blob; filename: string } { + if (typeof File !== "undefined" && file instanceof File) { + return { blob: file, filename: file.name }; + } + if (file instanceof Blob) { + return { blob: file, filename: "upload.bin" }; + } + if (file instanceof Uint8Array) { + return { + blob: new Blob([new Uint8Array(file)], { type: "application/octet-stream" }), + filename: "upload.bin", + }; + } + const wrapper = file; + const name = wrapper.name ?? "upload.bin"; + const mimeType = wrapper.mimeType ?? "application/octet-stream"; + if (wrapper.data instanceof Blob) { + return { blob: wrapper.data, filename: name }; + } + return { + blob: new Blob([new Uint8Array(wrapper.data)], { type: mimeType }), + filename: name, + }; +} + +/** Guess a display name for a file (used to populate `Job.filename`). */ +function describeFile(file: FileLike): string { + if (typeof File !== "undefined" && file instanceof File) { + return file.name; + } + if (file instanceof Blob || file instanceof Uint8Array) { + return "upload.bin"; + } + return file.name ?? "upload.bin"; +} diff --git a/packages/typescript/src/errors.ts b/packages/typescript/src/errors.ts new file mode 100644 index 0000000..730b191 --- /dev/null +++ b/packages/typescript/src/errors.ts @@ -0,0 +1,161 @@ +/** + * Error class hierarchy for Kreuzberg Cloud client failures. + * + * Every non-2xx HTTP response is mapped to a subclass of {@link KreuzbergError}. + * `TimeoutError` is also raised when {@link KreuzbergCloud.waitForJob} exceeds + * its configured deadline (distinct from a network/fetch timeout). + */ + +export interface KreuzbergErrorOptions { + status: number; + body: unknown; + cause?: unknown; +} + +export class KreuzbergError extends Error { + public readonly status: number; + public readonly body: unknown; + + public constructor(message: string, options: KreuzbergErrorOptions) { + const errorInit: ErrorOptions = options.cause === undefined ? {} : { cause: options.cause }; + super(message, errorInit); + this.name = "KreuzbergError"; + this.status = options.status; + this.body = options.body; + } +} + +export class AuthError extends KreuzbergError { + public constructor(message: string, options: KreuzbergErrorOptions) { + super(message, options); + this.name = "AuthError"; + } +} + +export interface RateLimitErrorOptions extends KreuzbergErrorOptions { + retryAfter?: number; +} + +export class RateLimitError extends KreuzbergError { + public readonly retryAfter?: number; + + public constructor(message: string, options: RateLimitErrorOptions) { + super(message, options); + this.name = "RateLimitError"; + if (options.retryAfter !== undefined) { + this.retryAfter = options.retryAfter; + } + } +} + +export class ValidationError extends KreuzbergError { + public constructor(message: string, options: KreuzbergErrorOptions) { + super(message, options); + this.name = "ValidationError"; + } +} + +export class NotFoundError extends KreuzbergError { + public constructor(message: string, options: KreuzbergErrorOptions) { + super(message, options); + this.name = "NotFoundError"; + } +} + +export class ServerError extends KreuzbergError { + public constructor(message: string, options: KreuzbergErrorOptions) { + super(message, options); + this.name = "ServerError"; + } +} + +export class TimeoutError extends KreuzbergError { + public constructor(message: string, options: KreuzbergErrorOptions) { + super(message, options); + this.name = "TimeoutError"; + } +} + +/** + * Map an HTTP response status to the matching error subclass and throw it. + * + * Always throws — call as `throw await raiseForStatus(response)` if you want + * the type system to see the unreachable code, otherwise just `await`. + */ +export async function raiseForStatus(response: Response): Promise { + const body = await readBody(response); + const message = extractErrorMessage(body) ?? `Request failed with status ${response.status}`; + const options: KreuzbergErrorOptions = { status: response.status, body }; + + if (response.status === 401 || response.status === 403) { + throw new AuthError(message, options); + } + if (response.status === 404) { + throw new NotFoundError(message, options); + } + if (response.status === 429) { + const retryAfterRaw = response.headers.get("retry-after"); + const retryAfter = parseRetryAfter(retryAfterRaw); + const opts: RateLimitErrorOptions = + retryAfter === undefined ? options : { ...options, retryAfter }; + throw new RateLimitError(message, opts); + } + if (response.status >= 400 && response.status < 500) { + throw new ValidationError(message, options); + } + if (response.status >= 500) { + throw new ServerError(message, options); + } + throw new KreuzbergError(message, options); +} + +async function readBody(response: Response): Promise { + const contentType = response.headers.get("content-type") ?? ""; + if (contentType.includes("application/json")) { + try { + return await response.json(); + } catch { + return undefined; + } + } + try { + const text = await response.text(); + return text === "" ? undefined : text; + } catch { + return undefined; + } +} + +function extractErrorMessage(body: unknown): string | undefined { + if (typeof body === "string" && body.length > 0) { + return body; + } + if (body !== null && typeof body === "object") { + const record = body as Record; + const error = record["error"]; + if (typeof error === "string" && error.length > 0) { + return error; + } + const message = record["message"]; + if (typeof message === "string" && message.length > 0) { + return message; + } + } + return undefined; +} + +function parseRetryAfter(value: string | null): number | undefined { + if (value === null) { + return undefined; + } + const seconds = Number(value); + if (Number.isFinite(seconds) && seconds >= 0) { + return seconds; + } + const parsed = Date.parse(value); + if (Number.isNaN(parsed)) { + return undefined; + } + const delta = Math.max(0, Math.ceil((parsed - Date.now()) / 1000)); + return delta; +} diff --git a/packages/typescript/src/index.ts b/packages/typescript/src/index.ts index dd6a4d9..a22c214 100644 --- a/packages/typescript/src/index.ts +++ b/packages/typescript/src/index.ts @@ -1,2 +1,41 @@ -export { createClient, type CreateClientOptions, type KreuzbergCloudClient } from "./client.js"; -export type { paths, components } from "./_generated/schema.js"; +export { + KreuzbergCloud, + createClient, + type BackoffStrategy, + type CreateClientOptions, + type ExtractAndWaitParams, + type ExtractBatchParams, + type ExtractParams, + type FileLike, + type FromSandboxOptions, + type KreuzbergCloudClient, + type KreuzbergCloudOptions, + type WaitOptions, +} from "./client.js"; +export { + AuthError, + KreuzbergError, + NotFoundError, + RateLimitError, + ServerError, + TimeoutError, + ValidationError, + type KreuzbergErrorOptions, + type RateLimitErrorOptions, +} from "./errors.js"; +export { + SUCCESS_JOB_STATUSES, + TERMINAL_JOB_STATUSES, + type DocumentInput, + type ExtractResponse, + type ExtractionConfig, + type ExtractionOptions, + type ExtractionResult, + type Job, + type JobResult, + type JobStatus, + type SandboxKey, + type UsageResponse, + type WebhookConfig, +} from "./types.js"; +export type { components, paths } from "./_generated/schema.js"; diff --git a/packages/typescript/src/types.ts b/packages/typescript/src/types.ts new file mode 100644 index 0000000..d30cbd0 --- /dev/null +++ b/packages/typescript/src/types.ts @@ -0,0 +1,55 @@ +/** + * Friendly re-exports of generated schema types. + * + * The generated `components["schemas"][...]` indexed access is verbose; this + * module provides the named aliases used by the public client API. + */ + +import type { components } from "./_generated/schema.js"; + +export type ExtractionOptions = components["schemas"]["ExtractionOptions"]; +export type ExtractionConfig = components["schemas"]["ExtractionConfig"]; +export type ExtractionResult = components["schemas"]["ExtractionResult"]; +export type Job = components["schemas"]["JobResponse"]; +export type JobStatus = components["schemas"]["JobStatus"]; +export type WebhookConfig = components["schemas"]["WebhookConfig"]; +export type ExtractResponse = components["schemas"]["ExtractResponse"]; +export type DocumentInput = components["schemas"]["DocumentInput"]; +export type UsageResponse = components["schemas"]["UsageResponse"]; + +/** + * A completed (or failed) job, returned by `waitForJob` once the job reaches a + * terminal status. Adds a non-nullable `result` when the status is + * `completed`/`partial_success` — callers can rely on `.result` without a + * runtime null check in the happy path. + */ +export type JobResult = Job; + +/** + * Sandbox API key returned by `POST /v1/sandbox/key`. + * + * The sandbox endpoint is not yet part of the OpenAPI spec; this type mirrors + * the documented response shape. + */ +export interface SandboxKey { + api_key: string; + expires_at?: string; + project_id?: string; +} + +/** + * Terminal job statuses — once observed, polling stops and the job result is + * returned (or thrown, for `failed`/`cancelled`). + */ +export const TERMINAL_JOB_STATUSES: readonly JobStatus[] = [ + "completed", + "partial_success", + "failed", + "cancelled", +] as const; + +/** + * Job statuses that indicate a successful extraction. Used to distinguish a + * "completed but check warnings" result from a hard failure. + */ +export const SUCCESS_JOB_STATUSES: readonly JobStatus[] = ["completed", "partial_success"] as const; diff --git a/packages/typescript/tests/_helpers.ts b/packages/typescript/tests/_helpers.ts new file mode 100644 index 0000000..e01f4be --- /dev/null +++ b/packages/typescript/tests/_helpers.ts @@ -0,0 +1,12 @@ +import { setupServer } from "msw/node"; +import type { SetupServer } from "msw/node"; + +export const TEST_BASE_URL = "https://api.test.kreuzberg.cloud"; + +export function createTestServer(): SetupServer { + return setupServer(); +} + +export function url(path: string): string { + return `${TEST_BASE_URL}${path}`; +} diff --git a/packages/typescript/tests/errors.test.ts b/packages/typescript/tests/errors.test.ts new file mode 100644 index 0000000..3ca99cf --- /dev/null +++ b/packages/typescript/tests/errors.test.ts @@ -0,0 +1,162 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { + AuthError, + KreuzbergError, + NotFoundError, + RateLimitError, + ServerError, + TimeoutError, + ValidationError, +} from "../src/errors.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +function makeClient(): KreuzbergCloud { + return new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + sleep: async () => {}, + }); +} + +describe("error mapping", () => { + it("maps 401 to AuthError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => + HttpResponse.json({ error: "unauthorized" }, { status: 401 }), + ), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(AuthError); + }); + + it("maps 403 to AuthError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "forbidden" }, { status: 403 })), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(AuthError); + }); + + it("maps 404 to NotFoundError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "not found" }, { status: 404 })), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(NotFoundError); + }); + + it("maps 422 to ValidationError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "bad input" }, { status: 422 })), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(ValidationError); + }); + + it("maps 400 to ValidationError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => + HttpResponse.json({ error: "bad request" }, { status: 400 }), + ), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(ValidationError); + }); + + it("maps 429 to RateLimitError with retryAfter parsed from Retry-After", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => + HttpResponse.json( + { error: "slow down" }, + { status: 429, headers: { "retry-after": "12" } }, + ), + ), + ); + try { + await makeClient().getJob("x"); + throw new Error("expected throw"); + } catch (error) { + expect(error).toBeInstanceOf(RateLimitError); + expect((error as RateLimitError).retryAfter).toBe(12); + } + }); + + it("treats 429 without Retry-After as RateLimitError with no retryAfter", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "slow down" }, { status: 429 })), + ); + try { + await makeClient().getJob("x"); + throw new Error("expected throw"); + } catch (error) { + expect(error).toBeInstanceOf(RateLimitError); + expect((error as RateLimitError).retryAfter).toBeUndefined(); + } + }); + + it("maps 500 to ServerError and surfaces the body", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "boom" }, { status: 500 })), + ); + try { + await makeClient().getJob("x"); + throw new Error("expected throw"); + } catch (error) { + expect(error).toBeInstanceOf(ServerError); + expect((error as ServerError).status).toBe(500); + expect((error as ServerError).body).toEqual({ error: "boom" }); + } + }); + + it("maps 503 to ServerError", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => HttpResponse.json({ error: "down" }, { status: 503 })), + ); + await expect(makeClient().getJob("x")).rejects.toBeInstanceOf(ServerError); + }); + + it("uses the body's `error` field as the message when present", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => + HttpResponse.json({ error: "specific reason" }, { status: 500 }), + ), + ); + try { + await makeClient().getJob("x"); + throw new Error("expected throw"); + } catch (error) { + expect((error as Error).message).toContain("specific reason"); + } + }); + + it("falls back to a generic message when the body has no error/message", async () => { + server.use( + http.get(url("/v1/jobs/x"), () => + HttpResponse.text("oops", { status: 500, headers: { "content-type": "text/plain" } }), + ), + ); + try { + await makeClient().getJob("x"); + throw new Error("expected throw"); + } catch (error) { + expect((error as Error).message).toContain("oops"); + } + }); + + it("KreuzbergError preserves status and body fields", () => { + const e = new KreuzbergError("boom", { status: 418, body: { hint: "teapot" } }); + expect(e.status).toBe(418); + expect(e.body).toEqual({ hint: "teapot" }); + expect(e).toBeInstanceOf(Error); + }); + + it("TimeoutError is its own subclass and instanceof KreuzbergError", () => { + const e = new TimeoutError("nope", { status: 408, body: null }); + expect(e).toBeInstanceOf(TimeoutError); + expect(e).toBeInstanceOf(KreuzbergError); + expect(e).toBeInstanceOf(Error); + }); +}); diff --git a/packages/typescript/tests/extract-and-wait.test.ts b/packages/typescript/tests/extract-and-wait.test.ts new file mode 100644 index 0000000..b87865e --- /dev/null +++ b/packages/typescript/tests/extract-and-wait.test.ts @@ -0,0 +1,142 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { TimeoutError } from "../src/errors.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +function makeClient(): KreuzbergCloud { + return new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + sleep: async () => {}, + }); +} + +describe("extractAndWait", () => { + it("submits, polls, and returns the completed result", async () => { + let polls = 0; + server.use( + http.post(url("/v1/extract"), () => + HttpResponse.json({ job_ids: ["job-9"], status: "pending" }, { status: 202 }), + ), + http.get(url("/v1/jobs/job-9"), () => { + polls += 1; + const status = polls < 2 ? "processing" : "completed"; + return HttpResponse.json( + { + id: "job-9", + filename: "x.pdf", + status, + created_at: "2026-05-09T10:00:00Z", + ...(status === "completed" + ? { result: { content: "Hello world", mime_type: "text/plain" } } + : {}), + }, + { status: 200 }, + ); + }), + ); + + const client = makeClient(); + const result = await client.extractAndWait({ + file: new Blob(["data"]), + pollIntervalMs: 1, + }); + expect(result.status).toBe("completed"); + expect(result.result?.content).toBe("Hello world"); + }); + + it("forwards extraction options through to the extract call", async () => { + let optionsField: string | null = null; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + const value = form.get("options"); + optionsField = typeof value === "string" ? value : null; + return HttpResponse.json({ job_ids: ["job-1"], status: "pending" }, { status: 202 }); + }), + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json( + { + id: "job-1", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ), + ), + ); + const client = makeClient(); + await client.extractAndWait({ + file: new Blob(["data"]), + options: { extraction_config: { output_format: "markdown" } }, + }); + expect(optionsField).not.toBeNull(); + expect(JSON.parse(optionsField as unknown as string)).toEqual({ + extraction_config: { output_format: "markdown" }, + }); + }); + + it("throws KreuzbergError when the polled job ends as failed", async () => { + server.use( + http.post(url("/v1/extract"), () => + HttpResponse.json({ job_ids: ["job-f"], status: "pending" }, { status: 202 }), + ), + http.get(url("/v1/jobs/job-f"), () => + HttpResponse.json( + { + id: "job-f", + filename: "x.pdf", + status: "failed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ), + ), + ); + const client = makeClient(); + await expect(client.extractAndWait({ file: new Blob(["data"]) })).rejects.toThrow(/failed/); + }); + + it("propagates TimeoutError when waitForJob deadline is exceeded", async () => { + server.use( + http.post(url("/v1/extract"), () => + HttpResponse.json({ job_ids: ["job-t"], status: "pending" }, { status: 202 }), + ), + http.get(url("/v1/jobs/job-t"), () => + HttpResponse.json( + { + id: "job-t", + filename: "x.pdf", + status: "processing", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ), + ), + ); + const client = makeClient(); + await expect( + client.extractAndWait({ + file: new Blob(["data"]), + timeoutMs: 1, + pollIntervalMs: 1, + }), + ).rejects.toBeInstanceOf(TimeoutError); + }); + + it("propagates AuthError surfaced from the extract submission", async () => { + server.use( + http.post(url("/v1/extract"), () => HttpResponse.json({ error: "no auth" }, { status: 401 })), + ); + const client = makeClient(); + await expect(client.extractAndWait({ file: new Blob(["data"]) })).rejects.toThrow(/no auth/); + }); +}); diff --git a/packages/typescript/tests/extract.test.ts b/packages/typescript/tests/extract.test.ts new file mode 100644 index 0000000..493e8ba --- /dev/null +++ b/packages/typescript/tests/extract.test.ts @@ -0,0 +1,201 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +function makeClient(): KreuzbergCloud { + return new KreuzbergCloud({ + apiKey: "test-key", + baseUrl: TEST_BASE_URL, + sleep: async () => {}, + }); +} + +describe("extract", () => { + it("posts a single file and returns a pending Job with the assigned id", async () => { + let receivedAuth: string | null = null; + let receivedContentType: string | null = null; + server.use( + http.post(url("/v1/extract"), ({ request }) => { + receivedAuth = request.headers.get("authorization"); + receivedContentType = request.headers.get("content-type"); + return HttpResponse.json({ job_ids: ["job-123"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + const job = await client.extract({ + file: new Blob([new Uint8Array([1, 2, 3])], { type: "application/pdf" }), + }); + + expect(job.id).toBe("job-123"); + expect(job.status).toBe("pending"); + expect(receivedAuth).toBe("Bearer test-key"); + expect(receivedContentType).toMatch(/^multipart\/form-data/); + }); + + it("accepts a File and preserves its filename in the multipart body", async () => { + let receivedFilenames: string[] = []; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + receivedFilenames = form + .getAll("file") + .filter((v): v is File => v instanceof File) + .map((f) => f.name); + return HttpResponse.json({ job_ids: ["job-A"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + const file = new File([new Uint8Array([10, 20])], "invoice.pdf", { type: "application/pdf" }); + await client.extract({ file }); + + expect(receivedFilenames).toEqual(["invoice.pdf"]); + }); + + it("accepts a Uint8Array with default filename upload.bin", async () => { + let filenames: string[] = []; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + filenames = form + .getAll("file") + .filter((v): v is File => v instanceof File) + .map((f) => f.name); + return HttpResponse.json({ job_ids: ["job-X"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + await client.extract({ file: new Uint8Array([1, 2, 3, 4]) }); + expect(filenames).toEqual(["upload.bin"]); + }); + + it("accepts a wrapper { name, data } and uses the provided name", async () => { + let filenames: string[] = []; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + filenames = form + .getAll("file") + .filter((v): v is File => v instanceof File) + .map((f) => f.name); + return HttpResponse.json({ job_ids: ["job-X"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + await client.extract({ file: { name: "report.pdf", data: new Uint8Array([1]) } }); + expect(filenames).toEqual(["report.pdf"]); + }); + + it("serializes options as a JSON multipart part", async () => { + let optionsField: string | null = null; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + const value = form.get("options"); + optionsField = typeof value === "string" ? value : null; + return HttpResponse.json({ job_ids: ["job-x"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + await client.extract({ + file: new Blob(["hi"]), + options: { extraction_config: { output_format: "markdown" } }, + }); + + expect(optionsField).not.toBeNull(); + expect(JSON.parse(optionsField as unknown as string)).toEqual({ + extraction_config: { output_format: "markdown" }, + }); + }); + + it("sends an empty webhook stub when none is provided", async () => { + let webhookField: string | null = null; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + const value = form.get("webhook"); + webhookField = typeof value === "string" ? value : null; + return HttpResponse.json({ job_ids: ["job-w"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + await client.extract({ file: new Blob(["hi"]) }); + expect(webhookField).toBe(JSON.stringify({ url: "" })); + }); + + it("forwards an explicit webhook config in the multipart body", async () => { + let webhookField: string | null = null; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + const value = form.get("webhook"); + webhookField = typeof value === "string" ? value : null; + return HttpResponse.json({ job_ids: ["job-w"], status: "pending" }, { status: 202 }); + }), + ); + + const client = makeClient(); + await client.extract({ + file: new Blob(["hi"]), + webhook: { url: "https://example.com/hook", secret: "shh" }, + }); + + expect(webhookField).not.toBeNull(); + expect(JSON.parse(webhookField as unknown as string)).toEqual({ + url: "https://example.com/hook", + secret: "shh", + }); + }); + + it("extractBatch posts multiple files in a single request", async () => { + let fileCount = 0; + server.use( + http.post(url("/v1/extract"), async ({ request }) => { + const form = await request.formData(); + fileCount = form.getAll("file").length; + return HttpResponse.json( + { job_ids: ["job-1", "job-2", "job-3"], status: "pending" }, + { status: 202 }, + ); + }), + ); + + const client = makeClient(); + const jobs = await client.extractBatch({ + files: [new Blob(["a"]), new Blob(["b"]), new Blob(["c"])], + }); + + expect(fileCount).toBe(3); + expect(jobs.map((j) => j.id)).toEqual(["job-1", "job-2", "job-3"]); + }); + + it("extractBatch throws when called with no files", async () => { + const client = makeClient(); + await expect(client.extractBatch({ files: [] })).rejects.toThrow(/no files/); + }); + + it("extractBatch throws when the server returns fewer job IDs than files", async () => { + server.use( + http.post(url("/v1/extract"), () => + HttpResponse.json({ job_ids: ["only-one"], status: "pending" }, { status: 202 }), + ), + ); + + const client = makeClient(); + await expect( + client.extractBatch({ files: [new Blob(["a"]), new Blob(["b"])] }), + ).rejects.toThrow(/job IDs/); + }); +}); diff --git a/packages/typescript/tests/jobs.test.ts b/packages/typescript/tests/jobs.test.ts new file mode 100644 index 0000000..3702a1d --- /dev/null +++ b/packages/typescript/tests/jobs.test.ts @@ -0,0 +1,193 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { TimeoutError } from "../src/errors.js"; +import type { Job } from "../src/types.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +const FIXED_NOW = "2026-05-09T10:00:00Z"; + +function makeJob(overrides: Partial & { id: string; status: Job["status"] }): Job { + return { + id: overrides.id, + filename: overrides.filename ?? "x.pdf", + status: overrides.status, + created_at: overrides.created_at ?? FIXED_NOW, + ...(overrides.result !== undefined ? { result: overrides.result } : {}), + }; +} + +function makeClient(sleeps: number[] = []): KreuzbergCloud { + const recorded = sleeps; + return new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + sleep: async (ms) => { + recorded.push(ms); + }, + }); +} + +describe("getJob", () => { + it("fetches and returns the job record", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "completed" }), { status: 200 }), + ), + ); + const client = makeClient(); + const job = await client.getJob("job-1"); + expect(job.id).toBe("job-1"); + expect(job.status).toBe("completed"); + }); + + it("URL-encodes the job ID path segment", async () => { + let receivedPath: string | null = null; + server.use( + http.get(url("/v1/jobs/:id"), ({ params, request }) => { + receivedPath = new URL(request.url).pathname; + return HttpResponse.json(makeJob({ id: String(params["id"]), status: "completed" }), { + status: 200, + }); + }), + ); + const client = makeClient(); + await client.getJob("a/b c"); + expect(receivedPath).toBe("/v1/jobs/a%2Fb%20c"); + }); +}); + +describe("waitForJob", () => { + it("returns immediately when the first poll already shows completed", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "completed" }), { status: 200 }), + ), + ); + const client = makeClient(); + const result = await client.waitForJob("job-1"); + expect(result.status).toBe("completed"); + }); + + it("polls until the job reaches a terminal status", async () => { + let calls = 0; + server.use( + http.get(url("/v1/jobs/job-1"), () => { + calls += 1; + const status: Job["status"] = calls < 3 ? "processing" : "completed"; + return HttpResponse.json(makeJob({ id: "job-1", status }), { status: 200 }); + }), + ); + const client = makeClient(); + const result = await client.waitForJob("job-1", { pollIntervalMs: 5 }); + expect(result.status).toBe("completed"); + expect(calls).toBe(3); + }); + + it("treats partial_success as a successful terminal status", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "partial_success" }), { status: 200 }), + ), + ); + const client = makeClient(); + const result = await client.waitForJob("job-1"); + expect(result.status).toBe("partial_success"); + }); + + it("throws KreuzbergError when the job ends as failed", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "failed" }), { status: 200 }), + ), + ); + const client = makeClient(); + await expect(client.waitForJob("job-1")).rejects.toThrow(/failed/); + }); + + it("throws KreuzbergError when the job ends as cancelled", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "cancelled" }), { status: 200 }), + ), + ); + const client = makeClient(); + await expect(client.waitForJob("job-1")).rejects.toThrow(/cancelled/); + }); + + it("throws TimeoutError when the wait deadline is exceeded", async () => { + server.use( + http.get(url("/v1/jobs/job-1"), () => + HttpResponse.json(makeJob({ id: "job-1", status: "processing" }), { status: 200 }), + ), + ); + const client = makeClient(); + await expect( + client.waitForJob("job-1", { timeoutMs: 1, pollIntervalMs: 1 }), + ).rejects.toBeInstanceOf(TimeoutError); + }); + + it("uses exponential backoff between polls by default", async () => { + const recorded: number[] = []; + let calls = 0; + server.use( + http.get(url("/v1/jobs/job-1"), () => { + calls += 1; + const status: Job["status"] = calls < 4 ? "processing" : "completed"; + return HttpResponse.json(makeJob({ id: "job-1", status }), { status: 200 }); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + sleep: async (ms) => { + recorded.push(ms); + }, + }); + await client.waitForJob("job-1", { pollIntervalMs: 10 }); + // 3 sleeps before the 4th poll returns completed; intervals 10, 20, 40 + expect(recorded).toEqual([10, 20, 40]); + }); + + it("uses constant intervals when backoff: 'constant' is set", async () => { + const recorded: number[] = []; + let calls = 0; + server.use( + http.get(url("/v1/jobs/job-1"), () => { + calls += 1; + const status: Job["status"] = calls < 4 ? "processing" : "completed"; + return HttpResponse.json(makeJob({ id: "job-1", status }), { status: 200 }); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + sleep: async (ms) => { + recorded.push(ms); + }, + }); + await client.waitForJob("job-1", { pollIntervalMs: 10, backoff: "constant" }); + expect(recorded).toEqual([10, 10, 10]); + }); + + it("waitForJobs resolves all jobs in parallel", async () => { + const completedFor = new Set(); + server.use( + http.get(url("/v1/jobs/:id"), ({ params }) => { + const id = String(params["id"]); + completedFor.add(id); + return HttpResponse.json(makeJob({ id, status: "completed" }), { status: 200 }); + }), + ); + const client = makeClient(); + const results = await client.waitForJobs(["a", "b", "c"]); + expect(results.map((r) => r.id).sort()).toEqual(["a", "b", "c"]); + expect([...completedFor].sort()).toEqual(["a", "b", "c"]); + }); +}); diff --git a/packages/typescript/tests/retry.test.ts b/packages/typescript/tests/retry.test.ts new file mode 100644 index 0000000..d83220b --- /dev/null +++ b/packages/typescript/tests/retry.test.ts @@ -0,0 +1,230 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { RateLimitError, ServerError } from "../src/errors.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +describe("retry/backoff", () => { + it("retries 503 responses up to the configured limit and succeeds", async () => { + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + if (attempts < 3) { + return HttpResponse.json({ error: "down" }, { status: 503 }); + } + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 5, + sleep: async () => {}, + }); + const job = await client.getJob("x"); + expect(job.id).toBe("x"); + expect(attempts).toBe(3); + }); + + it("retries 429 responses honoring Retry-After (seconds)", async () => { + const sleeps: number[] = []; + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + if (attempts === 1) { + return HttpResponse.json( + { error: "wait" }, + { status: 429, headers: { "retry-after": "3" } }, + ); + } + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 1, + sleep: async (ms) => { + sleeps.push(ms); + }, + }); + await client.getJob("x"); + expect(sleeps).toEqual([3000]); + }); + + it("uses exponential backoff intervals when Retry-After is absent", async () => { + const sleeps: number[] = []; + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + if (attempts < 4) { + return HttpResponse.json({ error: "down" }, { status: 502 }); + } + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 5, + retryBackoff: "exponential", + sleep: async (ms) => { + sleeps.push(ms); + }, + }); + await client.getJob("x"); + // Three retries: 200, 400, 800 (base 200 doubling) + expect(sleeps).toEqual([200, 400, 800]); + }); + + it("throws when retries are exhausted", async () => { + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + return HttpResponse.json({ error: "still down" }, { status: 503 }); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 2, + sleep: async () => {}, + }); + await expect(client.getJob("x")).rejects.toBeInstanceOf(ServerError); + expect(attempts).toBe(3); // initial + 2 retries + }); + + it("does not retry on 401", async () => { + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + return HttpResponse.json({ error: "no auth" }, { status: 401 }); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 5, + sleep: async () => {}, + }); + await expect(client.getJob("x")).rejects.toThrow(); + expect(attempts).toBe(1); + }); + + it("retries on a custom retryOn list", async () => { + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + if (attempts === 1) { + return HttpResponse.json({ error: "teapot" }, { status: 418 }); + } + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 2, + retryOn: [418], + sleep: async () => {}, + }); + const job = await client.getJob("x"); + expect(job.id).toBe("x"); + expect(attempts).toBe(2); + }); + + it("constant backoff keeps the same delay across attempts", async () => { + const sleeps: number[] = []; + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + if (attempts < 4) { + return HttpResponse.json({ error: "down" }, { status: 502 }); + } + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 5, + retryBackoff: "constant", + sleep: async (ms) => { + sleeps.push(ms); + }, + }); + await client.getJob("x"); + expect(sleeps).toEqual([200, 200, 200]); + }); + + it("propagates RateLimitError after exhausting retries", async () => { + let attempts = 0; + server.use( + http.get(url("/v1/jobs/x"), () => { + attempts += 1; + return HttpResponse.json({ error: "wait" }, { status: 429 }); + }), + ); + const client = new KreuzbergCloud({ + apiKey: "k", + baseUrl: TEST_BASE_URL, + retries: 1, + sleep: async () => {}, + }); + await expect(client.getJob("x")).rejects.toBeInstanceOf(RateLimitError); + expect(attempts).toBe(2); + }); +}); diff --git a/packages/typescript/tests/sandbox.test.ts b/packages/typescript/tests/sandbox.test.ts new file mode 100644 index 0000000..f8756dc --- /dev/null +++ b/packages/typescript/tests/sandbox.test.ts @@ -0,0 +1,121 @@ +import { HttpResponse, http } from "msw"; +import { afterAll, afterEach, beforeAll, describe, expect, it } from "vitest"; +import { KreuzbergCloud } from "../src/client.js"; +import { RateLimitError } from "../src/errors.js"; +import { TEST_BASE_URL, createTestServer, url } from "./_helpers.js"; + +const server = createTestServer(); + +beforeAll(() => server.listen({ onUnhandledRequest: "error" })); +afterEach(() => server.resetHandlers()); +afterAll(() => server.close()); + +describe("createSandboxKey", () => { + it("calls POST /v1/sandbox/key and returns the key payload", async () => { + server.use( + http.post(url("/v1/sandbox/key"), () => + HttpResponse.json( + { api_key: "kz_sandbox_abc", expires_at: "2026-05-10T00:00:00Z" }, + { status: 200 }, + ), + ), + ); + const client = new KreuzbergCloud({ baseUrl: TEST_BASE_URL, sleep: async () => {} }); + const key = await client.createSandboxKey(); + expect(key.api_key).toBe("kz_sandbox_abc"); + expect(key.expires_at).toBe("2026-05-10T00:00:00Z"); + }); + + it("raises RateLimitError with retryAfter when the sandbox endpoint returns 429", async () => { + server.use( + http.post(url("/v1/sandbox/key"), () => + HttpResponse.json( + { error: "rate limit" }, + { status: 429, headers: { "retry-after": "5" } }, + ), + ), + ); + const client = new KreuzbergCloud({ baseUrl: TEST_BASE_URL, sleep: async () => {} }); + try { + await client.createSandboxKey(); + throw new Error("expected throw"); + } catch (error) { + expect(error).toBeInstanceOf(RateLimitError); + expect((error as RateLimitError).retryAfter).toBe(5); + } + }); +}); + +describe("KreuzbergCloud.fromSandbox", () => { + it("mints a sandbox key and returns a client configured with it", async () => { + const seenAuth: string[] = []; + server.use( + http.post(url("/v1/sandbox/key"), ({ request }) => { + seenAuth.push(request.headers.get("authorization") ?? ""); + return HttpResponse.json({ api_key: "kz_sandbox_minted" }, { status: 200 }); + }), + http.get(url("/v1/jobs/job-1"), ({ request }) => { + seenAuth.push(request.headers.get("authorization") ?? ""); + return HttpResponse.json( + { + id: "job-1", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + + const client = await KreuzbergCloud.fromSandbox({ + baseUrl: TEST_BASE_URL, + }); + await client.getJob("job-1"); + + expect(seenAuth[0]).toBe(""); // bootstrap call has no auth + expect(seenAuth[1]).toBe("Bearer kz_sandbox_minted"); + }); + + it("propagates baseUrl, headers, and timeout config to the returned client", async () => { + const seenHeaders: Record = {}; + server.use( + http.post(url("/v1/sandbox/key"), () => + HttpResponse.json({ api_key: "kz_minted" }, { status: 200 }), + ), + http.get(url("/v1/jobs/x"), ({ request }) => { + seenHeaders["x-custom"] = request.headers.get("x-custom"); + return HttpResponse.json( + { + id: "x", + filename: "x.pdf", + status: "completed", + created_at: "2026-05-09T10:00:00Z", + }, + { status: 200 }, + ); + }), + ); + + const client = await KreuzbergCloud.fromSandbox({ + baseUrl: TEST_BASE_URL, + headers: { "x-custom": "yes" }, + }); + await client.getJob("x"); + expect(seenHeaders["x-custom"]).toBe("yes"); + }); + + it("propagates a 401 from the sandbox endpoint as AuthError", async () => { + server.use( + http.post(url("/v1/sandbox/key"), () => + HttpResponse.json({ error: "denied" }, { status: 401 }), + ), + ); + await expect(KreuzbergCloud.fromSandbox({ baseUrl: TEST_BASE_URL })).rejects.toThrow(/denied/); + }); + + it("makeClient (default base URL) constructs without throwing", () => { + const client = new KreuzbergCloud({ apiKey: "k" }); + expect(client).toBeInstanceOf(KreuzbergCloud); + }); +}); diff --git a/pyproject.toml b/pyproject.toml index 11ca930..6b8d65d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,14 @@ name = "kreuzberg-cloud-sdk-workspace" version = "0.0.0" description = "Workspace root for the kreuzberg-cloud SDK monorepo" requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", +] [dependency-groups] dev = [ @@ -21,9 +29,9 @@ dev = [ ] [tool.uv] +sources.kreuzberg-cloud = { workspace = true } package = false workspace.members = [ "packages/python" ] -sources.kreuzberg-cloud = { workspace = true } cache-keys = [ { file = "pyproject.toml" }, { file = "packages/python/pyproject.toml" }, @@ -107,10 +115,7 @@ implicit_reexport = false show_error_codes = true strict = true namespace_packages = true - -[[tool.mypy.overrides]] -module = [ "httpx", "respx" ] -ignore_missing_imports = true +overrides = [ { module = [ "httpx", "respx" ], ignore_missing_imports = true } ] [tool.pytest] ini_options.testpaths = [ "packages/python/tests" ] @@ -121,8 +126,8 @@ ini_options.asyncio_default_fixture_loop_scope = "function" [tool.coverage] run.branch = true run.omit = [ - "packages/python/tests/*", "packages/python/src/kreuzberg_cloud/_generated/*", + "packages/python/tests/*", ] run.plugins = [ "covdefaults" ] run.source = [ "packages/python/src/kreuzberg_cloud" ] diff --git a/scripts/sync-versions.py b/scripts/sync-versions.py index 53732e5..211cffc 100755 --- a/scripts/sync-versions.py +++ b/scripts/sync-versions.py @@ -5,9 +5,11 @@ ``task version:sync``) after bumping it to propagate to every language package. Affected files: - - packages/python/pyproject.toml (``project.version``) - - packages/typescript/package.json (``version``) -Go module versions live in git tags only and are intentionally not touched. + - packages/python/pyproject.toml (``project.version``) + - packages/typescript/package.json (``version``) + - packages/python/src/kreuzberg_cloud/__init__.py (``__version__``) + - packages/go/v1/version.go (``const Version``) +Go module versions for the module path itself live in git tags only. """ from __future__ import annotations @@ -21,11 +23,14 @@ VERSION_FILE = REPO_ROOT / "VERSION" PYTHON_PYPROJECT = REPO_ROOT / "packages" / "python" / "pyproject.toml" TYPESCRIPT_PACKAGE = REPO_ROOT / "packages" / "typescript" / "package.json" +PYTHON_INIT = REPO_ROOT / "packages" / "python" / "src" / "kreuzberg_cloud" / "__init__.py" +GO_VERSION = REPO_ROOT / "packages" / "go" / "v1" / "version.go" VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+(?:[-+][\w.+-]+)?$") def read_version() -> str: + """Return the canonical version string from the repo-root VERSION file.""" raw = VERSION_FILE.read_text(encoding="utf-8").strip() if not VERSION_PATTERN.match(raw): sys.exit(f"VERSION file contains invalid semver: {raw!r}") @@ -33,6 +38,7 @@ def read_version() -> str: def update_pyproject(path: Path, version: str) -> bool: + """Rewrite the ``project.version`` line in a pyproject.toml; return True if changed.""" text = path.read_text(encoding="utf-8") new_text, count = re.subn( r'(?m)^(version\s*=\s*")[^"]+(")', @@ -49,6 +55,7 @@ def update_pyproject(path: Path, version: str) -> bool: def update_package_json(path: Path, version: str) -> bool: + """Rewrite the ``version`` field in a package.json; return True if changed.""" raw = path.read_text(encoding="utf-8") data = json.loads(raw) if data.get("version") == version: @@ -58,17 +65,56 @@ def update_package_json(path: Path, version: str) -> bool: return True +def update_python_init(path: Path, version: str) -> bool: + """Rewrite the ``__version__`` literal in the Python package __init__.py; return True if changed.""" + text = path.read_text(encoding="utf-8") + new_text, count = re.subn( + r'(?m)^(__version__\s*=\s*")[^"]+(")', + rf"\g<1>{version}\g<2>", + text, + count=1, + ) + if count == 0: + sys.exit(f"no __version__ line found in {path}") + if new_text == text: + return False + path.write_text(new_text, encoding="utf-8") + return True + + +def update_go_version(path: Path, version: str) -> bool: + """Rewrite the ``const Version`` literal in the Go version.go; return True if changed.""" + text = path.read_text(encoding="utf-8") + new_text, count = re.subn( + r'(?m)^(const Version\s*=\s*")[^"]+(")', + rf"\g<1>{version}\g<2>", + text, + count=1, + ) + if count == 0: + sys.exit(f"no `const Version` line found in {path}") + if new_text == text: + return False + path.write_text(new_text, encoding="utf-8") + return True + + def main() -> int: + """Propagate the root VERSION value to every per-package manifest.""" version = read_version() changed: list[str] = [] if update_pyproject(PYTHON_PYPROJECT, version): changed.append(str(PYTHON_PYPROJECT.relative_to(REPO_ROOT))) if update_package_json(TYPESCRIPT_PACKAGE, version): changed.append(str(TYPESCRIPT_PACKAGE.relative_to(REPO_ROOT))) + if update_python_init(PYTHON_INIT, version): + changed.append(str(PYTHON_INIT.relative_to(REPO_ROOT))) + if update_go_version(GO_VERSION, version): + changed.append(str(GO_VERSION.relative_to(REPO_ROOT))) if changed: - print(f"synced version {version} -> {', '.join(changed)}") + print(f"synced version {version} -> {', '.join(changed)}") # noqa: T201 else: - print(f"version {version} already in sync") + print(f"version {version} already in sync") # noqa: T201 return 0 diff --git a/spec/openapi.yaml b/spec/openapi.yaml index b97ff59..d839429 100644 --- a/spec/openapi.yaml +++ b/spec/openapi.yaml @@ -1,3246 +1,1752 @@ openapi: 3.1.0 info: - title: Kreuzberg Backend API - description: Backend API for Kreuzberg Cloud - document processing SaaS platform - contact: - name: Kreuzberg Cloud - url: https://kreuzberg.cloud - license: - name: BUSL-1.1 - identifier: BUSL-1.1 - version: 0.1.0 + title: Kreuzberg Cloud API + description: Cloud document extraction API powered by Kreuzberg. Supports asynchronous extraction with webhook delivery. + contact: + name: Kreuzberg Cloud + url: https://kreuzberg.dev + license: + name: BUSL-1.1 + version: 1.0.0 +servers: + - url: https://api.kreuzberg.cloud + description: Production API + - url: https://api.staging.kreuzberg.cloud + description: Staging API paths: - /auth/account: - delete: - tags: - - auth - summary: Delete the authenticated user's account and all associated data - description: "Soft-deletes all projects owned by the user, revokes their API - keys, - - soft-deletes their webhooks, and removes all project memberships." - operationId: delete_account - responses: - "204": - description: Account deleted successfully - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /auth/login: - post: - tags: - - auth - summary: Authenticate with an OIDC ID token and receive a backend JWT - operationId: login - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/LoginRequest" - required: true - responses: - "200": - description: Successful login - content: - application/json: - schema: - $ref: "#/components/schemas/LoginResponse" - "401": - description: Invalid OIDC ID token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Authentication is disabled - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "503": - description: Identity provider service unavailable - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - /healthz: - get: - tags: - - health - summary: Liveness probe - returns 200 if the process is running - operationId: healthz - responses: - "200": - description: Service is alive - content: - application/json: - schema: - $ref: "#/components/schemas/HealthResponse" - example: - status: ok - /readyz: - get: - tags: - - health - summary: Readiness probe - returns 200 if the service can handle traffic - operationId: readyz - responses: - "200": - description: Service is ready - content: - application/json: - schema: - $ref: "#/components/schemas/ReadinessResponse" - example: - checks: - database: ok - status: ready - "503": - description: Service is not ready - content: - application/json: - schema: - $ref: "#/components/schemas/ReadinessResponse" - /v1/invitations/accept: - post: - tags: - - members - summary: Accept an invitation using a token - operationId: accept_invitation - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/AcceptInvitationRequest" - required: true - responses: - "200": - description: Invitation accepted, membership created - content: - application/json: - schema: - $ref: "#/components/schemas/MemberResponse" - "400": - description: Invalid or expired token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "409": - description: User is already a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects: - get: - tags: - - projects - summary: List all projects for the authenticated user - operationId: list_projects - parameters: - - name: limit - in: query - description: Number of items to return (default 100, max 1000) - required: false - schema: - type: integer - format: int32 - example: 100 - - name: offset - in: query - description: Number of items to skip (default 0) - required: false - schema: - type: integer - format: int32 - example: 0 - responses: - "200": - description: List of projects - content: - application/json: - schema: - $ref: "#/components/schemas/PaginatedResponse_ProjectResponse" - "400": - description: Invalid pagination parameters - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - post: - tags: - - projects - summary: Create a new project - operationId: create_project - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/CreateProjectRequest" - required: true - responses: - "201": - description: Project created - content: - application/json: - schema: - $ref: "#/components/schemas/ProjectResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "409": - description: Project slug already exists - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}: - get: - tags: - - projects - summary: Get project details by ID - operationId: get_project - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Project details - content: - application/json: - schema: - $ref: "#/components/schemas/ProjectResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - delete: - tags: - - projects - summary: Delete a project - operationId: delete_project - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "204": - description: Project deleted - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - patch: - tags: - - projects - summary: Update project details - operationId: update_project - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/UpdateProjectRequest" - required: true - responses: - "200": - description: Project updated - content: - application/json: - schema: - $ref: "#/components/schemas/ProjectResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/analytics: - get: - tags: - - analytics - summary: Get analytics data for a project within a date range - description: "Returns daily usage metrics and processing time percentiles aggregated - - from extraction jobs. Includes job counts, pages/tables/images extracted, - - file type breakdowns, and p50/p95/p99 processing latencies." - operationId: get_analytics - parameters: - - name: start_date - in: query - description: Start date (ISO 8601 format, e.g., "2025-01-01") - required: true - schema: - type: string - example: "2025-01-01" - - name: end_date - in: query - description: End date (ISO 8601 format, e.g., "2025-01-31") - required: true - schema: - type: string - example: "2025-01-31" - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Analytics data - content: - application/json: - schema: - $ref: "#/components/schemas/AnalyticsResponse" - "400": - description: Invalid date format or range - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/api-keys: - get: - tags: - - api_keys - summary: List all API keys for a project - operationId: list_api_keys - parameters: - - name: limit - in: query - description: Number of items to return (default 100, max 1000) - required: false - schema: - type: integer - format: int32 - example: 100 - - name: offset - in: query - description: Number of items to skip (default 0) - required: false - schema: - type: integer - format: int32 - example: 0 - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: List of API keys - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/ApiKeyResponse" - "400": - description: Invalid pagination parameters - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - post: - tags: - - api_keys - summary: Create a new API key for a project - operationId: create_api_key - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/CreateApiKeyRequest" - required: true - responses: - "201": - description: API key created - content: - application/json: - schema: - $ref: "#/components/schemas/CreateApiKeyResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/api-keys/{key_id}: - delete: - tags: - - api_keys - summary: Revoke an API key - operationId: revoke_api_key - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: key_id - in: path - description: API Key ID - required: true - schema: - type: string - format: uuid - responses: - "204": - description: API key revoked - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or API key not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/api-keys/{key_id}/regenerate: - post: - tags: - - api_keys - summary: Regenerate an API key - operationId: regenerate_api_key - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: key_id - in: path - description: API Key ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: API key regenerated - content: - application/json: - schema: - $ref: "#/components/schemas/CreateApiKeyResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or API key not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/billing: - get: - tags: - - billing - summary: Get billing and quota information for a project - description: "Returns the project's plan limits, current usage for the billing - period, - - and remaining quota. Useful for rendering usage progress bars on the - - frontend dashboard." - operationId: get_billing - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Billing and quota information - content: - application/json: - schema: - $ref: "#/components/schemas/BillingResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/billing/checkout: - post: - tags: - - billing - summary: Create a Stripe Checkout Session for a project - description: "Creates (or reuses) a Stripe customer for the project and opens - a - - Checkout Session in subscription mode so the user can add a payment - - method and subscribe to the standard plan." - operationId: create_checkout - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Checkout Session URL - content: - application/json: - schema: - $ref: "#/components/schemas/CheckoutResponse" - "400": - description: Bad request - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "409": - description: Subscription already active - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/billing/portal: - post: - tags: - - billing - summary: Create a Stripe Customer Portal Session for a project - description: "Opens a Customer Portal session so the user can manage their subscription, - - update payment methods, or view billing history." - operationId: create_portal - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Customer Portal URL - content: - application/json: - schema: - $ref: "#/components/schemas/PortalResponse" - "400": - description: No billing account - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/invitations: - get: - tags: - - members - summary: List pending invitations for a project - operationId: list_invitations - parameters: - - name: limit - in: query - description: Number of items to return (default 100, max 1000) - required: false - schema: - type: integer - format: int32 - example: 100 - - name: offset - in: query - description: Number of items to skip (default 0) - required: false - schema: - type: integer - format: int32 - example: 0 - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: List of pending invitations - content: - application/json: - schema: - $ref: "#/components/schemas/PaginatedResponse_InvitationResponse" - "400": - description: Invalid pagination parameters - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - post: - tags: - - members - summary: Create a project invitation (sends token for email delivery by frontend) - operationId: invite_user - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/CreateInvitationRequest" - required: true - responses: - "201": - description: Invitation created - content: - application/json: - schema: - $ref: "#/components/schemas/CreateInvitationResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "409": - description: Pending invitation already exists for this email, or user is already an active member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/invitations/{inv_id}: - delete: - tags: - - members - summary: Revoke a pending invitation - operationId: revoke_invitation - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: inv_id - in: path - description: Invitation ID - required: true - schema: - type: string - format: uuid - responses: - "204": - description: Invitation revoked - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Invitation not found or not pending - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/leave: - post: - tags: - - members - summary: "Self-service: caller leaves a project they belong to" - operationId: leave_project - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "204": - description: Successfully left project - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - caller is not a member of this project - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "409": - description: Cannot leave - caller is sole owner with other members; transfer ownership first - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/members: - get: - tags: - - members - summary: List all members of a project - operationId: list_members - parameters: - - name: limit - in: query - description: Number of items to return (default 100, max 1000) - required: false - schema: - type: integer - format: int32 - example: 100 - - name: offset - in: query - description: Number of items to skip (default 0) - required: false - schema: - type: integer - format: int32 - example: 0 - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: List of members - content: - application/json: - schema: - $ref: "#/components/schemas/PaginatedResponse_MemberResponse" - "400": - description: Invalid pagination parameters - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/members/{user_id}: - delete: - tags: - - members - summary: Remove a member from a project - operationId: remove_member - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: user_id - in: path - description: User ID of the member - required: true - schema: - type: string - responses: - "204": - description: Member removed - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or member not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - patch: - tags: - - members - summary: Update a member's role in a project - operationId: update_member_role - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: user_id - in: path - description: User ID of the member - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/UpdateMemberRoleRequest" - required: true - responses: - "200": - description: Member role updated - content: - application/json: - schema: - $ref: "#/components/schemas/MemberResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or member not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/sandbox/extract: - post: - tags: - - sandbox - summary: Extract a document in the sandbox (first page only) - description: "Accepts a single file upload via multipart form data, proxies - the extraction - - request through the REST API service, and returns the first page of results. - - This gives users a realistic preview of the extraction pipeline." - operationId: sandbox_extract - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - multipart/form-data: {} - responses: - "200": - description: Extraction result (first page) - content: - application/json: - schema: - $ref: "#/components/schemas/SandboxExtractResponse" - "400": - description: Invalid request - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "408": - description: Extraction timed out - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "503": - description: Sandbox not configured - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/usage: - get: - tags: - - usage - summary: Get usage statistics for a project within a date range - operationId: get_usage - parameters: - - name: start_date - in: query - description: Start date for the usage period (ISO 8601 format, e.g., "2025-01-01") - required: true - schema: - type: string - example: "2025-01-01" - - name: end_date - in: query - description: End date for the usage period (ISO 8601 format, e.g., "2025-01-31") - required: true - schema: - type: string - example: "2025-01-31" - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Usage statistics - content: - application/json: - schema: - $ref: "#/components/schemas/UsageResponse" - "400": - description: Invalid date format or range - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/webhooks: - get: - tags: - - webhooks - summary: List all webhooks for a project - operationId: list_webhooks - parameters: - - name: limit - in: query - description: Number of items to return (default 100, max 1000) - required: false - schema: - type: integer - format: int32 - example: 100 - - name: offset - in: query - description: Number of items to skip (default 0) - required: false - schema: - type: integer - format: int32 - example: 0 - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: List of webhooks - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/WebhookResponse" - "400": - description: Invalid pagination parameters - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not a member - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - post: - tags: - - webhooks - summary: Create a new webhook for a project - operationId: create_webhook - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/CreateWebhookRequest" - required: true - responses: - "201": - description: Webhook created - content: - application/json: - schema: - $ref: "#/components/schemas/WebhookResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/webhooks/{wh_id}: - delete: - tags: - - webhooks - summary: Delete a webhook - operationId: delete_webhook - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: wh_id - in: path - description: Webhook ID - required: true - schema: - type: string - format: uuid - responses: - "204": - description: Webhook deleted - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or webhook not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - patch: - tags: - - webhooks - summary: Update a webhook's configuration - operationId: update_webhook - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: wh_id - in: path - description: Webhook ID - required: true - schema: - type: string - format: uuid - requestBody: - content: - application/json: - schema: - $ref: "#/components/schemas/UpdateWebhookRequest" - required: true - responses: - "200": - description: Webhook updated - content: - application/json: - schema: - $ref: "#/components/schemas/WebhookResponse" - "400": - description: Validation failed - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or webhook not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/projects/{id}/webhooks/{wh_id}/test: - post: - tags: - - webhooks - summary: Send a test delivery to a webhook endpoint - operationId: test_webhook - parameters: - - name: id - in: path - description: Project ID - required: true - schema: - type: string - format: uuid - - name: wh_id - in: path - description: Webhook ID - required: true - schema: - type: string - format: uuid - responses: - "200": - description: Test delivery result - content: - application/json: - schema: - $ref: "#/components/schemas/WebhookTestResponse" - "401": - description: Unauthorized - no auth token - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "403": - description: Forbidden - user not owner - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "404": - description: Project or webhook not found - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - security: - - bearer_auth: [] - /v1/sandbox/public/extract: - post: - tags: - - sandbox - summary: Extract a document from the public landing-page sandbox. - description: "Public twin of [`sandbox_extract`], reachable without dashboard\ - \ auth. The\nroute is always mounted; the protective layers around it are\ - \ the\nnon-negotiable trust boundary:\n\n1. Per-IP rate limit (`tower_governor::SmartIpKeyExtractor`).\n\ - 2. 1 MB body cap (`axum::extract::DefaultBodyLimit`).\n3. Optional OIDC token\ - \ verification (`PublicSandboxAuthMode::Oidc`) — when\n configured, also\ - \ unlocks the per-UID rate limit layer.\n4. Per-page cap enforced server-side\ - \ via\n `extraction_config.security_limits.max_pages`.\n\nThe handler proxies\ - \ to the authenticated REST API service using a single\nlong-lived `SANDBOX_PUBLIC_API_KEY`,\ - \ which is created once at deployment\ntime for the synthetic public-sandbox\ - \ project. No per-request DB writes\nhappen here." - operationId: public_sandbox_extract - requestBody: - content: - multipart/form-data: {} - responses: - "200": - description: Extraction result (first page) - content: - application/json: - schema: - $ref: "#/components/schemas/SandboxExtractResponse" - "400": - description: Invalid request - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "401": - description: Sandbox token rejected - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "408": - description: Extraction timed out - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "413": - description: Request body too large - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "429": - description: Rate limit exceeded - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" - "503": - description: Public sandbox not configured - content: - application/json: - schema: - $ref: "#/components/schemas/ErrorResponse" -components: - schemas: - AcceptInvitationRequest: - type: object - description: Request to accept an invitation - required: - - token - properties: - token: - type: string - description: Invitation token received via email - example: a1b2c3d4e5f6... - AnalyticsQuery: - type: object - description: Query parameters for analytics endpoint - required: - - start_date - - end_date - properties: - end_date: - type: string - description: End date (ISO 8601 format, e.g., "2025-01-31") - start_date: - type: string - description: Start date (ISO 8601 format, e.g., "2025-01-01") - AnalyticsResponse: - type: object - description: Analytics response containing daily metrics - required: - - project_id - - start_date - - end_date - - days - properties: - days: - type: array - items: - $ref: "#/components/schemas/DailyAnalyticsEntry" - description: Daily analytics entries - end_date: - type: string - description: End date of the analytics period - example: "2025-01-31" - project_id: - type: string - description: The project identifier - example: 550e8400-e29b-41d4-a716-446655440000 - start_date: - type: string - description: Start date of the analytics period - example: "2025-01-01" - ApiKeyResponse: - type: object - description: "Response containing API key details (for list operations) - - - Never includes the full key, only the prefix for security." - required: - - id - - key_prefix - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - expires_at: - type: - - string - - "null" - description: Optional expiration date in ISO 8601 format - example: "2026-12-31T23:59:59Z" - id: - type: string - description: Unique API key identifier - example: 770e8400-e29b-41d4-a716-446655440002 - key_prefix: - type: string - description: Key prefix for identification (e.g., "kz_abcd...") - example: kz_abcdefgh - last_used_at: - type: - - string - - "null" - description: Optional last used timestamp in ISO 8601 format - example: "2025-12-21T09:30:00Z" - name: - type: - - string - - "null" - description: User-provided name for the key - example: Production API Key - revoked_at: - type: - - string - - "null" - description: Optional revocation timestamp in ISO 8601 format - example: "2025-06-15T12:00:00Z" - BillingAccountInfo: - type: object - description: Stripe billing account information - required: - - free_pages_remaining - - free_pages_total - - has_payment_method - - price_tier - - cancel_at_period_end - properties: - cancel_at: - type: - - string - - "null" - description: "ISO 8601 datetime when the subscription is scheduled to cancel. - - Null when no cancellation is pending." - cancel_at_period_end: - type: boolean - description: "True when the user has scheduled a deferred cancellation via - the Stripe portal. - - The subscription stays active until cancel_at / period end." - discount_expires_at: - type: - - string - - "null" - description: When the current discount expires (null if no active discount) - free_pages_remaining: - type: integer - format: int64 - description: Free pages remaining (lifetime credit) - free_pages_total: - type: integer - format: int64 - description: Total free pages originally granted - has_payment_method: - type: boolean - description: Whether a Stripe payment method is on file - price_tier: - type: string - description: Current price tier (STANDARD or DISCOUNTED) - subscription_status: - type: - - string - - "null" - description: 'Stripe subscription status ("active", "past_due", "canceled", - etc.). - - Null when no subscription exists on the billing account.' - BillingResponse: - type: object - description: Billing and quota response for a project - required: - - project_id - - period_start - - period_end - - plan - - usage - properties: - billing_account: - oneOf: - - type: "null" - - $ref: "#/components/schemas/BillingAccountInfo" - description: Stripe billing account information (null if not provisioned) - period_end: - type: string - description: End of the current billing period (ISO 8601 date) - example: "2026-05-01" - period_start: - type: string - description: Start of the current billing period (ISO 8601 date) - example: "2026-04-01" - plan: - $ref: "#/components/schemas/PlanInfo" - description: Plan limits - project_id: - type: string - description: The project identifier - example: 550e8400-e29b-41d4-a716-446655440000 - usage: - $ref: "#/components/schemas/UsageInfo" - description: Current usage within the billing period - BoundingBox: - type: object - description: Bounding box coordinates. - required: - - x0 - - y0 - - x1 - - y1 - properties: - x0: - type: number - format: double - x1: - type: number - format: double - y0: - type: number - format: double - y1: - type: number - format: double - CheckoutResponse: - type: object - description: Response body for a Stripe Checkout Session - required: - - url - properties: - url: - type: string - description: Stripe Checkout Session URL to redirect the user to - Chunk: - type: object - description: Text chunk with optional embedding. - properties: - content: - type: string - description: Chunk text content - embedding: - type: array - items: - type: number - format: float - description: Embedding vector (when embedding is enabled) - metadata: - oneOf: - - type: "null" - - $ref: "#/components/schemas/ChunkMetadata" - description: Chunk position metadata - ChunkMetadata: - type: object - description: Chunk position and size metadata. - required: - - byte_start - - byte_end - - chunk_index - - total_chunks - properties: - byte_end: - type: integer - format: int64 - minimum: 0 - byte_start: - type: integer - format: int64 - minimum: 0 - chunk_index: - type: integer - format: int64 - minimum: 0 - first_page: - type: - - integer - - "null" - format: int64 - minimum: 0 - last_page: - type: - - integer - - "null" - format: int64 - minimum: 0 - token_count: - type: - - integer - - "null" - format: int64 - minimum: 0 - total_chunks: - type: integer - format: int64 - minimum: 0 - CreateApiKeyRequest: - type: object - description: Request to create a new API key - required: - - name - properties: - expires_at: - type: - - string - - "null" - description: Optional expiration date in ISO 8601 format - example: "2026-12-31T23:59:59Z" - name: - type: string - description: Descriptive name for the API key - example: Production API Key - CreateApiKeyResponse: - type: object - description: "Response containing API key with full key (create/regenerate only) - - - The `key` field contains the full API key and is only returned once." - required: - - id - - key - - key_prefix - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - expires_at: - type: - - string - - "null" - description: Optional expiration date in ISO 8601 format - example: "2026-12-31T23:59:59Z" - id: - type: string - description: Unique API key identifier - example: 770e8400-e29b-41d4-a716-446655440002 - key: - type: string - description: Full API key (only returned once on creation/regeneration) - example: kz_abcdefghijklmnopqrstuvwxyz123 - key_prefix: - type: string - description: Key prefix for identification - example: kz_abcdefgh - name: - type: - - string - - "null" - description: User-provided name for the key - example: Production API Key - CreateInvitationRequest: - type: object - description: Request to create an invitation - required: - - email - - role - properties: - email: - type: string - description: Email address to invite - example: user@example.com - role: - type: string - description: 'Role to assign: "OWNER" or "MEMBER"' - example: MEMBER - CreateInvitationResponse: - type: object - description: Response when creating an invitation (includes token) - required: - - id - - email - - role - - token - - expires_at - properties: - email: - type: string - description: Invited email address - example: user@example.com - expires_at: - type: string - description: ISO 8601 expiration timestamp - example: "2025-12-24T10:00:00Z" - id: - type: string - description: Invitation ID - example: 770e8400-e29b-41d4-a716-446655440002 - role: - type: string - description: Assigned role - example: MEMBER - token: - type: string - description: Invitation token (include in the email link) - example: a1b2c3d4e5f67890... - CreateProjectRequest: - type: object - description: Request to create a new project - required: - - name - properties: - name: - type: string - description: Project name - example: My Project - CreateWebhookRequest: - type: object - description: Request to create a new webhook - required: - - name - - url - - events - properties: - events: - type: array - items: - type: string - description: Array of events to subscribe to - example: - - job.completed - - job.failed - name: - type: string - description: Webhook display name - example: Job Completion Webhook - secret: - type: - - string - - "null" - description: Optional HMAC secret (generated if not provided) - example: whsec_abc123def456 - url: - type: string - description: Webhook URL (must use HTTPS) - example: https://example.com/webhook - DailyAnalyticsEntry: - type: object - description: Daily analytics record - required: - - date - - jobs_total - - jobs_completed - - jobs_failed - - jobs_cancelled - - total_processing_time_ms - - total_file_size_bytes - - total_pages_extracted - - total_tables_extracted - - total_images_extracted - - file_types - properties: - date: - type: string - description: Date (ISO 8601 format) - example: "2025-01-15" - file_types: - description: File type breakdown (MIME type → count) - jobs_cancelled: - type: integer - format: int32 - description: Cancelled jobs - jobs_completed: - type: integer - format: int32 - description: Successfully completed jobs - jobs_failed: - type: integer - format: int32 - description: Failed jobs - jobs_total: - type: integer - format: int32 - description: Total jobs submitted - processing_stats: - oneOf: - - type: "null" - - $ref: "#/components/schemas/ProcessingStatsEntry" - description: Processing time percentiles (if available for this date) - total_file_size_bytes: - type: integer - format: int64 - description: Total file size processed in bytes - total_images_extracted: - type: integer - format: int32 - description: Total images extracted - total_pages_extracted: - type: integer - format: int32 - description: Total pages extracted - total_processing_time_ms: - type: integer - format: int64 - description: Total processing time across all jobs in milliseconds - total_tables_extracted: - type: integer - format: int32 - description: Total tables extracted - ErrorDetail: - type: object - description: Error detail for failed requests - required: - - timestamp - - method - - path - - status_code - properties: - error_message: - type: - - string - - "null" - description: Error message - example: Invalid file format - method: - type: string - description: HTTP method - example: POST - path: - type: string - description: Request path - example: /v1/projects/550e8400-e29b-41d4-a716-446655440000/jobs - status_code: - type: integer - format: int32 - description: HTTP status code - example: 400 - timestamp: - type: string - description: ISO 8601 timestamp of the error - example: "2025-01-15T14:30:00Z" - ErrorResponse: - type: object - description: Error response returned by the API - required: - - error - properties: - error: - type: string - description: Error message - example: Resource not found - ExtractedImage: - type: object - description: Extracted image with base64-encoded data. - properties: - bounding_box: - oneOf: - - type: "null" - - $ref: "#/components/schemas/BoundingBox" - description: Bounding box on the page - data: - type: string - description: Base64-encoded image data - description: - type: - - string - - "null" - description: Image description - format: - type: string - description: Image format (e.g., "PNG", "JPEG") - height: - type: - - integer - - "null" - format: int32 - description: Image height in pixels - minimum: 0 - image_index: - type: integer - format: int64 - description: Image index within the document - minimum: 0 - page_number: - type: - - integer - - "null" - format: int32 - description: Page number the image was found on - minimum: 0 - width: - type: - - integer - - "null" - format: int32 - description: Image width in pixels - minimum: 0 - ExtractionResult: - type: object - description: Extraction result — aligned with kreuzberg::ExtractionResult. - properties: - chunks: - type: array - items: - $ref: "#/components/schemas/Chunk" - description: Text chunks (when chunking is enabled) - content: - type: string - description: Full extracted text content - detected_languages: - type: array - items: - type: string - description: Detected document languages - images: - type: array - items: - $ref: "#/components/schemas/ExtractedImage" - description: Extracted images (base64 encoded) - metadata: - oneOf: - - type: "null" - - $ref: "#/components/schemas/Metadata" - description: Document metadata - mime_type: - type: string - description: Content MIME type (e.g., "text/plain", "text/markdown") - pages: - type: array - items: - $ref: "#/components/schemas/PageContent" - description: Per-page content (when page extraction is enabled) - processing_warnings: - type: array - items: - $ref: "#/components/schemas/ProcessingWarning" - description: Non-fatal processing warnings - quality_score: - type: - - number - - "null" - format: double - description: Document quality score (0.0-1.0) - tables: - type: array - items: - $ref: "#/components/schemas/Table" - description: Extracted tables - HealthResponse: - type: object - description: Health check response for liveness probes - required: - - status - properties: - status: - type: string - description: Service status - example: ok - InvitationResponse: - type: object - description: Response containing invitation details - required: - - id - - email - - role - - status - - expires_at - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - email: - type: string - description: Invited email address - example: user@example.com - expires_at: - type: string - description: ISO 8601 expiration timestamp - example: "2025-12-24T10:00:00Z" - id: - type: string - description: Invitation ID - example: 770e8400-e29b-41d4-a716-446655440002 - role: - type: string - description: Assigned role - example: MEMBER - status: - type: string - description: Invitation status - example: PENDING - LoginRequest: - type: object - description: Login request with OIDC ID token - required: - - id_token - properties: - id_token: - type: string - description: ID token from OIDC identity provider - example: eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9... - LoginResponse: - type: object - description: Successful login response - required: - - token - - user - - project - properties: - project: - $ref: "#/components/schemas/ProjectInfo" - description: User's default project - token: - type: string - description: Backend JWT token (valid for 7 days) - example: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9... - user: - $ref: "#/components/schemas/UserInfo" - description: Authenticated user information - MemberResponse: - type: object - description: Response containing member details - required: - - id - - user_id - - role - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - display_name: - type: - - string - - "null" - description: Member's display name - example: Jane Doe - email: - type: - - string - - "null" - description: Member's email address - example: user@example.com - id: - type: string - description: Unique member identifier - example: 660e8400-e29b-41d4-a716-446655440001 - role: - type: string - description: Member's role in the project - example: MEMBER - user_id: - type: string - description: User ID of the member - example: user-id-xyz789 - Metadata: - type: object - description: Document metadata — aligned with kreuzberg::Metadata. - properties: - abstract_text: - type: - - string - - "null" - additional: - type: object - additionalProperties: {} - propertyNames: - type: string - authors: - type: array - items: - type: string - category: - type: - - string - - "null" - created_at: - type: - - string - - "null" - created_by: - type: - - string - - "null" - document_version: - type: - - string - - "null" - extraction_duration_ms: - type: - - integer - - "null" - format: int64 - minimum: 0 - keywords: - type: array - items: - type: string - language: - type: - - string - - "null" - modified_at: - type: - - string - - "null" - modified_by: - type: - - string - - "null" - output_format: - type: - - string - - "null" - pages: - oneOf: - - type: "null" - - $ref: "#/components/schemas/PageStructure" - subject: - type: - - string - - "null" - tags: - type: array - items: - type: string - title: - type: - - string - - "null" - MimeTypeUsage: - type: object - description: Per-MIME-type usage breakdown - required: - - documents - - pages - - failed - properties: - documents: - type: integer - format: int32 - description: Number of documents processed - failed: - type: integer - format: int32 - description: Number of failed documents - pages: - type: integer - format: int64 - description: Total pages extracted - PageContent: - type: object - description: Per-page content. - required: - - page_number - properties: - content: - type: string - description: Extracted text for this page - is_blank: - type: - - boolean - - "null" - description: Whether the page is blank - page_number: - type: integer - format: int32 - description: Page number (0-indexed) - minimum: 0 - PageStructure: - type: object - description: Page structure metadata. - required: - - total_count - properties: - total_count: - type: integer - format: int32 - minimum: 0 - PaginatedResponse_InvitationResponse: - type: object - description: Paginated response wrapper with metadata - required: - - items - - total - - limit - - offset - properties: - items: - type: array - items: - type: object - description: Response containing invitation details - required: - - id - - email - - role - - status - - expires_at - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - email: - type: string - description: Invited email address - example: user@example.com - expires_at: - type: string - description: ISO 8601 expiration timestamp - example: "2025-12-24T10:00:00Z" - id: - type: string - description: Invitation ID - example: 770e8400-e29b-41d4-a716-446655440002 - role: - type: string - description: Assigned role - example: MEMBER - status: - type: string - description: Invitation status - example: PENDING - description: The page of items - limit: - type: integer - format: int32 - description: Number of items per page (the limit used) - offset: - type: integer - format: int32 - description: Current offset - total: - type: integer - format: int32 - description: Total number of items across all pages - PaginatedResponse_MemberResponse: - type: object - description: Paginated response wrapper with metadata - required: - - items - - total - - limit - - offset - properties: - items: - type: array - items: - type: object - description: Response containing member details - required: - - id - - user_id - - role - - created_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - display_name: - type: - - string - - "null" - description: Member's display name - example: Jane Doe - email: - type: - - string - - "null" - description: Member's email address - example: user@example.com - id: - type: string - description: Unique member identifier - example: 660e8400-e29b-41d4-a716-446655440001 - role: - type: string - description: Member's role in the project - example: MEMBER - user_id: - type: string - description: User ID of the member - example: user-id-xyz789 - description: The page of items - limit: - type: integer - format: int32 - description: Number of items per page (the limit used) - offset: - type: integer - format: int32 - description: Current offset - total: - type: integer - format: int32 - description: Total number of items across all pages - PaginatedResponse_ProjectResponse: - type: object - description: Paginated response wrapper with metadata - required: - - items - - total - - limit - - offset - properties: - items: - type: array - items: - type: object - description: Response containing project details - required: - - id - - name - - slug - - status - - owner_user_id - - created_at - - updated_at - - api_key_count - - webhook_count - - total_pages_extracted - properties: - api_key_count: - type: integer - format: int32 - description: Number of active (non-revoked) API keys - example: 3 - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - id: - type: string - description: Unique project identifier - example: 550e8400-e29b-41d4-a716-446655440000 - name: - type: string - description: Human-readable project name - example: My Project - owner_user_id: - type: string - description: User ID of the project owner (from identity provider) - example: user-id-abc123 - slug: - type: string - description: URL-safe identifier - example: my-project-a1b2c3d4 - status: - type: string - description: Project status (ACTIVE, SUSPENDED, INACTIVE) - example: ACTIVE - total_pages_extracted: - type: integer - format: int64 - description: Total billable pages extracted (excluding cancelled jobs) - example: 1500 - updated_at: - type: string - description: ISO 8601 last update timestamp - example: "2025-12-21T10:00:00Z" - webhook_count: - type: integer - format: int32 - description: Number of active (non-deleted) webhooks - example: 2 - description: The page of items - limit: - type: integer - format: int32 - description: Number of items per page (the limit used) - offset: - type: integer - format: int32 - description: Current offset - total: - type: integer - format: int32 - description: Total number of items across all pages - PlanInfo: - type: object - description: Plan limits for the project - properties: - max_file_size_mb: - type: - - integer - - "null" - format: int32 - description: Maximum file size in megabytes (null = unlimited) - max_monthly_jobs: - type: - - integer - - "null" - format: int64 - description: Maximum jobs allowed per month (null = unlimited) - max_monthly_pages: - type: - - integer - - "null" - format: int64 - description: Maximum pages allowed per month (null = unlimited) - PortalResponse: - type: object - description: Response body for a Stripe Customer Portal Session - required: - - url - properties: - url: - type: string - description: Stripe Customer Portal URL to redirect the user to - ProcessingStatsEntry: - type: object - description: Processing time percentiles for a day - required: - - sample_count - properties: - p50_ms: - type: - - integer - - "null" - format: int64 - description: Median processing time in milliseconds - p95_ms: - type: - - integer - - "null" - format: int64 - description: 95th percentile processing time in milliseconds - p99_ms: - type: - - integer - - "null" - format: int64 - description: 99th percentile processing time in milliseconds - sample_count: - type: integer - format: int32 - description: Number of samples used for calculation - ProcessingWarning: - type: object - description: Non-fatal processing warning. - properties: - message: - type: string - description: Warning message - source: - type: string - description: Warning source (e.g., "ocr", "pdf", "chunking") - ProjectInfo: - type: object - description: Project information in login response - required: - - id - - name - - role - properties: - id: - type: string - format: uuid - description: Project UUID - example: 550e8400-e29b-41d4-a716-446655440000 - name: - type: string - description: Project name - example: My Project - role: - type: string - description: User role in this project - example: OWNER - ProjectResponse: - type: object - description: Response containing project details - required: - - id - - name - - slug - - status - - owner_user_id - - created_at - - updated_at - - api_key_count - - webhook_count - - total_pages_extracted - properties: - api_key_count: - type: integer - format: int32 - description: Number of active (non-revoked) API keys - example: 3 - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-21T10:00:00Z" - id: - type: string - description: Unique project identifier - example: 550e8400-e29b-41d4-a716-446655440000 - name: - type: string - description: Human-readable project name - example: My Project - owner_user_id: - type: string - description: User ID of the project owner (from identity provider) - example: user-id-abc123 - slug: - type: string - description: URL-safe identifier - example: my-project-a1b2c3d4 - status: - type: string - description: Project status (ACTIVE, SUSPENDED, INACTIVE) - example: ACTIVE - total_pages_extracted: - type: integer - format: int64 - description: Total billable pages extracted (excluding cancelled jobs) - example: 1500 - updated_at: - type: string - description: ISO 8601 last update timestamp - example: "2025-12-21T10:00:00Z" - webhook_count: - type: integer - format: int32 - description: Number of active (non-deleted) webhooks - example: 2 - ReadinessChecks: - type: object - description: Individual dependency check results - required: - - database - properties: - database: - type: string - description: Database connectivity - example: ok - ReadinessResponse: - type: object - description: Readiness check response with dependency status - required: - - status - - checks - properties: + /healthz: + get: + tags: + - health + summary: Check service liveness + operationId: healthz + responses: + "200": + description: Service is alive + content: + application/json: + schema: + $ref: '#/components/schemas/HealthResponse' + example: + status: ok + /readyz: + get: + tags: + - health + summary: Check service readiness + operationId: readyz + responses: + "200": + description: Service is ready + content: + application/json: + schema: + $ref: '#/components/schemas/ReadinessResponse' + example: checks: - $ref: "#/components/schemas/ReadinessChecks" - description: Dependency check results - status: - type: string - description: Overall readiness status - example: ready - Row: - type: object - description: 'Table row — a list of cell string values. + database: ok + nats: ok + status: ready + "503": + description: Service is not ready + content: + application/json: + schema: + $ref: '#/components/schemas/ReadinessResponse' + /v1/extract: + post: + tags: + - extract + summary: Submit documents for extraction + description: |- + Accepts `application/json` or `multipart/form-data`. + **JSON body**: `{"documents": [...], "options": {...}, "webhook": {"url": "...", "secret": "...", "metadata": {...}}}` - Kreuzberg serializes cells as a nested array (each row is `["A", "B"]`). + **Multipart**: file parts (binary) + `webhook` part (JSON string) + optional `options` part (JSON string) - This type accepts both array and object formats via a custom deserializer.' - required: - - values - properties: - values: - type: array - items: - type: string - SandboxExtractResponse: - type: object - description: Sandbox extraction response — mirrors the real API `GET /v1/jobs/{id}` - response - required: - - id - - filename - - status - - created_at - properties: - created_at: - type: string - description: Job creation timestamp (RFC3339) - example: "2025-12-21T10:00:00Z" - filename: - type: string - description: Original filename - example: invoice.pdf - id: - type: string - description: Unique job identifier (UUID) - example: 550e8400-e29b-41d4-a716-446655440000 - processing_time_ms: - type: - - integer - - "null" - format: int64 - description: Processing time in milliseconds - example: 1234 + Returns 202 Accepted with job IDs. Results are delivered via the configured webhook. + operationId: extract + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ExtractJsonRequest' + example: + documents: + - data: SGVsbG8gV29ybGQ= + filename: invoice.pdf + mime_type: application/pdf + options: + extraction_config: + ocr: + backend: tesseract + language: eng + output_format: markdown + webhook: + metadata: + request_id: abc123 + secret: my-secret + url: https://example.com/webhook + required: true + responses: + "202": + description: Jobs accepted for processing + content: + application/json: + schema: + $ref: '#/components/schemas/ExtractResponse' + example: + job_ids: + - 550e8400-e29b-41d4-a716-446655440000 + status: pending + "400": + description: Invalid request + content: + application/json: + schema: {} + example: + error: At least one document is required + "401": + description: Unauthorized + content: + application/json: + schema: {} + example: + error: Missing project context + "429": + description: Free page credit exhausted — add a payment method to continue + content: + application/json: + schema: {} + example: + error: Free page credit exhausted. Add a payment method to continue extracting. + security: + - bearer_auth: [] + /v1/jobs/{id}: + get: + tags: + - jobs + summary: Get job status and results + operationId: get_job + parameters: + - name: id + in: path + description: Job ID (UUID) + required: true + schema: + type: string + format: uuid + example: 550e8400-e29b-41d4-a716-446655440000 + responses: + "200": + description: Job retrieved successfully + content: + application/json: + schema: + $ref: '#/components/schemas/JobResponse' + example: + created_at: "2025-12-21T10:00:00Z" + filename: invoice.pdf + id: 550e8400-e29b-41d4-a716-446655440000 result: - oneOf: - - type: "null" - - $ref: "#/components/schemas/ExtractionResult" - description: Extraction result (only present when completed) - status: - type: string - description: Job status - example: completed - Table: - type: object - description: Extracted table. - properties: - bounding_box: - oneOf: - - type: "null" - - $ref: "#/components/schemas/BoundingBox" - description: Bounding box on the page - cells: - type: array - items: - $ref: "#/components/schemas/Row" - description: 2D grid of cells (rows of string values) - markdown: - type: string - description: Markdown representation of the table - page_number: - type: integer - format: int32 - description: Page number (0-indexed) - minimum: 0 - UpdateMemberRoleRequest: - type: object - description: Request to update a member's role - required: - - role - properties: - role: - type: string - description: 'New role: "OWNER" or "MEMBER"' - example: OWNER - UpdateProjectRequest: - type: object - description: Request to update a project - properties: - name: - type: - - string - - "null" - description: Updated project name - example: Renamed Project - UpdateWebhookRequest: - type: object - description: Request to update a webhook - properties: - events: - type: - - array - - "null" - items: - type: string - description: Updated events array (optional) - example: - - job.completed - is_active: - type: - - boolean - - "null" - description: Updated active status (optional) - example: true - name: - type: - - string - - "null" - description: Updated webhook name (optional) - example: Updated Webhook Name - url: - type: - - string - - "null" - description: Updated webhook URL (optional) - example: https://example.com/new-webhook - UsageInfo: - type: object - description: Current usage for the billing period - required: - - pages_used - - total_documents - - total_failed - - by_mime_type - properties: + content: + - confidence: 0.95 + page_number: 1 + text: 'Invoice total: $1,234.56' + images: [] + metadata: + author: John Doe + page_count: 1 + subject: Monthly Invoice + title: 'Invoice #12345' + tables: [] + status: completed + "400": + description: Invalid job ID format + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + example: + error: Invalid job ID format + "401": + description: Unauthorized - missing or invalid API key + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + example: + error: Missing project context + "404": + description: Job not found + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + example: + error: Job not found + "500": + description: Internal server error + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + example: + error: Database connection failed + "503": + description: Service unavailable - dependency failure + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + example: + error: Database connection unavailable + security: + - bearer_auth: [] + /v1/uploads/confirm: + post: + tags: + - uploads + summary: Confirm uploads and start processing + description: Verifies all files exist in storage, then dispatches jobs to the worker queue. + operationId: confirm_upload + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ConfirmUploadRequest' + required: true + responses: + "202": + description: Uploads confirmed, processing started + content: + application/json: + schema: + $ref: '#/components/schemas/ConfirmUploadResponse' + example: + job_ids: + - 550e8400-e29b-41d4-a716-446655440000 + - 550e8400-e29b-41d4-a716-446655440001 + status: processing + "400": + description: Invalid request + content: + application/json: + schema: {} + "401": + description: Unauthorized + content: + application/json: + schema: {} + security: + - bearer_auth: [] + /v1/uploads/presign: + post: + tags: + - uploads + summary: Generate presigned upload URLs + description: |- + Returns upload URLs for each document. The client uploads files directly + to storage, then calls `/v1/uploads/confirm` to trigger processing. + operationId: presign_upload + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/PresignUploadRequest' + required: true + responses: + "200": + description: Presigned upload URLs generated + content: + application/json: + schema: + $ref: '#/components/schemas/PresignUploadResponse' + example: + batch_id: batch_550e8400-e29b-41d4-a716 + uploads: + - expires_in_secs: 3600 + job_id: 550e8400-e29b-41d4-a716-446655440000 + method: PUT + object_key: projects/abc123/uploads/550e8400-e29b-41d4-a716-446655440000 + upload_url: https://storage.googleapis.com/kreuzberg-dev-uploads/... + - expires_in_secs: 3600 + job_id: 550e8400-e29b-41d4-a716-446655440001 + method: PUT + object_key: projects/abc123/uploads/550e8400-e29b-41d4-a716-446655440001 + upload_url: https://storage.googleapis.com/kreuzberg-dev-uploads/... + "400": + description: Invalid request + content: + application/json: + schema: {} + "401": + description: Unauthorized + content: + application/json: + schema: {} + security: + - bearer_auth: [] + /v1/usage: + get: + tags: + - usage + summary: Get usage statistics and quota + operationId: get_usage + parameters: + - name: start + in: query + description: Start date (ISO 8601, e.g. "2026-03-01"). Defaults to first day of current month. + required: false + schema: + type: + - string + - "null" + - name: end + in: query + description: End date (ISO 8601, e.g. "2026-04-01"). Defaults to first day of next month. + required: false + schema: + type: + - string + - "null" + responses: + "200": + description: Usage statistics + content: + application/json: + schema: + $ref: '#/components/schemas/UsageResponse' + example: by_mime_type: - type: object - description: Usage breakdown by MIME type - additionalProperties: - $ref: "#/components/schemas/MimeTypeUsage" - propertyNames: - type: string - pages_remaining: - type: - - integer - - "null" - format: int64 - description: Pages remaining before hitting the limit (null = unlimited) - pages_used: - type: integer - format: int64 - description: Total pages used in the current period - total_documents: - type: integer - format: int32 - description: Total documents processed - total_failed: - type: integer - format: int32 - description: Total failed documents - UsagePeriod: - type: object - description: Period details in usage response - required: - - start_date - - end_date - properties: - end_date: - type: string - description: End date (ISO 8601 format) - example: "2025-01-31" - start_date: - type: string - description: Start date (ISO 8601 format) - example: "2025-01-01" - UsageQuery: - type: object - description: Query parameters for usage statistics endpoint - required: - - start_date - - end_date - properties: - end_date: - type: string - description: End date for the usage period (ISO 8601 format, e.g., "2025-01-31") - start_date: - type: string - description: Start date for the usage period (ISO 8601 format, e.g., "2025-01-01") - UsageResponse: - type: object - description: Response containing usage statistics - required: - - project_id - - period - - summary - - by_method - - by_status_code - - by_path - - error_details - properties: - by_method: - type: object - description: Request count by HTTP method - additionalProperties: - type: integer - format: int64 - propertyNames: - type: string - by_path: - type: object - description: Request count by path (normalized) - additionalProperties: - type: integer - format: int64 - propertyNames: - type: string - by_status_code: - type: object - description: Request count by status code - additionalProperties: - type: integer - format: int64 - propertyNames: - type: integer - format: int32 - error_details: - type: array - items: - $ref: "#/components/schemas/ErrorDetail" - description: Recent error details (up to 100) - period: - $ref: "#/components/schemas/UsagePeriod" - description: Time period covered by these statistics - project_id: - type: string - description: The project identifier - example: 550e8400-e29b-41d4-a716-446655440000 - summary: - $ref: "#/components/schemas/UsageSummary" - description: Aggregated statistics for the period - UsageSummary: - type: object - description: Summary statistics for a usage period - required: - - total_requests - - successful_requests - - failed_requests - - average_response_time_ms - - total_request_bytes - - total_response_bytes - properties: - average_response_time_ms: - type: integer - format: int32 - description: Average response time in milliseconds - example: 125 - failed_requests: - type: integer - format: int64 - description: Number of failed requests (status >= 400) - example: 50 - successful_requests: - type: integer - format: int64 - description: Number of successful requests (status < 400) - example: 1450 - total_request_bytes: - type: integer - format: int64 - description: Total request size in bytes - example: 1048576 - total_requests: - type: integer - format: int64 - description: Total number of requests - example: 1500 - total_response_bytes: - type: integer - format: int64 - description: Total response size in bytes - example: 2097152 - UserInfo: - type: object - description: User information in login response - required: - - user_id - - default_project_id - properties: - default_project_id: - type: string - format: uuid - description: Default project ID (first OWNER project) - example: 550e8400-e29b-41d4-a716-446655440000 - email: - type: - - string - - "null" - description: User email address - example: user@example.com - name: - type: - - string - - "null" - description: User display name - example: Jane Doe - user_id: - type: string - description: User ID from identity provider - example: user-id-abc123 - WebhookResponse: - type: object - description: Response containing webhook details - required: - - id - - name - - url - - events - - is_active - - total_deliveries - - failed_deliveries - - created_at - - updated_at - properties: - created_at: - type: string - description: ISO 8601 creation timestamp - example: "2025-12-20T10:00:00Z" - events: - type: array - items: - type: string - description: Subscribed event types (e.g., "job.completed", "job.failed") - example: - - job.completed - - job.failed - failed_deliveries: - type: integer - format: int32 - description: Number of failed delivery attempts - example: 2 - id: - type: string - format: uuid - description: Unique webhook identifier - example: 770e8400-e29b-41d4-a716-446655440003 - is_active: - type: boolean - description: Whether the webhook is currently active - example: true - last_delivery_at: - type: - - string - - "null" - description: ISO 8601 timestamp of last delivery attempt - example: "2025-12-21T12:00:00Z" - last_delivery_status: - type: - - string - - "null" - description: Status of last delivery (e.g., "success", "failed") - example: success - name: - type: string - description: Webhook display name - example: Job Completion Webhook - total_deliveries: - type: integer - format: int32 - description: Total number of delivery attempts - example: 42 - updated_at: - type: string - description: ISO 8601 last update timestamp - example: "2025-12-21T12:00:00Z" - url: - type: string - description: Webhook delivery URL - example: https://example.com/webhook - WebhookTestResponse: - type: object - description: Response from a webhook test delivery - required: - - success - properties: - error: - type: - - string - - "null" - description: Error message if the delivery failed - example: Connection refused - status_code: - type: - - integer - - "null" - format: int32 - description: HTTP status code returned by the webhook endpoint - example: 200 - minimum: 0 - success: - type: boolean - description: Whether the test delivery was successful - example: true - securitySchemes: - bearer_auth: - type: http - scheme: bearer + application/pdf: + documents: 65 + failed: 1 + pages: 3200 + image/png: + documents: 15 + failed: 0 + pages: 1800 + text/plain: + documents: 7 + failed: 1 + pages: 432 + period_end: "2026-06-01" + period_start: "2026-05-01" + quota_limit: 100000 + quota_remaining: 94568 + total_documents: 87 + total_failed: 2 + total_pages: 5432 + "400": + description: Invalid date format + "401": + description: Unauthorized + security: + - bearer_auth: [] +components: + schemas: + BoundingBox: + type: object + description: Bounding box coordinates. + required: + - x0 + - y0 + - x1 + - y1 + properties: + x0: + type: number + format: double + x1: + type: number + format: double + y0: + type: number + format: double + y1: + type: number + format: double + Chunk: + type: object + description: Text chunk with optional embedding. + properties: + content: + type: string + description: Chunk text content + embedding: + type: array + items: + type: number + format: float + description: Embedding vector (when embedding is enabled) + metadata: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ChunkMetadata' + description: Chunk position metadata + ChunkMetadata: + type: object + description: Chunk position and size metadata. + required: + - byte_start + - byte_end + - chunk_index + - total_chunks + properties: + byte_end: + type: integer + format: int64 + minimum: 0 + byte_start: + type: integer + format: int64 + minimum: 0 + chunk_index: + type: integer + format: int64 + minimum: 0 + first_page: + type: + - integer + - "null" + format: int64 + minimum: 0 + last_page: + type: + - integer + - "null" + format: int64 + minimum: 0 + token_count: + type: + - integer + - "null" + format: int64 + minimum: 0 + total_chunks: + type: integer + format: int64 + minimum: 0 + ChunkingConfig: + type: object + description: Text chunking configuration. + properties: + chunker_type: + type: + - string + - "null" + description: 'Chunker type: "text", "markdown", "yaml"' + embedding: + oneOf: + - type: "null" + - $ref: '#/components/schemas/EmbeddingConfig' + description: Embedding generation configuration + max_characters: + type: + - integer + - "null" + format: int32 + description: Maximum chunk size in characters + minimum: 0 + overlap: + type: + - integer + - "null" + format: int32 + description: Overlap between adjacent chunks in characters + minimum: 0 + preset: + type: + - string + - "null" + description: Preset name (overrides other settings) + trim: + type: + - boolean + - "null" + description: Trim whitespace from chunk boundaries + ConfirmUploadRequest: + type: object + description: Request body for confirming uploads + required: + - batch_id + properties: + batch_id: + type: string + description: Batch ID from the presign response + ConfirmUploadResponse: + type: object + description: Response from confirm endpoint + required: + - job_ids + - status + properties: + job_ids: + type: array + items: + type: string + description: Job IDs that are now queued for processing + status: + type: string + description: Status of the jobs + ContentFilterConfig: + type: object + description: Content filtering configuration (headers, footers, watermarks). + properties: + include_footers: + type: boolean + description: Include running footers + include_headers: + type: boolean + description: Include running headers + include_watermarks: + type: boolean + description: Include watermarks + strip_repeating_text: + type: boolean + description: Strip cross-page repeating text + DocumentInput: + type: object + description: Document input for JSON extraction requests + required: + - filename + - mime_type + - data + properties: + data: + type: string + description: Base64-encoded document data + filename: + type: string + description: Original filename + mime_type: + type: string + description: MIME type of the document + EmbeddingConfig: + type: object + description: Embedding generation configuration for chunks. + properties: + batch_size: + type: + - integer + - "null" + format: int32 + description: Batch size for embedding generation + minimum: 0 + model: + description: 'Model configuration (flexible JSON: {"type":"preset","name":"balanced"})' + normalize: + type: + - boolean + - "null" + description: Normalize embedding vectors + show_download_progress: + type: + - boolean + - "null" + description: Show model download progress + ErrorResponse: + type: object + description: Error response + required: + - error + properties: + error: + type: string + description: Error message + example: Invalid job ID format + ExtractJsonRequest: + type: object + description: JSON body for `POST /v1/extract` + required: + - documents + properties: + documents: + type: array + items: + $ref: '#/components/schemas/DocumentInput' + description: Documents to process + options: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ExtractionOptions' + description: Extraction options (optional) + webhook: + oneOf: + - type: "null" + - $ref: '#/components/schemas/WebhookConfig' + description: Webhook configuration for async result delivery (optional) + ExtractResponse: + type: object + description: Extract response (HTTP 202) + required: + - job_ids + - status + properties: + job_ids: + type: array + items: + type: string + description: Job IDs for tracking (one per document) + status: + type: string + description: Job status + ExtractedImage: + type: object + description: Extracted image with base64-encoded data. + properties: + bounding_box: + oneOf: + - type: "null" + - $ref: '#/components/schemas/BoundingBox' + description: Bounding box on the page + data: + type: string + description: Base64-encoded image data + description: + type: + - string + - "null" + description: Image description + format: + type: string + description: Image format (e.g., "PNG", "JPEG") + height: + type: + - integer + - "null" + format: int32 + description: Image height in pixels + minimum: 0 + image_index: + type: integer + format: int64 + description: Image index within the document + minimum: 0 + page_number: + type: + - integer + - "null" + format: int32 + description: Page number the image was found on + minimum: 0 + width: + type: + - integer + - "null" + format: int32 + description: Image width in pixels + minimum: 0 + ExtractionConfig: + type: object + description: Top-level extraction configuration — mirrors kreuzberg::ExtractionConfig. + properties: + cache_namespace: + type: + - string + - "null" + description: Cache namespace for tenant isolation + cache_ttl_secs: + type: + - integer + - "null" + format: int64 + description: Per-request cache TTL override in seconds + minimum: 0 + chunking: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ChunkingConfig' + description: Text chunking configuration + content_filter: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ContentFilterConfig' + description: Content filtering (headers, footers, watermarks) + disable_ocr: + type: + - boolean + - "null" + description: Disable OCR entirely + enable_quality_processing: + type: + - boolean + - "null" + description: Enable quality post-processing + extraction_timeout_secs: + type: + - integer + - "null" + format: int64 + description: Per-file extraction timeout in seconds + minimum: 0 + force_ocr: + type: + - boolean + - "null" + description: Force OCR on all pages (bypass native text extraction) + force_ocr_pages: + type: + - array + - "null" + items: + type: integer + format: int32 + minimum: 0 + description: Force OCR on specific pages (1-indexed) + html_options: + description: HTML extraction options (flexible JSON) + images: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ImageExtractionConfig' + description: Image extraction configuration + include_document_structure: + type: + - boolean + - "null" + description: Include structured document tree in output + keywords: + description: Keyword extraction configuration (flexible JSON) + language_detection: + oneOf: + - type: "null" + - $ref: '#/components/schemas/LanguageDetectionConfig' + description: Language detection configuration + layout: + oneOf: + - type: "null" + - $ref: '#/components/schemas/LayoutDetectionConfig' + description: Layout detection configuration + max_archive_depth: + type: + - integer + - "null" + format: int32 + description: Maximum recursion depth for archive extraction + minimum: 0 + max_concurrent_extractions: + type: + - integer + - "null" + format: int32 + description: Maximum concurrent extractions + minimum: 0 + ocr: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrConfig' + description: OCR configuration + output_format: + type: + - string + - "null" + description: 'Output text format: "plain", "markdown", "html", "djot", "structured", "json"' + pages: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PageConfig' + description: Page extraction configuration + pdf_options: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PdfConfig' + description: PDF-specific options + postprocessor: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PostProcessorConfig' + description: Post-processor configuration + result_format: + type: + - string + - "null" + description: 'Result format: "unified" or "element_based"' + security_limits: + description: Security limits (flexible JSON) + token_reduction: + oneOf: + - type: "null" + - $ref: '#/components/schemas/TokenReductionConfig' + description: Token reduction configuration + use_cache: + type: + - boolean + - "null" + description: Enable extraction result caching + ExtractionOptions: + type: object + description: Extraction options (shared by JSON and multipart requests) + properties: + extraction_config: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ExtractionConfig' + description: Kreuzberg extraction configuration (optional, null = defaults). + ExtractionResult: + type: object + description: Extraction result — aligned with kreuzberg::ExtractionResult. + properties: + chunks: + type: array + items: + $ref: '#/components/schemas/Chunk' + description: Text chunks (when chunking is enabled) + content: + type: string + description: Full extracted text content + detected_languages: + type: array + items: + type: string + description: Detected document languages + images: + type: array + items: + $ref: '#/components/schemas/ExtractedImage' + description: Extracted images (base64 encoded) + metadata: + oneOf: + - type: "null" + - $ref: '#/components/schemas/Metadata' + description: Document metadata + mime_type: + type: string + description: Content MIME type (e.g., "text/plain", "text/markdown") + pages: + type: array + items: + $ref: '#/components/schemas/PageContent' + description: Per-page content (when page extraction is enabled) + processing_warnings: + type: array + items: + $ref: '#/components/schemas/ProcessingWarning' + description: Non-fatal processing warnings + quality_score: + type: + - number + - "null" + format: double + description: Document quality score (0.0-1.0) + tables: + type: array + items: + $ref: '#/components/schemas/Table' + description: Extracted tables + FileExtractionConfig: + type: object + description: |- + Per-file extraction config override for batch/presign requests. + + All fields are optional — only provided fields override the batch-level config. + properties: + chunking: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ChunkingConfig' + content_filter: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ContentFilterConfig' + disable_ocr: + type: + - boolean + - "null" + enable_quality_processing: + type: + - boolean + - "null" + extraction_timeout_secs: + type: + - integer + - "null" + format: int64 + minimum: 0 + force_ocr: + type: + - boolean + - "null" + force_ocr_pages: + type: + - array + - "null" + items: + type: integer + format: int32 + minimum: 0 + images: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ImageExtractionConfig' + include_document_structure: + type: + - boolean + - "null" + language_detection: + oneOf: + - type: "null" + - $ref: '#/components/schemas/LanguageDetectionConfig' + ocr: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrConfig' + output_format: + type: + - string + - "null" + pages: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PageConfig' + postprocessor: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PostProcessorConfig' + result_format: + type: + - string + - "null" + token_reduction: + oneOf: + - type: "null" + - $ref: '#/components/schemas/TokenReductionConfig' + HealthResponse: + type: object + description: Health check response for liveness probes + required: + - status + properties: + status: + type: string + description: Service status + example: ok + HierarchyConfig: + type: object + description: Heading hierarchy detection configuration. + properties: + enabled: + type: boolean + description: Enable hierarchy detection + include_bbox: + type: boolean + description: Include bounding boxes + k_clusters: + type: + - integer + - "null" + format: int32 + description: Number of font-size clusters for heading levels + minimum: 0 + ocr_coverage_threshold: + type: + - number + - "null" + format: float + description: OCR coverage threshold + ImageExtractionConfig: + type: object + description: Image extraction and processing configuration. + properties: + auto_adjust_dpi: + type: + - boolean + - "null" + description: Auto-adjust DPI based on content + extract_images: + type: + - boolean + - "null" + description: Extract images from documents + inject_placeholders: + type: + - boolean + - "null" + description: Inject image reference placeholders in markdown output + max_dpi: + type: + - integer + - "null" + format: int32 + description: Maximum DPI threshold + max_image_dimension: + type: + - integer + - "null" + format: int32 + description: Maximum image dimension (width or height) + min_dpi: + type: + - integer + - "null" + format: int32 + description: Minimum DPI threshold + target_dpi: + type: + - integer + - "null" + format: int32 + description: Target DPI for image normalization + JobResponse: + type: object + description: Response for job status query + required: + - id + - filename + - status + - created_at + properties: + created_at: + type: string + format: date-time + description: Job creation timestamp (RFC3339) + example: "2025-12-21T10:00:00Z" + filename: + type: string + description: Original filename + example: invoice.pdf + id: + type: string + format: uuid + description: Unique job identifier (UUID) + example: 550e8400-e29b-41d4-a716-446655440000 + processing_time_ms: + type: + - integer + - "null" + format: int64 + description: Server-side processing duration in milliseconds (only present when completed) + example: 1234 + result: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ExtractionResult' + description: Extraction result (only present when status is completed/partial_success) + status: + $ref: '#/components/schemas/JobStatus' + description: Job status + JobStatus: + type: string + description: Job status enumeration (1:1 with domain). + enum: + - awaiting_upload + - pending + - processing + - chunking + - aggregating + - completed + - partial_success + - failed + - cancelled + LanguageDetectionConfig: + type: object + description: Language detection configuration. + properties: + detect_multiple: + type: + - boolean + - "null" + description: Detect multiple languages in document + enabled: + type: + - boolean + - "null" + description: Enable language detection + min_confidence: + type: + - number + - "null" + format: double + description: Minimum confidence threshold (0.0-1.0) + LayoutDetectionConfig: + type: object + description: Document layout detection configuration. + properties: + apply_heuristics: + type: + - boolean + - "null" + description: Apply postprocessing heuristics + confidence_threshold: + type: + - number + - "null" + format: float + description: Confidence threshold override + preset: + type: + - string + - "null" + description: 'Preset: "fast" or "accurate"' + Metadata: + type: object + description: Document metadata — aligned with kreuzberg::Metadata. + properties: + abstract_text: + type: + - string + - "null" + additional: + type: object + additionalProperties: {} + propertyNames: + type: string + authors: + type: array + items: + type: string + category: + type: + - string + - "null" + created_at: + type: + - string + - "null" + created_by: + type: + - string + - "null" + document_version: + type: + - string + - "null" + extraction_duration_ms: + type: + - integer + - "null" + format: int64 + minimum: 0 + keywords: + type: array + items: + type: string + language: + type: + - string + - "null" + modified_at: + type: + - string + - "null" + modified_by: + type: + - string + - "null" + output_format: + type: + - string + - "null" + pages: + oneOf: + - type: "null" + - $ref: '#/components/schemas/PageStructure' + subject: + type: + - string + - "null" + tags: + type: array + items: + type: string + title: + type: + - string + - "null" + OcrConfig: + type: object + description: OCR backend and language configuration. + properties: + auto_rotate: + type: + - boolean + - "null" + description: Automatic page rotation detection + backend: + type: + - string + - "null" + description: |- + OCR backend: only "tesseract" is supported. Any other value + (`easyocr`, `paddleocr`, `vlm`, …) is rejected at the API. + element_config: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrElementConfig' + description: Structured OCR element extraction + language: + type: + - string + - "null" + description: Language code (e.g., "eng", "deu", "fra") + output_format: + type: + - string + - "null" + description: OCR output format override + pipeline: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrPipelineConfig' + description: Multi-backend OCR pipeline with quality-based fallback + quality_thresholds: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrQualityThresholds' + description: Quality thresholds for OCR fallback decisions + tesseract_config: + description: Tesseract-specific configuration (flexible JSON with 20+ fields) + vlm_config: + description: VLM (Vision Language Model) configuration + vlm_prompt: + type: + - string + - "null" + description: Custom Jinja2 prompt template for VLM OCR + OcrElementConfig: + type: object + description: Structured OCR element extraction configuration. + properties: + build_hierarchy: + type: boolean + description: Build parent-child relationships + include_elements: + type: boolean + description: Include OCR elements in result + min_confidence: + type: + - number + - "null" + format: double + description: Minimum recognition confidence (0.0-1.0) + min_level: + type: + - string + - "null" + description: 'Minimum hierarchical level: "word", "line", "block", "page"' + OcrPipelineConfig: + type: object + description: Multi-backend OCR pipeline with quality-based fallback. + properties: + quality_thresholds: + oneOf: + - type: "null" + - $ref: '#/components/schemas/OcrQualityThresholds' + description: Quality evaluation thresholds for fallback decisions + stages: + type: array + items: + $ref: '#/components/schemas/OcrPipelineStage' + description: Ordered list of backends to try (sorted by priority descending) + OcrPipelineStage: + type: object + description: Single backend stage in the OCR pipeline. + required: + - backend + properties: + backend: + type: string + description: |- + Backend name. Only "tesseract" is supported; any other value is + rejected at the API boundary. + language: + type: + - string + - "null" + description: Language override for this stage + priority: + type: integer + format: int32 + description: Priority (higher = tried first, default 100) + minimum: 0 + tesseract_config: + description: Tesseract config override for this stage + vlm_config: + description: VLM config override for this stage + OcrQualityThresholds: + type: object + description: Quality thresholds for OCR fallback decisions. + properties: + alnum_ws_ratio_threshold: + type: + - number + - "null" + format: double + critical_fragmented_word_ratio: + type: + - number + - "null" + format: double + max_fragmented_word_ratio: + type: + - number + - "null" + format: double + min_alnum_ratio: + type: + - number + - "null" + format: double + min_avg_word_length: + type: + - number + - "null" + format: double + min_consecutive_repeat_ratio: + type: + - number + - "null" + format: double + min_garbage_chars: + type: + - integer + - "null" + format: int64 + minimum: 0 + min_meaningful_word_len: + type: + - integer + - "null" + format: int64 + minimum: 0 + min_meaningful_words: + type: + - integer + - "null" + format: int64 + minimum: 0 + min_non_whitespace_per_page: + type: + - number + - "null" + format: double + min_total_non_whitespace: + type: + - integer + - "null" + format: int64 + minimum: 0 + min_words_for_avg_length_check: + type: + - integer + - "null" + format: int64 + minimum: 0 + min_words_for_repeat_check: + type: + - integer + - "null" + format: int64 + minimum: 0 + non_text_min_chars: + type: + - integer + - "null" + format: int64 + minimum: 0 + pipeline_min_quality: + type: + - number + - "null" + format: double + substantive_min_chars: + type: + - integer + - "null" + format: int64 + minimum: 0 + PageConfig: + type: object + description: Page extraction configuration. + properties: + extract_pages: + type: boolean + description: Extract pages as separate array in result + insert_page_markers: + type: boolean + description: Insert page markers in content text + marker_format: + type: + - string + - "null" + description: Page marker format template (e.g., "\n\n\n\n") + PageContent: + type: object + description: Per-page content. + required: + - page_number + properties: + content: + type: string + description: Extracted text for this page + is_blank: + type: + - boolean + - "null" + description: Whether the page is blank + page_number: + type: integer + format: int32 + description: Page number (0-indexed) + minimum: 0 + PageStructure: + type: object + description: Page structure metadata. + required: + - total_count + properties: + total_count: + type: integer + format: int32 + minimum: 0 + PdfConfig: + type: object + description: PDF-specific extraction options. + properties: + bottom_margin_fraction: + type: + - number + - "null" + format: float + description: Bottom margin fraction to skip (0.0-1.0) + extract_annotations: + type: + - boolean + - "null" + description: Extract PDF annotations + extract_images: + type: + - boolean + - "null" + description: Extract images from PDF + extract_metadata: + type: + - boolean + - "null" + description: Extract PDF metadata + hierarchy: + oneOf: + - type: "null" + - $ref: '#/components/schemas/HierarchyConfig' + description: Heading hierarchy detection + passwords: + type: + - array + - "null" + items: + type: string + description: PDF passwords to try + top_margin_fraction: + type: + - number + - "null" + format: float + description: Top margin fraction to skip (0.0-1.0) + PostProcessorConfig: + type: object + description: Post-processing pipeline configuration. + properties: + disabled_processors: + type: + - array + - "null" + items: + type: string + description: Blacklist of processors to disable (None = none) + enabled: + type: + - boolean + - "null" + description: Enable post-processors + enabled_processors: + type: + - array + - "null" + items: + type: string + description: Whitelist of processors to enable (None = all) + PresignDocumentInput: + type: object + description: Document metadata for presigned upload (no bytes) + required: + - filename + - mime_type + properties: + config: + oneOf: + - type: "null" + - $ref: '#/components/schemas/FileExtractionConfig' + description: |- + Per-file extraction config override. + Merged with batch-level `config` at confirm time. None = use batch default. + filename: + type: string + description: Original filename + mime_type: + type: string + description: MIME type of the document + PresignUploadRequest: + type: object + description: Request body for presigning upload URLs + required: + - documents + properties: + config: + oneOf: + - type: "null" + - $ref: '#/components/schemas/ExtractionConfig' + description: Batch-level extraction configuration (applied to all documents) + documents: + type: array + items: + $ref: '#/components/schemas/PresignDocumentInput' + description: Document metadata (no file data) + webhook: + oneOf: + - type: "null" + - $ref: '#/components/schemas/WebhookConfig' + description: Webhook configuration for async result delivery + PresignUploadResponse: + type: object + description: Response from presign endpoint + required: + - batch_id + - uploads + properties: + batch_id: + type: string + description: Batch ID — pass this to the confirm endpoint + uploads: + type: array + items: + $ref: '#/components/schemas/PresignedUploadInfo' + description: Per-document upload URLs + PresignedUploadInfo: + type: object + description: Info about a single presigned upload + required: + - job_id + - upload_url + - object_key + - method + - expires_in_secs + properties: + expires_in_secs: + type: integer + format: int64 + description: Seconds until the URL expires + minimum: 0 + job_id: + type: string + description: Job ID for this document + method: + type: string + description: HTTP method to use (PUT) + object_key: + type: string + description: Storage object key + upload_url: + type: string + description: Presigned URL to upload the document to + ProcessingWarning: + type: object + description: Non-fatal processing warning. + properties: + message: + type: string + description: Warning message + source: + type: string + description: Warning source (e.g., "ocr", "pdf", "chunking") + ReadinessChecks: + type: object + description: Individual dependency check results + required: + - database + - nats + properties: + database: + type: string + description: Database connectivity + example: ok + nats: + type: string + description: NATS connectivity + example: ok + ReadinessResponse: + type: object + description: Readiness check response with dependency status + required: + - status + - checks + properties: + checks: + $ref: '#/components/schemas/ReadinessChecks' + description: Dependency check results + status: + type: string + description: Overall readiness status + example: ready + Row: + type: object + description: |- + Table row — a list of cell string values. + + Kreuzberg serializes cells as a nested array (each row is `["A", "B"]`). + This type accepts both array and object formats via a custom deserializer. + required: + - values + properties: + values: + type: array + items: + type: string + Table: + type: object + description: Extracted table. + properties: + bounding_box: + oneOf: + - type: "null" + - $ref: '#/components/schemas/BoundingBox' + description: Bounding box on the page + cells: + type: array + items: + $ref: '#/components/schemas/Row' + description: 2D grid of cells (rows of string values) + markdown: + type: string + description: Markdown representation of the table + page_number: + type: integer + format: int32 + description: Page number (0-indexed) + minimum: 0 + TokenReductionConfig: + type: object + description: Token reduction configuration for LLM consumption. + properties: + mode: + type: + - string + - "null" + description: 'Reduction mode: "off", "light", "moderate", "aggressive", "maximum"' + preserve_important_words: + type: + - boolean + - "null" + description: Preserve capitalized/technical terms + UsageByMimeType: + type: object + description: Usage statistics broken down by MIME type. + required: + - documents + - pages + - failed + properties: + documents: + type: integer + format: int32 + description: Number of documents of this MIME type processed + failed: + type: integer + format: int32 + description: Number of failed extractions for this MIME type + pages: + type: integer + format: int64 + description: Total pages extracted from documents of this MIME type + UsageResponse: + type: object + description: Current period usage and quota information. + required: + - period_start + - period_end + - total_pages + - total_documents + - total_failed + - by_mime_type + properties: + by_mime_type: + type: object + description: Usage breakdown by MIME type + additionalProperties: + $ref: '#/components/schemas/UsageByMimeType' + propertyNames: + type: string + period_end: + type: string + description: End of the reporting period (ISO 8601 date) + period_start: + type: string + description: Start of the reporting period (ISO 8601 date) + quota_limit: + type: + - integer + - "null" + format: int64 + description: Monthly page quota limit (null = unlimited) + quota_remaining: + type: + - integer + - "null" + format: int64 + description: Remaining pages in monthly quota (null = unlimited) + total_documents: + type: integer + format: int32 + description: Total documents processed in the period + total_failed: + type: integer + format: int32 + description: Total failed extractions in the period + total_pages: + type: integer + format: int64 + description: Total pages extracted in the period + WebhookConfig: + type: object + description: Webhook configuration for async delivery + required: + - url + properties: + metadata: + type: + - object + - "null" + description: Optional key-value metadata to include in the webhook payload + additionalProperties: + type: string + propertyNames: + type: string + secret: + type: + - string + - "null" + description: Optional HMAC secret for signing the webhook payload + url: + type: string + description: URL to deliver results to + securitySchemes: + bearer_auth: + type: http + scheme: bearer +security: + - bearer_auth: [] tags: - - name: health - description: Health check endpoints - - name: auth - description: Authentication endpoints - - name: projects - description: Project management - - name: members - description: Project members management - - name: api_keys - description: API key management - - name: webhooks - description: Webhook management - - name: billing - description: Billing and quota information - - name: usage - description: Usage statistics - - name: analytics - description: Rich analytics data - - name: sandbox - description: Sandbox extraction playground + - name: extract + description: Document extraction endpoints + - name: jobs + description: Job status endpoints + - name: uploads + description: Presigned upload endpoints + - name: usage + description: Usage and billing endpoints + - name: health + description: Health check endpoints +externalDocs: + url: https://docs.kreuzberg.cloud + description: API reference and documentation diff --git a/tasks/go.yml b/tasks/go.yml index 8e778ec..2dc4841 100644 --- a/tasks/go.yml +++ b/tasks/go.yml @@ -7,19 +7,24 @@ vars: tasks: generate: - desc: "Regenerate Go client from spec/openapi.yaml (downconverts 3.1 → 3.0 first)" + desc: "Regenerate Go client from spec/openapi.yaml (BLOCKED: oapi-codegen 3.1 support — see Workstream E-go)" cmds: - # oapi-codegen does not yet support OpenAPI 3.1 (oapi-codegen/oapi-codegen#373). - # Downconvert the vendored 3.1 spec to 3.0 first, then generate. + # STATUS: Go codegen is blocked on upstream oapi-codegen support for + # OpenAPI 3.1 (oapi-codegen/oapi-codegen#373). The kreuzberg-cloud + # public API spec is 3.1 (utoipa default) and uses 3.1 nullable + # unions inside oneOfs (`oneOf: [{type: null}, $ref]`) which the + # downconverter strips lossy-ly and oapi-codegen v2.7 still chokes + # on. Until 3.1 lands upstream, the Go SDK ships a hand-written + # interim client (Workstream E-go) instead of generated bindings. # - # NOTE: kreuzberg-cloud's spec uses 3.1 nullable unions inside oneOfs - # (`oneOf: [{type: null}, $ref]`). oapi-codegen v2.7 chokes on these. - # Generation is blocked until upstream emits 3.0 or oapi-codegen 3.1 - # support lands. The hand-written client.go remains buildable in the - # meantime. - - npx --yes @apiture/openapi-down-convert -i spec/openapi.yaml -o /tmp/kreuzberg-cloud-openapi-3.0.yaml - - cd {{.PKG_DIR}} && go tool oapi-codegen -config oapi-codegen.yaml /tmp/kreuzberg-cloud-openapi-3.0.yaml - - cd {{.PKG_DIR}} && go mod tidy + # When unblocking, replace this block with the previous downconvert+ + # codegen flow: + # npx --yes @apiture/openapi-down-convert -i spec/openapi.yaml \ + # -o /tmp/kreuzberg-cloud-openapi-3.0.yaml + # cd {{.PKG_DIR}} && go tool oapi-codegen -config oapi-codegen.yaml \ + # /tmp/kreuzberg-cloud-openapi-3.0.yaml + # cd {{.PKG_DIR}} && go mod tidy + - echo "Go codegen is blocked on oapi-codegen 3.1 support (Workstream E-go); skipping." install: desc: "Download Go module dependencies" diff --git a/tasks/spec.yml b/tasks/spec.yml index 766e8e5..1a8aa0a 100644 --- a/tasks/spec.yml +++ b/tasks/spec.yml @@ -1,29 +1,34 @@ version: "3" # Tasks for managing the vendored OpenAPI specification. +# +# Source of truth is the public extraction API spec emitted by +# kreuzberg-cloud/services/api (utoipa-generated). The sibling cloud repo +# commits the JSON snapshot at services/api/spec/openapi.json; we vendor +# it here as YAML to match the codegen toolchain's preferred input format. vars: SPEC_PATH: spec/openapi.yaml - UPSTREAM_LOCAL: ../kreuzberg-cloud/frontend/openapi-backend.yaml - UPSTREAM_URL: '{{.KREUZBERG_CLOUD_URL | default "http://localhost:8080"}}/api-doc/openapi.json' + UPSTREAM_LOCAL: ../kreuzberg-cloud/services/api/spec/openapi.json + UPSTREAM_URL: '{{.KREUZBERG_CLOUD_URL | default "https://api.kreuzberg.cloud"}}/api-doc/openapi.json' tasks: fetch: - desc: "Refresh spec/openapi.yaml from sibling kreuzberg-cloud repo" + desc: "Refresh spec/openapi.yaml from sibling kreuzberg-cloud repo (JSON → YAML)" cmds: - - cp {{.UPSTREAM_LOCAL}} {{.SPEC_PATH}} + - yq -P -oy '.' {{.UPSTREAM_LOCAL}} > {{.SPEC_PATH}} - git diff --stat {{.SPEC_PATH}} || true fetch:remote: desc: "Refresh spec/openapi.yaml from a running kreuzberg-cloud API server" cmds: - - curl -fsSL {{.UPSTREAM_URL}} | python3 -c "import sys, yaml, json; yaml.safe_dump(json.load(sys.stdin), sys.stdout, sort_keys=False)" > {{.SPEC_PATH}} + - curl -fsSL {{.UPSTREAM_URL}} | yq -P -oy '.' > {{.SPEC_PATH}} - git diff --stat {{.SPEC_PATH}} || true check: desc: "Fail if vendored spec drifted from sibling kreuzberg-cloud copy" cmds: - - diff -q {{.SPEC_PATH}} {{.UPSTREAM_LOCAL}} + - diff -q <(yq -P -oy '.' {{.UPSTREAM_LOCAL}}) {{.SPEC_PATH}} show: desc: "Print the current vendored spec path" diff --git a/tasks/version.yml b/tasks/version.yml index c195335..2c2b90b 100644 --- a/tasks/version.yml +++ b/tasks/version.yml @@ -26,3 +26,46 @@ tasks: fi echo "{{.CLI_ARGS}}" > {{.VERSION_FILE}} - task: sync + + set: + desc: "Set an explicit version and propagate (usage: task version:set -- 0.1.0)" + requires: + vars: [CLI_ARGS] + cmds: + - | + if [ -z "{{.CLI_ARGS}}" ]; then + echo "usage: task version:set -- " >&2 + exit 1 + fi + echo "{{.CLI_ARGS}}" > {{.VERSION_FILE}} + - task: sync + + bump:patch: + desc: "Bump patch (X.Y.Z -> X.Y.(Z+1)) and propagate" + cmds: + - | + current=$(cat {{.VERSION_FILE}}) + next=$(python3 -c "import re,sys;m=re.match(r'^(\d+)\.(\d+)\.(\d+)(.*)$','$current');print(f'{m[1]}.{m[2]}.{int(m[3])+1}')") + echo "$next" > {{.VERSION_FILE}} + echo "bumped $current -> $next" + - task: sync + + bump:minor: + desc: "Bump minor (X.Y.Z -> X.(Y+1).0) and propagate" + cmds: + - | + current=$(cat {{.VERSION_FILE}}) + next=$(python3 -c "import re,sys;m=re.match(r'^(\d+)\.(\d+)\.(\d+)(.*)$','$current');print(f'{m[1]}.{int(m[2])+1}.0')") + echo "$next" > {{.VERSION_FILE}} + echo "bumped $current -> $next" + - task: sync + + bump:major: + desc: "Bump major (X.Y.Z -> (X+1).0.0) and propagate" + cmds: + - | + current=$(cat {{.VERSION_FILE}}) + next=$(python3 -c "import re,sys;m=re.match(r'^(\d+)\.(\d+)\.(\d+)(.*)$','$current');print(f'{int(m[1])+1}.0.0')") + echo "$next" > {{.VERSION_FILE}} + echo "bumped $current -> $next" + - task: sync