Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .devcontainer/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copy to `.devcontainer/.env` for docker-compose variable substitution.
# Compose reads this file from the `.devcontainer/` directory (not repo-root `.env` for these keys).
#
# Host port mappings (optional):
# SSH_PORT=2220
# APP_PORT=1234
# DEV_PORT=8888
#
# DNS inside the container (optional; defaults are 1.1.1.1 + 8.8.8.8 in docker-compose.yml).
# Use your corporate resolvers if public DNS is blocked:
# DEVCONTAINER_DNS_1=10.0.0.1
# DEVCONTAINER_DNS_2=10.0.0.2
42 changes: 21 additions & 21 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
{
"name": "open-pulse-crawler",
"build": {
"dockerfile": "Dockerfile"
"name": "open-pulse-crawler-dev",
"dockerComposeFile": "docker-compose.yml",
"service": "devcontainer",
"workspaceFolder": "/workspaces/project",
"containerEnv": {
"UV_CACHE_DIR": "/workspaces/project/.uv-cache"
},
"overrideCommand": false,
"features": {
"ghcr.io/devcontainers/features/sshd:1": {
"version": "latest"
}
},
"runArgs": [
"--env-file",
"${localWorkspaceFolder}/.env",
"--network",
"dev"
],

// This is where your repo will be mounted inside the container
"remoteUser": "vscode",
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",

"customizations": {
"vscode": {
"settings": {
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
"python.envFile": "${workspaceFolder}/.env"
},
"settings": {
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
"python.envFile": "${workspaceFolder}/.env"
},
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"tamasfe.even-better-toml"
"tamasfe.even-better-toml",
"github.copilot",
"github.copilot-chat"
]
}
},

// Install project in editable mode after the container is built
"postCreateCommand": "rm -rf .venv && uv venv && uv pip install -e .[viz,dev] && echo '. $PWD/.venv/bin/activate' >> /home/vscode/.bashrc"
"postCreateCommand": "mkdir -p .uv-cache && rm -rf .venv && uv venv && uv pip install -e .[dev] && echo '. $PWD/.venv/bin/activate' >> /home/vscode/.bashrc",
"postStartCommand": "bash .devcontainer/set-vscode-password.sh"
}
35 changes: 35 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Dev container stack. Compose publishes ports on the host (more reliable than
# devcontainer forwardPorts in some setups). Interpolation vars (SSH_PORT, etc.)
# can be set in `.devcontainer/.env` (see `.devcontainer/.env.example`).
#
# Internal SSH: devcontainers `sshd` feature listens on 2222, not 22 — map host:2222.
#
# Explicit DNS: containers on external networks (e.g. `dev`) sometimes get no working resolver
# and `uv pip` fails with "dns error" / "failed to lookup address information".
# Override in `.devcontainer/.env`: DEVCONTAINER_DNS_1 / DEVCONTAINER_DNS_2.
services:
devcontainer:
build:
context: ..
dockerfile: .devcontainer/Dockerfile
dns:
- "${DEVCONTAINER_DNS_1:-1.1.1.1}"
- "${DEVCONTAINER_DNS_2:-8.8.8.8}"
env_file:
- ../.env
environment:
# Avoid ~/.cache/uv (often root-owned after sshd/common-utils); workspace is bind-mounted as vscode.
UV_CACHE_DIR: /workspaces/project/.uv-cache
ports:
- "${SSH_PORT:-2222}:2222"
- "${APP_PORT:-1234}:1234"
- "${DEV_PORT:-8888}:8888"
volumes:
- ..:/workspaces/project:cached
command: sleep infinity
networks:
- dev

networks:
dev:
external: true
8 changes: 8 additions & 0 deletions .devcontainer/set-vscode-password.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
# Apply VSCODE_PASSWORD to user vscode at container start (not baked into the image).
# Set VSCODE_PASSWORD in .env (this repo loads it via devcontainer runArgs --env-file).
set -euo pipefail
if [[ -z "${VSCODE_PASSWORD:-}" ]]; then
exit 0
fi
printf 'vscode:%s\n' "$VSCODE_PASSWORD" | sudo chpasswd
75 changes: 74 additions & 1 deletion .env.dist
Original file line number Diff line number Diff line change
@@ -1,2 +1,75 @@
# Open Pulse Crawler — environment template
#
# Copy to `.env` at the repo root, then either:
# • API only: `docker compose -f infra/docker-compose.yml --env-file ../.env up -d`
# → FastAPI on http://localhost:${OPC_API_PORT:-8000}
# • API + GUI + Nginx: `docker compose -f infra/docker-compose.yml --env-file ../.env --profile gui up -d`
# → Browser GUI on http://localhost:${OPC_PORT:-80}
# • or run the API/CLI directly via `uv run` after exporting the same vars.
#
# All variables marked REQUIRED must be set; everything else has sensible
# defaults baked into either the compose file or the application code.

# ----- REQUIRED: API authentication ----------------------------------------
#
# Bearer token clients must send to authenticated endpoints.
# Generate one with:
# python -c 'import secrets; print(secrets.token_urlsafe(32))'
API_TOKEN=

# ----- REQUIRED: GitHub access ---------------------------------------------
#
# GitHub Personal Access Token used by the BFS crawler AND by the gimie
# hybrid path when enabled.
#
# Comma-separated for multi-token rotation:
# GITHUB_TOKEN=ghp_aaa…,ghp_bbb…,ghp_ccc…
#
# Required scopes:
# read:org — required by gimie's GraphQL query (org avatarUrl/name/etc).
# Without it the hybrid gimie path returns INSUFFICIENT_SCOPES.
# read:user — recommended; gimie reads contributor profiles.
# public_repo — required for crawling public repos (or `repo` for private).
#
# A classic PAT scoped read:org + read:user + public_repo is the simplest
# choice. Fine-grained PATs work too but need org-level approval.
GITHUB_TOKEN=
API_TOKEN=

# ----- Optional: data directory --------------------------------------------
#
# Where the API stores per-job artifacts (JSON-LD payloads, archives).
# Defaults to /tmp/open-pulse-crawler inside the container; change only if
# you mount a persistent volume.
# OPC_DATA_DIR=/tmp/open-pulse-crawler

# ----- Optional: GUI password (only relevant with --profile gui) -----------
#
# Required by the Streamlit GUI to gate the browser interface. If unset, the
# GUI shows a visible warning and stays open. Pick anything non-trivial:
# GUI_PASSWORD=$(python -c 'import secrets; print(secrets.token_urlsafe(16))')
# GUI_PASSWORD=

# ----- Optional: gimie hybrid (deployment-time, server-side) ---------------
#
# When enabled, the crawler enriches repository nodes from a gimie JSON-LD
# service (users/orgs still come from GitHub via PyGithub). These are server
# concerns, not per-request — clients of /api/v1/crawl don't see or pass them.
#
# Truthy values: true, 1, yes, on (case-insensitive). Anything else: false.
# GIMIE_ENABLED=false
# GIMIE_API_BASE=http://host.docker.internal:1234
# GIMIE_STORE_JSONLD=false
# GIMIE_SKIP_EXISTING_JSONLD=false

# ----- Optional: compose-only knobs ----------------------------------------
#
# These are read by infra/docker-compose.yml, not by the Python code.
#
# Override the published image tag (defaults to :latest on GHCR).
# OPC_IMAGE=ghcr.io/sdsc-ordes/open-pulse-crawler:1.0.0
#
# Host port for the FastAPI service (default profile, defaults to 8000).
# OPC_API_PORT=8000
#
# Host port for the Nginx reverse proxy (only with --profile gui, defaults to 80).
# OPC_PORT=8080
72 changes: 72 additions & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: Docs

on:
push:
branches: ["main", "develop"]
paths:
- "docs/**"
- "mkdocs.yml"
- "pyproject.toml"
- ".github/workflows/docs.yaml"
pull_request:
branches: ["main", "develop"]
paths:
- "docs/**"
- "mkdocs.yml"
- "pyproject.toml"
- ".github/workflows/docs.yaml"
workflow_dispatch:

# Required by actions/deploy-pages.
permissions:
contents: read
pages: write
id-token: write

# Avoid stomping on each other if pushes land back-to-back.
concurrency:
group: pages
cancel-in-progress: false

jobs:
build:
name: Build MkDocs site
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # mkdocs-material uses git for "last updated" metadata

- name: Set up uv
uses: astral-sh/setup-uv@v4
with:
python-version: "3.12"

- name: Install docs dependencies
run: uv sync --extra docs

- name: Build site (strict)
# --strict turns warnings (broken links, missing nav targets, …) into
# errors so we fail fast in CI rather than ship a broken site.
run: uv run mkdocs build --strict --site-dir site

- name: Upload Pages artifact
uses: actions/upload-pages-artifact@v3
with:
path: site

deploy:
name: Deploy to GitHub Pages
# Only deploy on pushes to the default branch — PR builds validate but
# do not publish. workflow_dispatch on main also publishes.
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
needs: build
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- `--max-contributors` skip rule (CLI flag, REST API field on `POST /api/v1/crawl`, Streamlit GUI input under "Performance & filtering", and constructor argument on `GitHubCrawler`). When set, repos with more than N contributors stay in the graph but their contributor users are not queued for BFS expansion — owner / fork / dependency / dependent edges are unaffected. Useful for avoiding mega-projects (linux kernel, etc.) that would otherwise dominate the frontier.
- `RepoModel.contributor_count` and `RepoModel.skipped_high_contributors` fields. The total contributor count is captured the first time a repo is fetched (one cheap `per_page=1` request via `repo.get_contributors().totalCount`) and cached alongside the existing `repo:{full_name}` entry, so repeat crawls re-apply the skip rule with zero additional API calls.
- `GitHubClient.get_contributor_count(repo_full_name)` helper — cache-first lookup with a single live fallback on miss; write-throughs the count so the next call hits cache.
- Streamlit GUI brought to parity with the REST API: form now exposes `crawl_dependencies`, `crawl_dependents`, `min_stars`, `max_dependents`, `max_contributors`, and `batch_size` (under collapsible "Dependency crawling" and "Performance & filtering" expanders, with sensible defaults that don't surface the params in the request body unless changed). Results panel now shows live progress (current round, nodes processed, queue size, ETA) for active jobs, includes pause / resume / cancel / delete controls gated by current status, and offers a graph download button once the job is `completed`. API errors are surfaced with their `detail` message instead of a generic HTTP error string.
- MkDocs documentation site (Material theme) at `mkdocs.yml`, with a `docs` extra group in `pyproject.toml` (`mkdocs`, `mkdocs-material`, `pymdown-extensions`). Rendered Mermaid diagrams via `pymdownx.superfences`: an architecture diagram on the home page, a job-lifecycle state diagram and a request-flow sequence diagram in `docs/API.md`, and a Compose-stack flow diagram in `docs/DEPLOYMENT.md`. Build verified locally with `mkdocs build --strict`.
- `docs/index.md` landing page introducing the project and linking into the existing docs.
- GitHub Action `.github/workflows/docs.yaml` that builds the site on push / PR to `main` / `develop` (with `--strict`) and deploys to GitHub Pages from `main` via `actions/deploy-pages`. **Requires Pages source = "GitHub Actions"** in repo Settings → Pages.
- Job lifecycle endpoints in the REST API:
- `GET /api/v1/jobs` — list every job in the in-memory registry, newest first, with optional `status_filter`.
- `POST /api/v1/crawl/{job_id}/pause` — pause at the next BFS round boundary.
- `POST /api/v1/crawl/{job_id}/resume` — resume a paused job.
- `POST /api/v1/crawl/{job_id}/cancel` — cooperative cancel; partial graph is preserved.
- `DELETE /api/v1/crawl/{job_id}` — drop a terminal job from the registry.
- Live progress fields on `GET /api/v1/crawl/{job_id}` for running jobs: `started_at`, `completed_at`, `current_round`, `nodes_processed`, `nodes_in_queue`, and a best-effort `estimated_completion_at`.
- `paused` and `cancelled` job-status values.
- Server-side gimie configuration via env vars: `GIMIE_ENABLED`, `GIMIE_API_BASE`, `GIMIE_STORE_JSONLD`, `GIMIE_SKIP_EXISTING_JSONLD`. When enabled, each crawled repo is enriched via a sibling git-metadata-extractor service and (optionally) JSON-LD payloads are persisted under `${OPC_DATA_DIR}/<job_id>/jsonld/`.
- Nginx reverse-proxy container (`infra/nginx/Dockerfile` + `infra/nginx/nginx.conf`): routes `/api/*` to FastAPI (port 8000) and `/` to Streamlit (port 8501) with WebSocket upgrade support.
- Streamlit placeholder GUI (`src/open_pulse_crawler/gui.py`) with token input sidebar, crawl form (seeds + BFS rounds), and live job-status results area.
- `streamlit` and `httpx` added to project dependencies.
Expand All @@ -29,16 +45,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `fastapi` and `uvicorn[standard]` added to project dependencies; `httpx` added to dev dependencies.
- API test suite (`tests/test_api.py`).
- API documentation (`docs/API.md`).
- Optional gimie JSON-LD hybrid repository fetching (initially as a `gimie_repos` request flag; superseded by env-var configuration — see `Added` above).
- Docker Compose stack (`infra/docker-compose.yml`) for `api`, `gui`, and `nginx` services on a shared network with env-file configuration and per-service health checks.
- End-to-end Docker integration test script (`tests/test_integration.sh`) validating health, auth behavior, and GUI/API routing through Nginx.

### Changed

- Fixed `infra/docker-compose.yml` nginx healthcheck: replaced `wget http://localhost/api/v1/health` with `wget http://127.0.0.1/api/v1/health`. Inside the Alpine container, `localhost` resolves to `::1` (IPv6) but `infra/nginx/nginx.conf` only declares `listen 80;` (IPv4-only), so the probe was getting "Connection refused" while external access worked fine. Stack went from `unhealthy` to healthy in 7s after recreate.
- Crawl export filenames: timestamp first, then kind — e.g. `YYYYMMDDHHMMSS.graph.json`, `YYYYMMDDHHMMSS.edges.csv`, `YYYYMMDDHHMMSS.nodes.csv`, `YYYYMMDDHHMMSS.graph.png`, directory `YYYYMMDDHHMMSS.clusters/`. Incremental round folders are `YYYYMMDDHHMMSS.round_NN/` with the same inner naming.
- Gimie JSON-LD: on success (HTTP 2xx) or when using an existing payload file with skip-existing, remove matching `jsonld_errors/<repo>.*.json` files for that repository.
- Gimie JSON-LD: log a short preview of the HTTP error response body when the gimie endpoint returns non-2xx (in addition to optional `jsonld_errors/` files).
- Gimie JSON-LD: `force_refresh=true` is always sent on HTTP fetches (not a CLI/API flag). `--gimie-skip-existing-jsonld` only checks for existing files under the crawl `jsonld/` output directory.
- Renamed `jsonld/*.json` export/processing filenames to include timestamp as `name.timestamp.json` (timestamp derived from crawl export time or filesystem metadata by the provided rename script).
- Expanded deployment docs in `docs/DEPLOYMENT.md` with Docker Compose setup, environment configuration, health verification, and integration test usage.
- Updated `README.md` with dedicated REST API and Docker/GUI quick-start sections and links to deployment/API docs.
- Completed API docs (`docs/API.md`) with reverse-proxy base URL notes and practical `curl` examples for crawl/status/graph flows.
- Extended `POST /api/v1/crawl` to accept CLI-aligned crawl controls: `crawl_dependencies`, `crawl_dependents`, `min_stars`, `max_dependents`, `batch_size`, and inline `epfl_entities`.
- Moved Docker infrastructure files into `infra/` and updated commands/scripts to use `docker compose -f infra/docker-compose.yml ...`.
- Updated compose services to pull the app container from `ghcr.io/sdsc-ordes/open-pulse-crawler:latest` by default (`OPC_IMAGE` override supported).
- Gimie hybrid extraction is now configured via environment variables (`GIMIE_ENABLED`, `GIMIE_API_BASE`, `GIMIE_STORE_JSONLD`, `GIMIE_SKIP_EXISTING_JSONLD`), not the per-request `gimie_repos` flag. Operators decide whether the gimie path is on; clients submitting crawls don't need to know.
- Cleaned up `docs/`: removed completion-report markdown (`*_COMPLETE`, `*_FIX_SUMMARY`, `*_IMPLEMENTATION`, `IMPROVEMENTS_SUMMARY`, `PLAN_*`, `QUICK_REFERENCE`, etc.) and the duplicate copies of files that already live under `docs/dev/dependency-graph/`. The remaining doc set is `API.md`, `DEPLOYMENT.md`, `CONCURRENCY.md`, `PROGRESS_TRACKING.md`, `TIMESTAMPS.md`, `VISUALIZATION.md`, plus `docs/dev/`. README's broken `RATE_LIMITING.md` link now points at `docs/CONCURRENCY.md`.
- Refreshed `docs/API.md` to match the current API surface (job list / pause / resume / cancel / delete, live progress fields with ETA, env-driven gimie config) and corrected the Dockerfile path in `docs/DEPLOYMENT.md` (`tools/image/Dockerfile`).

[Unreleased]: https://github.com/sdsc-ordes/open-pulse-crawler/compare/v0.1.0...HEAD
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,13 +225,14 @@ open-pulse-crawler crawl DeepLabCut/DeepLabCut \
- `--crawl-dependents`: Crawl upstream dependents ("Used by")
- `--min-stars`: Minimum stars for filtering dependents/dependencies (default: 0)
- `--max-dependents`: Maximum number of dependents to fetch (default: all)
- `--max-contributors`: Skip contributor expansion for repos with more than N contributors. The repo node still lands in the graph (with owner / fork / deps); only its contributors are not queued. Useful for avoiding mega-projects (e.g. linux kernel) that would dominate the BFS frontier. The total count is cached, so this is roughly free on re-crawls. Default: unlimited.

#### Rate Limiting Options (New!)
- `--request-delay`: Minimum delay in seconds between API requests (default: 0.0)
- `--max-concurrent`: Maximum number of concurrent API requests (default: 5)
- `--rate-limit-buffer`: Buffer of requests to keep before waiting (default: 50)

See [RATE_LIMITING.md](./RATE_LIMITING.md) for detailed guide on rate limiting and API management.
See [docs/CONCURRENCY.md](./docs/CONCURRENCY.md) for the detailed guide on concurrency, rate limiting, and multi-token rotation.

## Output Formats

Expand Down
Loading
Loading