Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9167790
docs: development plan
caviri Nov 13, 2025
d9c7485
feat: remove bots from visualization
caviri Nov 19, 2025
f8a361e
feat: dependents & dependencies
caviri Nov 19, 2025
89bcf6b
feat: new nodes properties
caviri Nov 19, 2025
56be2a4
bug: cache dependency
caviri Nov 21, 2025
e824b8e
feat: add EPFL prioritization and JSON ID fixing scripts
caviri Feb 24, 2026
8a979ea
Merge pull request #4 from sdsc-ordes/feat--dependency_graph
caviri Feb 24, 2026
c100457
feat: add agent and contributor guidelines, REST API, and changelog
caviri Feb 24, 2026
1015321
feat: implement Bearer-token authentication with API_TOKEN support
caviri Feb 24, 2026
1764a03
feat: add multi-stage Dockerfile, .dockerignore, and deployment docum…
caviri Feb 24, 2026
5d8ef0e
feat: add Streamlit GUI for Open Pulse Crawler with token input and c…
caviri Feb 24, 2026
52af94b
feat: add Nginx reverse-proxy configuration for routing to FastAPI an…
caviri Feb 24, 2026
fd4cbff
feat: integrate Docker Compose for API, GUI, and Nginx with health ch…
caviri Mar 3, 2026
d22a5b8
feat: add tests for handling missing and invalid GitHub credentials i…
caviri Mar 3, 2026
9a0fbec
feat: add GitHub Actions workflow for building and publishing Docker …
caviri Mar 3, 2026
6e9d06e
feat: update project structure and enhance API with new crawl parameters
caviri Mar 3, 2026
d761774
feat: enhance testing setup and update dependencies
caviri Mar 3, 2026
a7bfd64
Merge pull request #5 from sdsc-ordes/feat-api
caviri Mar 3, 2026
e0099d5
refactor: update Dockerfile for multi-stage build and runtime optimiz…
caviri May 2, 2026
0511fa6
feat: implement optional gimie JSON-LD hybrid repository fetching and…
caviri Mar 25, 2026
d28186c
feat: configure development container with Docker Compose and environ…
caviri Mar 25, 2026
ea73b73
feat: enhance environment configuration and API functionality
caviri May 4, 2026
11f2afa
feat: introduce MkDocs documentation and enhance API job lifecycle ma…
caviri May 5, 2026
c69ee86
feat: implement `--max-contributors` skip rule for contributor expansion
caviri May 5, 2026
d8f34e0
Merge pull request #6 from sdsc-ordes/feat-gimie-driven-properties
caviri May 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .devcontainer/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copy to `.devcontainer/.env` for docker-compose variable substitution.
# Compose reads this file from the `.devcontainer/` directory (not repo-root `.env` for these keys).
#
# Host port mappings (optional):
# SSH_PORT=2220
# APP_PORT=1234
# DEV_PORT=8888
#
# DNS inside the container (optional; defaults are 1.1.1.1 + 8.8.8.8 in docker-compose.yml).
# Use your corporate resolvers if public DNS is blocked:
# DEVCONTAINER_DNS_1=10.0.0.1
# DEVCONTAINER_DNS_2=10.0.0.2
42 changes: 21 additions & 21 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
{
"name": "open-pulse-crawler",
"build": {
"dockerfile": "Dockerfile"
"name": "open-pulse-crawler-dev",
"dockerComposeFile": "docker-compose.yml",
"service": "devcontainer",
"workspaceFolder": "/workspaces/project",
"containerEnv": {
"UV_CACHE_DIR": "/workspaces/project/.uv-cache"
},
"overrideCommand": false,
"features": {
"ghcr.io/devcontainers/features/sshd:1": {
"version": "latest"
}
},
"runArgs": [
"--env-file",
"${localWorkspaceFolder}/.env",
"--network",
"dev"
],

// This is where your repo will be mounted inside the container
"remoteUser": "vscode",
"workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",

"customizations": {
"vscode": {
"settings": {
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
"python.envFile": "${workspaceFolder}/.env"
},
"settings": {
"python.defaultInterpreterPath": "${workspaceFolder}/.venv/bin/python",
"python.envFile": "${workspaceFolder}/.env"
},
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"tamasfe.even-better-toml"
"tamasfe.even-better-toml",
"github.copilot",
"github.copilot-chat"
]
}
},

// Install project in editable mode after the container is built
"postCreateCommand": "rm -rf .venv && uv venv && uv pip install -e .[viz,dev] && echo '. $PWD/.venv/bin/activate' >> /home/vscode/.bashrc"
"postCreateCommand": "mkdir -p .uv-cache && rm -rf .venv && uv venv && uv pip install -e .[dev] && echo '. $PWD/.venv/bin/activate' >> /home/vscode/.bashrc",
"postStartCommand": "bash .devcontainer/set-vscode-password.sh"
}
35 changes: 35 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Dev container stack. Compose publishes ports on the host (more reliable than
# devcontainer forwardPorts in some setups). Interpolation vars (SSH_PORT, etc.)
# can be set in `.devcontainer/.env` (see `.devcontainer/.env.example`).
#
# Internal SSH: devcontainers `sshd` feature listens on 2222, not 22 — map host:2222.
#
# Explicit DNS: containers on external networks (e.g. `dev`) sometimes get no working resolver
# and `uv pip` fails with "dns error" / "failed to lookup address information".
# Override in `.devcontainer/.env`: DEVCONTAINER_DNS_1 / DEVCONTAINER_DNS_2.
services:
devcontainer:
build:
context: ..
dockerfile: .devcontainer/Dockerfile
dns:
- "${DEVCONTAINER_DNS_1:-1.1.1.1}"
- "${DEVCONTAINER_DNS_2:-8.8.8.8}"
env_file:
- ../.env
environment:
# Avoid ~/.cache/uv (often root-owned after sshd/common-utils); workspace is bind-mounted as vscode.
UV_CACHE_DIR: /workspaces/project/.uv-cache
ports:
- "${SSH_PORT:-2222}:2222"
- "${APP_PORT:-1234}:1234"
- "${DEV_PORT:-8888}:8888"
volumes:
- ..:/workspaces/project:cached
command: sleep infinity
networks:
- dev

networks:
dev:
external: true
8 changes: 8 additions & 0 deletions .devcontainer/set-vscode-password.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash
# Apply VSCODE_PASSWORD to user vscode at container start (not baked into the image).
# Set VSCODE_PASSWORD in .env (this repo loads it via devcontainer runArgs --env-file).
set -euo pipefail
if [[ -z "${VSCODE_PASSWORD:-}" ]]; then
exit 0
fi
printf 'vscode:%s\n' "$VSCODE_PASSWORD" | sudo chpasswd
30 changes: 30 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Build context allowlist: keep only what the multi-stage build needs.
# Anything not under src/, plus pyproject.toml/uv.lock/README.md, is skipped.

**/.git
**/.gitignore
**/.gitattributes
**/__pycache__/
**/*.pyc
**/*.pyo
**/.pytest_cache/
**/.mypy_cache/
**/.ruff_cache/
**/.venv/
**/.env
**/.env.*
!**/.env.dist
.devcontainer/
.github/
.idea/
.vscode/
docs/
tests/
infra/
tools/scripts/
*.md
!README.md
*.log
*.sh
test_incremental.sh
Dockerfile.local
76 changes: 75 additions & 1 deletion .env.dist
Original file line number Diff line number Diff line change
@@ -1 +1,75 @@
GITHUB_TOKEN=
# Open Pulse Crawler — environment template
#
# Copy to `.env` at the repo root, then either:
# • API only: `docker compose -f infra/docker-compose.yml --env-file ../.env up -d`
# → FastAPI on http://localhost:${OPC_API_PORT:-8000}
# • API + GUI + Nginx: `docker compose -f infra/docker-compose.yml --env-file ../.env --profile gui up -d`
# → Browser GUI on http://localhost:${OPC_PORT:-80}
# • or run the API/CLI directly via `uv run` after exporting the same vars.
#
# All variables marked REQUIRED must be set; everything else has sensible
# defaults baked into either the compose file or the application code.

# ----- REQUIRED: API authentication ----------------------------------------
#
# Bearer token clients must send to authenticated endpoints.
# Generate one with:
# python -c 'import secrets; print(secrets.token_urlsafe(32))'
API_TOKEN=

# ----- REQUIRED: GitHub access ---------------------------------------------
#
# GitHub Personal Access Token used by the BFS crawler AND by the gimie
# hybrid path when enabled.
#
# Comma-separated for multi-token rotation:
# GITHUB_TOKEN=ghp_aaa…,ghp_bbb…,ghp_ccc…
#
# Required scopes:
# read:org — required by gimie's GraphQL query (org avatarUrl/name/etc).
# Without it the hybrid gimie path returns INSUFFICIENT_SCOPES.
# read:user — recommended; gimie reads contributor profiles.
# public_repo — required for crawling public repos (or `repo` for private).
#
# A classic PAT scoped read:org + read:user + public_repo is the simplest
# choice. Fine-grained PATs work too but need org-level approval.
GITHUB_TOKEN=

# ----- Optional: data directory --------------------------------------------
#
# Where the API stores per-job artifacts (JSON-LD payloads, archives).
# Defaults to /tmp/open-pulse-crawler inside the container; change only if
# you mount a persistent volume.
# OPC_DATA_DIR=/tmp/open-pulse-crawler

# ----- Optional: GUI password (only relevant with --profile gui) -----------
#
# Required by the Streamlit GUI to gate the browser interface. If unset, the
# GUI shows a visible warning and stays open. Pick anything non-trivial:
# GUI_PASSWORD=$(python -c 'import secrets; print(secrets.token_urlsafe(16))')
# GUI_PASSWORD=

# ----- Optional: gimie hybrid (deployment-time, server-side) ---------------
#
# When enabled, the crawler enriches repository nodes from a gimie JSON-LD
# service (users/orgs still come from GitHub via PyGithub). These are server
# concerns, not per-request — clients of /api/v1/crawl don't see or pass them.
#
# Truthy values: true, 1, yes, on (case-insensitive). Anything else: false.
# GIMIE_ENABLED=false
# GIMIE_API_BASE=http://host.docker.internal:1234
# GIMIE_STORE_JSONLD=false
# GIMIE_SKIP_EXISTING_JSONLD=false

# ----- Optional: compose-only knobs ----------------------------------------
#
# These are read by infra/docker-compose.yml, not by the Python code.
#
# Override the published image tag (defaults to :latest on GHCR).
# OPC_IMAGE=ghcr.io/sdsc-ordes/open-pulse-crawler:1.0.0
#
# Host port for the FastAPI service (default profile, defaults to 8000).
# OPC_API_PORT=8000
#
# Host port for the Nginx reverse proxy (only with --profile gui, defaults to 80).
# OPC_PORT=8080
72 changes: 72 additions & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: Docs

on:
push:
branches: ["main", "develop"]
paths:
- "docs/**"
- "mkdocs.yml"
- "pyproject.toml"
- ".github/workflows/docs.yaml"
pull_request:
branches: ["main", "develop"]
paths:
- "docs/**"
- "mkdocs.yml"
- "pyproject.toml"
- ".github/workflows/docs.yaml"
workflow_dispatch:

# Required by actions/deploy-pages.
permissions:
contents: read
pages: write
id-token: write

# Avoid stomping on each other if pushes land back-to-back.
concurrency:
group: pages
cancel-in-progress: false

jobs:
build:
name: Build MkDocs site
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # mkdocs-material uses git for "last updated" metadata

- name: Set up uv
uses: astral-sh/setup-uv@v4
with:
python-version: "3.12"

- name: Install docs dependencies
run: uv sync --extra docs

- name: Build site (strict)
# --strict turns warnings (broken links, missing nav targets, …) into
# errors so we fail fast in CI rather than ship a broken site.
run: uv run mkdocs build --strict --site-dir site

- name: Upload Pages artifact
uses: actions/upload-pages-artifact@v3
with:
path: site

deploy:
name: Deploy to GitHub Pages
# Only deploy on pushes to the default branch — PR builds validate but
# do not publish. workflow_dispatch on main also publishes.
if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
needs: build
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
Loading
Loading