From e396e06186ef8bb670422d90ecab43afeb42f1db Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Wed, 4 Mar 2026 09:12:58 -0800 Subject: [PATCH 01/11] =?UTF-8?q?docs:=20rename=20multitenancy=20=E2=86=92?= =?UTF-8?q?=20multiproject,=20update=20project.yml=20schema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename design docs to multiproject.md / multiproject_implementation.md - Deprecate scope concept in favor of project - Add PocEnv project parameter alongside ProdEnv - New project.yml v4 schema: sites/admins/projects separation - admins section explicitly optional for SSO migration path - Add Phase 1 implementation section --- docs/design/multiproject.md | 476 ++++++++++++++++++++ docs/design/multiproject_implementation.md | 499 +++++++++++++++++++++ 2 files changed, 975 insertions(+) create mode 100644 docs/design/multiproject.md create mode 100644 docs/design/multiproject_implementation.md diff --git a/docs/design/multiproject.md b/docs/design/multiproject.md new file mode 100644 index 0000000000..d7de82bd93 --- /dev/null +++ b/docs/design/multiproject.md @@ -0,0 +1,476 @@ +# Multi-Project Support in Flare + +## Introduction + +Flare currently operates as a single-tenant system. All server and client processes run under the same Linux user, all jobs share a flat store (`jobs//`), and every authorized admin can see and act on every job. There is no data segregation between different collaborations running on the same infrastructure. + +To achieve genuine multi-tenancy, we introduce a **project** concept as the primary tenant boundary. A project encapsulates a private dataset, a set of participants (users and sites), an authorization policy, and runtime isolation. This document specifies the required changes across the full Flare stack. + +### Design Principles + +1. **Least privilege by default** — users see nothing outside their project(s) +2. **Defense in depth** — logical access control (authz) + physical isolation (containers/PVs) +3. **Backward compatible** — a `default` project preserves current single-tenant behavior +4. **`scope` deprecated** — the existing `scope` data-governance concept is superseded by `project`; `scope` will be removed in a future release +5. **Feature-gated** — all multitenancy behavior gated on `api_version: 4` in `project.yml`; single-tenant deployments see zero behavior change + + +--- + +## Project Model + +A project is a named, immutable tenant boundary with these properties: + +| Property | Description | +|----------|-------------| +| `name` | Unique identifier (e.g., `cancer-research`) | +| `clients` | Set of FL client sites enrolled in this project | +| `users` | Set of admin users with per-project roles | +| `authorization` | Per-project authorization policy | + +- Users are associated with one or more projects, each with an independent role. +- **Clients participate in all projects they are enrolled in simultaneously.** Data isolation on shared clients is achieved through the runtime environment: K8s jobs mount project-specific PVs, Docker jobs mount project-specific host directories. The Flare parent process on the client does not access project data directly. +- Jobs belong to exactly one project (immutable after submission). +- A `default` project exists for backward compatibility. + +--- + +## User Experience + +### Data Scientist (Recipe API) + +The recipe is unchanged. The project is specified via `ProdEnv` or `PocEnv`: + +```python +recipe = FedAvgRecipe( + name="hello-pt", + min_clients=n_clients, + num_rounds=num_rounds, + initial_model=SimpleNetwork(), + train_script=args.train_script, +) + +env = ProdEnv( + startup_kit_location=args.startup_kit_location, + project="cancer-research", +) +run = recipe.execute(env) +``` + +`PocEnv` supports the same parameter: + +```python +env = PocEnv( + poc_workspace=args.poc_workspace, + project="cancer-research", +) +run = recipe.execute(env) +``` + +If `project` is omitted in either env, the `default` project is used. + +### Admin (FLARE API / Admin Console) + +The `Session` gains a project context: + +```python +sess = new_secure_session( + username="admin@org_a.com", + startup_kit_location="./startup", + project="cancer-research", # new +) +# All subsequent operations scoped to this project +jobs = sess.list_jobs() # only cancer-research jobs +sess.submit_job("./my_job") # tagged to cancer-research +``` + +Admin console equivalent: + +``` +> set_project cancer-research +Project set to: cancer-research + +> list_jobs +... only shows cancer-research jobs ... +``` + +A user with roles in multiple projects can switch context: + +``` +> set_project multiple-sclerosis +Project set to: multiple-sclerosis +``` + +### Platform Administrator + +A new **platform admin** role (distinct from per-project `project_admin`) manages cross-project concerns: + +- Create/archive projects +- Assign clients to projects +- Assign project admins +- View system-wide health (without seeing job data) + +--- + +## Data Model Changes + +### Job Metadata + +`project` becomes a first-class, immutable field on every job. Set at submission time from the user's active project context. Cannot be changed after creation. + +### Job Store Partitioning + +New multitenant jobs are stored at `jobs///` (vs. current `jobs//`). No migration of existing jobs — they remain at `jobs//` and implicitly belong to the `default` project. + +Physical partitioning enables: +- Filesystem-level isolation (different mount points per project in K8s) +- Simpler backup/restore per project +- Prevents cross-project data access via path traversal + +### Project Registry + +The server loads `project.yml` directly at startup for project/role lookup. No separate registry format or database needed. + +--- + +## Access Control Changes + +### Role Model + +Roles are **per-project**, not global. A user can be `lead` in one project and `member` in another. + +Today, the role is baked into the X.509 certificate (`UNSTRUCTURED_NAME` field). A single cert cannot encode multiple per-project roles. + +**Layered resolution (no breaking change):** +1. If `ProjectRegistry` exists AND user has a mapping for the active project → use registry role +2. Otherwise → fall back to cert-embedded role (existing behavior) + +The cert format is unchanged. Existing deployments with `api_version: 3` certs keep working. The cert role field is not removed or made vestigial in this version — it remains the primary source for single-tenant deployments. + +### Admin Role Hierarchy + +| Role | Scope | Capabilities | +|------|-------|-------------| +| `platform_admin` | Global | Create/delete projects, assign clients, system shutdown, view all sessions | +| `project_admin` | Per-project | All job ops within project, view project's clients (no client lifecycle control) | +| `org_admin` | Per-project | Manage own-org jobs, view own-org clients within project | +| `lead` | Per-project | Submit/manage own jobs, view own-org clients within project | +| `member` | Per-project | View-only within project | + +### Command Authorization Matrix + +Every command is scoped to the user's active project. Operations on resources outside the active project are denied. + +#### Job Operations + +| Command | project_admin | org_admin | lead | member | +|---------|:---:|:---:|:---:|:---:| +| `submit_job` | yes | no | yes | no | +| `list_jobs` | all in project | all in project | all in project | all in project | +| `get_job_meta` | all in project | own-org jobs | own jobs | all in project | +| `download_job` | all in project | own-org jobs | own jobs | no | +| `download_job_components` | all in project | own-org jobs | own jobs | no | +| `clone_job` | all in project | no | own jobs | no | +| `abort_job` | all in project | own-org jobs | own jobs | no | +| `delete_job` | all in project | own-org jobs | own jobs | no | +| `show_stats` | all in project | all in project | all in project | all in project | +| `show_errors` | all in project | all in project | all in project | all in project | +| `app_command` | all in project | own-org jobs | own jobs | no | +| `configure_job_log` | all in project | own-org jobs | own jobs | no | + +**"all in project"** = any job within the active project. +**"own-org jobs"** = jobs submitted by a user in the same org, within the active project. +**"own jobs"** = jobs submitted by this user, within the active project. + +#### Infrastructure Operations + +Since clients are shared across projects, **only `platform_admin` can perform client lifecycle operations** (restart, shutdown, remove). Disrupting a client affects all projects running on it. + +| Command | platform_admin | project_admin | org_admin | lead | member | +|---------|:---:|:---:|:---:|:---:|:---:| +| `check_status` | all clients | project's clients (view) | own-org + project (view) | own-org + project (view) | project's clients (view) | +| `restart` | all | no | no | no | no | +| `shutdown` | all | no | no | no | no | +| `shutdown_system` | yes | no | no | no | no | +| `remove_client` | all | no | no | no | no | +| `sys_info` | all | project's clients | own-org + project | own-org + project | no | +| `report_resources` | all | project's clients | own-org + project | own-org + project | no | +| `report_env` | all | project's clients | own-org + project | own-org + project | no | + +#### Shell Commands + +| Command | platform_admin | project_admin | org_admin | lead | member | +|---------|:---:|:---:|:---:|:---:|:---:| +| `pwd`, `ls`, `cat`, `head`, `tail`, `grep` | all | project's clients | own-org + project | own-org + project | no | + +Shell commands must be **restricted to the project's workspace path** on the target site. See Unresolved Questions. + +#### Session / Platform Commands + +| Command | platform_admin | project_admin | org_admin | lead | member | +|---------|:---:|:---:|:---:|:---:|:---:| +| `list_sessions` | all | project's sessions | no | no | no | +| `set_project` | any project | assigned projects | assigned projects | assigned projects | assigned projects | +| `list_projects` | all | assigned only | assigned only | assigned only | assigned only | +| `dead` | yes | no | no | no | no | + +--- + +## Authorization Enforcement + +Two layers, evaluated in order: + +1. **Project filter** (new): Is this resource in the user's active project? If no, invisible. +2. **RBAC policy** (existing): Does the user's project-role permit this operation on this resource? + +The existing `authorization.json` policy format is largely unchanged — project scoping happens above it. + +--- + +## Provisioning Changes + +### project.yml + +The v4 schema uses three top-level sections with a deliberate separation of concerns: + +- **`sites`** — infrastructure participants (server, clients). Always present. Identity and trust are cert-based; these entries never go away. +- **`admins`** — human participants with per-platform and per-project roles. **Optional.** Omit entirely when using SSO (see [Future: SSO](#future-sso-for-human-users)); roles are then provided by IdP claims. +- **`projects`** — tenant definitions: which clients are enrolled, and (optionally) which admins have which roles. The `admins:` block inside each project is also omitted under SSO. + +This separation is intentional: `sites` and `projects.clients` form the **permanent skeleton** of the file. The `admins` sections are an **optional overlay** that exists today but disappears when SSO is introduced — with no restructuring of the rest of the file. + +```yaml +api_version: 4 + +# Infrastructure — always present, cert-based mTLS +sites: + server1.example.com: { type: server, org: nvidia } + hospital-a: { type: client, org: org_a } + hospital-b: { type: client, org: org_a } + hospital-c: { type: client, org: org_b } + +# Human admins — omit entirely when using SSO +admins: + platform-admin@nvidia.com: { org: nvidia, role: platform_admin } + trainer@org_a.com: { org: org_a } + viewer@org_b.com: { org: org_b } + +projects: + cancer-research: + clients: [hospital-a, hospital-b] + # Omit when using SSO (roles come from IdP claims) + admins: + trainer@org_a.com: lead + + multiple-sclerosis: + clients: [hospital-a, hospital-c] + admins: + trainer@org_a.com: member + viewer@org_b.com: lead +``` + +**SSO migration**: drop the top-level `admins:` block and the `admins:` entries inside each project. The rest of the file is unchanged: + +```yaml +api_version: 4 + +sites: + server1.example.com: { type: server, org: nvidia } + hospital-a: { type: client, org: org_a } + hospital-b: { type: client, org: org_a } + hospital-c: { type: client, org: org_b } + +projects: + cancer-research: + clients: [hospital-a, hospital-b] + multiple-sclerosis: + clients: [hospital-a, hospital-c] +``` + +### Certificate Changes + +Certs continue to encode identity (name, org) and role. **No change to cert format.** The `UNSTRUCTURED_NAME` role field remains populated and serves as the fallback for single-tenant mode. + +In multitenant mode (`api_version: 4`), per-project roles are resolved from the `ProjectRegistry` loaded from `project.yml` at server startup. The cert role is only used when no registry mapping exists (backward compat). + +### Startup Kit Changes + +- **Server startup kit** includes `project.yml` — the authoritative source for project definitions, client enrollment, and user roles +- **Admin startup kits** are unchanged (cert for identity; project membership is server-side knowledge) + +--- + +## Job Scheduler Changes + +The scheduler becomes project-aware: + +1. **Candidate filtering**: Only schedule jobs to clients enrolled in the job's project +2. **Validation**: `deploy_map` sites must be a subset of the project's enrolled clients +3. **Quota/priority**: Deferred. K8s-level resource quotas per namespace may suffice initially. Future option: route different projects to different K8s scheduling queues via pod labels/nodeSelectors. + +--- + +## Runtime Isolation (ProdEnv) + +The project becomes a property of the job, and ProdEnv prepares the corresponding isolated environment. + +### Subprocess (Default — Single-Tenant Only) + +- Job workspace isolated to `///` (logical separation only) +- **No physical isolation**: same Linux user, shared `/tmp`, shared filesystem, shared GPU memory +- **Not suitable for multi-tenant deployments** — use K8s, Docker, or Slurm for cross-project isolation +- Retained for single-tenant and trusted environments (e.g., single org, development, POC) + +### Docker + +- Per-project volume mounts: each project's jobs mount a **different host directory** (e.g., `/data//`) as the workspace +- Per-container `/tmp`: each container gets its own tmpfs or bind mount — no shared host `/tmp` +- Per-project Docker network (no cross-project container communication) +- Container name includes project: `--` + +### Kubernetes (Primary Target) + +Clients participate in all their enrolled projects. **Data isolation is achieved by mounting different PersistentVolumes per project in each job pod.** The Flare client parent process runs in its own pod (or on the node) and does not mount project data PVs — it only orchestrates job pod creation. + +| Concern | Mechanism | +|---------|-----------| +| Namespace isolation | One K8s namespace per project | +| Storage isolation | PersistentVolumeClaim per project per client (not hostPath) | +| Temp directory isolation | Each pod gets its own `/tmp` via `emptyDir` — no shared host `/tmp` | +| Network isolation | NetworkPolicy per namespace | +| Resource limits | ResourceQuota per namespace (deferred, see Scheduler) | +| Pod security | PodSecurityPolicy/Standards per namespace | + +Job pods are created in the project's K8s namespace, mounting a pre-provisioned PVC (`-workspace`) per project per client. Each pod also gets its own `/tmp` via `emptyDir` to prevent cross-project leakage via temporary files. This applies to both server and client job pods. + +### Slurm + +- Per-project Slurm accounts/partitions +- Per-project storage paths +- Job submission includes `--account=` + +--- + +## FLARE API Changes + +- `Session` gains a `project` parameter (defaults to `"default"`) and `set_project()`/`list_projects()` methods +- `list_jobs` is automatically filtered to the active project (replaces the `-u` user-only filter) +- `get_system_info` returns only clients enrolled in the active project +- All job operations validate that the target job belongs to the active project + +--- + +## Audit Trail + +Every audit log entry gains a `project` field: + +``` +[2026-02-18 10:30:00] user=trainer@org_a.com project=cancer-research action=submit_job job_id=abc123 +[2026-02-18 10:31:00] user=trainer@org_a.com project=cancer-research action=list_jobs +``` + +Audit logs should be queryable per project for compliance. + +--- + +## Migration / Backward Compatibility + +1. **Feature gate**: all multitenancy behavior gated on `project.yml` having `api_version: 4` with a `projects:` section. Without it, the system behaves identically to today. +2. **Default project**: all existing jobs, clients, and users are in the `default` project +3. **Cert role fallback**: if no project registry exists (or user has no registry mapping), fall back to cert-embedded role +4. **API compatibility**: `project` parameter defaults to `"default"` everywhere +5. **Config version**: `api_version: 4` in `project.yml` signals multi-project support; version 3 continues to work as single-tenant + +--- + +## Design Decisions + +| # | Question | Decision | +|---|----------|----------| +| D1 | Can clients participate in multiple projects? | **Yes.** Clients participate in all enrolled projects simultaneously. Data isolation is physical: K8s mounts different PVs per project; Docker mounts different host directories. The Flare parent process does not access project data. | +| D2 | Project lifecycle management? | **Deferred.** Projects are defined at provisioning time in `project.yml`. Runtime project CRUD is not in scope for v1. | +| D3 | Per-project quota management? | **Deferred.** Rely on K8s ResourceQuota per namespace for now. Future: route projects to different K8s scheduling queues via pod labels. | +| D4 | `check_status` information leakage? | **Server has global knowledge, filtering the response is sufficient.** The server parent process knows about all clients and jobs; it filters responses to only include resources in the user's active project. No architectural change needed. | +| D5 | Server-side job store isolation? | **Server job pods must only access their project's data.** The server job process (running in K8s/Docker) must not mount the entire job store — only the project-partitioned slice. Current `FilesystemStorage` will be replaced by a database or object store in the future, which will enforce project-scoped access natively. For v1 with filesystem: mount only `jobs//` into the server job pod. No migration of existing jobs — they stay at `jobs//` and belong to `default`. | +| D6 | Role storage: certs vs. server-side registry? | **Layered: registry overrides cert.** `project.yml` defines per-project roles; the server loads it at startup via `ProjectRegistry`. Certs continue to authenticate identity (name, org) and carry a role as fallback. No cert format change required. | +| D7 | How do shared clients know which project PV to mount? | **The launcher passes the project name to the client.** Job metadata carries the project; the server includes it when dispatching to clients. The client-side `K8sJobLauncher`/`DockerJobLauncher` uses the project name to select the correct PV/volume mount. | +| D8 | Cross-project isolation in subprocess mode? | **Subprocess mode is single-tenant/trusted only.** Only K8s, Docker, and Slurm launchers provide secure multi-tenant isolation (separate namespaces, volumes, `/tmp`). The default subprocess launcher offers no physical isolation and is only suitable for single-tenant or trusted environments. | + +--- + +## Unresolved Questions + +1. **Cross-project visibility**: Can a platform admin see job metadata across all projects (for debugging)? Should `list_jobs` have a `--all-projects` flag for platform admins? + +2. **Existing `scope` concept**: The `scope` concept will be deprecated in favor of `project`. The `project` boundary subsumes data-governance scoping; existing `scope` usage will be migrated to `project`. + +3. **External IdP integration**: SSO is a follow-on (see Future: SSO section), but should the `ProjectRegistry` interface be designed now to accommodate an IdP backend later? What claims/attributes should the IdP provide (project membership, role, org)? + +4. **Shell commands (pwd, ls, cat, head, tail, grep)**: These allow direct filesystem access on server/client sites. In a multi-tenant environment: + - How do we restrict file access to the active project's workspace? Current implementation does basic path validation (no `..`, no absolute paths) but has no project awareness. + - In K8s, project data lives on per-project PVs that are only mounted into job pods — the client parent process does not have them mounted. Shell commands executed on the parent process have **no access** to project data at all. + - Options: (a) disable shell commands in multi-tenant mode, (b) replace with a project-scoped log/artifact download API that retrieves data from the job store, (c) route shell commands to a running job pod (requires the job to be active), (d) launch an ephemeral "debug pod" in the project's namespace with the project PV mounted. + - The current `cat log.txt` pattern assumes a single workspace. With per-project workspaces, the working directory concept needs redefinition. + - **This is a significant UX change** — today admins rely heavily on shell commands for debugging. Need a clear alternative. + +5. **Provisioning at scale**: With N projects and M users, the current "one provision run per project" model means M*N startup kits in the worst case. Is a shared-CA model with a single startup kit per user viable? + +--- + +## Future: SSO for Human Users + +The current design separates two kinds of participants that today are both managed via X.509 certs: + +- **Sites** (server, clients, relays) — infrastructure with stable identity, long-lived +- **Humans** (admins) — users who change roles, join/leave projects, need MFA + +In a future version, human authentication moves to a standard SSO system (OIDC/SAML) with short-lived tokens, while sites continue using mutual TLS with provisioned certs. + +| | Sites (v1 and future) | Humans (v1) | Humans (future) | +|---|---|---|---| +| **Authentication** | mTLS certs | mTLS certs | SSO (OIDC/SAML) tokens | +| **Identity source** | Cert CN + org | Cert CN + org | IdP claims | +| **Role source** | N/A | `project.yml` registry (cert fallback) | IdP claims or `project.yml` | +| **Lifecycle** | Provisioned, long-lived | Provisioned, long-lived | IdP-managed, dynamic | +| **Startup kit** | Yes (certs, config) | Yes (certs, config) | No — just a login URL | + +**Why this matters for v1 design decisions:** + +The server-side `ProjectRegistry` (loaded from `project.yml`) is the right abstraction because it decouples role resolution from the cert. Today the registry overrides the cert role; in the future, the registry (or IdP) replaces the cert entirely for humans. The same `ProjectRegistry` interface can be backed by `project.yml` now and by an IdP adapter later. + +This also means per-project startup kits for humans (alternative approach considered) would be a dead end — SSO eliminates admin certs entirely, so building around per-project certs for humans would be throwaway work. + +The v4 `project.yml` schema is designed with this migration in mind: the `admins:` section (top-level and per-project) is explicitly optional. A deployment using SSO simply omits it; the `sites:` and `projects:` skeleton is identical in both modes. No schema version bump or file restructuring is needed when migrating to SSO. + +--- + +## Implementation + +See [multiproject_implementation.md](multiproject_implementation.md) for the full implementation plan. + +--- + +## Phase 1: Minimal Project Plumbing + +Phase 1 delivers no access control, no job store partitioning, and no cert/registry changes. The sole goal is to thread the `project` name from user-facing APIs into the runtime launchers so K8s and Docker can mount the correct volume/directory. + +### Scope + +1. Add `project: str = "default"` parameter to `ProdEnv` and `PocEnv`. +2. Pass `project` through to the job metadata at submission time. +3. `K8sJobLauncher` reads `project` from job metadata and selects the corresponding PVC (`-workspace`). +4. `DockerJobLauncher` reads `project` from job metadata and mounts `/data//` as the workspace volume. +5. No changes to authorization, job store paths, `project.yml`, scheduler, or any other component. + +### What this enables + +- Data scientists can tag jobs with a project and get physical data isolation on K8s/Docker immediately. +- Lays the plumbing for all subsequent phases without requiring a full multitenancy deployment. + +### What this does NOT do + +- No access control — any user can submit to any project name. +- No job store partitioning (`jobs//` path unchanged). +- No `project.yml` parsing or `ProjectRegistry`. +- No `set_project` / `list_projects` admin commands. +- Subprocess launcher unchanged (single-tenant/trusted only). diff --git a/docs/design/multiproject_implementation.md b/docs/design/multiproject_implementation.md new file mode 100644 index 0000000000..03f1768511 --- /dev/null +++ b/docs/design/multiproject_implementation.md @@ -0,0 +1,499 @@ +# Multitenancy Implementation Plan + +Companion to [multitenancy.md](multitenancy.md). This document specifies *how* to implement the design with minimal risk. + +## Guiding Principles + +1. **Feature-gated** — all multitenancy behavior gated on `project.yml` having a `projects:` section (`api_version: 4`). Single-tenant deployments unchanged. +2. **Additive, not migratory** — no job store path migration, no role renames. New code paths only. +3. **Layered role resolution** — registry overrides cert, cert remains fallback. No breaking change to cert format. +4. **Incremental delivery** — three shippable milestones, each independently testable. + +--- + +## Codebase Map + +Key files and their roles (discovered via exploration): + +| Component | File | Key Classes/Functions | +|-----------|------|----------------------| +| Job metadata | `nvflare/apis/job_def.py:48-82` | `JobMetaKey` enum | +| Job store | `nvflare/app_common/storages/filesystem_storage.py` | `FilesystemStorage` | +| Job manager | `nvflare/apis/impl/job_def_manager.py` | `SimpleJobDefManager` | +| Job submission | `nvflare/private/fed/server/job_cmds.py:564-618` | `submit_job()` handler | +| Job listing | `nvflare/private/fed/server/job_cmds.py:316-367` | `list_jobs()` handler | +| Job scheduling | `nvflare/app_common/job_schedulers/job_scheduler.py:101-230` | `DefaultJobScheduler._try_job()` | +| Authz policy | `nvflare/fuel/sec/authz.py` | `AuthorizationService`, `Policy`, `Authorizer` | +| Authz filter | `nvflare/fuel/hci/server/authz.py:44-94` | `AuthzFilter.pre_command()` | +| Login/session | `nvflare/fuel/hci/server/login.py:69-119` | `handle_cert_login()` | +| Server session | `nvflare/fuel/hci/server/sess.py:33-86` | `Session` (user_name, user_org, user_role) | +| Conn properties | `nvflare/fuel/hci/server/constants.py:16-45` | `ConnProps` | +| Role from cert | `nvflare/fuel/hci/security.py:74-98` | `get_identity_info()` → `UNSTRUCTURED_NAME` | +| Admin server | `nvflare/private/fed/server/admin.py:95-174` | `FedAdminServer` (filter chain setup) | +| Cmd authz utils | `nvflare/private/fed/server/cmd_utils.py:41-148` | `authorize_job()`, `must_be_project_admin()` | +| FLARE API Session | `nvflare/fuel/flare_api/flare_api.py:65-108` | `Session` (client-side) | +| `new_secure_session` | `nvflare/fuel/flare_api/flare_api.py:944-956` | Session factory | +| ProdEnv | `nvflare/recipe/prod_env.py:45-108` | `ProdEnv` (recipe execution env) | +| SessionManager | `nvflare/recipe/session_mgr.py:40-106` | `SessionManager` | +| Provisioner | `nvflare/lighter/provision.py:132-170` | `prepare_project()`, loads `project.yml` | +| Project entity | `nvflare/lighter/entity.py:370-573` | `Project` class | +| Cert generation | `nvflare/lighter/impl/cert.py:296-347` | `get_pri_key_cert()` | +| x509 role field | `nvflare/lighter/utils.py:129-135` | `x509_name()` → `UNSTRUCTURED_NAME` | +| Admin roles | `nvflare/lighter/constants.py:109-116` | `AdminRole`, `DEFINED_ROLES` | +| Audit | `nvflare/fuel/sec/audit.py:90-125` | `AuditService` singleton | +| Audit filter | `nvflare/fuel/hci/server/audit.py:23-46` | `CommandAudit.pre_command()` | +| Security init | `nvflare/private/fed/utils/fed_utils.py:98-147` | `security_init()` | +| FLAuthorizer | `nvflare/security/security.py:21-74` | `FLAuthorizer`, `COMMAND_CATEGORIES` | + +--- + +## Risk Mitigations + +### Problem: Job store path migration +**Mitigation**: no migration. New multitenant jobs go to `jobs///`. Existing jobs stay at `jobs//` and implicitly belong to `default`. `SimpleJobDefManager` checks both paths when listing `default` project jobs. + +### Problem: `project_admin` role rename +**Mitigation**: no rename. `project_admin` already fits as the per-project admin concept. Add `platform_admin` as a new global role. Existing `must_be_project_admin()` checks become "is user `project_admin` in any project OR `platform_admin`" — backward compatible. + +### Problem: Cert role becomes vestigial +**Mitigation**: layered resolution. If `ProjectRegistry` has a mapping for this user+project, use it. Otherwise fall back to cert role. Cert format unchanged; no re-provisioning required for existing deployments. + +### Problem: Cross-cutting feature requires all-or-nothing +**Mitigation**: three milestones. Milestone 1 adds plumbing with zero behavior change. Milestone 2 adds the registry (gated). Milestone 3 enforces scoping. + +--- + +## Milestone 1: Project-Aware Plumbing + +**Goal**: thread `project` through the stack, always `"default"`. All existing tests pass, zero behavior change. + +### 1.1 Add `JobMetaKey.PROJECT` + +**File**: `nvflare/apis/job_def.py` + +```python +class JobMetaKey(str, Enum): + ... + PROJECT = "project" +``` + +### 1.2 Add `ConnProps.PROJECT` + +**File**: `nvflare/fuel/hci/server/constants.py` + +```python +class ConnProps(object): + ... + PROJECT = "_project" +``` + +### 1.3 Stamp project on job submission + +**File**: `nvflare/private/fed/server/job_cmds.py` + +In `submit_job()` (~line 604), after setting submitter info: +```python +meta[JobMetaKey.PROJECT.value] = conn.get_prop(ConnProps.PROJECT, "default") +``` + +Same for `clone_job()` (~line 504). + +### 1.4 Add `project` to server-side `Session` + +**File**: `nvflare/fuel/hci/server/sess.py` + +Add `project="default"` to `Session.__init__()`. Include `"p"` key in token encoding. Set `ConnProps.PROJECT` from session in `LoginModule.pre_command()`. + +### 1.5 Add `project` to client-side `Session` + +**File**: `nvflare/fuel/flare_api/flare_api.py` + +- Add `project="default"` param to `Session.__init__()` and `new_secure_session()` +- Store as `self._project`, pass to server on connect + +### 1.6 Add `project` to `ProdEnv` + +**File**: `nvflare/recipe/prod_env.py` + +Add `project="default"` param, pass through to `SessionManager`. + +### 1.7 Add `project` to audit events + +**File**: `nvflare/fuel/sec/audit.py` + +Add `project` param to `add_event()`. Emit `[P:project]` in log line. + +**File**: `nvflare/fuel/hci/server/audit.py` + +Pass `conn.get_prop(ConnProps.PROJECT, "default")` to `add_event()`. + +### 1.8 Filter `list_jobs` by project + +**File**: `nvflare/private/fed/server/job_cmds.py` + +Add project predicate to `_job_match()`: +```python +and ((not project) or job_meta.get("project", "default") == project) +``` + +Extract shared helper `_is_job_in_project(job_meta, project)` for reuse across all job command handlers. + +### Milestone 1 Summary + +| File | Change | +|------|--------| +| `nvflare/apis/job_def.py` | +1 enum value | +| `nvflare/fuel/hci/server/constants.py` | +1 constant | +| `nvflare/private/fed/server/job_cmds.py` | stamp project on submit/clone, filter list_jobs | +| `nvflare/fuel/hci/server/sess.py` | project field on Session | +| `nvflare/fuel/hci/server/login.py` | set ConnProps.PROJECT from session | +| `nvflare/fuel/flare_api/flare_api.py` | project param on Session + factory | +| `nvflare/recipe/prod_env.py` | project param | +| `nvflare/recipe/session_mgr.py` | pass project through | +| `nvflare/fuel/sec/audit.py` | project field in events | +| `nvflare/fuel/hci/server/audit.py` | pass project from conn | + +**~10 files, ~150 lines. Zero behavior change.** + +--- + +## Milestone 2: Project Registry + Role Resolution + +**Goal**: add `ProjectRegistry`, per-project role resolution, `platform_admin` role, `set_project`/`list_projects` commands. Gated on `api_version: 4`. + +### 2.1 Create `ProjectRegistry` + +**New file**: `nvflare/security/project_registry.py` (~150 lines) + +```python +class ProjectRegistry: + """Resolves project membership, client enrollment, and per-project roles. + + Loaded from project.yml at server startup. When absent or api_version < 4, + operates in single-tenant mode (all users/clients in 'default' project, + roles from certs). + """ + + def __init__(self): + self._projects = {} # name -> {clients: set, users: dict} + self._multitenant = False + + def load_from_config(self, project_dict: dict): + """Load from parsed project.yml. Detects api_version >= 4.""" + ... + + def is_multitenant(self) -> bool: + """True if projects section exists (api_version 4+).""" + return self._multitenant + + def get_projects(self) -> List[str]: + """All project names.""" + ... + + def get_project_clients(self, project: str) -> Set[str]: + """Client names enrolled in project.""" + ... + + def get_user_projects(self, username: str) -> List[str]: + """Projects this user belongs to.""" + ... + + def get_user_role(self, username: str, project: str) -> Optional[str]: + """User's role in project, or None if not a member.""" + ... + + def is_platform_admin(self, username: str) -> bool: + """True if user has global platform_admin role.""" + ... + + def is_user_in_project(self, username: str, project: str) -> bool: + ... + + def is_client_in_project(self, client_name: str, project: str) -> bool: + ... +``` + +Singleton access via `ProjectRegistryService` (follows existing `AuthorizationService` pattern). + +### 2.2 Add `platform_admin` role + +**File**: `nvflare/lighter/constants.py` + +```python +class AdminRole: + PLATFORM_ADMIN = "platform_admin" # new + PROJECT_ADMIN = "project_admin" + ORG_ADMIN = "org_admin" + LEAD = "lead" + MEMBER = "member" + +DEFINED_ROLES = [ + AdminRole.PLATFORM_ADMIN, + AdminRole.PROJECT_ADMIN, + AdminRole.ORG_ADMIN, + AdminRole.LEAD, + AdminRole.MEMBER, +] +``` + +### 2.3 Load registry at server startup + +**File**: `nvflare/private/fed/utils/fed_utils.py` in `security_init()` + +After loading authorization policy, load `project.yml` into `ProjectRegistryService`: +```python +project_config = load_yaml(workspace.get_project_config_path()) +ProjectRegistryService.initialize(project_config) +``` + +### 2.4 Layered role resolution in login + +**File**: `nvflare/fuel/hci/server/login.py` + +In `handle_cert_login()`, after extracting identity from cert: +```python +# Existing: role from cert +role = identity_info.get(IdentityKey.ROLE, "") + +# New: override with registry role if multitenant +registry = ProjectRegistryService.get_registry() +if registry and registry.is_multitenant(): + project = ... # from login request or default + registry_role = registry.get_user_role(username, project) + if registry_role: + role = registry_role +``` + +### 2.5 Project filter in authz chain + +**File**: `nvflare/fuel/hci/server/authz.py` + +New `ProjectFilter(CommandFilter)` registered in filter chain between `LoginModule` and `AuthzFilter`: + +```python +class ProjectFilter(CommandFilter): + """Validates user has access to their active project. + + Registered AFTER LoginModule (needs identity) and BEFORE AuthzFilter + (sets project-scoped role for downstream authz). + """ + def pre_command(self, conn, args): + registry = ProjectRegistryService.get_registry() + if not registry or not registry.is_multitenant(): + return True # single-tenant, no filtering + + project = conn.get_prop(ConnProps.PROJECT, "default") + username = conn.get_prop(ConnProps.USER_NAME, "") + + if registry.is_platform_admin(username): + return True # platform admin bypasses project check + + if not registry.is_user_in_project(username, project): + conn.append_error("Access denied: not a member of this project") + return False + + # Override role to project-specific role + role = registry.get_user_role(username, project) + conn.set_prop(ConnProps.USER_ROLE, role) + return True +``` + +**File**: `nvflare/private/fed/server/admin.py` + +Register `ProjectFilter` after `LoginModule`, before `AuthzFilter`: +```python +login_module = LoginModule(sess_mgr) +cmd_reg.add_filter(login_module) + +project_filter = ProjectFilter() # new +cmd_reg.add_filter(project_filter) # new + +authz_filter = AuthzFilter() +cmd_reg.add_filter(authz_filter) +``` + +### 2.6 `set_project` and `list_projects` commands + +**New command handlers** in `nvflare/private/fed/server/job_cmds.py` (or a new `ProjectCommandModule`): + +- `set_project `: validates user membership, updates session's project, re-resolves role +- `list_projects`: returns projects the user belongs to (or all, for `platform_admin`) + +**Client-side**: `Session.set_project()` and `Session.list_projects()` in `nvflare/fuel/flare_api/flare_api.py`. + +### 2.7 Provisioner: parse `api_version: 4` + +**File**: `nvflare/lighter/provision.py` + +Extend `prepare_project()` to accept `api_version` 3 or 4. When 4: +- Parse `projects:` section +- Parse per-admin `projects:` mapping +- Validate referenced clients exist in participants + +**File**: `nvflare/lighter/entity.py` + +Add `projects` property to `Project` class. Add `project_roles` dict to admin `Participant`. + +### 2.8 Server startup kit includes `project.yml` + +**File**: `nvflare/lighter/impl/static_file.py` + +Copy `project.yml` into server startup kit directory. + +### Milestone 2 Summary + +| File | Change | +|------|--------| +| `nvflare/security/project_registry.py` | **new** — ProjectRegistry + service | +| `nvflare/lighter/constants.py` | +PLATFORM_ADMIN role | +| `nvflare/private/fed/utils/fed_utils.py` | load registry at startup | +| `nvflare/fuel/hci/server/login.py` | layered role resolution | +| `nvflare/fuel/hci/server/authz.py` | +ProjectFilter class | +| `nvflare/private/fed/server/admin.py` | register ProjectFilter | +| `nvflare/private/fed/server/job_cmds.py` | set_project, list_projects handlers | +| `nvflare/fuel/flare_api/flare_api.py` | set_project, list_projects client methods | +| `nvflare/lighter/provision.py` | api_version 4 parsing | +| `nvflare/lighter/entity.py` | projects on Project/Participant | +| `nvflare/lighter/impl/static_file.py` | include project.yml in server kit | + +**~11 files (1 new), ~500 lines. Gated on api_version 4.** + +--- + +## Milestone 3: Enforcement + Scheduler + +**Goal**: all commands scoped to active project. Scheduler validates project client enrollment. + +### 3.1 All job commands: project gate + +**File**: `nvflare/private/fed/server/job_cmds.py` + +For every job-specific handler (`abort_job`, `delete_job`, `download_job`, `clone_job`, `app_command`, `configure_job_log`), add project validation in `authorize_job_id()`: + +```python +def authorize_job_id(self, conn, args): + ... # existing: load job, set submitter props + + # New: verify job belongs to active project + job_project = job.meta.get(JobMetaKey.PROJECT.value, "default") + active_project = conn.get_prop(ConnProps.PROJECT, "default") + if job_project != active_project: + conn.append_error("Job not found in current project") + return PreAuthzReturnCode.ERROR + + return PreAuthzReturnCode.REQUIRE_AUTHZ +``` + +This is a single change point since all job commands route through `authorize_job_id()`. + +### 3.2 Infrastructure commands: filter to project clients + +**File**: `nvflare/private/fed/server/training_cmds.py` (and `cmd_utils.py`) + +In `validate_command_targets()`, filter target list to clients enrolled in the active project: + +```python +registry = ProjectRegistryService.get_registry() +if registry and registry.is_multitenant(): + project = conn.get_prop(ConnProps.PROJECT, "default") + for target in targets: + if not registry.is_client_in_project(target, project): + conn.append_error(f"Client '{target}' not in project '{project}'") + return PreAuthzReturnCode.ERROR +``` + +### 3.3 `check_status`: filter response + +**File**: `nvflare/private/fed/server/training_cmds.py` + +Filter the client list in `check_status` response to only include clients enrolled in the user's active project. + +### 3.4 Scheduler: validate deploy_map against project + +**File**: `nvflare/app_common/job_schedulers/job_scheduler.py` + +In `_try_job()`, after extracting applicable sites (~line 126): + +```python +registry = ProjectRegistryService.get_registry() +if registry and registry.is_multitenant(): + project = job_meta.get(JobMetaKey.PROJECT.value, "default") + project_clients = registry.get_project_clients(project) + for site in applicable_sites: + if site != SERVER_SITE_NAME and site not in project_clients: + return (SCHEDULE_RESULT_BLOCK, None, f"Site {site} not in project {project}") +``` + +### 3.5 Job store partitioning (new jobs only) + +**File**: `nvflare/apis/impl/job_def_manager.py` + +Change `job_uri()` to include project for non-default projects: + +```python +def job_uri(self, jid, project=None): + if project and project != "default": + return os.path.join(self._uri_root, project, jid) + return os.path.join(self._uri_root, jid) # backward compat +``` + +`get_all_jobs()` scans both `//` and `///` paths. + +### 3.6 Tests + +| Test | File | ~Lines | +|------|------|--------| +| ProjectRegistry unit tests | `tests/unit_test/security/project_registry_test.py` | ~200 | +| Project-scoped job filtering | `tests/unit_test/private/fed/server/job_cmds_project_test.py` | ~150 | +| Per-project role resolution | `tests/unit_test/fuel/hci/server/project_filter_test.py` | ~100 | +| Provisioner v4 parsing | `tests/unit_test/lighter/provision_v4_test.py` | ~100 | +| Scheduler project validation | `tests/unit_test/app_common/job_schedulers/scheduler_project_test.py` | ~80 | + +### Milestone 3 Summary + +| File | Change | +|------|--------| +| `nvflare/private/fed/server/job_cmds.py` | project gate in authorize_job_id | +| `nvflare/private/fed/server/training_cmds.py` | filter infra commands to project clients | +| `nvflare/private/fed/server/cmd_utils.py` | project-aware target validation | +| `nvflare/app_common/job_schedulers/job_scheduler.py` | deploy_map vs project clients | +| `nvflare/apis/impl/job_def_manager.py` | partitioned job_uri for new jobs | +| `tests/unit_test/...` (5 new files) | ~630 lines of tests | + +**~5 files + 5 test files, ~550 lines.** + +--- + +## Total Estimates + +| Milestone | Files Changed | Files Created | Lines | +|-----------|:---:|:---:|:---:| +| M1: Plumbing | 10 | 0 | ~150 | +| M2: Registry + Authz | 10 | 1 | ~500 | +| M3: Enforcement + Tests | 5 | 5 | ~550 | +| **Total** | **~22** | **~6** | **~1,200** | + +--- + +## Out of Scope (This Plan) + +- K8s `K8sJobLauncher` changes (namespace per project, PVC selection) +- Docker `DockerJobLauncher` changes (per-project volume mounts) +- Slurm launcher changes +- Runtime project CRUD (D2 — projects defined at provision time only) +- Per-project quota management (D3 — rely on K8s ResourceQuota) +- External IdP integration (OIDC/SAML) +- Singleton refactoring (document as constraint, defer refactor) +- Shell command restrictions (see Unresolved Questions in design doc) + +--- + +## Unresolved Questions + +1. **`set_project` protocol**: does `set_project` require a new server round-trip, or can the client just switch locally and send the new project on the next command? Server round-trip is safer (validates membership) but adds latency. + +2. **Token encoding**: should the project be part of the session token, or sent as a command header? Token means re-auth on project switch; header is simpler but requires server-side validation on every command. + +3. **`list_jobs --all-projects`**: should `platform_admin` have a flag to see all projects' jobs? Useful for debugging but increases blast radius. + +4. **Provisioner backward compat**: when `api_version: 4` project.yml is used, should the provisioner still bake a role into the cert? Options: (a) bake the first project's role as a fallback, (b) leave `UNSTRUCTURED_NAME` empty, (c) bake a sentinel value like `"multitenant"`. From a5dabe6f56f553676c2248ae9b51cd83ded96040 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Wed, 4 Mar 2026 12:50:19 -0800 Subject: [PATCH 02/11] docs: refine multiproject naming and rollout guidance --- docs/design/multiproject.md | 105 +++++++++++---------- docs/design/multiproject_implementation.md | 69 ++++++++------ 2 files changed, 95 insertions(+), 79 deletions(-) diff --git a/docs/design/multiproject.md b/docs/design/multiproject.md index d7de82bd93..8bc9407d5c 100644 --- a/docs/design/multiproject.md +++ b/docs/design/multiproject.md @@ -12,7 +12,7 @@ To achieve genuine multi-tenancy, we introduce a **project** concept as the prim 2. **Defense in depth** — logical access control (authz) + physical isolation (containers/PVs) 3. **Backward compatible** — a `default` project preserves current single-tenant behavior 4. **`scope` deprecated** — the existing `scope` data-governance concept is superseded by `project`; `scope` will be removed in a future release -5. **Feature-gated** — all multitenancy behavior gated on `api_version: 4` in `project.yml`; single-tenant deployments see zero behavior change +5. **Phased rollout** — Phase 1 project plumbing is available without `api_version: 4`; full multitenancy enforcement is gated on `api_version: 4` in `project.yml` --- @@ -24,7 +24,7 @@ A project is a named, immutable tenant boundary with these properties: | Property | Description | |----------|-------------| | `name` | Unique identifier (e.g., `cancer-research`) | -| `clients` | Set of FL client sites enrolled in this project | +| `sites` | Set of FL sites enrolled in this project (must reference client-type site entries) | | `users` | Set of admin users with per-project roles | | `authorization` | Per-project authorization policy | @@ -67,7 +67,7 @@ env = PocEnv( run = recipe.execute(env) ``` -If `project` is omitted in either env, the `default` project is used. +If `project` is omitted in either env, it remains `None` (no API default change). ### Admin (FLARE API / Admin Console) @@ -80,7 +80,7 @@ sess = new_secure_session( project="cancer-research", # new ) # All subsequent operations scoped to this project -jobs = sess.list_jobs() # only cancer-research jobs +jobs = sess.list_jobs() # only caller-visible jobs in cancer-research sess.submit_job("./my_job") # tagged to cancer-research ``` @@ -91,7 +91,7 @@ Admin console equivalent: Project set to: cancer-research > list_jobs -... only shows cancer-research jobs ... +... only shows caller-visible jobs in cancer-research ... ``` A user with roles in multiple projects can switch context: @@ -105,11 +105,12 @@ Project set to: multiple-sclerosis A new **platform admin** role (distinct from per-project `project_admin`) manages cross-project concerns: -- Create/archive projects - Assign clients to projects - Assign project admins - View system-wide health (without seeing job data) +Project create/archive is deferred for v1 (projects are provisioning-time config in `project.yml`). + --- ## Data Model Changes @@ -122,6 +123,8 @@ A new **platform admin** role (distinct from per-project `project_admin`) manage New multitenant jobs are stored at `jobs///` (vs. current `jobs//`). No migration of existing jobs — they remain at `jobs//` and implicitly belong to the `default` project. +Legacy `default` jobs continue to be served by the main server process for compatibility. New server job pods mount only the project-partitioned slice needed for the active job. + Physical partitioning enables: - Filesystem-level isolation (different mount points per project in K8s) - Simpler backup/restore per project @@ -143,15 +146,16 @@ Today, the role is baked into the X.509 certificate (`UNSTRUCTURED_NAME` field). **Layered resolution (no breaking change):** 1. If `ProjectRegistry` exists AND user has a mapping for the active project → use registry role -2. Otherwise → fall back to cert-embedded role (existing behavior) +2. Else if active project is `default` → fall back to cert-embedded role (legacy compatibility) +3. Otherwise → deny (`user not assigned to active project`) -The cert format is unchanged. Existing deployments with `api_version: 3` certs keep working. The cert role field is not removed or made vestigial in this version — it remains the primary source for single-tenant deployments. +The cert format is unchanged. Existing deployments with `api_version: 3` certs keep working. The cert role field is not removed or made vestigial in this version — it remains the primary source for single-tenant deployments and fallback for the `default` project. ### Admin Role Hierarchy | Role | Scope | Capabilities | |------|-------|-------------| -| `platform_admin` | Global | Create/delete projects, assign clients, system shutdown, view all sessions | +| `platform_admin` | Global | Assign clients/admins to provisioned projects, system shutdown, view all sessions | | `project_admin` | Per-project | All job ops within project, view project's clients (no client lifecycle control) | | `org_admin` | Per-project | Manage own-org jobs, view own-org clients within project | | `lead` | Per-project | Submit/manage own jobs, view own-org clients within project | @@ -161,12 +165,17 @@ The cert format is unchanged. Existing deployments with `api_version: 3` certs k Every command is scoped to the user's active project. Operations on resources outside the active project are denied. +If the same human has multiple roles (for example `platform_admin` globally and `project_admin` in some projects), no explicit role-switch is required: +- Project-scoped job commands are authorized by the user's role in the active project +- Platform/global commands are authorized by `platform_admin` +- `platform_admin` alone does not imply project job-data permissions + #### Job Operations | Command | project_admin | org_admin | lead | member | |---------|:---:|:---:|:---:|:---:| | `submit_job` | yes | no | yes | no | -| `list_jobs` | all in project | all in project | all in project | all in project | +| `list_jobs` | all in project | own-org jobs | own jobs | all in project | | `get_job_meta` | all in project | own-org jobs | own jobs | all in project | | `download_job` | all in project | own-org jobs | own jobs | no | | `download_job_components` | all in project | own-org jobs | own jobs | no | @@ -203,7 +212,7 @@ Since clients are shared across projects, **only `platform_admin` can perform cl |---------|:---:|:---:|:---:|:---:|:---:| | `pwd`, `ls`, `cat`, `head`, `tail`, `grep` | all | project's clients | own-org + project | own-org + project | no | -Shell commands must be **restricted to the project's workspace path** on the target site. See Unresolved Questions. +Shell command behavior needs deeper design discussion because parent-process and job-pod filesystems can diverge (including standard K8s setups). See Unresolved Questions. #### Session / Platform Commands @@ -235,9 +244,9 @@ The v4 schema uses three top-level sections with a deliberate separation of conc - **`sites`** — infrastructure participants (server, clients). Always present. Identity and trust are cert-based; these entries never go away. - **`admins`** — human participants with per-platform and per-project roles. **Optional.** Omit entirely when using SSO (see [Future: SSO](#future-sso-for-human-users)); roles are then provided by IdP claims. -- **`projects`** — tenant definitions: which clients are enrolled, and (optionally) which admins have which roles. The `admins:` block inside each project is also omitted under SSO. +- **`projects`** — tenant definitions: which sites are enrolled (client-type entries), and (optionally) which admins have which roles. The `admins:` block inside each project is also omitted under SSO. -This separation is intentional: `sites` and `projects.clients` form the **permanent skeleton** of the file. The `admins` sections are an **optional overlay** that exists today but disappears when SSO is introduced — with no restructuring of the rest of the file. +This separation is intentional: `sites` and `projects.sites` form the **permanent skeleton** of the file. The `admins` sections are an **optional overlay** that exists today but disappears when SSO is introduced — with no restructuring of the rest of the file. ```yaml api_version: 4 @@ -257,13 +266,13 @@ admins: projects: cancer-research: - clients: [hospital-a, hospital-b] + sites: [hospital-a, hospital-b] # Omit when using SSO (roles come from IdP claims) admins: trainer@org_a.com: lead multiple-sclerosis: - clients: [hospital-a, hospital-c] + sites: [hospital-a, hospital-c] admins: trainer@org_a.com: member viewer@org_b.com: lead @@ -282,9 +291,9 @@ sites: projects: cancer-research: - clients: [hospital-a, hospital-b] + sites: [hospital-a, hospital-b] multiple-sclerosis: - clients: [hospital-a, hospital-c] + sites: [hospital-a, hospital-c] ``` ### Certificate Changes @@ -304,8 +313,8 @@ In multitenant mode (`api_version: 4`), per-project roles are resolved from the The scheduler becomes project-aware: -1. **Candidate filtering**: Only schedule jobs to clients enrolled in the job's project -2. **Validation**: `deploy_map` sites must be a subset of the project's enrolled clients +1. **Candidate filtering**: Only schedule jobs to sites enrolled in the job's project (client-type sites) +2. **Validation**: `deploy_map` sites must be a subset of the project's enrolled sites 3. **Quota/priority**: Deferred. K8s-level resource quotas per namespace may suffice initially. Future option: route different projects to different K8s scheduling queues via pod labels/nodeSelectors. --- @@ -330,18 +339,18 @@ The project becomes a property of the job, and ProdEnv prepares the correspondin ### Kubernetes (Primary Target) -Clients participate in all their enrolled projects. **Data isolation is achieved by mounting different PersistentVolumes per project in each job pod.** The Flare client parent process runs in its own pod (or on the node) and does not mount project data PVs — it only orchestrates job pod creation. +Clients participate in all their enrolled projects. **Data isolation is achieved by mounting project-scoped workspace volumes in each job pod.** The Flare client parent process runs in its own pod (or on the node) and does not mount project data volumes — it only orchestrates job pod creation. | Concern | Mechanism | |---------|-----------| -| Namespace isolation | One K8s namespace per project | -| Storage isolation | PersistentVolumeClaim per project per client (not hostPath) | +| Namespace isolation | Deployment-defined strategy (recommended: one namespace per project; supported: shared namespace or per-job namespace) | +| Storage isolation | Workspace volume resolved by `(project, client, job pod namespace)` (not hostPath) | | Temp directory isolation | Each pod gets its own `/tmp` via `emptyDir` — no shared host `/tmp` | -| Network isolation | NetworkPolicy per namespace | -| Resource limits | ResourceQuota per namespace (deferred, see Scheduler) | +| Network isolation | NetworkPolicy scoped by project name | +| Resource limits | ResourceQuota policy per deployment strategy (deferred, see Scheduler) | | Pod security | PodSecurityPolicy/Standards per namespace | -Job pods are created in the project's K8s namespace, mounting a pre-provisioned PVC (`-workspace`) per project per client. Each pod also gets its own `/tmp` via `emptyDir` to prevent cross-project leakage via temporary files. This applies to both server and client job pods. +Workspace volume naming/provisioning must remain project-aware and work with either shared namespaces or per-job namespaces. ### Slurm @@ -353,8 +362,8 @@ Job pods are created in the project's K8s namespace, mounting a pre-provisioned ## FLARE API Changes -- `Session` gains a `project` parameter (defaults to `"default"`) and `set_project()`/`list_projects()` methods -- `list_jobs` is automatically filtered to the active project (replaces the `-u` user-only filter) +- `Session` gains an optional `project` parameter (defaults to `None`) and `set_project()`/`list_projects()` methods +- `list_jobs` is filtered by active project and caller role (`project_admin`: all in project, `org_admin`: own-org, `lead`: own jobs, `member`: all in project) - `get_system_info` returns only clients enrolled in the active project - All job operations validate that the target job belongs to the active project @@ -375,11 +384,20 @@ Audit logs should be queryable per project for compliance. ## Migration / Backward Compatibility -1. **Feature gate**: all multitenancy behavior gated on `project.yml` having `api_version: 4` with a `projects:` section. Without it, the system behaves identically to today. -2. **Default project**: all existing jobs, clients, and users are in the `default` project -3. **Cert role fallback**: if no project registry exists (or user has no registry mapping), fall back to cert-embedded role -4. **API compatibility**: `project` parameter defaults to `"default"` everywhere -5. **Config version**: `api_version: 4` in `project.yml` signals multi-project support; version 3 continues to work as single-tenant +1. **Phase 1 is ungated**: project plumbing (`project` argument + metadata propagation to launchers) is available independent of `api_version`. +2. **Feature gate for full multitenancy**: project registry, project-scoped RBAC, scheduler constraints, and job-store partitioning are enabled only when `project.yml` has `api_version: 4` with a `projects:` section. +3. **Default project**: all existing jobs, clients, and users are in the `default` project +4. **Cert role fallback**: if no project registry exists, fall back to cert-embedded role; if registry exists but user has no mapping, fallback applies only when active project is `default` +5. **API compatibility**: omitted `project` remains `None` (no default change) across phases +6. **Config version**: `api_version: 4` in `project.yml` signals full multi-project enforcement; version 3 continues to work as single-tenant + +### Release Transition Strategy (2.8 -> 2.9) + +1. **Upgrade to 2.8 (Phase 1 only)**: optional project tagging/plumbing is available, but no multitenant access-control or scheduler behavior changes are enabled. +2. **Upgrade to 2.9 with existing v3 deployments**: keep current `project.yml` (`api_version: 3`) and startup kits; system remains single-tenant/compatibility mode. +3. **Existing jobs continue to work**: legacy jobs remain at `jobs//` as `default`; no data migration is required. +4. **Activate full multi-project mode when ready**: deploy a v4 `project.yml` (`api_version: 4` + `projects:`) to server startup artifacts and restart server to load registry-backed project scoping. +5. **Provisioning impact**: no full reprovision is required; keep dynamic provisioning behavior by updating server-side artifacts and generating startup kits only for newly added or changed participants. --- @@ -391,29 +409,18 @@ Audit logs should be queryable per project for compliance. | D2 | Project lifecycle management? | **Deferred.** Projects are defined at provisioning time in `project.yml`. Runtime project CRUD is not in scope for v1. | | D3 | Per-project quota management? | **Deferred.** Rely on K8s ResourceQuota per namespace for now. Future: route projects to different K8s scheduling queues via pod labels. | | D4 | `check_status` information leakage? | **Server has global knowledge, filtering the response is sufficient.** The server parent process knows about all clients and jobs; it filters responses to only include resources in the user's active project. No architectural change needed. | -| D5 | Server-side job store isolation? | **Server job pods must only access their project's data.** The server job process (running in K8s/Docker) must not mount the entire job store — only the project-partitioned slice. Current `FilesystemStorage` will be replaced by a database or object store in the future, which will enforce project-scoped access natively. For v1 with filesystem: mount only `jobs//` into the server job pod. No migration of existing jobs — they stay at `jobs//` and belong to `default`. | +| D5 | Server-side job store isolation? | **Server job pods must only access their project's data.** The server job process (running in K8s/Docker) must not mount the entire job store — only the project-partitioned slice for new-layout jobs (`jobs//...`). Legacy jobs remain at `jobs//` under `default`; they are served by the main server process for compatibility and are not mounted into new server job pods. Current `FilesystemStorage` will be replaced by a database or object store in the future, which will enforce project-scoped access natively. | | D6 | Role storage: certs vs. server-side registry? | **Layered: registry overrides cert.** `project.yml` defines per-project roles; the server loads it at startup via `ProjectRegistry`. Certs continue to authenticate identity (name, org) and carry a role as fallback. No cert format change required. | | D7 | How do shared clients know which project PV to mount? | **The launcher passes the project name to the client.** Job metadata carries the project; the server includes it when dispatching to clients. The client-side `K8sJobLauncher`/`DockerJobLauncher` uses the project name to select the correct PV/volume mount. | | D8 | Cross-project isolation in subprocess mode? | **Subprocess mode is single-tenant/trusted only.** Only K8s, Docker, and Slurm launchers provide secure multi-tenant isolation (separate namespaces, volumes, `/tmp`). The default subprocess launcher offers no physical isolation and is only suitable for single-tenant or trusted environments. | +| D9 | Cross-project visibility for `platform_admin` job data? | **No.** `platform_admin` does not get cross-project job metadata/data visibility and there is no `list_jobs --all-projects` behavior in v1. If the same human also has a project-scoped role in the active project, only that project-scoped role grants job access. | +| D10 | Provisioning model at scale? | **Keep dynamic provisioning behavior.** Adding sites/users should not require reprovisioning all existing sites; update server-side config/startup artifacts and generate kits only for newly added or changed participants. | --- ## Unresolved Questions -1. **Cross-project visibility**: Can a platform admin see job metadata across all projects (for debugging)? Should `list_jobs` have a `--all-projects` flag for platform admins? - -2. **Existing `scope` concept**: The `scope` concept will be deprecated in favor of `project`. The `project` boundary subsumes data-governance scoping; existing `scope` usage will be migrated to `project`. - -3. **External IdP integration**: SSO is a follow-on (see Future: SSO section), but should the `ProjectRegistry` interface be designed now to accommodate an IdP backend later? What claims/attributes should the IdP provide (project membership, role, org)? - -4. **Shell commands (pwd, ls, cat, head, tail, grep)**: These allow direct filesystem access on server/client sites. In a multi-tenant environment: - - How do we restrict file access to the active project's workspace? Current implementation does basic path validation (no `..`, no absolute paths) but has no project awareness. - - In K8s, project data lives on per-project PVs that are only mounted into job pods — the client parent process does not have them mounted. Shell commands executed on the parent process have **no access** to project data at all. - - Options: (a) disable shell commands in multi-tenant mode, (b) replace with a project-scoped log/artifact download API that retrieves data from the job store, (c) route shell commands to a running job pod (requires the job to be active), (d) launch an ephemeral "debug pod" in the project's namespace with the project PV mounted. - - The current `cat log.txt` pattern assumes a single workspace. With per-project workspaces, the working directory concept needs redefinition. - - **This is a significant UX change** — today admins rely heavily on shell commands for debugging. Need a clear alternative. - -5. **Provisioning at scale**: With N projects and M users, the current "one provision run per project" model means M*N startup kits in the worst case. Is a shared-CA model with a single startup kit per user viable? +1. **Shell-command replacement UX**: Parent-process shell commands are backend-dependent and cannot be relied on for job workspace access (notably in K8s, but this can happen in single-project setups too). The right policy and UX replacement need further study (for example log/artifact APIs vs pod-targeted debug workflows). --- @@ -456,9 +463,9 @@ Phase 1 delivers no access control, no job store partitioning, and no cert/regis ### Scope -1. Add `project: str = "default"` parameter to `ProdEnv` and `PocEnv`. +1. Add `project: Optional[str] = None` parameter to `ProdEnv` and `PocEnv`. 2. Pass `project` through to the job metadata at submission time. -3. `K8sJobLauncher` reads `project` from job metadata and selects the corresponding PVC (`-workspace`). +3. `K8sJobLauncher` reads `project` from job metadata and selects the corresponding project workspace volume. 4. `DockerJobLauncher` reads `project` from job metadata and mounts `/data//` as the workspace volume. 5. No changes to authorization, job store paths, `project.yml`, scheduler, or any other component. diff --git a/docs/design/multiproject_implementation.md b/docs/design/multiproject_implementation.md index 03f1768511..62b0773b97 100644 --- a/docs/design/multiproject_implementation.md +++ b/docs/design/multiproject_implementation.md @@ -1,12 +1,12 @@ # Multitenancy Implementation Plan -Companion to [multitenancy.md](multitenancy.md). This document specifies *how* to implement the design with minimal risk. +Companion to [multiproject.md](multiproject.md). This document specifies *how* to implement the design with minimal risk. ## Guiding Principles -1. **Feature-gated** — all multitenancy behavior gated on `project.yml` having a `projects:` section (`api_version: 4`). Single-tenant deployments unchanged. +1. **Phased rollout** — Phase 1 project plumbing is ungated; full multitenancy enforcement is gated on `project.yml` having a `projects:` section (`api_version: 4`). 2. **Additive, not migratory** — no job store path migration, no role renames. New code paths only. -3. **Layered role resolution** — registry overrides cert, cert remains fallback. No breaking change to cert format. +3. **Layered role resolution** — registry role for active project when available; cert remains fallback for `default` only. No breaking change to cert format. 4. **Incremental delivery** — three shippable milestones, each independently testable. --- @@ -53,10 +53,17 @@ Key files and their roles (discovered via exploration): **Mitigation**: no migration. New multitenant jobs go to `jobs///`. Existing jobs stay at `jobs//` and implicitly belong to `default`. `SimpleJobDefManager` checks both paths when listing `default` project jobs. ### Problem: `project_admin` role rename -**Mitigation**: no rename. `project_admin` already fits as the per-project admin concept. Add `platform_admin` as a new global role. Existing `must_be_project_admin()` checks become "is user `project_admin` in any project OR `platform_admin`" — backward compatible. +**Mitigation**: no rename. `project_admin` already fits as the per-project admin concept. Add `platform_admin` as a new global role. `must_be_project_admin()`-style checks for project job operations must remain project-scoped (no implicit `platform_admin` job-data access). ### Problem: Cert role becomes vestigial -**Mitigation**: layered resolution. If `ProjectRegistry` has a mapping for this user+project, use it. Otherwise fall back to cert role. Cert format unchanged; no re-provisioning required for existing deployments. +**Mitigation**: layered resolution. If `ProjectRegistry` has a mapping for this user+project, use it. If not, cert fallback applies only when active project is `default`; otherwise deny. Cert format unchanged; no re-provisioning required for existing deployments. + +### Problem: Project name injection/path hazards +**Mitigation**: centralized `validate_project_name()` used by provisioning and job submission. Rules: +- Must be non-empty string, length `1..63` +- Must match `^[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?$` +- Invalid names are rejected (no auto-truncation) +- `default` reserved as system project name ### Problem: Cross-cutting feature requires all-or-nothing **Mitigation**: three milestones. Milestone 1 adds plumbing with zero behavior change. Milestone 2 adds the registry (gated). Milestone 3 enforces scoping. @@ -65,7 +72,7 @@ Key files and their roles (discovered via exploration): ## Milestone 1: Project-Aware Plumbing -**Goal**: thread `project` through the stack, always `"default"`. All existing tests pass, zero behavior change. +**Goal**: thread `project` through the stack while preserving current API defaults (`None`). All existing tests pass, zero behavior change. ### 1.1 Add `JobMetaKey.PROJECT` @@ -93,7 +100,10 @@ class ConnProps(object): In `submit_job()` (~line 604), after setting submitter info: ```python -meta[JobMetaKey.PROJECT.value] = conn.get_prop(ConnProps.PROJECT, "default") +project = conn.get_prop(ConnProps.PROJECT, None) +if project is not None: + validate_project_name(project) +meta[JobMetaKey.PROJECT.value] = project ``` Same for `clone_job()` (~line 504). @@ -102,20 +112,20 @@ Same for `clone_job()` (~line 504). **File**: `nvflare/fuel/hci/server/sess.py` -Add `project="default"` to `Session.__init__()`. Include `"p"` key in token encoding. Set `ConnProps.PROJECT` from session in `LoginModule.pre_command()`. +Add `project: Optional[str] = None` to `Session.__init__()`. Include `"p"` key in token encoding. Set `ConnProps.PROJECT` from session in `LoginModule.pre_command()`. ### 1.5 Add `project` to client-side `Session` **File**: `nvflare/fuel/flare_api/flare_api.py` -- Add `project="default"` param to `Session.__init__()` and `new_secure_session()` +- Add `project: Optional[str] = None` param to `Session.__init__()` and `new_secure_session()` - Store as `self._project`, pass to server on connect ### 1.6 Add `project` to `ProdEnv` **File**: `nvflare/recipe/prod_env.py` -Add `project="default"` param, pass through to `SessionManager`. +Add `project: Optional[str] = None` param, pass through to `SessionManager`. ### 1.7 Add `project` to audit events @@ -125,7 +135,7 @@ Add `project` param to `add_event()`. Emit `[P:project]` in log line. **File**: `nvflare/fuel/hci/server/audit.py` -Pass `conn.get_prop(ConnProps.PROJECT, "default")` to `add_event()`. +Pass `conn.get_prop(ConnProps.PROJECT, None)` to `add_event()`. ### 1.8 Filter `list_jobs` by project @@ -167,15 +177,15 @@ Extract shared helper `_is_job_in_project(job_meta, project)` for reuse across a ```python class ProjectRegistry: - """Resolves project membership, client enrollment, and per-project roles. + """Resolves project membership, site enrollment, and per-project roles. Loaded from project.yml at server startup. When absent or api_version < 4, - operates in single-tenant mode (all users/clients in 'default' project, + operates in single-tenant mode (all users/sites in 'default' project, roles from certs). """ def __init__(self): - self._projects = {} # name -> {clients: set, users: dict} + self._projects = {} # name -> {sites: set, users: dict} self._multitenant = False def load_from_config(self, project_dict: dict): @@ -190,8 +200,8 @@ class ProjectRegistry: """All project names.""" ... - def get_project_clients(self, project: str) -> Set[str]: - """Client names enrolled in project.""" + def get_project_sites(self, project: str) -> Set[str]: + """Site names enrolled in project (client-type sites).""" ... def get_user_projects(self, username: str) -> List[str]: @@ -209,7 +219,7 @@ class ProjectRegistry: def is_user_in_project(self, username: str, project: str) -> bool: ... - def is_client_in_project(self, client_name: str, project: str) -> bool: + def is_site_in_project(self, site_name: str, project: str) -> bool: ... ``` @@ -328,7 +338,8 @@ cmd_reg.add_filter(authz_filter) Extend `prepare_project()` to accept `api_version` 3 or 4. When 4: - Parse `projects:` section - Parse per-admin `projects:` mapping -- Validate referenced clients exist in participants +- Validate referenced project sites exist in participants and are client-type sites +- Preserve dynamic provisioning behavior: adding users/sites should not require reprovisioning all existing sites; regenerate/update startup artifacts only for the server and newly added/changed participants. **File**: `nvflare/lighter/entity.py` @@ -362,7 +373,7 @@ Copy `project.yml` into server startup kit directory. ## Milestone 3: Enforcement + Scheduler -**Goal**: all commands scoped to active project. Scheduler validates project client enrollment. +**Goal**: all commands scoped to active project. Scheduler validates project site enrollment (client-type sites). ### 3.1 All job commands: project gate @@ -386,19 +397,19 @@ def authorize_job_id(self, conn, args): This is a single change point since all job commands route through `authorize_job_id()`. -### 3.2 Infrastructure commands: filter to project clients +### 3.2 Infrastructure commands: filter to project sites **File**: `nvflare/private/fed/server/training_cmds.py` (and `cmd_utils.py`) -In `validate_command_targets()`, filter target list to clients enrolled in the active project: +In `validate_command_targets()`, filter target list to sites enrolled in the active project: ```python registry = ProjectRegistryService.get_registry() if registry and registry.is_multitenant(): project = conn.get_prop(ConnProps.PROJECT, "default") for target in targets: - if not registry.is_client_in_project(target, project): - conn.append_error(f"Client '{target}' not in project '{project}'") + if not registry.is_site_in_project(target, project): + conn.append_error(f"Site '{target}' not in project '{project}'") return PreAuthzReturnCode.ERROR ``` @@ -418,9 +429,9 @@ In `_try_job()`, after extracting applicable sites (~line 126): registry = ProjectRegistryService.get_registry() if registry and registry.is_multitenant(): project = job_meta.get(JobMetaKey.PROJECT.value, "default") - project_clients = registry.get_project_clients(project) + project_sites = registry.get_project_sites(project) for site in applicable_sites: - if site != SERVER_SITE_NAME and site not in project_clients: + if site != SERVER_SITE_NAME and site not in project_sites: return (SCHEDULE_RESULT_BLOCK, None, f"Site {site} not in project {project}") ``` @@ -454,9 +465,9 @@ def job_uri(self, jid, project=None): | File | Change | |------|--------| | `nvflare/private/fed/server/job_cmds.py` | project gate in authorize_job_id | -| `nvflare/private/fed/server/training_cmds.py` | filter infra commands to project clients | +| `nvflare/private/fed/server/training_cmds.py` | filter infra commands to project sites | | `nvflare/private/fed/server/cmd_utils.py` | project-aware target validation | -| `nvflare/app_common/job_schedulers/job_scheduler.py` | deploy_map vs project clients | +| `nvflare/app_common/job_schedulers/job_scheduler.py` | deploy_map vs project sites | | `nvflare/apis/impl/job_def_manager.py` | partitioned job_uri for new jobs | | `tests/unit_test/...` (5 new files) | ~630 lines of tests | @@ -494,6 +505,4 @@ def job_uri(self, jid, project=None): 2. **Token encoding**: should the project be part of the session token, or sent as a command header? Token means re-auth on project switch; header is simpler but requires server-side validation on every command. -3. **`list_jobs --all-projects`**: should `platform_admin` have a flag to see all projects' jobs? Useful for debugging but increases blast radius. - -4. **Provisioner backward compat**: when `api_version: 4` project.yml is used, should the provisioner still bake a role into the cert? Options: (a) bake the first project's role as a fallback, (b) leave `UNSTRUCTURED_NAME` empty, (c) bake a sentinel value like `"multitenant"`. +3. **Provisioner backward compat**: when `api_version: 4` project.yml is used, should the provisioner still bake a role into the cert? Options: (a) bake the first project's role as a fallback, (b) leave `UNSTRUCTURED_NAME` empty, (c) bake a sentinel value like `"multitenant"`. From 7480dd4eb6c0d85d598199163817a54fc108eb96 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Wed, 4 Mar 2026 13:52:51 -0800 Subject: [PATCH 03/11] docs: add revision history to multiproject design --- docs/design/multiproject.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/design/multiproject.md b/docs/design/multiproject.md index 8bc9407d5c..4c1a399a95 100644 --- a/docs/design/multiproject.md +++ b/docs/design/multiproject.md @@ -1,5 +1,12 @@ # Multi-Project Support in Flare +## Revision History + +| Version | Notes | +|---------|-------| +| 1 | Initial version | +| 2 | Incorporate feedback and Mayo discussion | + ## Introduction Flare currently operates as a single-tenant system. All server and client processes run under the same Linux user, all jobs share a flat store (`jobs//`), and every authorized admin can see and act on every job. There is no data segregation between different collaborations running on the same infrastructure. From c69831a1ce89b688e514b10c7265336ad4f28113 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 13:24:25 -0800 Subject: [PATCH 04/11] feat: add phase 1 project plumbing --- docs/design/multiproject.md | 6 +- nvflare/apis/job_def.py | 1 + nvflare/apis/utils/format_check.py | 1 + .../app_opt/job_launcher/docker_launcher.py | 8 +- nvflare/app_opt/job_launcher/k8s_launcher.py | 7 ++ nvflare/fuel/flare_api/flare_api.py | 50 ++++++++++-- nvflare/private/fed/server/job_cmds.py | 34 ++++++++ nvflare/recipe/poc_env.py | 11 +++ nvflare/recipe/prod_env.py | 11 +++ .../unit_test/apis/utils/format_check_test.py | 20 +++++ .../fuel/flare_api/flare_api_project_test.py | 78 +++++++++++++++++++ .../private/fed/server/job_cmds_test.py | 36 ++++++++- tests/unit_test/recipe/poc_env_test.py | 13 ++++ tests/unit_test/recipe/prod_env_test.py | 35 +++++++++ 14 files changed, 299 insertions(+), 12 deletions(-) create mode 100644 tests/unit_test/fuel/flare_api/flare_api_project_test.py create mode 100644 tests/unit_test/recipe/prod_env_test.py diff --git a/docs/design/multiproject.md b/docs/design/multiproject.md index 4c1a399a95..f76050f17f 100644 --- a/docs/design/multiproject.md +++ b/docs/design/multiproject.md @@ -126,6 +126,8 @@ Project create/archive is deferred for v1 (projects are provisioning-time config `project` becomes a first-class, immutable field on every job. Set at submission time from the user's active project context. Cannot be changed after creation. +The project value is syntactically validated at the user-facing API layer and again on the server before it is persisted into job metadata. This prevents invalid or path-like values from reaching runtime launchers. + ### Job Store Partitioning New multitenant jobs are stored at `jobs///` (vs. current `jobs//`). No migration of existing jobs — they remain at `jobs//` and implicitly belong to the `default` project. @@ -471,7 +473,7 @@ Phase 1 delivers no access control, no job store partitioning, and no cert/regis ### Scope 1. Add `project: Optional[str] = None` parameter to `ProdEnv` and `PocEnv`. -2. Pass `project` through to the job metadata at submission time. +2. Pass `project` through to the job metadata at submission/clone time, with syntax validation before persistence. 3. `K8sJobLauncher` reads `project` from job metadata and selects the corresponding project workspace volume. 4. `DockerJobLauncher` reads `project` from job metadata and mounts `/data//` as the workspace volume. 5. No changes to authorization, job store paths, `project.yml`, scheduler, or any other component. @@ -483,7 +485,7 @@ Phase 1 delivers no access control, no job store partitioning, and no cert/regis ### What this does NOT do -- No access control — any user can submit to any project name. +- No access control — any user can submit to any valid project name. - No job store partitioning (`jobs//` path unchanged). - No `project.yml` parsing or `ProjectRegistry`. - No `set_project` / `list_projects` admin commands. diff --git a/nvflare/apis/job_def.py b/nvflare/apis/job_def.py index 56faae4fcb..be9f9b3062 100644 --- a/nvflare/apis/job_def.py +++ b/nvflare/apis/job_def.py @@ -76,6 +76,7 @@ class JobMetaKey(str, Enum): CUSTOM_PROPS = "custom_props" EDGE_METHOD = "edge_method" JOB_CLIENTS = "job_clients" # clients that participated the job + PROJECT = "project" def __repr__(self): return self.value diff --git a/nvflare/apis/utils/format_check.py b/nvflare/apis/utils/format_check.py index 41c11ea600..5f91584ac1 100644 --- a/nvflare/apis/utils/format_check.py +++ b/nvflare/apis/utils/format_check.py @@ -27,6 +27,7 @@ "email": r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}$", "org": r"^[A-Za-z0-9_]+$", "simple_name": r"^[A-Za-z0-9_]+$", + "project": r"^[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?$", } diff --git a/nvflare/app_opt/job_launcher/docker_launcher.py b/nvflare/app_opt/job_launcher/docker_launcher.py index 8779e4711d..ec3b73d02b 100644 --- a/nvflare/app_opt/job_launcher/docker_launcher.py +++ b/nvflare/app_opt/job_launcher/docker_launcher.py @@ -20,6 +20,7 @@ from nvflare.apis.event_type import EventType from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext +from nvflare.apis.job_def import JobMetaKey from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec, JobReturnCode, add_launcher from nvflare.apis.workspace import Workspace from nvflare.utils.job_launcher_utils import extract_job_image, generate_client_command, generate_server_command @@ -117,8 +118,13 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: command = f' /bin/bash -c "export PYTHONPATH={python_path};{cmd}"' self.logger.info(f"Launch image:{job_image}, run command: {command}") + project = job_meta.get(JobMetaKey.PROJECT.value, "") docker_workspace = os.environ.get("NVFL_DOCKER_WORKSPACE") - self.logger.info(f"launch_job {job_id} in docker_workspace: {docker_workspace}") + if docker_workspace and isinstance(project, str) and project and project != "default": + docker_workspace = os.path.join(docker_workspace, project) + + self.logger.info(f"launch_job {job_id} in docker_workspace: {docker_workspace} (project={project})") + docker_client = docker.from_env() try: container = docker_client.containers.run( diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 19e6716bc2..9410cab4ec 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -228,6 +228,13 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: raise RuntimeError(f"missing {FLContextKey.JOB_PROCESS_ARGS} in FLContext") _, job_cmd = job_args[JobProcessArgs.EXE_MODULE] + # TODO: Make the K8s launcher project-aware with minimal code churn. + # The intended change is only to read the optional job_meta["project"] + # and use it to resolve project-specific Kubernetes settings before pod + # launch. That settings lookup may include workspace volume/path plus + # any other K8s deployment settings required for the selected project. + # Keep the existing launch flow unchanged; only the settings resolution + # should become project-aware. job_config = { "name": job_id, "image": job_image, diff --git a/nvflare/fuel/flare_api/flare_api.py b/nvflare/fuel/flare_api/flare_api.py index 14b5624228..5d9ddb6cc6 100644 --- a/nvflare/fuel/flare_api/flare_api.py +++ b/nvflare/fuel/flare_api/flare_api.py @@ -19,6 +19,7 @@ from nvflare.apis.fl_constant import AdminCommandNames from nvflare.apis.job_def import JobMetaKey +from nvflare.apis.utils.format_check import name_check from nvflare.apis.workspace import Workspace from nvflare.fuel.common.excepts import ConfigError from nvflare.fuel.hci.client.api import AdminAPI, APIStatus, ResultKey @@ -69,6 +70,7 @@ def __init__( startup_path: str, secure_mode: bool = True, debug: bool = False, + project: Optional[str] = None, ): """Initializes a session with the NVFLARE system. @@ -77,11 +79,11 @@ def __init__( startup_path (str): path to the provisioned startup kit, which contains endpoint of the system secure_mode (bool): whether to log in with secure mode debug (bool): turn on debug or not + project (Optional[str]): project name to tag submitted/cloned jobs; None keeps existing behavior """ assert isinstance(username, str), "username must be str" assert isinstance(startup_path, str), "startup_path must be str" assert os.path.isdir(startup_path), f"startup kit does not exist at {startup_path}" - workspace = Workspace(root_dir=startup_path) conf = secure_load_admin_config(workspace) admin_config = conf.get_admin_config() @@ -105,6 +107,11 @@ def __init__( ) self.upload_dir = upload_dir self.download_dir = download_dir + if project is not None: + err, reason = name_check(project, "project") + if err: + raise ValueError(reason) + self._project = project def close(self): """Close the session.""" @@ -209,7 +216,8 @@ def clone_job(self, job_id: str) -> str: """ self._validate_job_id(job_id) - result = self._do_command(AdminCommandNames.CLONE_JOB + " " + job_id) + props = {JobMetaKey.PROJECT.value: self._project} if self._project else None + result = self._do_command(AdminCommandNames.CLONE_JOB + " " + job_id, props=props) meta = result[ResultKey.META] job_id = meta.get(MetaKey.JOB_ID, None) info = meta.get(MetaKey.INFO, "") @@ -241,7 +249,8 @@ def submit_job(self, job_definition_path: str) -> str: else: raise InvalidJobDefinition(f"job_definition_path '{job_definition_path}' is not a valid folder") - result = self._do_command(AdminCommandNames.SUBMIT_JOB + " " + job_definition_path) + props = {JobMetaKey.PROJECT.value: self._project} if self._project else None + result = self._do_command(AdminCommandNames.SUBMIT_JOB + " " + job_definition_path, props=props) meta = result[ResultKey.META] job_id = meta.get(MetaKey.JOB_ID, None) if not job_id: @@ -935,13 +944,26 @@ def new_session( secure_mode: bool = True, debug: bool = False, timeout: float = 10.0, + project: Optional[str] = None, ) -> Session: - session = Session(username=username, startup_path=startup_kit_location, debug=debug, secure_mode=secure_mode) + session = Session( + username=username, + startup_path=startup_kit_location, + debug=debug, + secure_mode=secure_mode, + project=project, + ) session.try_connect(timeout) return session -def new_secure_session(username: str, startup_kit_location: str, debug: bool = False, timeout: float = 10.0) -> Session: +def new_secure_session( + username: str, + startup_kit_location: str, + debug: bool = False, + timeout: float = 10.0, + project: Optional[str] = None, +) -> Session: """Create a new secure FLARE API session with the NVFLARE system. Args: @@ -949,20 +971,27 @@ def new_secure_session(username: str, startup_kit_location: str, debug: bool = F startup_kit_location (str): path to the provisioned startup folder, the root admin dir containing the startup folder debug (bool): enable debug mode timeout (float): how long to try to establish the session, in seconds + project (Optional[str]): project name to tag submitted/cloned jobs Returns: a Session object """ - return new_session(username, startup_kit_location, True, debug, timeout) + return new_session(username, startup_kit_location, True, debug, timeout, project=project) -def new_insecure_session(startup_kit_location: str, debug: bool = False, timeout: float = 10.0) -> Session: +def new_insecure_session( + startup_kit_location: str, + debug: bool = False, + timeout: float = 10.0, + project: Optional[str] = None, +) -> Session: """Create a new insecure FLARE API session with the NVFLARE system. Args: startup_kit_location (str): path to the provisioned startup folder debug (bool): enable debug mode timeout (float): how long to try to establish the session, in seconds + project (Optional[str]): project name to tag submitted/cloned jobs Returns: a Session object @@ -970,5 +999,10 @@ def new_insecure_session(startup_kit_location: str, debug: bool = False, timeout """ return new_session( - username="", startup_kit_location=startup_kit_location, secure_mode=False, debug=debug, timeout=timeout + username="", + startup_kit_location=startup_kit_location, + secure_mode=False, + debug=debug, + timeout=timeout, + project=project, ) diff --git a/nvflare/private/fed/server/job_cmds.py b/nvflare/private/fed/server/job_cmds.py index 36058c5818..9489a2cef9 100644 --- a/nvflare/private/fed/server/job_cmds.py +++ b/nvflare/private/fed/server/job_cmds.py @@ -25,6 +25,7 @@ from nvflare.apis.job_def_manager_spec import JobDefManagerSpec, RunStatus from nvflare.apis.shareable import Shareable from nvflare.apis.storage import DATA, JOB_ZIP, META, META_JSON, WORKSPACE, WORKSPACE_ZIP, StorageSpec +from nvflare.apis.utils.format_check import name_check from nvflare.fuel.hci.conn import Connection from nvflare.fuel.hci.proto import ConfirmMethod, MetaKey, MetaStatusValue, make_meta from nvflare.fuel.hci.reg import CommandModule, CommandModuleSpec, CommandSpec @@ -53,8 +54,11 @@ JobMetaKey.MIN_CLIENTS.value, JobMetaKey.MANDATORY_CLIENTS.value, JobMetaKey.DATA_STORAGE_FORMAT.value, + JobMetaKey.PROJECT.value, } +PROJECT_CMD_PROP_KEY = JobMetaKey.PROJECT.value + def _create_list_job_cmd_parser(): parser = SafeArgumentParser(prog=AdminCommandNames.LIST_JOBS) @@ -78,6 +82,32 @@ def __init__(self): super().__init__() self.logger = get_obj_logger(self) + @staticmethod + def _get_project_from_cmd_props(conn: Connection) -> str: + cmd_props = conn.get_prop(ConnProps.CMD_PROPS) + if not isinstance(cmd_props, dict): + return "" + project = cmd_props.get(PROJECT_CMD_PROP_KEY) + if not project: + return "" + err, reason = name_check(project, "project") + if err: + raise ValueError(reason) + return project + + @staticmethod + def _add_project_to_meta(meta: dict, conn: Connection) -> bool: + try: + project = JobCommandModule._get_project_from_cmd_props(conn) + except (TypeError, ValueError) as e: + err = str(e) + conn.append_error(err, meta=make_meta(MetaStatusValue.INVALID_JOB_DEFINITION, err)) + return False + + if project: + meta[JobMetaKey.PROJECT.value] = project + return True + def get_spec(self): return CommandModuleSpec( name="job_mgmt", @@ -502,6 +532,8 @@ def clone_job(self, conn: Connection, args: List[str]): job_meta[JobMetaKey.SUBMITTER_ORG.value] = conn.get_prop(ConnProps.USER_ORG) job_meta[JobMetaKey.SUBMITTER_ROLE.value] = conn.get_prop(ConnProps.USER_ROLE) job_meta[JobMetaKey.CLONED_FROM.value] = job_id + if not self._add_project_to_meta(job_meta, conn): + return meta = job_def_manager.clone(from_jid=job_id, meta=job_meta, fl_ctx=fl_ctx) new_job_id = meta.get(JobMetaKey.JOB_ID) @@ -599,6 +631,8 @@ def submit_job(self, conn: Connection, args: List[str]): meta[JobMetaKey.SUBMITTER_ORG.value] = conn.get_prop(ConnProps.USER_ORG, "") meta[JobMetaKey.SUBMITTER_ROLE.value] = conn.get_prop(ConnProps.USER_ROLE, "") meta[JobMetaKey.JOB_FOLDER_NAME.value] = folder_name + if not self._add_project_to_meta(meta, conn): + return custom_props = conn.get_prop(ConnProps.CUSTOM_PROPS) if custom_props: meta[JobMetaKey.CUSTOM_PROPS.value] = custom_props diff --git a/nvflare/recipe/poc_env.py b/nvflare/recipe/poc_env.py index ac6a67fadf..e8cd458c07 100644 --- a/nvflare/recipe/poc_env.py +++ b/nvflare/recipe/poc_env.py @@ -19,6 +19,7 @@ from pydantic import BaseModel, conint, model_validator +from nvflare.apis.utils.format_check import name_check from nvflare.job_config.api import FedJob from nvflare.recipe.spec import ExecEnv from nvflare.recipe.utils import _collect_non_local_scripts @@ -50,6 +51,7 @@ class _PocEnvValidator(BaseModel): docker_image: Optional[str] = None project_conf_path: str = "" username: str = DEFAULT_ADMIN_USER + project: Optional[str] = None @model_validator(mode="after") def check_client_configuration(self): @@ -67,6 +69,10 @@ def check_client_configuration(self): if self.clients is None and self.num_clients <= 0: raise ValueError("num_clients must be greater than 0") + if self.project is not None: + err, reason = name_check(self.project, "project") + if err: + raise ValueError(reason) return self @@ -87,6 +93,7 @@ def __init__( docker_image: Optional[str] = None, project_conf_path: str = "", username: str = DEFAULT_ADMIN_USER, + project: Optional[str] = None, extra: Optional[dict] = None, ): """Initialize POC execution environment. @@ -101,6 +108,7 @@ def __init__( project_conf_path (str, optional): Path to the project configuration file. Defaults to "". If specified, 'number_of_clients','clients' and 'docker' specific options will be ignored. username (str, optional): Admin user. Defaults to "admin@nvidia.com". + project (Optional[str]): Project name to tag submitted/cloned jobs. extra: extra env info. """ super().__init__(extra) @@ -113,6 +121,7 @@ def __init__( docker_image=docker_image, project_conf_path=project_conf_path, username=username, + project=project, ) self.clients = v.clients @@ -123,6 +132,7 @@ def __init__( self.project_conf_path = v.project_conf_path self.docker_image = v.docker_image self.username = v.username + self.project = v.project self._session_manager = None # Lazy initialization def deploy(self, job: FedJob): @@ -257,6 +267,7 @@ def _get_session_manager(self): "username": self.username, "startup_kit_location": self._get_admin_startup_kit_path(), "timeout": self.get_extra_prop("login_timeout", 10), + "project": self.project, } self._session_manager = SessionManager(session_params) return self._session_manager diff --git a/nvflare/recipe/prod_env.py b/nvflare/recipe/prod_env.py index de7dc6f9f9..030a7f6c8a 100644 --- a/nvflare/recipe/prod_env.py +++ b/nvflare/recipe/prod_env.py @@ -18,6 +18,7 @@ from pydantic import BaseModel, PositiveFloat, model_validator +from nvflare.apis.utils.format_check import name_check from nvflare.job_config.api import FedJob from nvflare.recipe.spec import ExecEnv from nvflare.recipe.utils import _collect_non_local_scripts @@ -34,11 +35,16 @@ class _ProdEnvValidator(BaseModel): startup_kit_location: str login_timeout: PositiveFloat = 5.0 username: str = DEFAULT_ADMIN_USER + project: Optional[str] = None @model_validator(mode="after") def check_startup_kit_location_exists(self) -> "_ProdEnvValidator": if not os.path.exists(self.startup_kit_location): raise ValueError(f"startup_kit_location path does not exist: {self.startup_kit_location}") + if self.project is not None: + err, reason = name_check(self.project, "project") + if err: + raise ValueError(reason) return self @@ -48,6 +54,7 @@ def __init__( startup_kit_location: str, login_timeout: float = 5.0, username: str = DEFAULT_ADMIN_USER, + project: Optional[str] = None, extra: Optional[dict] = None, ): """Production execution environment for submitting and monitoring NVFlare jobs. @@ -58,6 +65,7 @@ def __init__( startup_kit_location (str): Path to the admin's startup kit directory. login_timeout (float): Timeout (in seconds) for logging into the Flare API session. Must be > 0. username (str): Username to log in with. + project (Optional[str]): Project name to tag submitted/cloned jobs. extra: extra env info. """ super().__init__(extra) @@ -66,11 +74,13 @@ def __init__( startup_kit_location=startup_kit_location, login_timeout=login_timeout, username=username, + project=project, ) self.startup_kit_location = v.startup_kit_location self.login_timeout = v.login_timeout self.username = v.username + self.project = v.project self._session_manager = None # Lazy initialization def get_job_status(self, job_id: str) -> Optional[str]: @@ -103,6 +113,7 @@ def _get_session_manager(self): "username": self.username, "startup_kit_location": self.startup_kit_location, "timeout": self.login_timeout, + "project": self.project, } self._session_manager = SessionManager(session_params) return self._session_manager diff --git a/tests/unit_test/apis/utils/format_check_test.py b/tests/unit_test/apis/utils/format_check_test.py index e90b565503..824d37c010 100644 --- a/tests/unit_test/apis/utils/format_check_test.py +++ b/tests/unit_test/apis/utils/format_check_test.py @@ -55,3 +55,23 @@ def test_admin(self, name, err_value): assert err == err_value err, reason = name_check(name, "email") assert err == err_value + + @pytest.mark.parametrize( + "name, err_value", + [ + ["default", False], + ["cancer-research", False], + ["a", False], + ["a" * 63, False], + ["", True], + ["A", True], + ["abc_", True], + ["-abc", True], + ["abc-", True], + ["a" * 64, True], + ["with space", True], + ], + ) + def test_project(self, name, err_value): + err, reason = name_check(name, "project") + assert err == err_value diff --git a/tests/unit_test/fuel/flare_api/flare_api_project_test.py b/tests/unit_test/fuel/flare_api/flare_api_project_test.py new file mode 100644 index 0000000000..23fcebcb93 --- /dev/null +++ b/tests/unit_test/fuel/flare_api/flare_api_project_test.py @@ -0,0 +1,78 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import patch + +from nvflare.fuel.hci.client.api import ResultKey +from nvflare.fuel.hci.proto import MetaKey +from nvflare.fuel.flare_api.flare_api import Session, new_secure_session + + +def _make_session_for_project(project): + session = Session.__new__(Session) + session.upload_dir = "/tmp" + session._project = project + return session + + +def test_submit_job_sends_project_cmd_props(): + session = _make_session_for_project("cancer-research") + captured = {} + + def _fake_do_command(command, enforce_meta=True, props=None): + captured["props"] = props + return {ResultKey.META: {MetaKey.JOB_ID: "job-1"}} + + session._do_command = _fake_do_command + with patch("os.path.isdir", return_value=True): + session.submit_job("/tmp/job") + + assert captured["props"] == {"project": "cancer-research"} + + +def test_clone_job_sends_project_cmd_props(): + session = _make_session_for_project("multiple-sclerosis") + captured = {} + + def _fake_do_command(command, enforce_meta=True, props=None): + captured["props"] = props + return {ResultKey.META: {MetaKey.JOB_ID: "job-2"}} + + session._do_command = _fake_do_command + session.clone_job("source-job") + + assert captured["props"] == {"project": "multiple-sclerosis"} + + +def test_submit_job_without_project_keeps_cmd_props_empty(): + session = _make_session_for_project(None) + captured = {} + + def _fake_do_command(command, enforce_meta=True, props=None): + captured["props"] = props + return {ResultKey.META: {MetaKey.JOB_ID: "job-3"}} + + session._do_command = _fake_do_command + with patch("os.path.isdir", return_value=True): + session.submit_job("/tmp/job") + + assert captured["props"] is None + + +def test_new_secure_session_forwards_project(): + with patch("nvflare.fuel.flare_api.flare_api.new_session") as mock_new_session: + new_secure_session("admin@nvidia.com", "/tmp/kit", project="cancer-research") + _, kwargs = mock_new_session.call_args + assert kwargs["project"] == "cancer-research" + diff --git a/tests/unit_test/private/fed/server/job_cmds_test.py b/tests/unit_test/private/fed/server/job_cmds_test.py index a5b4290495..50edfbdf22 100644 --- a/tests/unit_test/private/fed/server/job_cmds_test.py +++ b/tests/unit_test/private/fed/server/job_cmds_test.py @@ -16,7 +16,8 @@ import pytest -from nvflare.private.fed.server.job_cmds import _create_list_job_cmd_parser +from nvflare.fuel.hci.server.constants import ConnProps +from nvflare.private.fed.server.job_cmds import JobCommandModule, _create_list_job_cmd_parser TEST_CASES = [ ( @@ -41,3 +42,36 @@ def test_parse_args(self, args: list[str], expected_args): parser = _create_list_job_cmd_parser() parsed_args = parser.parse_args(args) assert parsed_args == expected_args + + +class _MockConnection: + def __init__(self, cmd_props=None): + self._cmd_props = cmd_props + + def get_prop(self, key): + if key == ConnProps.CMD_PROPS: + return self._cmd_props + return None + + +class TestProjectCmdProps: + @pytest.mark.parametrize( + "cmd_props, expected", + [ + (None, ""), + ("not-a-dict", ""), + ({}, ""), + ({"project": ""}, ""), + ({"project": "cancer-research"}, "cancer-research"), + ({"project": "default"}, "default"), + ], + ) + def test_get_project_from_cmd_props(self, cmd_props, expected): + conn = _MockConnection(cmd_props=cmd_props) + assert JobCommandModule._get_project_from_cmd_props(conn) == expected + + @pytest.mark.parametrize("project", [123, "Bad Project", " cancer-research ", "../escape"]) + def test_get_project_from_cmd_props_rejects_invalid_values(self, project): + conn = _MockConnection(cmd_props={"project": project}) + with pytest.raises((TypeError, ValueError)): + JobCommandModule._get_project_from_cmd_props(conn) diff --git a/tests/unit_test/recipe/poc_env_test.py b/tests/unit_test/recipe/poc_env_test.py index b710a7e1e4..c0000042d8 100644 --- a/tests/unit_test/recipe/poc_env_test.py +++ b/tests/unit_test/recipe/poc_env_test.py @@ -61,6 +61,10 @@ def test_poc_env_validation(): with pytest.raises(ValueError, match="Inconsistent"): PocEnv(num_clients=3, clients=["site1", "site2"]) + # Test invalid project name + with pytest.raises(ValueError): + PocEnv(project="Bad Project") + def test_poc_env_client_names(): """Test PocEnv client name generation and validation.""" @@ -137,3 +141,12 @@ def test_stop_poc(mock_rmtree, mock_is_running, mock_clean_poc, mock_stop_poc, m ) mock_clean_poc.assert_called_once_with(env.poc_workspace) mock_rmtree.assert_called_once_with(env.poc_workspace, ignore_errors=True) + + +def test_poc_env_session_manager_passes_project(): + env = PocEnv(project="multiple-sclerosis") + with patch.object(env, "_get_admin_startup_kit_path", return_value="/tmp/admin@nvidia.com"): + with patch("nvflare.recipe.poc_env.SessionManager") as mock_session_manager: + env._get_session_manager() + session_params = mock_session_manager.call_args[0][0] + assert session_params["project"] == "multiple-sclerosis" diff --git a/tests/unit_test/recipe/prod_env_test.py b/tests/unit_test/recipe/prod_env_test.py new file mode 100644 index 0000000000..64ac03397e --- /dev/null +++ b/tests/unit_test/recipe/prod_env_test.py @@ -0,0 +1,35 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +from unittest.mock import patch + +import pytest + +from nvflare.recipe.prod_env import ProdEnv + + +def test_prod_env_session_manager_passes_project(): + with tempfile.TemporaryDirectory() as startup_kit_location: + env = ProdEnv(startup_kit_location=startup_kit_location, project="cancer-research") + with patch("nvflare.recipe.prod_env.SessionManager") as mock_session_manager: + env._get_session_manager() + session_params = mock_session_manager.call_args[0][0] + assert session_params["project"] == "cancer-research" + + +def test_prod_env_rejects_invalid_project_name(): + with tempfile.TemporaryDirectory() as startup_kit_location: + with pytest.raises(ValueError): + ProdEnv(startup_kit_location=startup_kit_location, project="Bad Project") From 289822354b59d4702d113d2a2496effa4411cdb6 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 13:45:04 -0800 Subject: [PATCH 05/11] refactor: simplify project metadata validation --- nvflare/private/fed/server/job_cmds.py | 36 ++++++++++--------- .../private/fed/server/job_cmds_test.py | 35 +++++++++++------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/nvflare/private/fed/server/job_cmds.py b/nvflare/private/fed/server/job_cmds.py index 9489a2cef9..5d377f84bc 100644 --- a/nvflare/private/fed/server/job_cmds.py +++ b/nvflare/private/fed/server/job_cmds.py @@ -83,25 +83,27 @@ def __init__(self): self.logger = get_obj_logger(self) @staticmethod - def _get_project_from_cmd_props(conn: Connection) -> str: + def _add_project_to_meta(meta: dict, conn: Connection) -> bool: + """Validate optional project from command props and persist it into job metadata.""" + cmd_props = conn.get_prop(ConnProps.CMD_PROPS) - if not isinstance(cmd_props, dict): - return "" - project = cmd_props.get(PROJECT_CMD_PROP_KEY) - if not project: - return "" - err, reason = name_check(project, "project") - if err: - raise ValueError(reason) - return project + project = "" + error = "" + + if isinstance(cmd_props, dict): + candidate = cmd_props.get(PROJECT_CMD_PROP_KEY) + if candidate: + if not isinstance(candidate, str): + error = f"project must be str but got {type(candidate)}" + else: + invalid, reason = name_check(candidate, "project") + if invalid: + error = reason + else: + project = candidate - @staticmethod - def _add_project_to_meta(meta: dict, conn: Connection) -> bool: - try: - project = JobCommandModule._get_project_from_cmd_props(conn) - except (TypeError, ValueError) as e: - err = str(e) - conn.append_error(err, meta=make_meta(MetaStatusValue.INVALID_JOB_DEFINITION, err)) + if error: + conn.append_error(error, meta=make_meta(MetaStatusValue.INVALID_JOB_DEFINITION, error)) return False if project: diff --git a/tests/unit_test/private/fed/server/job_cmds_test.py b/tests/unit_test/private/fed/server/job_cmds_test.py index 50edfbdf22..a7acc0a3b2 100644 --- a/tests/unit_test/private/fed/server/job_cmds_test.py +++ b/tests/unit_test/private/fed/server/job_cmds_test.py @@ -47,31 +47,42 @@ def test_parse_args(self, args: list[str], expected_args): class _MockConnection: def __init__(self, cmd_props=None): self._cmd_props = cmd_props + self.errors = [] def get_prop(self, key): if key == ConnProps.CMD_PROPS: return self._cmd_props return None + def append_error(self, msg, meta=None): + self.errors.append((msg, meta)) + class TestProjectCmdProps: @pytest.mark.parametrize( - "cmd_props, expected", + "cmd_props, expected_meta", [ - (None, ""), - ("not-a-dict", ""), - ({}, ""), - ({"project": ""}, ""), - ({"project": "cancer-research"}, "cancer-research"), - ({"project": "default"}, "default"), + (None, {}), + ("not-a-dict", {}), + ({}, {}), + ({"project": ""}, {}), + ({"project": "cancer-research"}, {"project": "cancer-research"}), + ({"project": "default"}, {"project": "default"}), ], ) - def test_get_project_from_cmd_props(self, cmd_props, expected): + def test_add_project_to_meta(self, cmd_props, expected_meta): conn = _MockConnection(cmd_props=cmd_props) - assert JobCommandModule._get_project_from_cmd_props(conn) == expected + meta = {} + + assert JobCommandModule._add_project_to_meta(meta, conn) is True + assert meta == expected_meta + assert conn.errors == [] @pytest.mark.parametrize("project", [123, "Bad Project", " cancer-research ", "../escape"]) - def test_get_project_from_cmd_props_rejects_invalid_values(self, project): + def test_add_project_to_meta_rejects_invalid_values(self, project): conn = _MockConnection(cmd_props={"project": project}) - with pytest.raises((TypeError, ValueError)): - JobCommandModule._get_project_from_cmd_props(conn) + meta = {} + + assert JobCommandModule._add_project_to_meta(meta, conn) is False + assert meta == {} + assert len(conn.errors) == 1 From 6e8d98eac8ace2f7af909c912ebacf747011c699 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 15:10:06 -0800 Subject: [PATCH 06/11] docs: clarify docker project workspace behavior --- nvflare/app_opt/job_launcher/docker_launcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nvflare/app_opt/job_launcher/docker_launcher.py b/nvflare/app_opt/job_launcher/docker_launcher.py index ec3b73d02b..0d52c80807 100644 --- a/nvflare/app_opt/job_launcher/docker_launcher.py +++ b/nvflare/app_opt/job_launcher/docker_launcher.py @@ -120,6 +120,8 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: project = job_meta.get(JobMetaKey.PROJECT.value, "") docker_workspace = os.environ.get("NVFL_DOCKER_WORKSPACE") + # Keep legacy jobs on the existing workspace root; only non-default projects + # get a project-specific subdirectory under the configured Docker workspace. if docker_workspace and isinstance(project, str) and project and project != "default": docker_workspace = os.path.join(docker_workspace, project) From 2ee816b21ba378641d8445807b0b1112072d0d13 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 15:13:31 -0800 Subject: [PATCH 07/11] docs: keep multiproject implementation notes local --- docs/design/multiproject_implementation.md | 508 --------------------- 1 file changed, 508 deletions(-) delete mode 100644 docs/design/multiproject_implementation.md diff --git a/docs/design/multiproject_implementation.md b/docs/design/multiproject_implementation.md deleted file mode 100644 index 62b0773b97..0000000000 --- a/docs/design/multiproject_implementation.md +++ /dev/null @@ -1,508 +0,0 @@ -# Multitenancy Implementation Plan - -Companion to [multiproject.md](multiproject.md). This document specifies *how* to implement the design with minimal risk. - -## Guiding Principles - -1. **Phased rollout** — Phase 1 project plumbing is ungated; full multitenancy enforcement is gated on `project.yml` having a `projects:` section (`api_version: 4`). -2. **Additive, not migratory** — no job store path migration, no role renames. New code paths only. -3. **Layered role resolution** — registry role for active project when available; cert remains fallback for `default` only. No breaking change to cert format. -4. **Incremental delivery** — three shippable milestones, each independently testable. - ---- - -## Codebase Map - -Key files and their roles (discovered via exploration): - -| Component | File | Key Classes/Functions | -|-----------|------|----------------------| -| Job metadata | `nvflare/apis/job_def.py:48-82` | `JobMetaKey` enum | -| Job store | `nvflare/app_common/storages/filesystem_storage.py` | `FilesystemStorage` | -| Job manager | `nvflare/apis/impl/job_def_manager.py` | `SimpleJobDefManager` | -| Job submission | `nvflare/private/fed/server/job_cmds.py:564-618` | `submit_job()` handler | -| Job listing | `nvflare/private/fed/server/job_cmds.py:316-367` | `list_jobs()` handler | -| Job scheduling | `nvflare/app_common/job_schedulers/job_scheduler.py:101-230` | `DefaultJobScheduler._try_job()` | -| Authz policy | `nvflare/fuel/sec/authz.py` | `AuthorizationService`, `Policy`, `Authorizer` | -| Authz filter | `nvflare/fuel/hci/server/authz.py:44-94` | `AuthzFilter.pre_command()` | -| Login/session | `nvflare/fuel/hci/server/login.py:69-119` | `handle_cert_login()` | -| Server session | `nvflare/fuel/hci/server/sess.py:33-86` | `Session` (user_name, user_org, user_role) | -| Conn properties | `nvflare/fuel/hci/server/constants.py:16-45` | `ConnProps` | -| Role from cert | `nvflare/fuel/hci/security.py:74-98` | `get_identity_info()` → `UNSTRUCTURED_NAME` | -| Admin server | `nvflare/private/fed/server/admin.py:95-174` | `FedAdminServer` (filter chain setup) | -| Cmd authz utils | `nvflare/private/fed/server/cmd_utils.py:41-148` | `authorize_job()`, `must_be_project_admin()` | -| FLARE API Session | `nvflare/fuel/flare_api/flare_api.py:65-108` | `Session` (client-side) | -| `new_secure_session` | `nvflare/fuel/flare_api/flare_api.py:944-956` | Session factory | -| ProdEnv | `nvflare/recipe/prod_env.py:45-108` | `ProdEnv` (recipe execution env) | -| SessionManager | `nvflare/recipe/session_mgr.py:40-106` | `SessionManager` | -| Provisioner | `nvflare/lighter/provision.py:132-170` | `prepare_project()`, loads `project.yml` | -| Project entity | `nvflare/lighter/entity.py:370-573` | `Project` class | -| Cert generation | `nvflare/lighter/impl/cert.py:296-347` | `get_pri_key_cert()` | -| x509 role field | `nvflare/lighter/utils.py:129-135` | `x509_name()` → `UNSTRUCTURED_NAME` | -| Admin roles | `nvflare/lighter/constants.py:109-116` | `AdminRole`, `DEFINED_ROLES` | -| Audit | `nvflare/fuel/sec/audit.py:90-125` | `AuditService` singleton | -| Audit filter | `nvflare/fuel/hci/server/audit.py:23-46` | `CommandAudit.pre_command()` | -| Security init | `nvflare/private/fed/utils/fed_utils.py:98-147` | `security_init()` | -| FLAuthorizer | `nvflare/security/security.py:21-74` | `FLAuthorizer`, `COMMAND_CATEGORIES` | - ---- - -## Risk Mitigations - -### Problem: Job store path migration -**Mitigation**: no migration. New multitenant jobs go to `jobs///`. Existing jobs stay at `jobs//` and implicitly belong to `default`. `SimpleJobDefManager` checks both paths when listing `default` project jobs. - -### Problem: `project_admin` role rename -**Mitigation**: no rename. `project_admin` already fits as the per-project admin concept. Add `platform_admin` as a new global role. `must_be_project_admin()`-style checks for project job operations must remain project-scoped (no implicit `platform_admin` job-data access). - -### Problem: Cert role becomes vestigial -**Mitigation**: layered resolution. If `ProjectRegistry` has a mapping for this user+project, use it. If not, cert fallback applies only when active project is `default`; otherwise deny. Cert format unchanged; no re-provisioning required for existing deployments. - -### Problem: Project name injection/path hazards -**Mitigation**: centralized `validate_project_name()` used by provisioning and job submission. Rules: -- Must be non-empty string, length `1..63` -- Must match `^[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?$` -- Invalid names are rejected (no auto-truncation) -- `default` reserved as system project name - -### Problem: Cross-cutting feature requires all-or-nothing -**Mitigation**: three milestones. Milestone 1 adds plumbing with zero behavior change. Milestone 2 adds the registry (gated). Milestone 3 enforces scoping. - ---- - -## Milestone 1: Project-Aware Plumbing - -**Goal**: thread `project` through the stack while preserving current API defaults (`None`). All existing tests pass, zero behavior change. - -### 1.1 Add `JobMetaKey.PROJECT` - -**File**: `nvflare/apis/job_def.py` - -```python -class JobMetaKey(str, Enum): - ... - PROJECT = "project" -``` - -### 1.2 Add `ConnProps.PROJECT` - -**File**: `nvflare/fuel/hci/server/constants.py` - -```python -class ConnProps(object): - ... - PROJECT = "_project" -``` - -### 1.3 Stamp project on job submission - -**File**: `nvflare/private/fed/server/job_cmds.py` - -In `submit_job()` (~line 604), after setting submitter info: -```python -project = conn.get_prop(ConnProps.PROJECT, None) -if project is not None: - validate_project_name(project) -meta[JobMetaKey.PROJECT.value] = project -``` - -Same for `clone_job()` (~line 504). - -### 1.4 Add `project` to server-side `Session` - -**File**: `nvflare/fuel/hci/server/sess.py` - -Add `project: Optional[str] = None` to `Session.__init__()`. Include `"p"` key in token encoding. Set `ConnProps.PROJECT` from session in `LoginModule.pre_command()`. - -### 1.5 Add `project` to client-side `Session` - -**File**: `nvflare/fuel/flare_api/flare_api.py` - -- Add `project: Optional[str] = None` param to `Session.__init__()` and `new_secure_session()` -- Store as `self._project`, pass to server on connect - -### 1.6 Add `project` to `ProdEnv` - -**File**: `nvflare/recipe/prod_env.py` - -Add `project: Optional[str] = None` param, pass through to `SessionManager`. - -### 1.7 Add `project` to audit events - -**File**: `nvflare/fuel/sec/audit.py` - -Add `project` param to `add_event()`. Emit `[P:project]` in log line. - -**File**: `nvflare/fuel/hci/server/audit.py` - -Pass `conn.get_prop(ConnProps.PROJECT, None)` to `add_event()`. - -### 1.8 Filter `list_jobs` by project - -**File**: `nvflare/private/fed/server/job_cmds.py` - -Add project predicate to `_job_match()`: -```python -and ((not project) or job_meta.get("project", "default") == project) -``` - -Extract shared helper `_is_job_in_project(job_meta, project)` for reuse across all job command handlers. - -### Milestone 1 Summary - -| File | Change | -|------|--------| -| `nvflare/apis/job_def.py` | +1 enum value | -| `nvflare/fuel/hci/server/constants.py` | +1 constant | -| `nvflare/private/fed/server/job_cmds.py` | stamp project on submit/clone, filter list_jobs | -| `nvflare/fuel/hci/server/sess.py` | project field on Session | -| `nvflare/fuel/hci/server/login.py` | set ConnProps.PROJECT from session | -| `nvflare/fuel/flare_api/flare_api.py` | project param on Session + factory | -| `nvflare/recipe/prod_env.py` | project param | -| `nvflare/recipe/session_mgr.py` | pass project through | -| `nvflare/fuel/sec/audit.py` | project field in events | -| `nvflare/fuel/hci/server/audit.py` | pass project from conn | - -**~10 files, ~150 lines. Zero behavior change.** - ---- - -## Milestone 2: Project Registry + Role Resolution - -**Goal**: add `ProjectRegistry`, per-project role resolution, `platform_admin` role, `set_project`/`list_projects` commands. Gated on `api_version: 4`. - -### 2.1 Create `ProjectRegistry` - -**New file**: `nvflare/security/project_registry.py` (~150 lines) - -```python -class ProjectRegistry: - """Resolves project membership, site enrollment, and per-project roles. - - Loaded from project.yml at server startup. When absent or api_version < 4, - operates in single-tenant mode (all users/sites in 'default' project, - roles from certs). - """ - - def __init__(self): - self._projects = {} # name -> {sites: set, users: dict} - self._multitenant = False - - def load_from_config(self, project_dict: dict): - """Load from parsed project.yml. Detects api_version >= 4.""" - ... - - def is_multitenant(self) -> bool: - """True if projects section exists (api_version 4+).""" - return self._multitenant - - def get_projects(self) -> List[str]: - """All project names.""" - ... - - def get_project_sites(self, project: str) -> Set[str]: - """Site names enrolled in project (client-type sites).""" - ... - - def get_user_projects(self, username: str) -> List[str]: - """Projects this user belongs to.""" - ... - - def get_user_role(self, username: str, project: str) -> Optional[str]: - """User's role in project, or None if not a member.""" - ... - - def is_platform_admin(self, username: str) -> bool: - """True if user has global platform_admin role.""" - ... - - def is_user_in_project(self, username: str, project: str) -> bool: - ... - - def is_site_in_project(self, site_name: str, project: str) -> bool: - ... -``` - -Singleton access via `ProjectRegistryService` (follows existing `AuthorizationService` pattern). - -### 2.2 Add `platform_admin` role - -**File**: `nvflare/lighter/constants.py` - -```python -class AdminRole: - PLATFORM_ADMIN = "platform_admin" # new - PROJECT_ADMIN = "project_admin" - ORG_ADMIN = "org_admin" - LEAD = "lead" - MEMBER = "member" - -DEFINED_ROLES = [ - AdminRole.PLATFORM_ADMIN, - AdminRole.PROJECT_ADMIN, - AdminRole.ORG_ADMIN, - AdminRole.LEAD, - AdminRole.MEMBER, -] -``` - -### 2.3 Load registry at server startup - -**File**: `nvflare/private/fed/utils/fed_utils.py` in `security_init()` - -After loading authorization policy, load `project.yml` into `ProjectRegistryService`: -```python -project_config = load_yaml(workspace.get_project_config_path()) -ProjectRegistryService.initialize(project_config) -``` - -### 2.4 Layered role resolution in login - -**File**: `nvflare/fuel/hci/server/login.py` - -In `handle_cert_login()`, after extracting identity from cert: -```python -# Existing: role from cert -role = identity_info.get(IdentityKey.ROLE, "") - -# New: override with registry role if multitenant -registry = ProjectRegistryService.get_registry() -if registry and registry.is_multitenant(): - project = ... # from login request or default - registry_role = registry.get_user_role(username, project) - if registry_role: - role = registry_role -``` - -### 2.5 Project filter in authz chain - -**File**: `nvflare/fuel/hci/server/authz.py` - -New `ProjectFilter(CommandFilter)` registered in filter chain between `LoginModule` and `AuthzFilter`: - -```python -class ProjectFilter(CommandFilter): - """Validates user has access to their active project. - - Registered AFTER LoginModule (needs identity) and BEFORE AuthzFilter - (sets project-scoped role for downstream authz). - """ - def pre_command(self, conn, args): - registry = ProjectRegistryService.get_registry() - if not registry or not registry.is_multitenant(): - return True # single-tenant, no filtering - - project = conn.get_prop(ConnProps.PROJECT, "default") - username = conn.get_prop(ConnProps.USER_NAME, "") - - if registry.is_platform_admin(username): - return True # platform admin bypasses project check - - if not registry.is_user_in_project(username, project): - conn.append_error("Access denied: not a member of this project") - return False - - # Override role to project-specific role - role = registry.get_user_role(username, project) - conn.set_prop(ConnProps.USER_ROLE, role) - return True -``` - -**File**: `nvflare/private/fed/server/admin.py` - -Register `ProjectFilter` after `LoginModule`, before `AuthzFilter`: -```python -login_module = LoginModule(sess_mgr) -cmd_reg.add_filter(login_module) - -project_filter = ProjectFilter() # new -cmd_reg.add_filter(project_filter) # new - -authz_filter = AuthzFilter() -cmd_reg.add_filter(authz_filter) -``` - -### 2.6 `set_project` and `list_projects` commands - -**New command handlers** in `nvflare/private/fed/server/job_cmds.py` (or a new `ProjectCommandModule`): - -- `set_project `: validates user membership, updates session's project, re-resolves role -- `list_projects`: returns projects the user belongs to (or all, for `platform_admin`) - -**Client-side**: `Session.set_project()` and `Session.list_projects()` in `nvflare/fuel/flare_api/flare_api.py`. - -### 2.7 Provisioner: parse `api_version: 4` - -**File**: `nvflare/lighter/provision.py` - -Extend `prepare_project()` to accept `api_version` 3 or 4. When 4: -- Parse `projects:` section -- Parse per-admin `projects:` mapping -- Validate referenced project sites exist in participants and are client-type sites -- Preserve dynamic provisioning behavior: adding users/sites should not require reprovisioning all existing sites; regenerate/update startup artifacts only for the server and newly added/changed participants. - -**File**: `nvflare/lighter/entity.py` - -Add `projects` property to `Project` class. Add `project_roles` dict to admin `Participant`. - -### 2.8 Server startup kit includes `project.yml` - -**File**: `nvflare/lighter/impl/static_file.py` - -Copy `project.yml` into server startup kit directory. - -### Milestone 2 Summary - -| File | Change | -|------|--------| -| `nvflare/security/project_registry.py` | **new** — ProjectRegistry + service | -| `nvflare/lighter/constants.py` | +PLATFORM_ADMIN role | -| `nvflare/private/fed/utils/fed_utils.py` | load registry at startup | -| `nvflare/fuel/hci/server/login.py` | layered role resolution | -| `nvflare/fuel/hci/server/authz.py` | +ProjectFilter class | -| `nvflare/private/fed/server/admin.py` | register ProjectFilter | -| `nvflare/private/fed/server/job_cmds.py` | set_project, list_projects handlers | -| `nvflare/fuel/flare_api/flare_api.py` | set_project, list_projects client methods | -| `nvflare/lighter/provision.py` | api_version 4 parsing | -| `nvflare/lighter/entity.py` | projects on Project/Participant | -| `nvflare/lighter/impl/static_file.py` | include project.yml in server kit | - -**~11 files (1 new), ~500 lines. Gated on api_version 4.** - ---- - -## Milestone 3: Enforcement + Scheduler - -**Goal**: all commands scoped to active project. Scheduler validates project site enrollment (client-type sites). - -### 3.1 All job commands: project gate - -**File**: `nvflare/private/fed/server/job_cmds.py` - -For every job-specific handler (`abort_job`, `delete_job`, `download_job`, `clone_job`, `app_command`, `configure_job_log`), add project validation in `authorize_job_id()`: - -```python -def authorize_job_id(self, conn, args): - ... # existing: load job, set submitter props - - # New: verify job belongs to active project - job_project = job.meta.get(JobMetaKey.PROJECT.value, "default") - active_project = conn.get_prop(ConnProps.PROJECT, "default") - if job_project != active_project: - conn.append_error("Job not found in current project") - return PreAuthzReturnCode.ERROR - - return PreAuthzReturnCode.REQUIRE_AUTHZ -``` - -This is a single change point since all job commands route through `authorize_job_id()`. - -### 3.2 Infrastructure commands: filter to project sites - -**File**: `nvflare/private/fed/server/training_cmds.py` (and `cmd_utils.py`) - -In `validate_command_targets()`, filter target list to sites enrolled in the active project: - -```python -registry = ProjectRegistryService.get_registry() -if registry and registry.is_multitenant(): - project = conn.get_prop(ConnProps.PROJECT, "default") - for target in targets: - if not registry.is_site_in_project(target, project): - conn.append_error(f"Site '{target}' not in project '{project}'") - return PreAuthzReturnCode.ERROR -``` - -### 3.3 `check_status`: filter response - -**File**: `nvflare/private/fed/server/training_cmds.py` - -Filter the client list in `check_status` response to only include clients enrolled in the user's active project. - -### 3.4 Scheduler: validate deploy_map against project - -**File**: `nvflare/app_common/job_schedulers/job_scheduler.py` - -In `_try_job()`, after extracting applicable sites (~line 126): - -```python -registry = ProjectRegistryService.get_registry() -if registry and registry.is_multitenant(): - project = job_meta.get(JobMetaKey.PROJECT.value, "default") - project_sites = registry.get_project_sites(project) - for site in applicable_sites: - if site != SERVER_SITE_NAME and site not in project_sites: - return (SCHEDULE_RESULT_BLOCK, None, f"Site {site} not in project {project}") -``` - -### 3.5 Job store partitioning (new jobs only) - -**File**: `nvflare/apis/impl/job_def_manager.py` - -Change `job_uri()` to include project for non-default projects: - -```python -def job_uri(self, jid, project=None): - if project and project != "default": - return os.path.join(self._uri_root, project, jid) - return os.path.join(self._uri_root, jid) # backward compat -``` - -`get_all_jobs()` scans both `//` and `///` paths. - -### 3.6 Tests - -| Test | File | ~Lines | -|------|------|--------| -| ProjectRegistry unit tests | `tests/unit_test/security/project_registry_test.py` | ~200 | -| Project-scoped job filtering | `tests/unit_test/private/fed/server/job_cmds_project_test.py` | ~150 | -| Per-project role resolution | `tests/unit_test/fuel/hci/server/project_filter_test.py` | ~100 | -| Provisioner v4 parsing | `tests/unit_test/lighter/provision_v4_test.py` | ~100 | -| Scheduler project validation | `tests/unit_test/app_common/job_schedulers/scheduler_project_test.py` | ~80 | - -### Milestone 3 Summary - -| File | Change | -|------|--------| -| `nvflare/private/fed/server/job_cmds.py` | project gate in authorize_job_id | -| `nvflare/private/fed/server/training_cmds.py` | filter infra commands to project sites | -| `nvflare/private/fed/server/cmd_utils.py` | project-aware target validation | -| `nvflare/app_common/job_schedulers/job_scheduler.py` | deploy_map vs project sites | -| `nvflare/apis/impl/job_def_manager.py` | partitioned job_uri for new jobs | -| `tests/unit_test/...` (5 new files) | ~630 lines of tests | - -**~5 files + 5 test files, ~550 lines.** - ---- - -## Total Estimates - -| Milestone | Files Changed | Files Created | Lines | -|-----------|:---:|:---:|:---:| -| M1: Plumbing | 10 | 0 | ~150 | -| M2: Registry + Authz | 10 | 1 | ~500 | -| M3: Enforcement + Tests | 5 | 5 | ~550 | -| **Total** | **~22** | **~6** | **~1,200** | - ---- - -## Out of Scope (This Plan) - -- K8s `K8sJobLauncher` changes (namespace per project, PVC selection) -- Docker `DockerJobLauncher` changes (per-project volume mounts) -- Slurm launcher changes -- Runtime project CRUD (D2 — projects defined at provision time only) -- Per-project quota management (D3 — rely on K8s ResourceQuota) -- External IdP integration (OIDC/SAML) -- Singleton refactoring (document as constraint, defer refactor) -- Shell command restrictions (see Unresolved Questions in design doc) - ---- - -## Unresolved Questions - -1. **`set_project` protocol**: does `set_project` require a new server round-trip, or can the client just switch locally and send the new project on the next command? Server round-trip is safer (validates membership) but adds latency. - -2. **Token encoding**: should the project be part of the session token, or sent as a command header? Token means re-auth on project switch; header is simpler but requires server-side validation on every command. - -3. **Provisioner backward compat**: when `api_version: 4` project.yml is used, should the provisioner still bake a role into the cert? Options: (a) bake the first project's role as a fallback, (b) leave `UNSTRUCTURED_NAME` empty, (c) bake a sentinel value like `"multitenant"`. From b2c24baf1cad6a14da9d99c56b3382b5378f44b7 Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 15:36:55 -0800 Subject: [PATCH 08/11] style: format flare api project test --- tests/unit_test/fuel/flare_api/flare_api_project_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit_test/fuel/flare_api/flare_api_project_test.py b/tests/unit_test/fuel/flare_api/flare_api_project_test.py index 23fcebcb93..446b37de24 100644 --- a/tests/unit_test/fuel/flare_api/flare_api_project_test.py +++ b/tests/unit_test/fuel/flare_api/flare_api_project_test.py @@ -75,4 +75,3 @@ def test_new_secure_session_forwards_project(): new_secure_session("admin@nvidia.com", "/tmp/kit", project="cancer-research") _, kwargs = mock_new_session.call_args assert kwargs["project"] == "cancer-research" - From 73788b33414134dbf3499157a055cda7235795ee Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 15:44:30 -0800 Subject: [PATCH 09/11] style: sort flare api project test imports --- tests/unit_test/fuel/flare_api/flare_api_project_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_test/fuel/flare_api/flare_api_project_test.py b/tests/unit_test/fuel/flare_api/flare_api_project_test.py index 446b37de24..f2c9aa4714 100644 --- a/tests/unit_test/fuel/flare_api/flare_api_project_test.py +++ b/tests/unit_test/fuel/flare_api/flare_api_project_test.py @@ -14,9 +14,9 @@ from unittest.mock import patch +from nvflare.fuel.flare_api.flare_api import Session, new_secure_session from nvflare.fuel.hci.client.api import ResultKey from nvflare.fuel.hci.proto import MetaKey -from nvflare.fuel.flare_api.flare_api import Session, new_secure_session def _make_session_for_project(project): From f4c17a8bd7d873a2a7f8984be0b3f222b09ac51e Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Fri, 6 Mar 2026 15:57:00 -0800 Subject: [PATCH 10/11] fix: validate docker workspace and submit project meta --- .../app_opt/job_launcher/docker_launcher.py | 25 +++- nvflare/private/fed/server/job_cmds.py | 5 +- .../unit_test/app_opt/docker_launcher_test.py | 116 ++++++++++++++++++ .../private/fed/server/job_cmds_test.py | 81 +++++++++++- 4 files changed, 214 insertions(+), 13 deletions(-) create mode 100644 tests/unit_test/app_opt/docker_launcher_test.py diff --git a/nvflare/app_opt/job_launcher/docker_launcher.py b/nvflare/app_opt/job_launcher/docker_launcher.py index 0d52c80807..649431f9ea 100644 --- a/nvflare/app_opt/job_launcher/docker_launcher.py +++ b/nvflare/app_opt/job_launcher/docker_launcher.py @@ -105,6 +105,23 @@ def __init__(self, mount_path: str = "/workspace", network: str = "nvflare-netwo self.network = network self.timeout = timeout + def _resolve_docker_workspace(self, job_id: str, project) -> str: + docker_workspace = os.environ.get("NVFL_DOCKER_WORKSPACE") + if not docker_workspace: + self.logger.error(f"Failed to launch job {job_id}: NVFL_DOCKER_WORKSPACE is not set.") + return "" + + # Keep legacy jobs on the existing workspace root; only non-default projects + # get a project-specific subdirectory under the configured Docker workspace. + if isinstance(project, str) and project and project != "default": + docker_workspace = os.path.join(docker_workspace, project) + + if not os.path.isdir(docker_workspace): + self.logger.error(f"Failed to launch job {job_id}: Docker workspace does not exist: {docker_workspace}") + return "" + + return docker_workspace + def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: self.logger.debug("DockerJobLauncher start to launch job") job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) @@ -119,11 +136,9 @@ def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: self.logger.info(f"Launch image:{job_image}, run command: {command}") project = job_meta.get(JobMetaKey.PROJECT.value, "") - docker_workspace = os.environ.get("NVFL_DOCKER_WORKSPACE") - # Keep legacy jobs on the existing workspace root; only non-default projects - # get a project-specific subdirectory under the configured Docker workspace. - if docker_workspace and isinstance(project, str) and project and project != "default": - docker_workspace = os.path.join(docker_workspace, project) + docker_workspace = self._resolve_docker_workspace(job_id, project) + if not docker_workspace: + return None self.logger.info(f"launch_job {job_id} in docker_workspace: {docker_workspace} (project={project})") diff --git a/nvflare/private/fed/server/job_cmds.py b/nvflare/private/fed/server/job_cmds.py index 5d377f84bc..ab799f84d7 100644 --- a/nvflare/private/fed/server/job_cmds.py +++ b/nvflare/private/fed/server/job_cmds.py @@ -617,6 +617,9 @@ def submit_job(self, conn: Connection, args: List[str]): f"job_def_manager in engine is not of type JobDefManagerSpec, but got {type(job_def_manager)}" ) + if not self._add_project_to_meta(meta, conn): + return + fl_ctx.set_prop(FLContextKey.JOB_META, meta, private=True, sticky=False) engine.fire_event(EventType.SUBMIT_JOB, fl_ctx) block_reason = fl_ctx.get_prop(FLContextKey.JOB_BLOCK_REASON) @@ -633,8 +636,6 @@ def submit_job(self, conn: Connection, args: List[str]): meta[JobMetaKey.SUBMITTER_ORG.value] = conn.get_prop(ConnProps.USER_ORG, "") meta[JobMetaKey.SUBMITTER_ROLE.value] = conn.get_prop(ConnProps.USER_ROLE, "") meta[JobMetaKey.JOB_FOLDER_NAME.value] = folder_name - if not self._add_project_to_meta(meta, conn): - return custom_props = conn.get_prop(ConnProps.CUSTOM_PROPS) if custom_props: meta[JobMetaKey.CUSTOM_PROPS.value] = custom_props diff --git a/tests/unit_test/app_opt/docker_launcher_test.py b/tests/unit_test/app_opt/docker_launcher_test.py new file mode 100644 index 0000000000..cd63793d24 --- /dev/null +++ b/tests/unit_test/app_opt/docker_launcher_test.py @@ -0,0 +1,116 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import Mock + +from nvflare.apis.fl_constant import FLContextKey, JobConstants, ReservedKey +from nvflare.apis.fl_context import FLContext +from nvflare.apis.job_def import JobMetaKey +from nvflare.app_opt.job_launcher.docker_launcher import DockerJobHandle, DockerJobLauncher + + +class _DummyWorkspace: + def get_app_custom_dir(self, job_id): + return "" + + +class _DummyDockerLauncher(DockerJobLauncher): + def get_command(self, job_meta, fl_ctx) -> (str, str): + return "test-container", "python worker.py" + + +def _make_fl_ctx(): + fl_ctx = FLContext() + fl_ctx.set_prop(FLContextKey.WORKSPACE_OBJECT, _DummyWorkspace(), private=True, sticky=False) + fl_ctx.set_prop(ReservedKey.IDENTITY_NAME, "server", private=True, sticky=False) + return fl_ctx + + +def _make_job_meta(project=""): + job_meta = { + JobConstants.JOB_ID: "job-1", + JobMetaKey.DEPLOY_MAP.value: { + "app": [ + { + JobConstants.SITES: ["server"], + JobConstants.JOB_IMAGE: "nvflare:test", + } + ] + }, + } + if project: + job_meta[JobMetaKey.PROJECT.value] = project + return job_meta + + +def test_launch_job_returns_none_when_workspace_env_missing(monkeypatch): + launcher = _DummyDockerLauncher() + fl_ctx = _make_fl_ctx() + job_meta = _make_job_meta(project="cancer-research") + docker_from_env = Mock() + + monkeypatch.delenv("NVFL_DOCKER_WORKSPACE", raising=False) + monkeypatch.setattr("nvflare.app_opt.job_launcher.docker_launcher.docker.from_env", docker_from_env) + + handle = launcher.launch_job(job_meta, fl_ctx) + + assert handle is None + docker_from_env.assert_not_called() + + +def test_launch_job_returns_none_when_project_workspace_missing(monkeypatch, tmp_path): + launcher = _DummyDockerLauncher() + fl_ctx = _make_fl_ctx() + job_meta = _make_job_meta(project="cancer-research") + docker_from_env = Mock() + + workspace_root = tmp_path / "workspace" + workspace_root.mkdir() + monkeypatch.setenv("NVFL_DOCKER_WORKSPACE", str(workspace_root)) + monkeypatch.setattr("nvflare.app_opt.job_launcher.docker_launcher.docker.from_env", docker_from_env) + + handle = launcher.launch_job(job_meta, fl_ctx) + + assert handle is None + docker_from_env.assert_not_called() + + +def test_launch_job_uses_project_workspace_when_present(monkeypatch, tmp_path): + launcher = _DummyDockerLauncher() + fl_ctx = _make_fl_ctx() + job_meta = _make_job_meta(project="cancer-research") + + workspace_root = tmp_path / "workspace" + project_workspace = workspace_root / "cancer-research" + project_workspace.mkdir(parents=True) + + fake_container = Mock() + fake_container.id = "container-id" + fake_client = Mock() + fake_client.containers.run.return_value = fake_container + + monkeypatch.setenv("NVFL_DOCKER_WORKSPACE", str(workspace_root)) + monkeypatch.setattr("nvflare.app_opt.job_launcher.docker_launcher.docker.from_env", Mock(return_value=fake_client)) + monkeypatch.setattr(DockerJobHandle, "enter_states", Mock(return_value=True)) + + handle = launcher.launch_job(job_meta, fl_ctx) + + assert isinstance(handle, DockerJobHandle) + fake_client.containers.run.assert_called_once() + assert fake_client.containers.run.call_args.kwargs["volumes"] == { + str(project_workspace): { + "bind": launcher.mount_path, + "mode": "rw", + } + } diff --git a/tests/unit_test/private/fed/server/job_cmds_test.py b/tests/unit_test/private/fed/server/job_cmds_test.py index a7acc0a3b2..46ab57fce7 100644 --- a/tests/unit_test/private/fed/server/job_cmds_test.py +++ b/tests/unit_test/private/fed/server/job_cmds_test.py @@ -16,7 +16,11 @@ import pytest +from nvflare.apis.event_type import EventType +from nvflare.apis.fl_constant import FLContextKey +from nvflare.apis.job_def import JobMetaKey from nvflare.fuel.hci.server.constants import ConnProps +from nvflare.private.fed.server import job_cmds as job_cmds_module from nvflare.private.fed.server.job_cmds import JobCommandModule, _create_list_job_cmd_parser TEST_CASES = [ @@ -45,18 +49,26 @@ def test_parse_args(self, args: list[str], expected_args): class _MockConnection: - def __init__(self, cmd_props=None): - self._cmd_props = cmd_props + def __init__(self, cmd_props=None, app_ctx=None, props=None): + self._props = dict(props or {}) + self._props.setdefault(ConnProps.CMD_PROPS, cmd_props) + self.app_ctx = app_ctx self.errors = [] + self.strings = [] + self.successes = [] - def get_prop(self, key): - if key == ConnProps.CMD_PROPS: - return self._cmd_props - return None + def get_prop(self, key, default=None): + return self._props.get(key, default) def append_error(self, msg, meta=None): self.errors.append((msg, meta)) + def append_string(self, msg, meta=None): + self.strings.append((msg, meta)) + + def append_success(self, msg, meta=None): + self.successes.append((msg, meta)) + class TestProjectCmdProps: @pytest.mark.parametrize( @@ -86,3 +98,60 @@ def test_add_project_to_meta_rejects_invalid_values(self, project): assert JobCommandModule._add_project_to_meta(meta, conn) is False assert meta == {} assert len(conn.errors) == 1 + + +class _FakeJobMetaValidator: + def validate(self, folder_name, zip_file_name): + assert folder_name == "job_folder" + assert zip_file_name == "job.zip" + return True, "", {} + + +class _FakeJobDefManager: + def __init__(self): + self.created_meta = None + + def create(self, meta, uploaded_content, fl_ctx): + self.created_meta = dict(meta) + result = dict(meta) + result[JobMetaKey.JOB_ID.value] = "new-job-id" + return result + + +class _FakeEngine: + def __init__(self): + self.job_def_manager = _FakeJobDefManager() + self.submit_event_meta = None + + def new_context(self): + from nvflare.apis.fl_context import FLContext + + return FLContext() + + def fire_event(self, event_type, fl_ctx): + assert event_type == EventType.SUBMIT_JOB + self.submit_event_meta = dict(fl_ctx.get_prop(FLContextKey.JOB_META, {})) + + +def test_submit_job_exposes_project_in_submit_event(monkeypatch): + monkeypatch.setattr(job_cmds_module, "JobMetaValidator", _FakeJobMetaValidator) + monkeypatch.setattr(job_cmds_module, "JobDefManagerSpec", object) + + engine = _FakeEngine() + conn = _MockConnection( + app_ctx=engine, + props={ + ConnProps.FILE_LOCATION: "job.zip", + ConnProps.CMD_PROPS: {"project": "cancer-research"}, + ConnProps.USER_NAME: "submitter", + ConnProps.USER_ORG: "org", + ConnProps.USER_ROLE: "role", + }, + ) + + JobCommandModule().submit_job(conn, ["submit_job", "job_folder"]) + + assert conn.errors == [] + assert len(conn.successes) == 1 + assert engine.submit_event_meta == {JobMetaKey.PROJECT.value: "cancer-research"} + assert engine.job_def_manager.created_meta[JobMetaKey.PROJECT.value] == "cancer-research" From 08d5bf898710f6f1f68790294fa87c08703b0bba Mon Sep 17 00:00:00 2001 From: Peter Cnudde Date: Sun, 15 Mar 2026 15:22:06 -0700 Subject: [PATCH 11/11] refactor: remove project from poc env --- docs/design/multiproject.md | 16 +++------------- nvflare/recipe/poc_env.py | 11 ----------- tests/unit_test/recipe/poc_env_test.py | 13 ------------- 3 files changed, 3 insertions(+), 37 deletions(-) diff --git a/docs/design/multiproject.md b/docs/design/multiproject.md index f76050f17f..9f26bb3c85 100644 --- a/docs/design/multiproject.md +++ b/docs/design/multiproject.md @@ -46,7 +46,7 @@ A project is a named, immutable tenant boundary with these properties: ### Data Scientist (Recipe API) -The recipe is unchanged. The project is specified via `ProdEnv` or `PocEnv`: +The recipe is unchanged. The project is specified via `ProdEnv`: ```python recipe = FedAvgRecipe( @@ -64,17 +64,7 @@ env = ProdEnv( run = recipe.execute(env) ``` -`PocEnv` supports the same parameter: - -```python -env = PocEnv( - poc_workspace=args.poc_workspace, - project="cancer-research", -) -run = recipe.execute(env) -``` - -If `project` is omitted in either env, it remains `None` (no API default change). +If `project` is omitted in `ProdEnv`, it remains `None` (no API default change). ### Admin (FLARE API / Admin Console) @@ -472,7 +462,7 @@ Phase 1 delivers no access control, no job store partitioning, and no cert/regis ### Scope -1. Add `project: Optional[str] = None` parameter to `ProdEnv` and `PocEnv`. +1. Add `project: Optional[str] = None` parameter to `ProdEnv`. 2. Pass `project` through to the job metadata at submission/clone time, with syntax validation before persistence. 3. `K8sJobLauncher` reads `project` from job metadata and selects the corresponding project workspace volume. 4. `DockerJobLauncher` reads `project` from job metadata and mounts `/data//` as the workspace volume. diff --git a/nvflare/recipe/poc_env.py b/nvflare/recipe/poc_env.py index e8cd458c07..ac6a67fadf 100644 --- a/nvflare/recipe/poc_env.py +++ b/nvflare/recipe/poc_env.py @@ -19,7 +19,6 @@ from pydantic import BaseModel, conint, model_validator -from nvflare.apis.utils.format_check import name_check from nvflare.job_config.api import FedJob from nvflare.recipe.spec import ExecEnv from nvflare.recipe.utils import _collect_non_local_scripts @@ -51,7 +50,6 @@ class _PocEnvValidator(BaseModel): docker_image: Optional[str] = None project_conf_path: str = "" username: str = DEFAULT_ADMIN_USER - project: Optional[str] = None @model_validator(mode="after") def check_client_configuration(self): @@ -69,10 +67,6 @@ def check_client_configuration(self): if self.clients is None and self.num_clients <= 0: raise ValueError("num_clients must be greater than 0") - if self.project is not None: - err, reason = name_check(self.project, "project") - if err: - raise ValueError(reason) return self @@ -93,7 +87,6 @@ def __init__( docker_image: Optional[str] = None, project_conf_path: str = "", username: str = DEFAULT_ADMIN_USER, - project: Optional[str] = None, extra: Optional[dict] = None, ): """Initialize POC execution environment. @@ -108,7 +101,6 @@ def __init__( project_conf_path (str, optional): Path to the project configuration file. Defaults to "". If specified, 'number_of_clients','clients' and 'docker' specific options will be ignored. username (str, optional): Admin user. Defaults to "admin@nvidia.com". - project (Optional[str]): Project name to tag submitted/cloned jobs. extra: extra env info. """ super().__init__(extra) @@ -121,7 +113,6 @@ def __init__( docker_image=docker_image, project_conf_path=project_conf_path, username=username, - project=project, ) self.clients = v.clients @@ -132,7 +123,6 @@ def __init__( self.project_conf_path = v.project_conf_path self.docker_image = v.docker_image self.username = v.username - self.project = v.project self._session_manager = None # Lazy initialization def deploy(self, job: FedJob): @@ -267,7 +257,6 @@ def _get_session_manager(self): "username": self.username, "startup_kit_location": self._get_admin_startup_kit_path(), "timeout": self.get_extra_prop("login_timeout", 10), - "project": self.project, } self._session_manager = SessionManager(session_params) return self._session_manager diff --git a/tests/unit_test/recipe/poc_env_test.py b/tests/unit_test/recipe/poc_env_test.py index c0000042d8..b710a7e1e4 100644 --- a/tests/unit_test/recipe/poc_env_test.py +++ b/tests/unit_test/recipe/poc_env_test.py @@ -61,10 +61,6 @@ def test_poc_env_validation(): with pytest.raises(ValueError, match="Inconsistent"): PocEnv(num_clients=3, clients=["site1", "site2"]) - # Test invalid project name - with pytest.raises(ValueError): - PocEnv(project="Bad Project") - def test_poc_env_client_names(): """Test PocEnv client name generation and validation.""" @@ -141,12 +137,3 @@ def test_stop_poc(mock_rmtree, mock_is_running, mock_clean_poc, mock_stop_poc, m ) mock_clean_poc.assert_called_once_with(env.poc_workspace) mock_rmtree.assert_called_once_with(env.poc_workspace, ignore_errors=True) - - -def test_poc_env_session_manager_passes_project(): - env = PocEnv(project="multiple-sclerosis") - with patch.object(env, "_get_admin_startup_kit_path", return_value="/tmp/admin@nvidia.com"): - with patch("nvflare.recipe.poc_env.SessionManager") as mock_session_manager: - env._get_session_manager() - session_params = mock_session_manager.call_args[0][0] - assert session_params["project"] == "multiple-sclerosis"