finos · 1597463007 · Aug 25, 2025 · Aug 25, 2025 · Aug 29, 2025 · Sep 12, 2025
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -0,0 +1,46 @@
+name: Publish documentation to GitHub Pages
+
+on:
+  release:
+    types: [ created ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
+        with:
+          egress-policy: audit
+
+      - name: Checkout code
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install uv
+          uv pip install --system -r docs/requirements_docs.txt
+
+      - name: Build documentation
+        run: cd docs && make html
+
+      - name: Upload Pages artifact
+        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3.0.1
+        with:
+          path: './docs/build/html'
+
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4.0.5
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.idea/
+
+docs/build/
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements_docs.txt b/docs/requirements_docs.txt
@@ -0,0 +1,3 @@
+sphinx
+sphinx_rtd_theme
+sphinxcontrib-openapi
diff --git a/docs/source/_static/schema.yaml b/docs/source/_static/schema.yaml
@@ -0,0 +1,86 @@
+swagger: "2.0"
+info:
+  title: OpenGRIS Adapter Webhook OpenAPI Schema
+  version: "1.0.0"
+paths:
+  /:
+    post:
+      summary: Webhook endpoint for OpenGRIS Adapter
+      parameters:
+        - in: body
+          name: body
+          required: true
+          schema:
+            type: object
+            properties:
+              action:
+                description: Action to perform. Valid values are 'start_worker' and 'shutdown_worker'.
+                type: string
+                enum:
+                  - start_worker
+                  - shutdown_worker
+              worker_id:
+                description: |
+                  Unique identifier for the worker instance. **Note**: This ID might be ignored when performing the 
+                  'start_worker` action, as the server may generate its own ID instead.
+                type: string
+              metadata:
+                description: |
+                  Optional metadata provided by the scheduler's scaling policy. May include details like instance type,
+                  region, image tags, etc.
+                type: object
+              auth_token:
+                description: |
+                  Optional authentication token for securing the webhook from unauthorized requests. Must match the 
+                  server's expected token if provided.
+                type: string
+            required:
+              - action
+              - worker_id
+      responses:
+        200:
+          description: Successful response
+          schema:
+            type: object
+            properties:
+              status:
+                description: Status of the requested action.
+                type: string
+              worker_id:
+                description: |
+                  Unique identifier for the worker instance. **Note**: This ID may not match the one provided in the
+                  request if the server generates its own ID when starting a new worker.
+                type: string
+              metadata:
+                description: |
+                  Optional metadata about the worker instance. May include details like instance type,
+                  region, image tags, etc.
+                type: object
+                properties: { }
+            required:
+              - status
+              - worker_id
+        400:
+          description: Bad Request, e.g., missing required fields or invalid action
+          schema:
+            $ref: '#/definitions/Error'
+        401:
+          description: Unauthorized, e.g., invalid or missing auth_token
+          schema:
+            $ref: '#/definitions/Error'
+        429:
+          description: Too Many Requests, e.g. rate limiting, no available resources
+          schema:
+            $ref: '#/definitions/Error'
+        500:
+          description: Internal Server Error, e.g., server-side or infrastructure related issues
+          schema:
+            $ref: '#/definitions/Error'
+definitions:
+  Error:
+    type: object
+    properties:
+      error:
+        type: string
+    required:
+      - error
diff --git a/docs/source/adapter/scaling.rst b/docs/source/adapter/scaling.rst
@@ -0,0 +1,56 @@
+Scaling Controller
+==================
+
+The scaling controller determines when to start and stop workers based on task and other state updates emitted by 
+the scheduler. The scaling controller implements a scaling policy that can suit specific needs such as:
+
+- Cost optimized: Start and stop workers based on task queue length and worker idle time.
+- Performance optimized: Start workers as soon as tasks are available.
+- Time constrained: Start workers based on a schedule.
+- Resource constrained: Start and stop workers based on available budget or resource quotas.
+
+To perform a scaling action, the scaling controller sends a webhook request to the adapter. The adapter then starts or
+stops workers as requested. For more details on the adapter, see the `Adapter Webhook API <webhook.html>`_.
+
+Scaling Controller Interface
+----------------------------
+
+.. code-block:: python
+
+    import abc
+
+    from scaler.protocol.python.message import (
+        StateBalanceAdvice,
+        StateClient,
+        StateGraphTask,
+        StateObject,
+        StateScheduler,
+        StateTask,
+        StateWorker
+    )
+    from scaler.utility.mixins import Reporter
+
+    class ScalingController(Reporter, abc.ABC):
+        async def on_state_client(self, state_client: StateClient):
+            pass
+
+        async def on_state_object(self, state_object: StateObject):
+            pass
+
+        async def on_state_balance_advice(self, state_balance_advice: StateBalanceAdvice):
+            pass
+
+        async def on_state_scheduler(self, state_scheduler: StateScheduler):
+            pass
+
+        async def on_state_worker(self, state_worker: StateWorker):
+            pass
+
+        async def on_state_task(self, state_task: StateTask):
+            pass
+
+        async def on_state_graph_task(self, state_graph_task: StateGraphTask):
+            pass
+
+In most cases, the scaling controller will only need to implement the ``on_state_task`` and ``on_state_worker``
+handlers. The other handlers are provided for completeness and future use cases.
diff --git a/docs/source/adapter/webhook.rst b/docs/source/adapter/webhook.rst
@@ -0,0 +1,40 @@
+Adapter Webhook API
+===================
+
+The Adapter Webhook API provides a single unified endpoint for the scheduler to request the start and stop of workers.
+
+Here is a high-level diagram illustrating how the Adapter Webhook API fits into the overall architecture:
+
+.. code-block::
+
+                                                Start  ┌────────┐
+                                            ┌──────────► Worker │
+                                            │          └────────┘
+                                            │                    
+    ┌───────────┐ Webhook Requests  ┌───────┴─┐ Start  ┌────────┐
+    │  Scaling  ├───────────────────► Adapter ├────────► Worker │
+    │  Policy   │                   └───────┬─┘        └────────┘
+    └─────▲─────┘                           │                    
+          │                                 │   Start  ┌────────┐
+          │ Task Updates                    └──────────► Worker │
+          │                                            └────────┘
+    ┌─────┴─────┐                                                
+    │ Scheduler │                                                
+    └───────────┘                                                
+
+Webhook Specification
+---------------------
+
+.. openapi:: ../_static/schema.yaml
+
+Scaling Controller Interaction
+------------------------------
+
+Interactions with the scaling controller can be complex given the possbility of dependency failures, network issues, 
+delays, etc. Here are some guidelines to ensure the scaling controller can communicate with the adapter reliably:
+
+- **Return the correct error code**: The scaling controller relies on the adapter to return the correct HTTP status code
+  to indicate success or failure of the request and whether the request should be retried.
+- **Return when the action is finished**: The scaling controller relies on the adapter to return a response only when
+  the requested action is finished. This ensures that the scaling controller has an accurate view of the current
+  state of workers.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,26 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "OpenGRIS"
+copyright = "2025, Citi"
+author = "Citi"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ["sphinxcontrib.openapi"]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,35 @@
+OpenGRIS: Open Standard for Grid Resource Scheduling
+====================================================
+
+OpenGRIS is an open standard for grid resource scheduling. It provides a standardized way to prepare, distribute, and
+execute tasks across elastic and heterogeneous computing environments.
+
+OpenGRIS contains the following components:
+
+- Scheduler: Manages tasks using a stable and language-agnostic communication protocol.
+    - Reference scheduler implementation: `OpenGRIS Scaler <https://github.com/finos/opengris-scaler>`_.
+- Worker: Executes tasks assigned by the scheduler.
+    - Reference worker implementation is part of OpenGRIS Scaler.
+    - IBM Spectrum Symphony worker implementation:
+      `Worker Code <https://github.com/finos/opengris-scaler/tree/main/scaler/worker/symphony>`_.
+- Adapter: Interfaces with the scheduler to start and stop workers.
+    - Reference adapter implementation for VMs:
+      `VM Adapter Pull Request <https://github.com/finos/opengris-scaler/pull/190>`_.
+    - Documentation: `Adapter Webhook API <adapter/webhook.html>`_.
+- Client Libraries: Helper libraries for implicit parallelization.
+    - Python client library is part of OpenGRIS Scaler.
+    - Graph parallelization library: `OpenGRIS Pargraph <https://github.com/finos/opengris-pargraph>`_.
+    - Map-reduce parallelization library: `OpenGRIS Parfun <https://github.com/finos/opengris-parfun>`_.
+- Object Storage: For sharing data objects between clients, schedulers, and workers.
+    - Reference object storage implementation:
+      `OpenGRIS Object Storage <https://github.com/finos/opengris-scaler/tree/main/scaler/object_storage>`_.
+
+Contents
+========
+
+.. toctree::
+   :maxdepth: 2
+
+   protocol/scheduler
+   adapter/scaling
+   adapter/webhook
diff --git a/docs/source/protocol/scheduler.rst b/docs/source/protocol/scheduler.rst
@@ -0,0 +1,4 @@
+Scheduler Communication Protocol
+================================
+
+TODO