Add files via upload

ryaneggz · web-flow · commit eb2a4bdbf3b3 · 2026-02-15T13:44:14.000-07:00
diff --git a/examples/tools/fetch_workspace.ipynb b/examples/tools/fetch_workspace.ipynb
@@ -0,0 +1,270 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "0JmAs6KCA70Z",
+        "outputId": "2d33e3e3-fc23-472e-b860-cfb1d7ff1381"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Loaded 10 files: ['.ruska/agents/command-builder.md', '.ruska/agents/skill-builder.md', '.ruska/skills/rlm/SKILL.md', '.ruska/skills/rlm/references/prompt-templates.md', 'AGENTS.md', 'IDENTITY.md', 'MEMORY.md', 'SOUL.md', 'TOOLS.md', 'USER.md']\n"
+          ]
+        }
+      ],
+      "source": [
+        "from __future__ import annotations\n",
+        "\n",
+        "import hashlib\n",
+        "import io\n",
+        "import json\n",
+        "import os\n",
+        "import tarfile\n",
+        "import time\n",
+        "import fnmatch\n",
+        "import re\n",
+        "from dataclasses import dataclass\n",
+        "from pathlib import Path\n",
+        "from typing import Dict, Iterable, Optional, Tuple\n",
+        "\n",
+        "import requests\n",
+        "\n",
+        "\n",
+        "@dataclass\n",
+        "class RepoLoadConfig:\n",
+        "    owner: str\n",
+        "    repo: str\n",
+        "    ref: str = \"main\"                 # branch/tag/sha\n",
+        "    token: Optional[str] = None       # recommended (higher limits)\n",
+        "    timeout_s: int = 20\n",
+        "\n",
+        "    # caching\n",
+        "    cache_dir: Path = Path(\".cache/repo_loader\")\n",
+        "    ttl_s: int = 6 * 60 * 60          # 6 hours\n",
+        "    allow_stale_on_error: bool = True\n",
+        "\n",
+        "    # filtering\n",
+        "    max_file_bytes: int = 512_000\n",
+        "    include_extensions: Optional[Tuple[str, ...]] = None  # e.g. (\".md\", \".py\", \".toml\")\n",
+        "    deny_globs: Tuple[str, ...] = (\n",
+        "        \"**/.git/**\",\n",
+        "        \"**/.github/**\",\n",
+        "        \"**/__pycache__/**\",\n",
+        "        \"**/.venv/**\",\n",
+        "        \"**/venv/**\",\n",
+        "        \"**/node_modules/**\",\n",
+        "        \"**/dist/**\",\n",
+        "        \"**/build/**\",\n",
+        "        \"**/*.png\",\n",
+        "        \"**/*.jpg\",\n",
+        "        \"**/*.jpeg\",\n",
+        "        \"**/*.gif\",\n",
+        "        \"**/*.pdf\",\n",
+        "        \"**/*.zip\",\n",
+        "        \"**/*.tar\",\n",
+        "        \"**/*.gz\",\n",
+        "        \"**/*.7z\",\n",
+        "        \"**/*.woff\",\n",
+        "        \"**/*.woff2\",\n",
+        "        \"**/*.mp4\",\n",
+        "        \"**/*.mov\",\n",
+        "        \"**/*.mp3\",\n",
+        "        \"**/*.lock\",\n",
+        "    )\n",
+        "    deny_regex: Tuple[str, ...] = (\n",
+        "        r\".*\\.env(\\..*)?$\",\n",
+        "        r\".*\\.pem$\",\n",
+        "        r\".*id_rsa.*\",\n",
+        "        r\".*\\.key$\",\n",
+        "        r\".*\\.p12$\",\n",
+        "    )\n",
+        "\n",
+        "\n",
+        "def _headers(token: Optional[str]) -> Dict[str, str]:\n",
+        "    h = {\"Accept\": \"application/vnd.github+json\"}\n",
+        "    if token:\n",
+        "        h[\"Authorization\"] = f\"Bearer {token}\"\n",
+        "    return h\n",
+        "\n",
+        "\n",
+        "def _matches_any_glob(path: str, globs: Iterable[str]) -> bool:\n",
+        "    p = path.replace(os.sep, \"/\")\n",
+        "    for g in globs:\n",
+        "        gg = g.replace(os.sep, \"/\")\n",
+        "        if fnmatch.fnmatch(p, gg) or (gg.startswith(\"**/\") and fnmatch.fnmatch(p, gg[3:])):\n",
+        "            return True\n",
+        "    return False\n",
+        "\n",
+        "\n",
+        "def _matches_any_regex(path: str, patterns: Iterable[str]) -> bool:\n",
+        "    return any(re.match(pat, path) for pat in patterns)\n",
+        "\n",
+        "\n",
+        "def _is_probably_binary(data: bytes) -> bool:\n",
+        "    if b\"\\x00\" in data:\n",
+        "        return True\n",
+        "    sample = data[:4096]\n",
+        "    if not sample:\n",
+        "        return False\n",
+        "    textish = sum(1 for b in sample if 9 <= b <= 13 or 32 <= b <= 126)\n",
+        "    return textish < (len(sample) * 0.80)\n",
+        "\n",
+        "\n",
+        "def _cache_key(cfg: RepoLoadConfig) -> str:\n",
+        "    raw = f\"{cfg.owner}/{cfg.repo}@{cfg.ref}\"\n",
+        "    return hashlib.sha256(raw.encode(\"utf-8\")).hexdigest()[:16]\n",
+        "\n",
+        "\n",
+        "def _cache_paths(cfg: RepoLoadConfig) -> tuple[Path, Path]:\n",
+        "    cfg.cache_dir.mkdir(parents=True, exist_ok=True)\n",
+        "    key = _cache_key(cfg)\n",
+        "    return cfg.cache_dir / f\"{key}.tar.gz\", cfg.cache_dir / f\"{key}.meta.json\"\n",
+        "\n",
+        "\n",
+        "def _load_meta(meta_path: Path) -> dict:\n",
+        "    if not meta_path.exists():\n",
+        "        return {}\n",
+        "    try:\n",
+        "        return json.loads(meta_path.read_text(\"utf-8\"))\n",
+        "    except Exception:\n",
+        "        return {}\n",
+        "\n",
+        "\n",
+        "def _save_meta(meta_path: Path, meta: dict) -> None:\n",
+        "    meta_path.write_text(json.dumps(meta, indent=2, sort_keys=True), \"utf-8\")\n",
+        "\n",
+        "\n",
+        "def _extract_tarball_to_memory(blob: bytes, cfg: RepoLoadConfig) -> Dict[str, str]:\n",
+        "    out: Dict[str, str] = {}\n",
+        "    tf = tarfile.open(fileobj=io.BytesIO(blob), mode=\"r:gz\")\n",
+        "\n",
+        "    for member in tf.getmembers():\n",
+        "        if not member.isfile():\n",
+        "            continue\n",
+        "        if member.size > cfg.max_file_bytes:\n",
+        "            continue\n",
+        "\n",
+        "        raw_path = member.name.replace(\"\\\\\", \"/\")\n",
+        "        parts = raw_path.split(\"/\", 1)\n",
+        "        if len(parts) != 2:\n",
+        "            continue\n",
+        "        path = parts[1]  # strip tar prefix\n",
+        "\n",
+        "        if cfg.include_extensions is not None:\n",
+        "            if not any(path.lower().endswith(ext.lower()) for ext in cfg.include_extensions):\n",
+        "                continue\n",
+        "\n",
+        "        if _matches_any_glob(path, cfg.deny_globs):\n",
+        "            continue\n",
+        "        if _matches_any_regex(path, cfg.deny_regex):\n",
+        "            continue\n",
+        "\n",
+        "        f = tf.extractfile(member)\n",
+        "        if f is None:\n",
+        "            continue\n",
+        "        data = f.read()\n",
+        "\n",
+        "        if _is_probably_binary(data):\n",
+        "            continue\n",
+        "\n",
+        "        out[path] = data.decode(\"utf-8\", errors=\"replace\")\n",
+        "\n",
+        "    return out\n",
+        "\n",
+        "\n",
+        "def load_repo_files_on_start(cfg: RepoLoadConfig) -> Dict[str, str]:\n",
+        "    \"\"\"\n",
+        "    Startup-safe:\n",
+        "    - uses If-None-Match with ETag when available\n",
+        "    - caches tarball + meta\n",
+        "    - falls back to cached tarball if network/rate-limit fails (optional)\n",
+        "    \"\"\"\n",
+        "    tar_path, meta_path = _cache_paths(cfg)\n",
+        "    meta = _load_meta(meta_path)\n",
+        "\n",
+        "    now = time.time()\n",
+        "    fresh_enough = tar_path.exists() and (now - tar_path.stat().st_mtime) < cfg.ttl_s\n",
+        "\n",
+        "    # If cache is fresh, avoid network entirely\n",
+        "    if fresh_enough:\n",
+        "        return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
+        "\n",
+        "    url = f\"https://api.github.com/repos/{cfg.owner}/{cfg.repo}/tarball/{cfg.ref}\"\n",
+        "    headers = _headers(cfg.token)\n",
+        "\n",
+        "    # Conditional request if we have ETag\n",
+        "    if meta.get(\"etag\"):\n",
+        "        headers[\"If-None-Match\"] = meta[\"etag\"]\n",
+        "\n",
+        "    try:\n",
+        "        resp = requests.get(url, headers=headers, timeout=cfg.timeout_s)\n",
+        "        if resp.status_code == 304 and tar_path.exists():\n",
+        "            # Not modified; use cached tarball\n",
+        "            return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
+        "\n",
+        "        resp.raise_for_status()\n",
+        "\n",
+        "        # Save tarball + meta\n",
+        "        tar_path.write_bytes(resp.content)\n",
+        "        meta = {\n",
+        "            \"etag\": resp.headers.get(\"ETag\"),\n",
+        "            \"fetched_at\": int(time.time()),\n",
+        "            \"owner\": cfg.owner,\n",
+        "            \"repo\": cfg.repo,\n",
+        "            \"ref\": cfg.ref,\n",
+        "        }\n",
+        "        _save_meta(meta_path, meta)\n",
+        "\n",
+        "        return _extract_tarball_to_memory(resp.content, cfg)\n",
+        "\n",
+        "    except Exception as e:\n",
+        "        if cfg.allow_stale_on_error and tar_path.exists():\n",
+        "            # Rate-limited/offline? Still boot with last known good cache.\n",
+        "            return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
+        "        raise\n",
+        "\n",
+        "\n",
+        "# Example for your repo\n",
+        "if __name__ == \"__main__\":\n",
+        "    cfg = RepoLoadConfig(\n",
+        "        owner=\"ruska-ai\",\n",
+        "        repo=\"workspace\",\n",
+        "        ref=\"main\",\n",
+        "        token=os.getenv(\"GITHUB_TOKEN\"),  # strongly recommended for startup stability\n",
+        "        deny_globs=(\n",
+        "            \"**/README.md\",\n",
+        "            \"**/.github/**\",\n",
+        "            \"**/*.png\",\n",
+        "            \"**/*.pdf\",\n",
+        "        ),\n",
+        "        include_extensions=(\".md\",),  # likely what you want for system prompt context\n",
+        "        ttl_s=60 * 60,  # check at most hourly\n",
+        "    )\n",
+        "\n",
+        "    files = load_repo_files_on_start(cfg)\n",
+        "    print(\"Loaded\", len(files), \"files:\", list(files)[:10])\n"
+      ]
+    }
+  ]
+}