Skip to content

Commit eb2a4bd

Browse files
authored
Add files via upload
1 parent 17fd9f3 commit eb2a4bd

1 file changed

Lines changed: 270 additions & 0 deletions

File tree

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "code",
19+
"execution_count": 12,
20+
"metadata": {
21+
"colab": {
22+
"base_uri": "https://localhost:8080/"
23+
},
24+
"id": "0JmAs6KCA70Z",
25+
"outputId": "2d33e3e3-fc23-472e-b860-cfb1d7ff1381"
26+
},
27+
"outputs": [
28+
{
29+
"output_type": "stream",
30+
"name": "stdout",
31+
"text": [
32+
"Loaded 10 files: ['.ruska/agents/command-builder.md', '.ruska/agents/skill-builder.md', '.ruska/skills/rlm/SKILL.md', '.ruska/skills/rlm/references/prompt-templates.md', 'AGENTS.md', 'IDENTITY.md', 'MEMORY.md', 'SOUL.md', 'TOOLS.md', 'USER.md']\n"
33+
]
34+
}
35+
],
36+
"source": [
37+
"from __future__ import annotations\n",
38+
"\n",
39+
"import hashlib\n",
40+
"import io\n",
41+
"import json\n",
42+
"import os\n",
43+
"import tarfile\n",
44+
"import time\n",
45+
"import fnmatch\n",
46+
"import re\n",
47+
"from dataclasses import dataclass\n",
48+
"from pathlib import Path\n",
49+
"from typing import Dict, Iterable, Optional, Tuple\n",
50+
"\n",
51+
"import requests\n",
52+
"\n",
53+
"\n",
54+
"@dataclass\n",
55+
"class RepoLoadConfig:\n",
56+
" owner: str\n",
57+
" repo: str\n",
58+
" ref: str = \"main\" # branch/tag/sha\n",
59+
" token: Optional[str] = None # recommended (higher limits)\n",
60+
" timeout_s: int = 20\n",
61+
"\n",
62+
" # caching\n",
63+
" cache_dir: Path = Path(\".cache/repo_loader\")\n",
64+
" ttl_s: int = 6 * 60 * 60 # 6 hours\n",
65+
" allow_stale_on_error: bool = True\n",
66+
"\n",
67+
" # filtering\n",
68+
" max_file_bytes: int = 512_000\n",
69+
" include_extensions: Optional[Tuple[str, ...]] = None # e.g. (\".md\", \".py\", \".toml\")\n",
70+
" deny_globs: Tuple[str, ...] = (\n",
71+
" \"**/.git/**\",\n",
72+
" \"**/.github/**\",\n",
73+
" \"**/__pycache__/**\",\n",
74+
" \"**/.venv/**\",\n",
75+
" \"**/venv/**\",\n",
76+
" \"**/node_modules/**\",\n",
77+
" \"**/dist/**\",\n",
78+
" \"**/build/**\",\n",
79+
" \"**/*.png\",\n",
80+
" \"**/*.jpg\",\n",
81+
" \"**/*.jpeg\",\n",
82+
" \"**/*.gif\",\n",
83+
" \"**/*.pdf\",\n",
84+
" \"**/*.zip\",\n",
85+
" \"**/*.tar\",\n",
86+
" \"**/*.gz\",\n",
87+
" \"**/*.7z\",\n",
88+
" \"**/*.woff\",\n",
89+
" \"**/*.woff2\",\n",
90+
" \"**/*.mp4\",\n",
91+
" \"**/*.mov\",\n",
92+
" \"**/*.mp3\",\n",
93+
" \"**/*.lock\",\n",
94+
" )\n",
95+
" deny_regex: Tuple[str, ...] = (\n",
96+
" r\".*\\.env(\\..*)?$\",\n",
97+
" r\".*\\.pem$\",\n",
98+
" r\".*id_rsa.*\",\n",
99+
" r\".*\\.key$\",\n",
100+
" r\".*\\.p12$\",\n",
101+
" )\n",
102+
"\n",
103+
"\n",
104+
"def _headers(token: Optional[str]) -> Dict[str, str]:\n",
105+
" h = {\"Accept\": \"application/vnd.github+json\"}\n",
106+
" if token:\n",
107+
" h[\"Authorization\"] = f\"Bearer {token}\"\n",
108+
" return h\n",
109+
"\n",
110+
"\n",
111+
"def _matches_any_glob(path: str, globs: Iterable[str]) -> bool:\n",
112+
" p = path.replace(os.sep, \"/\")\n",
113+
" for g in globs:\n",
114+
" gg = g.replace(os.sep, \"/\")\n",
115+
" if fnmatch.fnmatch(p, gg) or (gg.startswith(\"**/\") and fnmatch.fnmatch(p, gg[3:])):\n",
116+
" return True\n",
117+
" return False\n",
118+
"\n",
119+
"\n",
120+
"def _matches_any_regex(path: str, patterns: Iterable[str]) -> bool:\n",
121+
" return any(re.match(pat, path) for pat in patterns)\n",
122+
"\n",
123+
"\n",
124+
"def _is_probably_binary(data: bytes) -> bool:\n",
125+
" if b\"\\x00\" in data:\n",
126+
" return True\n",
127+
" sample = data[:4096]\n",
128+
" if not sample:\n",
129+
" return False\n",
130+
" textish = sum(1 for b in sample if 9 <= b <= 13 or 32 <= b <= 126)\n",
131+
" return textish < (len(sample) * 0.80)\n",
132+
"\n",
133+
"\n",
134+
"def _cache_key(cfg: RepoLoadConfig) -> str:\n",
135+
" raw = f\"{cfg.owner}/{cfg.repo}@{cfg.ref}\"\n",
136+
" return hashlib.sha256(raw.encode(\"utf-8\")).hexdigest()[:16]\n",
137+
"\n",
138+
"\n",
139+
"def _cache_paths(cfg: RepoLoadConfig) -> tuple[Path, Path]:\n",
140+
" cfg.cache_dir.mkdir(parents=True, exist_ok=True)\n",
141+
" key = _cache_key(cfg)\n",
142+
" return cfg.cache_dir / f\"{key}.tar.gz\", cfg.cache_dir / f\"{key}.meta.json\"\n",
143+
"\n",
144+
"\n",
145+
"def _load_meta(meta_path: Path) -> dict:\n",
146+
" if not meta_path.exists():\n",
147+
" return {}\n",
148+
" try:\n",
149+
" return json.loads(meta_path.read_text(\"utf-8\"))\n",
150+
" except Exception:\n",
151+
" return {}\n",
152+
"\n",
153+
"\n",
154+
"def _save_meta(meta_path: Path, meta: dict) -> None:\n",
155+
" meta_path.write_text(json.dumps(meta, indent=2, sort_keys=True), \"utf-8\")\n",
156+
"\n",
157+
"\n",
158+
"def _extract_tarball_to_memory(blob: bytes, cfg: RepoLoadConfig) -> Dict[str, str]:\n",
159+
" out: Dict[str, str] = {}\n",
160+
" tf = tarfile.open(fileobj=io.BytesIO(blob), mode=\"r:gz\")\n",
161+
"\n",
162+
" for member in tf.getmembers():\n",
163+
" if not member.isfile():\n",
164+
" continue\n",
165+
" if member.size > cfg.max_file_bytes:\n",
166+
" continue\n",
167+
"\n",
168+
" raw_path = member.name.replace(\"\\\\\", \"/\")\n",
169+
" parts = raw_path.split(\"/\", 1)\n",
170+
" if len(parts) != 2:\n",
171+
" continue\n",
172+
" path = parts[1] # strip tar prefix\n",
173+
"\n",
174+
" if cfg.include_extensions is not None:\n",
175+
" if not any(path.lower().endswith(ext.lower()) for ext in cfg.include_extensions):\n",
176+
" continue\n",
177+
"\n",
178+
" if _matches_any_glob(path, cfg.deny_globs):\n",
179+
" continue\n",
180+
" if _matches_any_regex(path, cfg.deny_regex):\n",
181+
" continue\n",
182+
"\n",
183+
" f = tf.extractfile(member)\n",
184+
" if f is None:\n",
185+
" continue\n",
186+
" data = f.read()\n",
187+
"\n",
188+
" if _is_probably_binary(data):\n",
189+
" continue\n",
190+
"\n",
191+
" out[path] = data.decode(\"utf-8\", errors=\"replace\")\n",
192+
"\n",
193+
" return out\n",
194+
"\n",
195+
"\n",
196+
"def load_repo_files_on_start(cfg: RepoLoadConfig) -> Dict[str, str]:\n",
197+
" \"\"\"\n",
198+
" Startup-safe:\n",
199+
" - uses If-None-Match with ETag when available\n",
200+
" - caches tarball + meta\n",
201+
" - falls back to cached tarball if network/rate-limit fails (optional)\n",
202+
" \"\"\"\n",
203+
" tar_path, meta_path = _cache_paths(cfg)\n",
204+
" meta = _load_meta(meta_path)\n",
205+
"\n",
206+
" now = time.time()\n",
207+
" fresh_enough = tar_path.exists() and (now - tar_path.stat().st_mtime) < cfg.ttl_s\n",
208+
"\n",
209+
" # If cache is fresh, avoid network entirely\n",
210+
" if fresh_enough:\n",
211+
" return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
212+
"\n",
213+
" url = f\"https://api.github.com/repos/{cfg.owner}/{cfg.repo}/tarball/{cfg.ref}\"\n",
214+
" headers = _headers(cfg.token)\n",
215+
"\n",
216+
" # Conditional request if we have ETag\n",
217+
" if meta.get(\"etag\"):\n",
218+
" headers[\"If-None-Match\"] = meta[\"etag\"]\n",
219+
"\n",
220+
" try:\n",
221+
" resp = requests.get(url, headers=headers, timeout=cfg.timeout_s)\n",
222+
" if resp.status_code == 304 and tar_path.exists():\n",
223+
" # Not modified; use cached tarball\n",
224+
" return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
225+
"\n",
226+
" resp.raise_for_status()\n",
227+
"\n",
228+
" # Save tarball + meta\n",
229+
" tar_path.write_bytes(resp.content)\n",
230+
" meta = {\n",
231+
" \"etag\": resp.headers.get(\"ETag\"),\n",
232+
" \"fetched_at\": int(time.time()),\n",
233+
" \"owner\": cfg.owner,\n",
234+
" \"repo\": cfg.repo,\n",
235+
" \"ref\": cfg.ref,\n",
236+
" }\n",
237+
" _save_meta(meta_path, meta)\n",
238+
"\n",
239+
" return _extract_tarball_to_memory(resp.content, cfg)\n",
240+
"\n",
241+
" except Exception as e:\n",
242+
" if cfg.allow_stale_on_error and tar_path.exists():\n",
243+
" # Rate-limited/offline? Still boot with last known good cache.\n",
244+
" return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n",
245+
" raise\n",
246+
"\n",
247+
"\n",
248+
"# Example for your repo\n",
249+
"if __name__ == \"__main__\":\n",
250+
" cfg = RepoLoadConfig(\n",
251+
" owner=\"ruska-ai\",\n",
252+
" repo=\"workspace\",\n",
253+
" ref=\"main\",\n",
254+
" token=os.getenv(\"GITHUB_TOKEN\"), # strongly recommended for startup stability\n",
255+
" deny_globs=(\n",
256+
" \"**/README.md\",\n",
257+
" \"**/.github/**\",\n",
258+
" \"**/*.png\",\n",
259+
" \"**/*.pdf\",\n",
260+
" ),\n",
261+
" include_extensions=(\".md\",), # likely what you want for system prompt context\n",
262+
" ttl_s=60 * 60, # check at most hourly\n",
263+
" )\n",
264+
"\n",
265+
" files = load_repo_files_on_start(cfg)\n",
266+
" print(\"Loaded\", len(files), \"files:\", list(files)[:10])\n"
267+
]
268+
}
269+
]
270+
}

0 commit comments

Comments
 (0)