1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " code" ,
19+ "execution_count" : 12 ,
20+ "metadata" : {
21+ "colab" : {
22+ "base_uri" : " https://localhost:8080/"
23+ },
24+ "id" : " 0JmAs6KCA70Z" ,
25+ "outputId" : " 2d33e3e3-fc23-472e-b860-cfb1d7ff1381"
26+ },
27+ "outputs" : [
28+ {
29+ "output_type" : " stream" ,
30+ "name" : " stdout" ,
31+ "text" : [
32+ " Loaded 10 files: ['.ruska/agents/command-builder.md', '.ruska/agents/skill-builder.md', '.ruska/skills/rlm/SKILL.md', '.ruska/skills/rlm/references/prompt-templates.md', 'AGENTS.md', 'IDENTITY.md', 'MEMORY.md', 'SOUL.md', 'TOOLS.md', 'USER.md']\n "
33+ ]
34+ }
35+ ],
36+ "source" : [
37+ " from __future__ import annotations\n " ,
38+ " \n " ,
39+ " import hashlib\n " ,
40+ " import io\n " ,
41+ " import json\n " ,
42+ " import os\n " ,
43+ " import tarfile\n " ,
44+ " import time\n " ,
45+ " import fnmatch\n " ,
46+ " import re\n " ,
47+ " from dataclasses import dataclass\n " ,
48+ " from pathlib import Path\n " ,
49+ " from typing import Dict, Iterable, Optional, Tuple\n " ,
50+ " \n " ,
51+ " import requests\n " ,
52+ " \n " ,
53+ " \n " ,
54+ " @dataclass\n " ,
55+ " class RepoLoadConfig:\n " ,
56+ " owner: str\n " ,
57+ " repo: str\n " ,
58+ " ref: str = \" main\" # branch/tag/sha\n " ,
59+ " token: Optional[str] = None # recommended (higher limits)\n " ,
60+ " timeout_s: int = 20\n " ,
61+ " \n " ,
62+ " # caching\n " ,
63+ " cache_dir: Path = Path(\" .cache/repo_loader\" )\n " ,
64+ " ttl_s: int = 6 * 60 * 60 # 6 hours\n " ,
65+ " allow_stale_on_error: bool = True\n " ,
66+ " \n " ,
67+ " # filtering\n " ,
68+ " max_file_bytes: int = 512_000\n " ,
69+ " include_extensions: Optional[Tuple[str, ...]] = None # e.g. (\" .md\" , \" .py\" , \" .toml\" )\n " ,
70+ " deny_globs: Tuple[str, ...] = (\n " ,
71+ " \" **/.git/**\" ,\n " ,
72+ " \" **/.github/**\" ,\n " ,
73+ " \" **/__pycache__/**\" ,\n " ,
74+ " \" **/.venv/**\" ,\n " ,
75+ " \" **/venv/**\" ,\n " ,
76+ " \" **/node_modules/**\" ,\n " ,
77+ " \" **/dist/**\" ,\n " ,
78+ " \" **/build/**\" ,\n " ,
79+ " \" **/*.png\" ,\n " ,
80+ " \" **/*.jpg\" ,\n " ,
81+ " \" **/*.jpeg\" ,\n " ,
82+ " \" **/*.gif\" ,\n " ,
83+ " \" **/*.pdf\" ,\n " ,
84+ " \" **/*.zip\" ,\n " ,
85+ " \" **/*.tar\" ,\n " ,
86+ " \" **/*.gz\" ,\n " ,
87+ " \" **/*.7z\" ,\n " ,
88+ " \" **/*.woff\" ,\n " ,
89+ " \" **/*.woff2\" ,\n " ,
90+ " \" **/*.mp4\" ,\n " ,
91+ " \" **/*.mov\" ,\n " ,
92+ " \" **/*.mp3\" ,\n " ,
93+ " \" **/*.lock\" ,\n " ,
94+ " )\n " ,
95+ " deny_regex: Tuple[str, ...] = (\n " ,
96+ " r\" .*\\ .env(\\ ..*)?$\" ,\n " ,
97+ " r\" .*\\ .pem$\" ,\n " ,
98+ " r\" .*id_rsa.*\" ,\n " ,
99+ " r\" .*\\ .key$\" ,\n " ,
100+ " r\" .*\\ .p12$\" ,\n " ,
101+ " )\n " ,
102+ " \n " ,
103+ " \n " ,
104+ " def _headers(token: Optional[str]) -> Dict[str, str]:\n " ,
105+ " h = {\" Accept\" : \" application/vnd.github+json\" }\n " ,
106+ " if token:\n " ,
107+ " h[\" Authorization\" ] = f\" Bearer {token}\"\n " ,
108+ " return h\n " ,
109+ " \n " ,
110+ " \n " ,
111+ " def _matches_any_glob(path: str, globs: Iterable[str]) -> bool:\n " ,
112+ " p = path.replace(os.sep, \" /\" )\n " ,
113+ " for g in globs:\n " ,
114+ " gg = g.replace(os.sep, \" /\" )\n " ,
115+ " if fnmatch.fnmatch(p, gg) or (gg.startswith(\" **/\" ) and fnmatch.fnmatch(p, gg[3:])):\n " ,
116+ " return True\n " ,
117+ " return False\n " ,
118+ " \n " ,
119+ " \n " ,
120+ " def _matches_any_regex(path: str, patterns: Iterable[str]) -> bool:\n " ,
121+ " return any(re.match(pat, path) for pat in patterns)\n " ,
122+ " \n " ,
123+ " \n " ,
124+ " def _is_probably_binary(data: bytes) -> bool:\n " ,
125+ " if b\"\\ x00\" in data:\n " ,
126+ " return True\n " ,
127+ " sample = data[:4096]\n " ,
128+ " if not sample:\n " ,
129+ " return False\n " ,
130+ " textish = sum(1 for b in sample if 9 <= b <= 13 or 32 <= b <= 126)\n " ,
131+ " return textish < (len(sample) * 0.80)\n " ,
132+ " \n " ,
133+ " \n " ,
134+ " def _cache_key(cfg: RepoLoadConfig) -> str:\n " ,
135+ " raw = f\" {cfg.owner}/{cfg.repo}@{cfg.ref}\"\n " ,
136+ " return hashlib.sha256(raw.encode(\" utf-8\" )).hexdigest()[:16]\n " ,
137+ " \n " ,
138+ " \n " ,
139+ " def _cache_paths(cfg: RepoLoadConfig) -> tuple[Path, Path]:\n " ,
140+ " cfg.cache_dir.mkdir(parents=True, exist_ok=True)\n " ,
141+ " key = _cache_key(cfg)\n " ,
142+ " return cfg.cache_dir / f\" {key}.tar.gz\" , cfg.cache_dir / f\" {key}.meta.json\"\n " ,
143+ " \n " ,
144+ " \n " ,
145+ " def _load_meta(meta_path: Path) -> dict:\n " ,
146+ " if not meta_path.exists():\n " ,
147+ " return {}\n " ,
148+ " try:\n " ,
149+ " return json.loads(meta_path.read_text(\" utf-8\" ))\n " ,
150+ " except Exception:\n " ,
151+ " return {}\n " ,
152+ " \n " ,
153+ " \n " ,
154+ " def _save_meta(meta_path: Path, meta: dict) -> None:\n " ,
155+ " meta_path.write_text(json.dumps(meta, indent=2, sort_keys=True), \" utf-8\" )\n " ,
156+ " \n " ,
157+ " \n " ,
158+ " def _extract_tarball_to_memory(blob: bytes, cfg: RepoLoadConfig) -> Dict[str, str]:\n " ,
159+ " out: Dict[str, str] = {}\n " ,
160+ " tf = tarfile.open(fileobj=io.BytesIO(blob), mode=\" r:gz\" )\n " ,
161+ " \n " ,
162+ " for member in tf.getmembers():\n " ,
163+ " if not member.isfile():\n " ,
164+ " continue\n " ,
165+ " if member.size > cfg.max_file_bytes:\n " ,
166+ " continue\n " ,
167+ " \n " ,
168+ " raw_path = member.name.replace(\"\\\\\" , \" /\" )\n " ,
169+ " parts = raw_path.split(\" /\" , 1)\n " ,
170+ " if len(parts) != 2:\n " ,
171+ " continue\n " ,
172+ " path = parts[1] # strip tar prefix\n " ,
173+ " \n " ,
174+ " if cfg.include_extensions is not None:\n " ,
175+ " if not any(path.lower().endswith(ext.lower()) for ext in cfg.include_extensions):\n " ,
176+ " continue\n " ,
177+ " \n " ,
178+ " if _matches_any_glob(path, cfg.deny_globs):\n " ,
179+ " continue\n " ,
180+ " if _matches_any_regex(path, cfg.deny_regex):\n " ,
181+ " continue\n " ,
182+ " \n " ,
183+ " f = tf.extractfile(member)\n " ,
184+ " if f is None:\n " ,
185+ " continue\n " ,
186+ " data = f.read()\n " ,
187+ " \n " ,
188+ " if _is_probably_binary(data):\n " ,
189+ " continue\n " ,
190+ " \n " ,
191+ " out[path] = data.decode(\" utf-8\" , errors=\" replace\" )\n " ,
192+ " \n " ,
193+ " return out\n " ,
194+ " \n " ,
195+ " \n " ,
196+ " def load_repo_files_on_start(cfg: RepoLoadConfig) -> Dict[str, str]:\n " ,
197+ " \"\"\"\n " ,
198+ " Startup-safe:\n " ,
199+ " - uses If-None-Match with ETag when available\n " ,
200+ " - caches tarball + meta\n " ,
201+ " - falls back to cached tarball if network/rate-limit fails (optional)\n " ,
202+ " \"\"\"\n " ,
203+ " tar_path, meta_path = _cache_paths(cfg)\n " ,
204+ " meta = _load_meta(meta_path)\n " ,
205+ " \n " ,
206+ " now = time.time()\n " ,
207+ " fresh_enough = tar_path.exists() and (now - tar_path.stat().st_mtime) < cfg.ttl_s\n " ,
208+ " \n " ,
209+ " # If cache is fresh, avoid network entirely\n " ,
210+ " if fresh_enough:\n " ,
211+ " return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n " ,
212+ " \n " ,
213+ " url = f\" https://api.github.com/repos/{cfg.owner}/{cfg.repo}/tarball/{cfg.ref}\"\n " ,
214+ " headers = _headers(cfg.token)\n " ,
215+ " \n " ,
216+ " # Conditional request if we have ETag\n " ,
217+ " if meta.get(\" etag\" ):\n " ,
218+ " headers[\" If-None-Match\" ] = meta[\" etag\" ]\n " ,
219+ " \n " ,
220+ " try:\n " ,
221+ " resp = requests.get(url, headers=headers, timeout=cfg.timeout_s)\n " ,
222+ " if resp.status_code == 304 and tar_path.exists():\n " ,
223+ " # Not modified; use cached tarball\n " ,
224+ " return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n " ,
225+ " \n " ,
226+ " resp.raise_for_status()\n " ,
227+ " \n " ,
228+ " # Save tarball + meta\n " ,
229+ " tar_path.write_bytes(resp.content)\n " ,
230+ " meta = {\n " ,
231+ " \" etag\" : resp.headers.get(\" ETag\" ),\n " ,
232+ " \" fetched_at\" : int(time.time()),\n " ,
233+ " \" owner\" : cfg.owner,\n " ,
234+ " \" repo\" : cfg.repo,\n " ,
235+ " \" ref\" : cfg.ref,\n " ,
236+ " }\n " ,
237+ " _save_meta(meta_path, meta)\n " ,
238+ " \n " ,
239+ " return _extract_tarball_to_memory(resp.content, cfg)\n " ,
240+ " \n " ,
241+ " except Exception as e:\n " ,
242+ " if cfg.allow_stale_on_error and tar_path.exists():\n " ,
243+ " # Rate-limited/offline? Still boot with last known good cache.\n " ,
244+ " return _extract_tarball_to_memory(tar_path.read_bytes(), cfg)\n " ,
245+ " raise\n " ,
246+ " \n " ,
247+ " \n " ,
248+ " # Example for your repo\n " ,
249+ " if __name__ == \" __main__\" :\n " ,
250+ " cfg = RepoLoadConfig(\n " ,
251+ " owner=\" ruska-ai\" ,\n " ,
252+ " repo=\" workspace\" ,\n " ,
253+ " ref=\" main\" ,\n " ,
254+ " token=os.getenv(\" GITHUB_TOKEN\" ), # strongly recommended for startup stability\n " ,
255+ " deny_globs=(\n " ,
256+ " \" **/README.md\" ,\n " ,
257+ " \" **/.github/**\" ,\n " ,
258+ " \" **/*.png\" ,\n " ,
259+ " \" **/*.pdf\" ,\n " ,
260+ " ),\n " ,
261+ " include_extensions=(\" .md\" ,), # likely what you want for system prompt context\n " ,
262+ " ttl_s=60 * 60, # check at most hourly\n " ,
263+ " )\n " ,
264+ " \n " ,
265+ " files = load_repo_files_on_start(cfg)\n " ,
266+ " print(\" Loaded\" , len(files), \" files:\" , list(files)[:10])\n "
267+ ]
268+ }
269+ ]
270+ }
0 commit comments