33
44"""Workspace bundle creation for job submission."""
55
6+ import atexit
67import logging
8+ import os
9+ import re
10+ import shutil
711import subprocess
812import tempfile
913import zipfile
1418# Maximum bundle size in bytes (25 MB)
1519MAX_BUNDLE_SIZE_BYTES = 25 * 1024 * 1024
1620
17- EXCLUDE_EXTENSIONS = {".mov" , ".pyc" }
18-
19- EXCLUDE_DIRS = {
20- "__pycache__" ,
21- ".git" ,
22- ".mypy_cache" ,
23- ".pytest_cache" ,
24- ".ruff_cache" ,
25- ".venv" ,
26- "node_modules" ,
27- "venv" ,
28- }
29-
30- EXCLUDE_SUBPATHS = {
31- "docs/figures" ,
32- "docs/images" ,
33- "docs/reports" ,
34- "docs/static" ,
35- "tests/snapshot" ,
36- }
37-
38-
39- def _should_exclude (relative : Path ) -> bool :
40- """Check whether a relative path should be excluded from the bundle."""
41- if relative .suffix in EXCLUDE_EXTENSIONS :
42- return True
43- # e.g. foo.egg-info
44- if any (part .endswith (".egg-info" ) for part in relative .parts ):
45- return True
46- if any (part in EXCLUDE_DIRS for part in relative .parts ):
47- return True
48- rel_str = str (relative )
49- return any (subpath in rel_str for subpath in EXCLUDE_SUBPATHS )
50-
51-
52- def _get_git_non_ignored_files (workspace : Path ) -> set [Path ] | None :
21+ # Default exclude pattern applied to all bundles. Matches against the
22+ # *relative* path (forward-slash separated, no leading slash).
23+ DEFAULT_EXCLUDE = re .compile (
24+ r"""
25+ \.mov$ # video files
26+ | \.pyc$ # bytecode
27+ | \.egg-info(/|$) # egg metadata
28+ | (^|/)__pycache__(/|$) # pycache at any depth
29+ | (^|/)\.git(/|$) # .git at any depth
30+ | (^|/)\.mypy_cache(/|$)
31+ | (^|/)\.pytest_cache(/|$)
32+ | (^|/)\.ruff_cache(/|$)
33+ | (^|/)\.venv(/|$)
34+ | (^|/)node_modules(/|$)
35+ | (^|/)venv(/|$)
36+ | ^docs/figures(/|$)
37+ | ^docs/images(/|$)
38+ | ^docs/reports(/|$)
39+ | ^docs/static(/|$)
40+ | ^tests/snapshot(/|$)
41+ """ ,
42+ re .VERBOSE ,
43+ )
44+
45+ # Glob patterns for generated files that are gitignored but required at runtime.
46+ # These are produced by build hooks (e.g. hatch_build.py protobuf generation)
47+ # and must be included in task bundles so that `uv sync` inside containers can
48+ # skip regeneration.
49+ GENERATED_ARTIFACT_GLOBS = [
50+ "src/iris/rpc/*_pb2.py" ,
51+ "src/iris/rpc/*_pb2.pyi" ,
52+ "src/iris/rpc/*_connect.py" ,
53+ "lib/iris/src/iris/rpc/*_pb2.py" ,
54+ "lib/iris/src/iris/rpc/*_pb2.pyi" ,
55+ "lib/iris/src/iris/rpc/*_connect.py" ,
56+ ]
57+
58+
59+ def _should_exclude (relative : str , exclude : re .Pattern [str ]) -> bool :
60+ return bool (exclude .search (relative ))
61+
62+
63+ def _merge_exclude (extra : re .Pattern [str ] | None ) -> re .Pattern [str ]:
64+ if extra is None :
65+ return DEFAULT_EXCLUDE
66+ return re .compile (f"(?:{ DEFAULT_EXCLUDE .pattern } )|(?:{ extra .pattern } )" , re .VERBOSE )
67+
68+
69+ def collect_workspace_files (
70+ workspace : str | Path ,
71+ * ,
72+ exclude : re .Pattern [str ] | None = None ,
73+ ) -> list [Path ]:
74+ """Collect the list of files to include in a workspace bundle.
75+
76+ Uses git ls-files when available (respecting .gitignore), then adds back
77+ generated protobuf artifacts that are gitignored but needed at runtime.
78+ Falls back to pattern-based exclusion when git is unavailable.
79+
80+ Args:
81+ workspace: Root directory to bundle.
82+ exclude: Extra regex to exclude (merged with DEFAULT_EXCLUDE).
83+
84+ Returns:
85+ Sorted list of absolute paths.
86+ """
87+ workspace = Path (workspace )
88+ merged = _merge_exclude (exclude )
89+
90+ git_files = _get_git_non_ignored_files (workspace , merged )
91+ if git_files is not None :
92+ _include_generated_build_artifacts (workspace , git_files , merged )
93+ return sorted (f for f in git_files if f .is_file ())
94+
95+ return sorted (
96+ f for f in workspace .rglob ("*" ) if f .is_file () and not _should_exclude (str (f .relative_to (workspace )), merged )
97+ )
98+
99+
100+ def create_workspace_zip (
101+ workspace : str | Path ,
102+ * ,
103+ exclude : re .Pattern [str ] | None = None ,
104+ max_size_bytes : int | None = MAX_BUNDLE_SIZE_BYTES ,
105+ ) -> bytes :
106+ """Create a zip of the workspace and return the raw bytes.
107+
108+ Suitable for Iris bundle uploads where the caller sends bytes directly.
109+ """
110+ workspace = Path (workspace )
111+ files = collect_workspace_files (workspace , exclude = exclude )
112+
113+ fd , tmp_path = tempfile .mkstemp (suffix = ".zip" , prefix = "workspace_" )
114+ os .close (fd )
115+ try :
116+ with zipfile .ZipFile (tmp_path , "w" , zipfile .ZIP_DEFLATED ) as zf :
117+ for file in files :
118+ zf .write (file , file .relative_to (workspace ))
119+ buf = Path (tmp_path ).read_bytes ()
120+ finally :
121+ os .unlink (tmp_path )
122+
123+ if max_size_bytes is not None and len (buf ) > max_size_bytes :
124+ size_mb = len (buf ) / (1024 * 1024 )
125+ max_mb = max_size_bytes / (1024 * 1024 )
126+ raise ValueError (
127+ f"Bundle size { size_mb :.1f} MB exceeds maximum { max_mb :.0f} MB. "
128+ "Consider excluding large files or using .gitignore."
129+ )
130+
131+ return buf
132+
133+
134+ def create_workspace_dir (
135+ workspace : str | Path ,
136+ * ,
137+ exclude : re .Pattern [str ] | None = None ,
138+ ) -> str :
139+ """Copy workspace files into a temporary directory for Ray's working_dir.
140+
141+ Ray's JobSubmissionClient expects a directory path: it zips and uploads it
142+ internally. The temp directory is cleaned up at process exit via atexit.
143+ """
144+ workspace = Path (workspace )
145+ files = collect_workspace_files (workspace , exclude = exclude )
146+
147+ tmp_dir = tempfile .mkdtemp (prefix = "workspace_" )
148+ atexit .register (lambda d = tmp_dir : shutil .rmtree (d , ignore_errors = True ))
149+
150+ for file in files :
151+ rel = file .relative_to (workspace )
152+ dest = Path (tmp_dir ) / rel
153+ dest .parent .mkdir (parents = True , exist_ok = True )
154+ shutil .copy2 (file , dest )
155+
156+ return tmp_dir
157+
158+
159+ def _get_git_non_ignored_files (
160+ workspace : Path ,
161+ exclude : re .Pattern [str ],
162+ ) -> set [Path ] | None :
53163 """Get files that are not ignored by git.
54164
55165 Returns None if git is not available or this isn't a git repo.
@@ -62,49 +172,31 @@ def _get_git_non_ignored_files(workspace: Path) -> set[Path] | None:
62172 text = True ,
63173 check = True ,
64174 )
65- files = [Path (f ) for f in result .stdout .splitlines () if f ]
66- files = [f for f in files if not _should_exclude (f )]
175+ files = [f for f in result .stdout .splitlines () if f and not _should_exclude (f , exclude )]
67176 return {workspace / f for f in files }
68177 except (subprocess .CalledProcessError , FileNotFoundError ) as e :
69178 logger .debug ("Git not available, using pattern-based exclusion: %s" , e )
70179 return None
71180
72181
73- # Glob patterns for generated files that are gitignored but required at runtime.
74- # These are produced by build hooks (e.g. hatch_build.py protobuf generation)
75- # and must be included in task bundles so that `uv sync` inside containers can
76- # skip regeneration.
77- _GENERATED_ARTIFACT_GLOBS = [
78- "src/iris/rpc/*_pb2.py" ,
79- "src/iris/rpc/*_pb2.pyi" ,
80- "src/iris/rpc/*_connect.py" ,
81- "lib/iris/src/iris/rpc/*_pb2.py" ,
82- "lib/iris/src/iris/rpc/*_pb2.pyi" ,
83- "lib/iris/src/iris/rpc/*_connect.py" ,
84- ]
85-
86-
87- def _include_generated_build_artifacts (workspace : Path , files : set [Path ]) -> None :
182+ def _include_generated_build_artifacts (
183+ workspace : Path ,
184+ files : set [Path ],
185+ exclude : re .Pattern [str ],
186+ ) -> None :
88187 """Add generated build artifacts that exist on disk but are gitignored."""
89188 added = 0
90- for pattern in _GENERATED_ARTIFACT_GLOBS :
189+ for pattern in GENERATED_ARTIFACT_GLOBS :
91190 for path in workspace .glob (pattern ):
92- if path .is_file () and path not in files and not _should_exclude (path .relative_to (workspace )):
191+ if path .is_file () and path not in files and not _should_exclude (str ( path .relative_to (workspace )), exclude ):
93192 files .add (path )
94193 added += 1
95194 if added :
96195 logger .debug ("Included %d generated build artifact(s) in bundle" , added )
97196
98197
99198class BundleCreator :
100- """Helper for creating workspace bundles.
101-
102- Bundles a user's workspace directory (containing pyproject.toml, uv.lock,
103- and source code) into a zip file for job execution.
104-
105- The workspace must already have iris as a dependency in pyproject.toml.
106- If uv.lock doesn't exist, it will be generated.
107- """
199+ """Helper for creating workspace bundles for Iris job submission."""
108200
109201 def __init__ (self , workspace : Path ):
110202 self ._workspace = workspace
@@ -118,31 +210,4 @@ def create_bundle(self) -> bytes:
118210 Raises:
119211 ValueError: If bundle size exceeds MAX_BUNDLE_SIZE_BYTES
120212 """
121- git_files = _get_git_non_ignored_files (self ._workspace )
122- if git_files is not None :
123- _include_generated_build_artifacts (self ._workspace , git_files )
124-
125- with tempfile .TemporaryDirectory (prefix = "bundle_" ) as td :
126- bundle_path = Path (td ) / "bundle.zip"
127- with zipfile .ZipFile (bundle_path , "w" , zipfile .ZIP_DEFLATED ) as zf :
128- if git_files is not None :
129- for file in git_files :
130- if file .is_file ():
131- zf .write (file , file .relative_to (self ._workspace ))
132- else :
133- for file in self ._workspace .rglob ("*" ):
134- rel = file .relative_to (self ._workspace )
135- if file .is_file () and not _should_exclude (rel ):
136- zf .write (file , rel )
137-
138- bundle_bytes = bundle_path .read_bytes ()
139- bundle_size_mb = len (bundle_bytes ) / (1024 * 1024 )
140- max_size_mb = MAX_BUNDLE_SIZE_BYTES / (1024 * 1024 )
141-
142- if len (bundle_bytes ) > MAX_BUNDLE_SIZE_BYTES :
143- raise ValueError (
144- f"Bundle size { bundle_size_mb :.1f} MB exceeds maximum { max_size_mb :.0f} MB. "
145- "Consider excluding large files or using .gitignore."
146- )
147-
148- return bundle_bytes
213+ return create_workspace_zip (self ._workspace , max_size_bytes = MAX_BUNDLE_SIZE_BYTES )
0 commit comments