From a4e7169a1b286211ed9425d16f4ac9b9176f82f5 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Mon, 16 Jun 2025 13:33:35 +0300 Subject: [PATCH 1/8] refactor clone repository --- .../src/fal/toolkit/utils/download_utils.py | 169 +++++++++++------- projects/fal/tests/integration_test.py | 6 +- 2 files changed, 108 insertions(+), 67 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index dd4977da..18516613 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -422,14 +422,22 @@ def clone_repository( Returns: A Path object representing the full path to the cloned Git repository. """ - target_dir = target_dir or FAL_REPOSITORY_DIR # type: ignore[assignment] + + target_dir = Path(target_dir or "/tmp") # type: ignore[assignment] + temp_dir = Path("/tmp") + base_repo_dir = Path(FAL_REPOSITORY_DIR) + if repo_name is None: repo_name = Path(https_url).stem if commit_hash: repo_name += f"-{commit_hash[:8]}" - local_repo_path = Path(target_dir) / repo_name # type: ignore[arg-type] + commit_hash = commit_hash or "main" + repo_url_hash = _hash_url(https_url) + repo_hash = f"{repo_url_hash}-{commit_hash}" + archive_path = base_repo_dir / (repo_hash+".zip") + local_repo_path = target_dir / repo_name if local_repo_path.exists(): local_repo_commit_hash = _get_git_revision_hash(local_repo_path) @@ -437,71 +445,104 @@ def clone_repository( if include_to_path: __add_local_path_to_sys_path(local_repo_path) return local_repo_path - else: - if local_repo_commit_hash != commit_hash: - print( - f"Local repository '{local_repo_path}' has a different commit hash " - f"({local_repo_commit_hash}) than the one provided ({commit_hash})." - ) - elif force: - print( - f"Local repository '{local_repo_path}' already exists. " - f"Forcing re-download." - ) + elif local_repo_commit_hash != commit_hash: + print( + f"Local repository '{local_repo_path}' has a different commit hash " + f"({local_repo_commit_hash}) than the one provided ({commit_hash})." + ) + elif force: + print( + f"Local repository '{local_repo_path}' already exists. " + f"Forcing re-download." + ) print(f"Removing the existing repository: {local_repo_path} ") - with TemporaryDirectory( - dir=target_dir, suffix=f"{local_repo_path.name}.tmp.old" - ) as tmp_dir: - with suppress(FileNotFoundError): - # repository might be already deleted by another worker - os.rename(local_repo_path, tmp_dir) - # sometimes seeing FileNotFoundError even here on juicefs - shutil.rmtree(tmp_dir, ignore_errors=True) - - # NOTE: using the target_dir to be able to avoid potential copies across temp fs - # and target fs, and also to be able to atomically rename repo_name dir into place - # when we are done setting it up. - os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] - with TemporaryDirectory( - dir=target_dir, - suffix=f"{local_repo_path.name}.tmp", - ) as temp_dir: - try: - print(f"Cloning the repository '{https_url}' .") - - # Clone with disabling the logs and advices for detached HEAD state. - clone_command = [ - "git", - "clone", - "--recursive", - https_url, - temp_dir, - ] - subprocess.check_call(clone_command) - - if commit_hash: - checkout_command = ["git", "checkout", commit_hash] - subprocess.check_call(checkout_command, cwd=temp_dir) - subprocess.check_call( - ["git", "submodule", "update", "--init", "--recursive"], - cwd=temp_dir, - ) - # NOTE: Atomically renaming the repository directory into place when the - # clone and checkout are done. + with TemporaryDirectory( + dir=target_dir, suffix=f"{local_repo_path.name}.tmp.old" + ) as tmp_dir: + with suppress(FileNotFoundError): + # repository might be already deleted by another worker + os.rename(local_repo_path, tmp_dir) + # sometimes seeing FileNotFoundError even here on juicefs + shutil.rmtree(tmp_dir, ignore_errors=True) + + + if archive_path.exists(): + print("Cached repository found, unpacking...") + + # Copy the archive to the temp directory + file_path = shutil.copyfile(archive_path, temp_dir / (repo_name+".zip")) + + # Unpack and clean + shutil.unpack_archive(file_path, temp_dir / repo_name) + os.remove(file_path) + + if temp_dir.absolute() != target_dir.absolute(): + shutil.move(temp_dir / repo_name, target_dir / repo_name) + + local_repo_path = target_dir / repo_name + + else: + # NOTE: using the target_dir to be able to avoid potential copies across temp fs + # and target fs, and also to be able to atomically rename repo_name dir into place + # when we are done setting it up. + # os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] + with TemporaryDirectory( + dir="/tmp", + suffix=f"{repo_name}.tmp", + ) as temp_repo_dir: try: - os.rename(temp_dir, local_repo_path) - except OSError as error: - shutil.rmtree(temp_dir, ignore_errors=True) - - # someone beat us to it, assume it's good - if error.errno != errno.ENOTEMPTY: - raise - print(f"{local_repo_path} already exists, skipping rename") - - except Exception as error: - print(f"{error}\nFailed to clone repository '{https_url}' .") - raise error + print(f"Cloning the repository '{https_url}'.") + + # Clone with disabling the logs and advices for detached HEAD state. + clone_command = [ + "git", + "clone", + "--recursive", + https_url, + temp_repo_dir, + ] + subprocess.check_call(clone_command) + + if commit_hash: + checkout_command = ["git", "checkout", commit_hash] + subprocess.check_call(checkout_command, cwd=temp_repo_dir) + subprocess.check_call( + ["git", "submodule", "update", "--init", "--recursive"], + cwd=temp_repo_dir, + ) + + repo_zip_name = repo_hash+".zip" + + file_name = shutil.make_archive(repo_name, "zip", root_dir=temp_repo_dir) + os.rename(file_name, temp_dir/repo_zip_name) + + + # We know that file_path is empty + os.makedirs(archive_path.parent, exist_ok=True) + + shutil.copy(temp_dir/repo_zip_name, archive_path) + os.remove(temp_dir/repo_zip_name) + + print(f"Repository is cached in {archive_path}") + + + # NOTE: Atomically renaming the repository directory into place when the + # clone and checkout are done. + try: + shutil.copytree(temp_repo_dir, target_dir/repo_name) + shutil.rmtree(temp_repo_dir, ignore_errors=True) + except OSError as error: + shutil.rmtree(temp_dir, ignore_errors=True) + + # someone beat us to it, assume it's good + if error.errno != errno.ENOTEMPTY: + raise + print(f"{target_dir/repo_name} already exists, skipping rename") + + except Exception as error: + print(f"{error}\nFailed to clone repository '{https_url}' .") + raise error if include_to_path: __add_local_path_to_sys_path(local_repo_path) diff --git a/projects/fal/tests/integration_test.py b/projects/fal/tests/integration_test.py index f418c9f4..ffae7574 100644 --- a/projects/fal/tests/integration_test.py +++ b/projects/fal/tests/integration_test.py @@ -354,12 +354,12 @@ def test_clone_repository(isolated_client, mock_fal_persistent_dirs): EXAMPLE_REPO_URL = "https://github.com/fal-ai/isolate.git" EXAMPLE_REPO_FIRST_COMMIT = "64b0a89c8391bd2cb3ca23cdeae01779e11aee05" EXAMPLE_REPO_SECOND_COMMIT = "34ecbca8cc7b64719d2a5c40dd3272f8d13bc1d2" - expected_path = FAL_REPOSITORY_DIR / "isolate" + expected_path = f"/tmp/isolate" first_expected_path = ( - FAL_REPOSITORY_DIR / f"isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}" + f"/tmp/isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}" ) second_expected_path = ( - FAL_REPOSITORY_DIR / f"isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}" + f"/tmp/isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}" ) @isolated_client() From 01498cb83d3bb270f712fe2cccc7b37d9b1f9c38 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Mon, 16 Jun 2025 13:38:47 +0300 Subject: [PATCH 2/8] format --- .../src/fal/toolkit/utils/download_utils.py | 31 +++++++++---------- projects/fal/tests/integration_test.py | 12 ++----- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index 18516613..e5ca4f21 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -427,16 +427,14 @@ def clone_repository( temp_dir = Path("/tmp") base_repo_dir = Path(FAL_REPOSITORY_DIR) - if repo_name is None: repo_name = Path(https_url).stem if commit_hash: repo_name += f"-{commit_hash[:8]}" commit_hash = commit_hash or "main" - repo_url_hash = _hash_url(https_url) - repo_hash = f"{repo_url_hash}-{commit_hash}" - archive_path = base_repo_dir / (repo_hash+".zip") + repo_hash = f"{_hash_url(https_url)}-{commit_hash}" + archive_path = base_repo_dir / (repo_hash + ".zip") local_repo_path = target_dir / repo_name if local_repo_path.exists(): @@ -466,13 +464,12 @@ def clone_repository( # sometimes seeing FileNotFoundError even here on juicefs shutil.rmtree(tmp_dir, ignore_errors=True) - if archive_path.exists(): print("Cached repository found, unpacking...") # Copy the archive to the temp directory - file_path = shutil.copyfile(archive_path, temp_dir / (repo_name+".zip")) - + file_path = shutil.copyfile(archive_path, temp_dir / (repo_name + ".zip")) + # Unpack and clean shutil.unpack_archive(file_path, temp_dir / repo_name) os.remove(file_path) @@ -484,8 +481,8 @@ def clone_repository( else: # NOTE: using the target_dir to be able to avoid potential copies across temp fs - # and target fs, and also to be able to atomically rename repo_name dir into place - # when we are done setting it up. + # and target fs, and also to be able to atomically rename repo_name dir into + # place when we are done setting it up. # os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] with TemporaryDirectory( dir="/tmp", @@ -512,25 +509,25 @@ def clone_repository( cwd=temp_repo_dir, ) - repo_zip_name = repo_hash+".zip" - - file_name = shutil.make_archive(repo_name, "zip", root_dir=temp_repo_dir) - os.rename(file_name, temp_dir/repo_zip_name) + repo_zip_name = repo_hash + ".zip" + file_name = shutil.make_archive( + repo_name, "zip", root_dir=temp_repo_dir + ) + os.rename(file_name, temp_dir / repo_zip_name) # We know that file_path is empty os.makedirs(archive_path.parent, exist_ok=True) - shutil.copy(temp_dir/repo_zip_name, archive_path) - os.remove(temp_dir/repo_zip_name) + shutil.copy(temp_dir / repo_zip_name, archive_path) + os.remove(temp_dir / repo_zip_name) print(f"Repository is cached in {archive_path}") - # NOTE: Atomically renaming the repository directory into place when the # clone and checkout are done. try: - shutil.copytree(temp_repo_dir, target_dir/repo_name) + shutil.copytree(temp_repo_dir, target_dir / repo_name) shutil.rmtree(temp_repo_dir, ignore_errors=True) except OSError as error: shutil.rmtree(temp_dir, ignore_errors=True) diff --git a/projects/fal/tests/integration_test.py b/projects/fal/tests/integration_test.py index ffae7574..99653cb5 100644 --- a/projects/fal/tests/integration_test.py +++ b/projects/fal/tests/integration_test.py @@ -348,19 +348,13 @@ def download_weights(): def test_clone_repository(isolated_client, mock_fal_persistent_dirs): - from fal.toolkit.utils.download_utils import FAL_REPOSITORY_DIR - # https://github.com/fal-ai/isolate/tree/64b0a89c8391bd2cb3ca23cdeae01779e11aee05 EXAMPLE_REPO_URL = "https://github.com/fal-ai/isolate.git" EXAMPLE_REPO_FIRST_COMMIT = "64b0a89c8391bd2cb3ca23cdeae01779e11aee05" EXAMPLE_REPO_SECOND_COMMIT = "34ecbca8cc7b64719d2a5c40dd3272f8d13bc1d2" - expected_path = f"/tmp/isolate" - first_expected_path = ( - f"/tmp/isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}" - ) - second_expected_path = ( - f"/tmp/isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}" - ) + expected_path = "/tmp/isolate" + first_expected_path = f"/tmp/isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}" + second_expected_path = f"/tmp/isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}" @isolated_client() def clone_without_commit_hash(): From 4c1a8d97e6ca145f130565580ac1904af7b165de Mon Sep 17 00:00:00 2001 From: aturker1 Date: Mon, 16 Jun 2025 14:28:04 +0300 Subject: [PATCH 3/8] random temp dir --- projects/fal/src/fal/toolkit/utils/download_utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index e5ca4f21..7a3b6bfa 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -3,6 +3,7 @@ import errno import hashlib import os +import random import shutil import subprocess import sys @@ -486,7 +487,7 @@ def clone_repository( # os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] with TemporaryDirectory( dir="/tmp", - suffix=f"{repo_name}.tmp", + suffix=f"{repo_name}.tmp{random.randint(0, 1000000)}", ) as temp_repo_dir: try: print(f"Cloning the repository '{https_url}'.") @@ -518,17 +519,14 @@ def clone_repository( # We know that file_path is empty os.makedirs(archive_path.parent, exist_ok=True) - - shutil.copy(temp_dir / repo_zip_name, archive_path) - os.remove(temp_dir / repo_zip_name) + shutil.move(temp_dir / repo_zip_name, archive_path) print(f"Repository is cached in {archive_path}") # NOTE: Atomically renaming the repository directory into place when the # clone and checkout are done. try: - shutil.copytree(temp_repo_dir, target_dir / repo_name) - shutil.rmtree(temp_repo_dir, ignore_errors=True) + shutil.move(temp_repo_dir, target_dir / repo_name) except OSError as error: shutil.rmtree(temp_dir, ignore_errors=True) From f075233807cc0d438b213371028e5bf8a01d3763 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Mon, 16 Jun 2025 17:01:35 +0300 Subject: [PATCH 4/8] clone_repositry_cached --- projects/fal/src/fal/toolkit/__init__.py | 2 + .../src/fal/toolkit/utils/download_utils.py | 156 ++++++++++++++---- projects/fal/tests/integration_test.py | 95 ++++++----- 3 files changed, 181 insertions(+), 72 deletions(-) diff --git a/projects/fal/src/fal/toolkit/__init__.py b/projects/fal/src/fal/toolkit/__init__.py index 51f8bad4..3e603a96 100644 --- a/projects/fal/src/fal/toolkit/__init__.py +++ b/projects/fal/src/fal/toolkit/__init__.py @@ -9,6 +9,7 @@ FAL_PERSISTENT_DIR, FAL_REPOSITORY_DIR, clone_repository, + clone_repository_cached, download_file, download_model_weights, ) @@ -31,6 +32,7 @@ "FAL_PERSISTENT_DIR", "FAL_REPOSITORY_DIR", "clone_repository", + "clone_repository_cached", "download_file", "download_model_weights", ] diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index 7a3b6bfa..de3c9869 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -401,13 +401,10 @@ def clone_repository( ) -> Path: """Clones a Git repository from the specified HTTPS URL into a local directory. - This function clones a Git repository from the specified HTTPS URL into a local directory. It can also checkout a specific commit if the `commit_hash` is provided. - If a custom `target_dir` or `repo_name` is not specified, a predefined directory is used for the target directory, and the repository name is determined from the URL. - Args: https_url: The HTTPS URL of the Git repository to be cloned. commit_hash: The commit hash to checkout after cloning. @@ -419,24 +416,17 @@ def clone_repository( and its commit hash matches the provided commit hash. Defaults to `False`. include_to_path: If `True`, the cloned repository is added to the `sys.path`. Defaults to `False`. - Returns: A Path object representing the full path to the cloned Git repository. """ - - target_dir = Path(target_dir or "/tmp") # type: ignore[assignment] - temp_dir = Path("/tmp") - base_repo_dir = Path(FAL_REPOSITORY_DIR) + target_dir = target_dir or FAL_REPOSITORY_DIR # type: ignore[assignment] if repo_name is None: repo_name = Path(https_url).stem if commit_hash: repo_name += f"-{commit_hash[:8]}" - commit_hash = commit_hash or "main" - repo_hash = f"{_hash_url(https_url)}-{commit_hash}" - archive_path = base_repo_dir / (repo_hash + ".zip") - local_repo_path = target_dir / repo_name + local_repo_path = Path(target_dir) / repo_name # type: ignore[arg-type] if local_repo_path.exists(): local_repo_commit_hash = _get_git_revision_hash(local_repo_path) @@ -444,24 +434,128 @@ def clone_repository( if include_to_path: __add_local_path_to_sys_path(local_repo_path) return local_repo_path - elif local_repo_commit_hash != commit_hash: - print( - f"Local repository '{local_repo_path}' has a different commit hash " - f"({local_repo_commit_hash}) than the one provided ({commit_hash})." - ) - elif force: - print( - f"Local repository '{local_repo_path}' already exists. " - f"Forcing re-download." - ) + else: + if local_repo_commit_hash != commit_hash: + print( + f"Local repository '{local_repo_path}' has a different commit hash " + f"({local_repo_commit_hash}) than the one provided ({commit_hash})." + ) + elif force: + print( + f"Local repository '{local_repo_path}' already exists. " + f"Forcing re-download." + ) print(f"Removing the existing repository: {local_repo_path} ") + with TemporaryDirectory( + dir=target_dir, suffix=f"{local_repo_path.name}.tmp.old" + ) as tmp_dir: + with suppress(FileNotFoundError): + # repository might be already deleted by another worker + os.rename(local_repo_path, tmp_dir) + # sometimes seeing FileNotFoundError even here on juicefs + shutil.rmtree(tmp_dir, ignore_errors=True) + + # NOTE: using the target_dir to be able to avoid potential copies across temp fs + # and target fs, and also to be able to atomically rename repo_name dir into place + # when we are done setting it up. + os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] + with TemporaryDirectory( + dir=target_dir, + suffix=f"{local_repo_path.name}.tmp", + ) as temp_dir: + try: + print(f"Cloning the repository '{https_url}' .") + + # Clone with disabling the logs and advices for detached HEAD state. + clone_command = [ + "git", + "clone", + "--recursive", + https_url, + temp_dir, + ] + subprocess.check_call(clone_command) + + if commit_hash: + checkout_command = ["git", "checkout", commit_hash] + subprocess.check_call(checkout_command, cwd=temp_dir) + subprocess.check_call( + ["git", "submodule", "update", "--init", "--recursive"], + cwd=temp_dir, + ) + + # NOTE: Atomically renaming the repository directory into place when the + # clone and checkout are done. + try: + os.rename(temp_dir, local_repo_path) + except OSError as error: + shutil.rmtree(temp_dir, ignore_errors=True) + + # someone beat us to it, assume it's good + if error.errno != errno.ENOTEMPTY: + raise + print(f"{local_repo_path} already exists, skipping rename") + + except Exception as error: + print(f"{error}\nFailed to clone repository '{https_url}' .") + raise error + + if include_to_path: + __add_local_path_to_sys_path(local_repo_path) + return local_repo_path + + +# TODO: rename +def clone_repository_cached( + https_url: str, + *, + commit_hash: str | None = None, + target_path: str | Path | None = None, + include_to_path: bool = False, +) -> Path: + """Clones a Git repository from the specified HTTPS URL into a local + directory. + + This function clones a Git repository from the specified HTTPS URL into a local + directory. It can also checkout a specific commit if the `commit_hash` is provided. + + If a custom `target_path` is not specified, a predefined directory is + used for the target directory, and the repository name is determined from the URL. + + Args: + https_url: The HTTPS URL of the Git repository to be cloned. + commit_hash: The commit hash to checkout after cloning. + target_path: The path where the repository will be saved. + If not provided, a predefined directory is used. + include_to_path: If `True`, the cloned repository is added to the `sys.path`. + Defaults to `False`. + + Returns: + A Path object representing the full path to the cloned Git repository. + """ + + temp_dir = Path("/tmp") + base_repo_dir = Path(FAL_REPOSITORY_DIR) + if isinstance(target_path, str): + target_path = Path(target_path) + repo_name = target_path.stem if target_path else Path(https_url).stem + if commit_hash: + repo_name += f"-{commit_hash[:8]}" + + commit_hash = commit_hash or "main" + repo_hash = f"{_hash_url(https_url)}-{commit_hash}" + archive_path = base_repo_dir / (repo_hash + ".zip") + target_path = Path(target_path or temp_dir / repo_name) + + # Clean up the existing repository if it exists + if target_path.exists(): with TemporaryDirectory( - dir=target_dir, suffix=f"{local_repo_path.name}.tmp.old" + dir=target_path.parent, suffix=f"{target_path.name}.tmp.old" ) as tmp_dir: with suppress(FileNotFoundError): # repository might be already deleted by another worker - os.rename(local_repo_path, tmp_dir) + os.rename(target_path, tmp_dir) # sometimes seeing FileNotFoundError even here on juicefs shutil.rmtree(tmp_dir, ignore_errors=True) @@ -475,10 +569,8 @@ def clone_repository( shutil.unpack_archive(file_path, temp_dir / repo_name) os.remove(file_path) - if temp_dir.absolute() != target_dir.absolute(): - shutil.move(temp_dir / repo_name, target_dir / repo_name) - - local_repo_path = target_dir / repo_name + if temp_dir.absolute() != target_path.parent.absolute(): + shutil.move(temp_dir / repo_name, target_path) else: # NOTE: using the target_dir to be able to avoid potential copies across temp fs @@ -526,23 +618,23 @@ def clone_repository( # NOTE: Atomically renaming the repository directory into place when the # clone and checkout are done. try: - shutil.move(temp_repo_dir, target_dir / repo_name) + shutil.move(temp_repo_dir, target_path) except OSError as error: shutil.rmtree(temp_dir, ignore_errors=True) # someone beat us to it, assume it's good if error.errno != errno.ENOTEMPTY: raise - print(f"{target_dir/repo_name} already exists, skipping rename") + print(f"{target_path} already exists, skipping rename") except Exception as error: print(f"{error}\nFailed to clone repository '{https_url}' .") raise error if include_to_path: - __add_local_path_to_sys_path(local_repo_path) + __add_local_path_to_sys_path(target_path) - return local_repo_path + return target_path def __add_local_path_to_sys_path(local_path: Path | str): diff --git a/projects/fal/tests/integration_test.py b/projects/fal/tests/integration_test.py index 99653cb5..ac4bdc23 100644 --- a/projects/fal/tests/integration_test.py +++ b/projects/fal/tests/integration_test.py @@ -16,6 +16,7 @@ from fal.toolkit import ( File, clone_repository, + clone_repository_cached, download_file, download_model_weights, ) @@ -347,18 +348,34 @@ def download_weights(): ), "The model weights should be redownloaded with force=True" -def test_clone_repository(isolated_client, mock_fal_persistent_dirs): +@pytest.mark.parametrize( + "clone_fn", + [ + clone_repository, + clone_repository_cached, + ], +) +def test_clone_repository(isolated_client, mock_fal_persistent_dirs, clone_fn): # https://github.com/fal-ai/isolate/tree/64b0a89c8391bd2cb3ca23cdeae01779e11aee05 EXAMPLE_REPO_URL = "https://github.com/fal-ai/isolate.git" EXAMPLE_REPO_FIRST_COMMIT = "64b0a89c8391bd2cb3ca23cdeae01779e11aee05" EXAMPLE_REPO_SECOND_COMMIT = "34ecbca8cc7b64719d2a5c40dd3272f8d13bc1d2" - expected_path = "/tmp/isolate" - first_expected_path = f"/tmp/isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}" - second_expected_path = f"/tmp/isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}" + + # clone_repository uses FAL_REPOSITORY_DIR, clone_repository_cached uses /tmp + if clone_fn == clone_repository: + from fal.toolkit.utils.download_utils import FAL_REPOSITORY_DIR + + base_dir = FAL_REPOSITORY_DIR + else: + base_dir = Path("/tmp") + + expected_path = str(base_dir / "isolate") + first_expected_path = str(base_dir / f"isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}") + second_expected_path = str(base_dir / f"isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}") @isolated_client() def clone_without_commit_hash(): - repo_path = clone_repository(EXAMPLE_REPO_URL) + repo_path = clone_fn(EXAMPLE_REPO_URL) return repo_path @@ -367,14 +384,10 @@ def clone_without_commit_hash(): @isolated_client() def clone_with_commit_hash(): - first_path = clone_repository( - EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT - ) + first_path = clone_fn(EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT) first_repo_hash = _get_git_revision_hash(first_path) - second_path = clone_repository( - EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_SECOND_COMMIT - ) + second_path = clone_fn(EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_SECOND_COMMIT) second_repo_hash = _get_git_revision_hash(second_path) @@ -403,17 +416,17 @@ def clone_with_commit_hash(): @isolated_client() def clone_with_force(): - first_path = clone_repository( + first_path = clone_fn( EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT, force=False ) first_repo_stat = first_path.stat() - second_path = clone_repository( + second_path = clone_fn( EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT, force=False ) second_repo_stat = second_path.stat() - third_path = clone_repository( + third_path = clone_fn( EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT, force=True ) third_repo_stat = third_path.stat() @@ -427,32 +440,34 @@ def clone_with_force(): third_repo_stat, ) - ( - first_path, - first_repo_stat, - second_path, - second_repo_stat, - third_path, - third_repo_stat, - ) = clone_with_force() - - assert str(first_expected_path) == str( - first_path - ), "Path should be the target location" - assert str(first_expected_path) == str( - second_path - ), "Path should be the target location" - assert str(first_expected_path) == str( - third_path - ), "Path should be the target location" - - assert ( - first_repo_stat.st_mtime == second_repo_stat.st_mtime - ), "The repository should not be cloned again" - - assert ( - first_repo_stat.st_mtime < third_repo_stat.st_mtime - ), "The repository should be cloned again with force=True" + # Only test force functionality for clone_repository + if clone_fn == clone_repository: + ( + first_path, + first_repo_stat, + second_path, + second_repo_stat, + third_path, + third_repo_stat, + ) = clone_with_force() + + assert str(first_expected_path) == str( + first_path + ), "Path should be the target location" + assert str(first_expected_path) == str( + second_path + ), "Path should be the target location" + assert str(first_expected_path) == str( + third_path + ), "Path should be the target location" + + assert ( + first_repo_stat.st_mtime == second_repo_stat.st_mtime + ), "The repository should not be cloned again" + + assert ( + first_repo_stat.st_mtime < third_repo_stat.st_mtime + ), "The repository should be cloned again with force=True" def fal_file_downloaded(file: File): From 992c546cc0eba1aa5b542b66bace9147b4854e58 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Tue, 17 Jun 2025 18:19:05 +0300 Subject: [PATCH 5/8] test archive stats --- .../src/fal/toolkit/utils/download_utils.py | 2 +- projects/fal/tests/integration_test.py | 51 ++++++++++++++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index de3c9869..d0645f60 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -620,7 +620,7 @@ def clone_repository_cached( try: shutil.move(temp_repo_dir, target_path) except OSError as error: - shutil.rmtree(temp_dir, ignore_errors=True) + shutil.rmtree(temp_repo_dir, ignore_errors=True) # someone beat us to it, assume it's good if error.errno != errno.ENOTEMPTY: diff --git a/projects/fal/tests/integration_test.py b/projects/fal/tests/integration_test.py index ac4bdc23..13262139 100644 --- a/projects/fal/tests/integration_test.py +++ b/projects/fal/tests/integration_test.py @@ -360,11 +360,10 @@ def test_clone_repository(isolated_client, mock_fal_persistent_dirs, clone_fn): EXAMPLE_REPO_URL = "https://github.com/fal-ai/isolate.git" EXAMPLE_REPO_FIRST_COMMIT = "64b0a89c8391bd2cb3ca23cdeae01779e11aee05" EXAMPLE_REPO_SECOND_COMMIT = "34ecbca8cc7b64719d2a5c40dd3272f8d13bc1d2" + from fal.toolkit.utils.download_utils import FAL_REPOSITORY_DIR # clone_repository uses FAL_REPOSITORY_DIR, clone_repository_cached uses /tmp if clone_fn == clone_repository: - from fal.toolkit.utils.download_utils import FAL_REPOSITORY_DIR - base_dir = FAL_REPOSITORY_DIR else: base_dir = Path("/tmp") @@ -372,6 +371,7 @@ def test_clone_repository(isolated_client, mock_fal_persistent_dirs, clone_fn): expected_path = str(base_dir / "isolate") first_expected_path = str(base_dir / f"isolate-{EXAMPLE_REPO_FIRST_COMMIT[:8]}") second_expected_path = str(base_dir / f"isolate-{EXAMPLE_REPO_SECOND_COMMIT[:8]}") + import os @isolated_client() def clone_without_commit_hash(): @@ -469,6 +469,53 @@ def clone_with_force(): first_repo_stat.st_mtime < third_repo_stat.st_mtime ), "The repository should be cloned again with force=True" + @isolated_client() + def clone_without_commit_hash_multiple_times(): + import shutil + + # Clean FAL_REPOSITORY_DIR + shutil.rmtree(FAL_REPOSITORY_DIR, ignore_errors=True) + + repo_path = clone_fn(EXAMPLE_REPO_URL) + + repo_dir_contents = os.listdir(FAL_REPOSITORY_DIR) + + first_n_archives = len(repo_dir_contents) + first_archive_stat = Path(FAL_REPOSITORY_DIR / repo_dir_contents[0]).stat() + + repo_path_2 = clone_fn(EXAMPLE_REPO_URL) + + repo_dir_contents_2 = os.listdir(FAL_REPOSITORY_DIR) + + second_n_archives = len(repo_dir_contents_2) + second_archive_stat = Path(FAL_REPOSITORY_DIR / repo_dir_contents_2[0]).stat() + + return ( + repo_path, + repo_path_2, + first_n_archives, + second_n_archives, + first_archive_stat, + second_archive_stat, + ) + + ( + repo_path, + repo_path_2, + first_n_archives, + second_n_archives, + first_archive_stat, + second_archive_stat, + ) = clone_without_commit_hash_multiple_times() + + if clone_fn == clone_repository_cached: + assert first_n_archives == 1, "Only one archive should be present" + assert second_n_archives == 1, "Only one archive should be present" + assert ( + first_archive_stat.st_mtime == second_archive_stat.st_mtime + ), "The archive should be the same" + assert repo_path == repo_path_2, "The repository should be the same" + def fal_file_downloaded(file: File): return file.file_size is not None From ff55309753b7ecdfaf8f2873e108c3a41e6c9838 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Wed, 18 Jun 2025 01:12:41 +0300 Subject: [PATCH 6/8] fix race cond --- projects/fal/src/fal/toolkit/utils/download_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index d0645f60..214a5037 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -577,9 +577,10 @@ def clone_repository_cached( # and target fs, and also to be able to atomically rename repo_name dir into # place when we are done setting it up. # os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] + random_idx = random.randint(0, 9999999) with TemporaryDirectory( dir="/tmp", - suffix=f"{repo_name}.tmp{random.randint(0, 1000000)}", + suffix=f"{repo_name}.tmp{random_idx}", ) as temp_repo_dir: try: print(f"Cloning the repository '{https_url}'.") @@ -618,7 +619,14 @@ def clone_repository_cached( # NOTE: Atomically renaming the repository directory into place when the # clone and checkout are done. try: - shutil.move(temp_repo_dir, target_path) + shutil.move( + temp_repo_dir, + target_path.with_name(f"tmp_{random_idx}_" + target_path.name), + ) + os.rename( + target_path.with_name(f"tmp_{random_idx}_" + target_path.name), + target_path, + ) except OSError as error: shutil.rmtree(temp_repo_dir, ignore_errors=True) From 039b62019aed26e64c949153d8277648ed63fcdb Mon Sep 17 00:00:00 2001 From: aturker1 Date: Wed, 18 Jun 2025 20:54:34 +0300 Subject: [PATCH 7/8] update hash strategy --- .../src/fal/toolkit/utils/download_utils.py | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index 214a5037..df92ad79 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -540,10 +540,39 @@ def clone_repository_cached( target_path = Path(target_path) repo_name = target_path.stem if target_path else Path(https_url).stem - if commit_hash: + + if commit_hash is None: + print( + "Log warning: No commit hash provided. Attempting to fetch the latest" + " version of the repository from GitHub. This process may take time and" + " could result in unexpected changes. Please specify a commit hash to" + " ensure stability." + ) + + # Get the commit hash from the remote repository + commit_hash = subprocess.check_output( + ["git", "ls-remote", https_url, "HEAD"], + text=True, + stderr=subprocess.STDOUT, + ).split()[0] + if not commit_hash: + raise ValueError( + "Failed to get the commit hash from the remote repository." + ) + else: + # Convert mutable hash to immutable hash + + result = subprocess.check_output( + ["git", "ls-remote", https_url, commit_hash], + text=True, + stderr=subprocess.STDOUT, + ) + + if result: + commit_hash = result.split()[0] + repo_name += f"-{commit_hash[:8]}" - commit_hash = commit_hash or "main" repo_hash = f"{_hash_url(https_url)}-{commit_hash}" archive_path = base_repo_dir / (repo_hash + ".zip") target_path = Path(target_path or temp_dir / repo_name) From 419ea05a1e3cd2e161d87fac2ad55eace7c32623 Mon Sep 17 00:00:00 2001 From: aturker1 Date: Wed, 18 Jun 2025 23:45:13 +0300 Subject: [PATCH 8/8] format --- .../src/fal/toolkit/utils/download_utils.py | 19 ++++++++++++------- projects/fal/tests/integration_test.py | 4 +--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/projects/fal/src/fal/toolkit/utils/download_utils.py b/projects/fal/src/fal/toolkit/utils/download_utils.py index bf729fc8..612cee1f 100644 --- a/projects/fal/src/fal/toolkit/utils/download_utils.py +++ b/projects/fal/src/fal/toolkit/utils/download_utils.py @@ -401,10 +401,13 @@ def clone_repository( ) -> Path: """Clones a Git repository from the specified HTTPS URL into a local directory. + This function clones a Git repository from the specified HTTPS URL into a local directory. It can also checkout a specific commit if the `commit_hash` is provided. + If a custom `target_dir` or `repo_name` is not specified, a predefined directory is used for the target directory, and the repository name is determined from the URL. + Args: https_url: The HTTPS URL of the Git repository to be cloned. commit_hash: The commit hash to checkout after cloning. @@ -416,6 +419,7 @@ def clone_repository( and its commit hash matches the provided commit hash. Defaults to `False`. include_to_path: If `True`, the cloned repository is added to the `sys.path`. Defaults to `False`. + Returns: A Path object representing the full path to the cloned Git repository. """ @@ -511,6 +515,7 @@ def clone_repository( if include_to_path: __add_local_path_to_sys_path(local_repo_path) + return local_repo_path @@ -552,7 +557,7 @@ def clone_repository_cached( if commit_hash is None: print( - "Log warning: No commit hash provided. Attempting to fetch the latest" + "Warning: No commit hash provided. Attempting to fetch the latest" " version of the repository from GitHub. This process may take time and" " could result in unexpected changes. Please specify a commit hash to" " ensure stability." @@ -570,7 +575,6 @@ def clone_repository_cached( ) else: # Convert mutable hash to immutable hash - result = subprocess.check_output( ["git", "ls-remote", https_url, commit_hash], text=True, @@ -578,6 +582,11 @@ def clone_repository_cached( ) if result: + # This is mutable hash case + print( + "Warning: The provided Git reference is mutable (e.g., a branch or " + "tag). Please use an immutable commit hash to ensure reproducibility." + ) commit_hash = result.split()[0] repo_name += f"-{commit_hash[:8]}" @@ -598,7 +607,7 @@ def clone_repository_cached( shutil.rmtree(tmp_dir, ignore_errors=True) if archive_path.exists(): - print("Cached repository found, unpacking...") + print("Repository cache found, unpacking...") # Copy the archive to the temp directory file_path = shutil.copyfile(archive_path, temp_dir / (repo_name + ".zip")) @@ -611,10 +620,6 @@ def clone_repository_cached( shutil.move(temp_dir / repo_name, target_path) else: - # NOTE: using the target_dir to be able to avoid potential copies across temp fs - # and target fs, and also to be able to atomically rename repo_name dir into - # place when we are done setting it up. - # os.makedirs(target_dir, exist_ok=True) # type: ignore[arg-type] random_idx = random.randint(0, 9999999) with TemporaryDirectory( dir="/tmp", diff --git a/projects/fal/tests/integration_test.py b/projects/fal/tests/integration_test.py index dd1b325e..d68ac10a 100644 --- a/projects/fal/tests/integration_test.py +++ b/projects/fal/tests/integration_test.py @@ -384,9 +384,7 @@ def clone_without_commit_hash(): @isolated_client() def clone_with_commit_hash(): - first_path = clone_fn( - EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT - ) + first_path = clone_fn(EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_FIRST_COMMIT) first_repo_hash = _git_rev_parse(first_path, "HEAD") second_path = clone_fn(EXAMPLE_REPO_URL, commit_hash=EXAMPLE_REPO_SECOND_COMMIT)