From f2f73a302c7123f36ce811dfea3c64c2e1cb6777 Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Thu, 1 May 2025 15:21:47 -0700 Subject: [PATCH 1/3] Avoid global umask for setting file mode. --- src/datasets/utils/file_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index bbd19859b65..23c8d7011be 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -13,6 +13,7 @@ import posixpath import re import shutil +import stat import sys import tarfile import time @@ -411,11 +412,14 @@ def temp_file_manager(mode="w+b"): # GET file object fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc, disable_tqdm=disable_tqdm) + # get the permissions of the temp file + temp_file_mode = stat.S_IMODE(os.stat(temp_file.name).st_mode) + logger.info(f"storing {url} in cache at {cache_path}") shutil.move(temp_file.name, cache_path) - umask = os.umask(0o666) - os.umask(umask) - os.chmod(cache_path, 0o666 & ~umask) + + # make sure make sure permissions on cache_path are the same as temp_file, shutil.move may not preserve them + os.chmod(cache_path, temp_file_mode) logger.info(f"creating metadata file for {cache_path}") meta = {"url": url, "etag": etag} From dd819f7b399271ad4020410d85f8c4588e3b87bc Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Mon, 5 May 2025 13:11:12 -0400 Subject: [PATCH 2/3] Remove old code and simplify. --- src/datasets/utils/file_utils.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 23c8d7011be..86da8a49594 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -400,27 +400,16 @@ def get_from_cache( incomplete_path = cache_path + ".incomplete" - @contextmanager - def temp_file_manager(mode="w+b"): - with open(incomplete_path, mode) as f: - yield f - # Download to temporary file, then copy to cache path once finished. # Otherwise, you get corrupt cache entries if the download gets interrupted. - with temp_file_manager() as temp_file: + with open(incomplete_path, "w+b") as temp_file: logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}") # GET file object fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc, disable_tqdm=disable_tqdm) - # get the permissions of the temp file - temp_file_mode = stat.S_IMODE(os.stat(temp_file.name).st_mode) - logger.info(f"storing {url} in cache at {cache_path}") shutil.move(temp_file.name, cache_path) - # make sure make sure permissions on cache_path are the same as temp_file, shutil.move may not preserve them - os.chmod(cache_path, temp_file_mode) - logger.info(f"creating metadata file for {cache_path}") meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" @@ -1385,3 +1374,8 @@ def _iter_from_urlpaths( @classmethod def from_urlpaths(cls, urlpaths, download_config: Optional[DownloadConfig] = None) -> "FilesIterable": return cls(cls._iter_from_urlpaths, urlpaths, download_config) + +# if __name__ == "__main__": + +# with open("/tmp/test.txt", "w+b") as f: +# f.write(b"test") From ba43c8223c381f80d6763669beb049196a35c053 Mon Sep 17 00:00:00 2001 From: Ryan Clancy Date: Mon, 5 May 2025 13:12:15 -0400 Subject: [PATCH 3/3] make style --- src/datasets/utils/file_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 86da8a49594..dba97acda03 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -13,14 +13,12 @@ import posixpath import re import shutil -import stat import sys import tarfile import time import xml.dom.minidom import zipfile from collections.abc import Generator -from contextlib import contextmanager from io import BytesIO from itertools import chain from pathlib import Path, PurePosixPath @@ -1374,8 +1372,3 @@ def _iter_from_urlpaths( @classmethod def from_urlpaths(cls, urlpaths, download_config: Optional[DownloadConfig] = None) -> "FilesIterable": return cls(cls._iter_from_urlpaths, urlpaths, download_config) - -# if __name__ == "__main__": - -# with open("/tmp/test.txt", "w+b") as f: -# f.write(b"test")