Migrate imports to canonical paths and simplify download functions

ravwojdyla · claude · ravwojdyla · commit 710fe9b46788 · 2026-03-25T11:08:42.000-07:00
Updates all 23 consumer files to import from marin.datakit.download.*
instead of marin.download.*. Refactors download functions (transfer_files,
download_nemotron_cc, extract_dclm_hq_dump) to accept plain parameters
instead of requiring config dataclass construction. Config classes are
kept for backward compat with ExecutorStep callers.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/marin/src/marin/datakit/download/dclm_hq.py b/lib/marin/src/marin/datakit/download/dclm_hq.py
@@ -172,35 +172,40 @@ def process_file(task: FileTask) -> None:
         raise
 
 
-def extract_dclm_hq_dump(cfg: DCLMHQDownloadConfig) -> None:
-    """Process the DCLM HQ dump in the input path and save the results to the output path.
+def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None:
+    """Process the DCLM HQ dump and enrich with HTML from Common Crawl.
 
-    Flattens the nested directory structure (shards → files) into a single list of files
-    and processes them in parallel using zephyr.
+    Args:
+        input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat.
+        output_path: Output directory path. Required when input_path_or_cfg is a string.
     """
-    logger.info(f"Starting processing of DCLM HQ dump in {cfg.input_path}")
+    if isinstance(input_path_or_cfg, DCLMHQDownloadConfig):
+        input_path = input_path_or_cfg.input_path
+        output_path = input_path_or_cfg.output_path
+    else:
+        input_path = input_path_or_cfg
+        if output_path is None:
+            raise ValueError("output_path is required when input_path_or_cfg is a string")
+
+    logger.info(f"Starting processing of DCLM HQ dump in {input_path}")
 
-    # Flatten nested structure: discover all files upfront
     all_files = []
-    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(cfg.input_path, "*"))]
+    paths = [i.split("/")[-1] for i in fsspec_glob(os.path.join(input_path, "*"))]
 
     logger.info(f"Found {len(paths)} shards to process")
 
     for path in paths:
-        input_path = os.path.join(cfg.input_path, path)
-        shard_paths = fsspec_glob(os.path.join(input_path, "*.json.zst"))
+        shard_input = os.path.join(input_path, path)
+        shard_paths = fsspec_glob(os.path.join(shard_input, "*.json.zst"))
 
         for shard_path in shard_paths:
-            input_file_path = shard_path
-            output_file_path = os.path.join(cfg.output_path, path, os.path.basename(shard_path)).replace(
+            output_file_path = os.path.join(output_path, path, os.path.basename(shard_path)).replace(
                 ".json.zst", ".jsonl.gz"
             )
-
-            all_files.append(FileTask(input_file_path=input_file_path, output_file_path=output_file_path))
+            all_files.append(FileTask(input_file_path=shard_path, output_file_path=output_file_path))
 
     logger.info(f"Found {len(all_files)} files to process")
 
-    # Single-level parallelism over all files
     pipeline = Dataset.from_list(all_files).map(process_file)
 
     ctx = ZephyrContext(name="download-dclm-html")
@@ -220,7 +225,7 @@ def dclm_hq_step(
     """Create a StepSpec that downloads DCLM HQ HTML data from Common Crawl."""
 
     def _run(output_path: str) -> None:
-        extract_dclm_hq_dump(DCLMHQDownloadConfig(input_path=input_path, output_path=output_path))
+        extract_dclm_hq_dump(input_path, output_path)
 
     return StepSpec(
         name=name,
diff --git a/lib/marin/src/marin/datakit/download/filesystem.py b/lib/marin/src/marin/datakit/download/filesystem.py
@@ -1,6 +1,7 @@
 # Copyright The Marin Authors
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
 import os
 import random
 import time
@@ -12,61 +13,58 @@
 
 from marin.utils import fsspec_exists, fsspec_glob
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class TransferConfig:
+    """Kept for backward compatibility. Prefer ``transfer_files()`` with flat params."""
+
     input_path: str
     output_path: str
-
-    # Selectively choose the number of random files to transfer. None means all files
     num_random_files: int | None = None
     filetype: str = "jsonl.zst"
 
 
-def transfer_files(config: TransferConfig) -> None:
-    """Transfers files from the input path to the output path.
+def transfer_files(
+    input_path: str,
+    output_path: str,
+    *,
+    num_random_files: int | None = None,
+    filetype: str = "jsonl.zst",
+) -> None:
+    """Transfer files from input_path to output_path.
 
-    When num_random_files is None, copies the entire directory recursively.
-    When num_random_files is specified, randomly samples that many files and
-    copies them in parallel using zephyr.
+    When num_random_files is None, copies all matching files.
+    When specified, randomly samples that many files.
     """
-    if config.input_path.endswith("/"):
-        input_path = config.input_path[:-1]
-    else:
-        input_path = config.input_path
+    input_path = input_path.rstrip("/")
 
-    print(f"Downloading {input_path} from GCS.")
-    start_time: float = time.time()
+    logger.info("Transferring %s to %s", input_path, output_path)
+    start_time = time.time()
     fs, _ = url_to_fs(input_path)
     if not fs.exists(input_path):
         raise FileNotFoundError(f"{input_path} does not exist.")
 
-    # Glob all matching files
-    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{config.filetype}"))
+    filenames = fsspec_glob(os.path.join(input_path, f"**/*.{filetype}"))
 
-    # Select files: either random sample or all files
-    if config.num_random_files is None:
-        selected_files = filenames
-    else:
+    if num_random_files is not None:
         random.seed(42)
         random.shuffle(filenames)
-        selected_files = filenames[: config.num_random_files]
+        filenames = filenames[:num_random_files]
 
     def copy_file(filename: str) -> None:
-        """Copy a single file if it doesn't already exist at destination."""
-        output_filename = os.path.join(config.output_path, os.path.basename(filename))
+        output_filename = os.path.join(output_path, os.path.basename(filename))
         if not fsspec_exists(output_filename):
-            # Ensure output directory exists
-            fs.makedirs(config.output_path, exist_ok=True)
+            fs.makedirs(output_path, exist_ok=True)
             fs.copy(filename, output_filename)
 
-    # Always use parallel copying via zephyr
-    pipeline = Dataset.from_list(selected_files).map(copy_file)
+    pipeline = Dataset.from_list(filenames).map(copy_file)
     ctx = ZephyrContext(name="fs-transfer")
     ctx.execute(pipeline)
 
-    elapsed_time_seconds: float = time.time() - start_time
-    print(f"Downloaded {input_path} to {config.output_path} ({elapsed_time_seconds}s).")
+    elapsed = time.time() - start_time
+    logger.info("Transferred %s to %s (%.1fs)", input_path, output_path, elapsed)
 
 
 def transfer_step(
@@ -82,14 +80,7 @@ def transfer_step(
     """Create a StepSpec that transfers files between fsspec paths."""
 
     def _run(output_path: str) -> None:
-        transfer_files(
-            TransferConfig(
-                input_path=input_path,
-                output_path=output_path,
-                num_random_files=num_random_files,
-                filetype=filetype,
-            )
-        )
+        transfer_files(input_path, output_path, num_random_files=num_random_files, filetype=filetype)
 
     return StepSpec(
         name=name,
diff --git a/lib/marin/src/marin/datakit/download/nemotron_cc.py b/lib/marin/src/marin/datakit/download/nemotron_cc.py
@@ -87,11 +87,22 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) -
 
 @dataclass
 class NemotronIngressConfig:
+    """Kept for backward compatibility with ExecutorStep callers."""
+
     output_path: str = THIS_OUTPUT_PATH
 
 
-def download_nemotron_cc(cfg: NemotronIngressConfig):
-    paths_file_path = os.path.join(cfg.output_path, "data-jsonl.paths")
+def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None:
+    """Download and process Nemotron-CC dataset from Common Crawl.
+
+    Args:
+        output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat.
+    """
+    output_path = (
+        output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg
+    )
+
+    paths_file_path = os.path.join(output_path, "data-jsonl.paths")
     logger.info(f"Downloading Nemotron CC path file {paths_file_path}")
 
     with open_url(NCC_PATH_FILE_URL, "rb") as f, open_url(paths_file_path, "wb") as f_out:
@@ -102,7 +113,7 @@ def download_nemotron_cc(cfg: NemotronIngressConfig):
     with open_url(paths_file_path, "r", compression="gzip") as f:
         for line in f:
             file = line.strip()
-            output_file_path = os.path.join(cfg.output_path, file).replace("jsonl.zstd", "jsonl.zst")
+            output_file_path = os.path.join(output_path, file).replace("jsonl.zstd", "jsonl.zst")
             all_files.append((file, output_file_path))
 
     logger.info(f"Processing {len(all_files)} Nemotron CC files")
@@ -111,13 +122,13 @@ def download_nemotron_cc(cfg: NemotronIngressConfig):
         Dataset.from_list(all_files)
         .filter(lambda file_info: not fsspec_exists(file_info[1]))
         .map(lambda file_info: download_single_nemotron_path(*file_info))
-        .write_jsonl(os.path.join(cfg.output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
+        .write_jsonl(os.path.join(output_path, ".metrics/download-{shard:05d}.jsonl"), skip_existing=True)
     )
 
     ctx = ZephyrContext(name="download-nemotron-cc")
     ctx.execute(pipeline)
 
-    logger.info(f"Downloaded Nemotron CC files to {cfg.output_path}")
+    logger.info(f"Downloaded Nemotron CC files to {output_path}")
 
 
 def nemotron_cc_step(
@@ -130,7 +141,7 @@ def nemotron_cc_step(
     """Create a StepSpec that downloads the Nemotron-CC dataset from Common Crawl."""
 
     def _run(output_path: str) -> None:
-        download_nemotron_cc(NemotronIngressConfig(output_path=output_path))
+        download_nemotron_cc(output_path)
 
     return StepSpec(
         name=name,