Skip to content

Commit f34d28a

Browse files
committed
distinguish between update of one module and general container generation to speed things up
1 parent a218846 commit f34d28a

3 files changed

Lines changed: 182 additions & 62 deletions

File tree

Lines changed: 124 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
import contextlib
22
import logging
33
import re
4+
from concurrent.futures import ThreadPoolExecutor, as_completed
45
from pathlib import Path
56

67
import yaml
78

8-
log = logging.getLogger(__name__)
9+
from nf_core.utils import read_module_name
910

10-
_PROCESS_NAME_RE = re.compile(r"^\s*process\s+(\w+)\s*\{", re.MULTILINE)
11+
log = logging.getLogger(__name__)
1112

1213
PLATFORMS: dict[str, list[str]] = {
1314
"docker_amd64": ["docker", "linux/amd64", "name"],
@@ -20,6 +21,69 @@
2021
"conda_lock_files_arm64": ["conda", "linux/arm64", "lock_file"],
2122
}
2223

24+
_CONFIG_LINE_RE = re.compile(r"withName:\s*'(\w+)'\s*\{\s*(?:container|conda)\s*=\s*'([^']+)'")
25+
26+
27+
def _container_key(platform: str) -> str:
28+
return "conda" if platform.startswith("conda_lock_") else "container"
29+
30+
31+
def _parse_config_file(config_path: Path) -> dict[str, str]:
32+
"""Return {module_name: container} from an existing platform config file."""
33+
result: dict[str, str] = {}
34+
with contextlib.suppress(OSError):
35+
for line in config_path.read_text().splitlines():
36+
m = _CONFIG_LINE_RE.search(line)
37+
if m:
38+
result[m.group(1)] = m.group(2)
39+
return result
40+
41+
42+
def _write_platform_config(config_path: Path, entries: dict[str, str], key: str) -> bool:
43+
"""Write *entries* to *config_path*, or delete it when empty.
44+
45+
Skips the write when content is unchanged. Returns True if the file now exists.
46+
"""
47+
if not entries:
48+
config_path.unlink(missing_ok=True)
49+
return False
50+
new_content = "".join(
51+
f"process {{ withName: '{name}' {{ {key} = '{container}' }} }}\n" for name, container in sorted(entries.items())
52+
)
53+
if not config_path.exists() or config_path.read_text() != new_content:
54+
config_path.write_text(new_content)
55+
return True
56+
57+
58+
def _process_meta(meta_path: Path) -> tuple[str, dict[str, str]] | None:
59+
"""Read one meta.yml + sibling main.nf and return (module_name, {platform: container}).
60+
61+
Returns None when the file should be skipped.
62+
"""
63+
try:
64+
raw = meta_path.read_bytes()
65+
except OSError as e:
66+
log.debug(f"Could not read {meta_path}: {e}")
67+
return None
68+
69+
# TODO: remove this early-exit once containers are present in the majority of modules
70+
if b"containers:" not in raw:
71+
return None
72+
73+
meta = yaml.safe_load(raw)
74+
75+
module_name = read_module_name(meta_path.parent / "main.nf")
76+
if not module_name:
77+
log.debug(f"No process definition found next to {meta_path}, skipping")
78+
return None
79+
80+
platform_containers: dict[str, str] = {}
81+
for platform_name, (runtime, arch, protocol) in PLATFORMS.items():
82+
with contextlib.suppress(KeyError, TypeError):
83+
platform_containers[platform_name] = meta["containers"][runtime][arch][protocol]
84+
85+
return module_name, platform_containers
86+
2387

2488
class ContainerConfigs:
2589
"""Generates the container configuration files for a pipeline.
@@ -28,78 +92,82 @@ class ContainerConfigs:
2892
workflow_directory (Path): The directory containing the workflow files.
2993
"""
3094

31-
def __init__(
32-
self,
33-
workflow_directory: Path = Path(),
34-
):
95+
def __init__(self, workflow_directory: Path = Path()) -> None:
3596
self.workflow_directory = workflow_directory
3697

37-
def generate_container_configs(
38-
self, new_module_path: Path | None = None, new_module_name: str | None = None
39-
) -> set[str]:
98+
def update_module_container_config(self, module_path: Path) -> None:
99+
"""Targeted update for a single module.
100+
101+
Reads the current config files, splices in (or removes) the entry for
102+
the module at *module_path*, and writes back only what changed.
40103
"""
41-
Generate the container configuration files for a pipeline by scanning
42-
all ``main.nf`` files under the ``modules/`` directory and reading the
43-
accompanying ``meta.yml`` for container information.
104+
result = _process_meta(module_path / "meta.yml")
105+
106+
if result is None:
107+
module_name = read_module_name(module_path / "main.nf")
108+
if not module_name:
109+
log.debug(f"Could not determine process name for {module_path}, skipping")
110+
return
111+
platform_containers: dict[str, str] = {}
112+
else:
113+
module_name, platform_containers = result
114+
115+
conf_dir = self.workflow_directory / "conf"
116+
for platform in PLATFORMS:
117+
config_path = conf_dir / f"containers_{platform}.config"
118+
entries = _parse_config_file(config_path)
119+
if platform in platform_containers:
120+
entries[module_name] = platform_containers[platform]
121+
_write_platform_config(config_path, entries, _container_key(platform))
122+
123+
def generate_container_configs(self) -> set[str]:
124+
"""Full scan of all ``meta.yml`` files under ``modules/``.
125+
126+
Used by lint and bulk operations (update, remove, patch) where multiple
127+
modules may have changed.
44128
45129
Returns:
46130
set[str]: Names of config files written (e.g. ``{'containers_docker_amd64.config'}``).
47131
"""
132+
modules_dir = self.workflow_directory / "modules"
133+
if not modules_dir.is_dir():
134+
log.debug(f"No modules directory found at {modules_dir}, skipping")
135+
return set()
136+
48137
containers: dict[str, dict[str, str]] = {platform: {} for platform in PLATFORMS}
49138

50-
for meta_path in (self.workflow_directory / "modules").rglob("meta.yml"):
51-
try:
52-
content = meta_path.read_text()
53-
except OSError as e:
54-
log.debug(f"Could not read {meta_path}: {e}")
55-
continue
56-
57-
# TODO: remove this early-exit once containers are present in the majority of modules
58-
if "containers:" not in content:
59-
continue
60-
61-
meta = yaml.safe_load(content)
62-
63-
main_nf = meta_path.parent / "main.nf"
64-
try:
65-
match = _PROCESS_NAME_RE.search(main_nf.read_text())
66-
except OSError:
67-
log.debug(f"No main.nf next to {meta_path}, skipping")
68-
continue
69-
if not match:
70-
log.debug(f"No process definition found in {main_nf}, skipping")
71-
continue
72-
process_name = match.group(1)
73-
74-
for platform_name, (runtime, arch, protocol) in PLATFORMS.items():
75-
with contextlib.suppress(KeyError, TypeError):
76-
containers[platform_name][process_name] = meta["containers"][runtime][arch][protocol]
139+
with ThreadPoolExecutor() as pool:
140+
futures = {pool.submit(_process_meta, p): p for p in modules_dir.rglob("meta.yml")}
141+
for future in as_completed(futures):
142+
result = future.result()
143+
if result is None:
144+
continue
145+
module_name, platform_containers = result
146+
for platform_name, container in platform_containers.items():
147+
containers[platform_name][module_name] = container
77148

78149
log.info("Generated container configs for the pipeline.")
79150

80-
# remove all generated config files, to handle removed modules
81-
for platform in PLATFORMS:
82-
(self.workflow_directory / "conf" / f"containers_{platform}.config").unlink(missing_ok=True)
83-
# write config files
84151
written: set[str] = set()
85152
for platform, module_containers in containers.items():
86-
if not module_containers:
87-
continue
88-
container_key = "conda" if platform.startswith("conda_lock_") else "container"
89-
lines = [
90-
f"process {{ withName: '{module_name}' {{ {container_key} = '{container}' }} }}\n"
91-
for module_name, container in sorted(module_containers.items())
92-
]
93153
config_path = self.workflow_directory / "conf" / f"containers_{platform}.config"
94-
config_path.write_text("".join(lines))
95-
written.add(config_path.name)
154+
if _write_platform_config(config_path, module_containers, _container_key(platform)):
155+
written.add(config_path.name)
96156
return written
97157

98158

99-
def try_generate_container_configs(
100-
directory: Path, new_module_path: Path | None = None, new_module_name: str | None = None
101-
) -> None:
159+
def try_generate_container_configs(directory: Path, module_path: Path | None = None) -> None:
160+
"""Regenerate container configs for *directory*.
161+
162+
If *module_path* is given, only that module's entries are updated (fast
163+
path for single-module installs). Otherwise a full scan of ``modules/``
164+
is performed.
165+
"""
102166
try:
103-
ContainerConfigs(directory).generate_container_configs(new_module_path, new_module_name)
167+
configs = ContainerConfigs(directory)
168+
if module_path is not None:
169+
configs.update_module_container_config(module_path)
170+
else:
171+
configs.generate_container_configs()
104172
except UserWarning as e:
105173
log.warning(f"Could not regenerate container configuration files: {e}")

nf_core/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,18 @@ def check_nextflow_version(minimal_nf_version: tuple[int, int, int, bool], silen
369369
return nf_version >= minimal_nf_version
370370

371371

372+
_NF_PROCESS_NAME_RE = re.compile(r"^\s*process\s+(\w+)\s*\{", re.MULTILINE)
373+
374+
375+
def read_module_name(main_nf: Path) -> str | None:
376+
"""Return the process name declared in a Nextflow ``main.nf`` file, or ``None``."""
377+
try:
378+
match = _NF_PROCESS_NAME_RE.search(main_nf.read_text())
379+
return match.group(1) if match else None
380+
except OSError:
381+
return None
382+
383+
372384
def fetch_wf_config(wf_path: Path, cache_config: bool = True) -> dict:
373385
"""Uses Nextflow to retrieve the the configuration variables
374386
from a Nextflow workflow.

tests/pipelines/test_container_configs.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Tests for the ContainerConfigs helper used by pipelines."""
22

3-
from pathlib import Path
3+
import shutil
44

55
import ruamel.yaml
66

@@ -53,11 +53,7 @@ def test_generate_container_configs_newly_installed_module(self) -> None:
5353
with open(self.pipeline_dir / "modules" / "nf-core" / "fastqc" / "meta.yml") as fh:
5454
fastqc_meta_yml = yaml.load(fh)
5555

56-
# new_module_path/name are kept for backward compat but the scan finds the module regardless
57-
self.container_configs.generate_container_configs(
58-
new_module_path=Path("modules/nf-core/fastqc"),
59-
new_module_name="fastqc",
60-
)
56+
self.container_configs.generate_container_configs()
6157

6258
conf_dir = self.pipeline_dir / "conf"
6359
for p_name, (runtime, arch, protocol) in PLATFORMS.items():
@@ -82,3 +78,47 @@ def test_generate_container_configs_removes_stale_entries(self) -> None:
8278
assert stale_line not in cfg_path.read_text(), (
8379
f"{cfg_path.name} still contains stale entry after regeneration"
8480
)
81+
82+
def test_generate_container_configs_no_modules_dir(self) -> None:
83+
"""Returns an empty set immediately when there is no modules/ directory."""
84+
shutil.rmtree(self.pipeline_dir / "modules", ignore_errors=True)
85+
result = self.container_configs.generate_container_configs()
86+
assert result == set()
87+
88+
def test_generate_container_configs_skips_meta_without_containers(self) -> None:
89+
"""meta.yml files without a containers key are silently ignored."""
90+
module_dir = self.pipeline_dir / "modules" / "local" / "fake"
91+
module_dir.mkdir(parents=True, exist_ok=True)
92+
(module_dir / "meta.yml").write_text("name: fake\ndescription: no containers here\n")
93+
(module_dir / "main.nf").write_text('process FAKE {\n script:\n """\n echo hello\n """\n}\n')
94+
95+
self.container_configs.generate_container_configs()
96+
97+
conf_dir = self.pipeline_dir / "conf"
98+
for p_name in PLATFORMS:
99+
cfg_path = conf_dir / f"containers_{p_name}.config"
100+
if cfg_path.exists():
101+
assert "FAKE" not in cfg_path.read_text(), f"{cfg_path.name} contains entry for FAKE module"
102+
103+
def test_generate_container_configs_skips_unchanged_write(self) -> None:
104+
"""Config files are not rewritten when content has not changed."""
105+
mods_install = ModuleInstall(
106+
self.pipeline_dir, prompt=False, force=False, sha="79b36b51048048374b642289bfe9e591ef56fe05"
107+
)
108+
mods_install.install("fastqc")
109+
110+
self.container_configs.generate_container_configs()
111+
112+
conf_dir = self.pipeline_dir / "conf"
113+
mtimes = {
114+
p: (conf_dir / f"containers_{p}.config").stat().st_mtime_ns
115+
for p in PLATFORMS
116+
if (conf_dir / f"containers_{p}.config").exists()
117+
}
118+
assert mtimes, "Expected at least one config file to be generated on the first run"
119+
120+
self.container_configs.generate_container_configs()
121+
122+
for p_name, mtime_before in mtimes.items():
123+
mtime_after = (conf_dir / f"containers_{p_name}.config").stat().st_mtime_ns
124+
assert mtime_after == mtime_before, f"conf/{p_name}.config was rewritten unnecessarily"

0 commit comments

Comments
 (0)