nf-core · grst · Apr 25, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/.editorconfig b/.editorconfig
@@ -8,7 +8,7 @@ trim_trailing_whitespace = true
 indent_size = 4
 indent_style = space
 
-[*.{md,yml,yaml,html,css,scss,js}]
+[*.{md,yml,yaml,html,css,scss,js,toml}]
 indent_size = 2
 
 # These files are edited and tested upstream in nf-core/modules

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,3 +11,12 @@ repos:
     hooks:
       - id: editorconfig-checker
         alias: ec
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.6
+    hooks:
+      - id: ruff
+        types_or: [python, pyi, jupyter]
+        args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
+        types_or: [python, pyi, jupyter]
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,53 @@
+line-length = 120
+extend-include = [ "*.ipynb" ]
+exclude = [
+  "modules/nf-core/*",
+  "subworkflows/nf-core/*",
+]
+
+format.docstring-code-format = true
+lint.select = [
+  "B",      # flake8-bugbear
+  "BLE",    # flake8-blind-except
+  "C4",     # flake8-comprehensions
+  "D",      # pydocstyle
+  "E",      # Error detected by Pycodestyle
+  "F",      # Errors detected by Pyflakes
+  "I",      # isort
+  "RUF100", # Report unused noqa directives
+  "TID",    # flake8-tidy-imports
+  "UP",     # pyupgrade
+  "W",      # Warning detected by Pycodestyle
+]
+lint.ignore = [
+  # Errors from function calls in argument defaults. These are fine when the result is immutable.
+  "B008",
+  # Missing docstring in public module
+  "D100",
+  # Missing docstring in public package
+  "D104",
+  # __magic__ methods are often self-explanatory, allow missing docstrings
+  "D105",
+  # Missing docstring in __init__
+  "D107",
+  ## Disable one in each pair of mutually incompatible rules
+  # We don’t want a blank line before a class docstring
+  "D203",
+  # We want docstrings to start immediately after the opening triple quote
+  "D213",
+  # first line should end with a period [Bug: doesn't work with single-line docstrings]
+  "D400",
+  # First line should be in imperative mood; try rephrasing
+  "D401",
+  # line too long -> we accept long comment lines; formatter gets rid of long code lines
+  "E501",
+  # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
+  "E731",
+  # allow I, O, l as variable names -> I is the identity matrix
+  "E741",
+  # We can live without docstrings in public functions in scripts
+  "D103"
+]
+lint.per-file-ignores."*/templates/*" = [
+    "F821" # Undefined name (due to nextflow variables)
+]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix simpleaf protocol name for 10xv4 ([#452](https://github.com/nf-core/scrnaseq/pull/452))
 - Fix the workflow for cellranger-arc alignment and add new test with 10x multiome dataset ([#441](https://github.com/nf-core/scrnaseq/pull/441))
 
+### Chore
+
+- Add ruff pre-commit check as linter/autoformatter for python scripts ([#464](https://github.com/nf-core/scrnaseq/pull/464))
+
 ## v4.0.0 - 2025-03-10
 
 - Move `txp2gene` to `reference_genome_options` in schema as it is required by `kb_python` and `alevin` ([434](https://github.com/nf-core/scrnaseq/pull/434))

diff --git a/bin/check_cellrangermulti.py b/bin/check_cellrangermulti.py
@@ -4,6 +4,7 @@
 import os
 import sys
 
+
 def parse_samplesheet(samplesheet_path):
     # Define required headers
     required_headers = ["sample", "multiplexed_sample_id", "description"]
@@ -14,7 +15,7 @@ def parse_samplesheet(samplesheet_path):
     os.makedirs(cmo_output_dir, exist_ok=True)
     os.makedirs(frna_output_dir, exist_ok=True)
 
-    with open(samplesheet_path, 'r') as csvfile:
+    with open(samplesheet_path) as csvfile:
         reader = csv.DictReader(csvfile)
         headers = reader.fieldnames
 
@@ -33,7 +34,7 @@ def parse_samplesheet(samplesheet_path):
             # Process CMOs
             if "cmo_ids" in headers and row["cmo_ids"]:
                 cmo_filename = os.path.join(cmo_output_dir, f"{sample}_cmo.csv")
-                with open(cmo_filename, 'a', newline='') as cmo_file:
+                with open(cmo_filename, "a", newline="") as cmo_file:
                     cmo_writer = csv.writer(cmo_file)
                     if not os.path.exists(cmo_filename) or os.stat(cmo_filename).st_size == 0:
                         cmo_writer.writerow(["sample_id", "cmo_ids", "description"])
@@ -42,7 +43,7 @@ def parse_samplesheet(samplesheet_path):
             # Process FRNAs
             if "probe_barcode_ids" in headers and row["probe_barcode_ids"]:
                 frna_filename = os.path.join(frna_output_dir, f"{sample}_frna.csv")
-                with open(frna_filename, 'a', newline='') as frna_file:
+                with open(frna_filename, "a", newline="") as frna_file:
                     frna_writer = csv.writer(frna_file)
                     if not os.path.exists(frna_filename) or os.stat(frna_filename).st_size == 0:
                         frna_writer.writerow(["sample_id", "probe_barcode_ids", "description"])
@@ -51,6 +52,7 @@ def parse_samplesheet(samplesheet_path):
     print("Parsing completed successfully.")
     return True
 
+
 if __name__ == "__main__":
     if len(sys.argv) != 2:
         print("Usage: python script.py <path_to_samplesheet>")

diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py
@@ -2,10 +2,9 @@
 # Script originally written by Pranathi Vemuri (github.com/pranathivemuri)
 #   modified by Harshil Patel (github.com/drpatelh)
 
-from __future__ import print_function
+import argparse
 import logging
 from itertools import groupby
-import argparse
 
 # Create a logger
 logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
@@ -18,11 +17,12 @@ def is_header(line):
 
 
 def extract_fasta_seq_names(fasta_name):
-    """
-    modified from Brent Pedersen
+    """Get sequence names from FFASTA files.
+
+    Modified from Brent Pedersen
     Correct Way To Parse A Fasta File In Python
     given a fasta file. yield tuples of header, sequence
-    from https://www.biostars.org/p/710/
+    from https://www.biostars.org/p/710/.
     """
     # first open the file outside
     fh = open(fasta_name)
@@ -31,7 +31,7 @@ def extract_fasta_seq_names(fasta_name):
     # we know they alternate.
     faiter = (x[1] for x in groupby(fh, is_header))
 
-    for i, header in enumerate(faiter):
+    for _i, header in enumerate(faiter):
         line = next(header)
         if is_header(line):
             # drop the ">"
@@ -41,9 +41,9 @@ def extract_fasta_seq_names(fasta_name):
 
 def extract_genes_in_genome(fasta, gtf_in, gtf_out):
     seq_names_in_genome = set(extract_fasta_seq_names(fasta))
-    logger.info("Extracted chromosome sequence names from : %s" % fasta)
+    logger.info(f"Extracted chromosome sequence names from : {fasta}")
     logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome)))
-    seq_names_in_gtf = set([])
+    seq_names_in_gtf = set()
 
     n_total_lines = 0
     n_lines_in_genome = 0
@@ -56,12 +56,10 @@ def extract_genes_in_genome(fasta, gtf_in, gtf_out):
                 if seq_name_gtf in seq_names_in_genome:
                     n_lines_in_genome += 1
                     f.write(line)
-    logger.info(
-        "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta)
-    )
+    logger.info(f"Extracted {n_lines_in_genome} / {n_total_lines} lines from {gtf_in} matching sequences in {fasta}")
     logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf)))
 
-    logger.info("Wrote matching lines to %s" % gtf_out)
+    logger.info(f"Wrote matching lines to {gtf_out}")
 
 
 if __name__ == "__main__":

diff --git a/bin/generate_lib_csv.py b/bin/generate_lib_csv.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
 import argparse
-import os
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.")

diff --git a/bin/t2g.py b/bin/t2g.py
@@ -27,7 +27,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import sys, argparse
+import argparse
+import sys
 
 
 def create_transcript_list(input, use_name=True, use_version=False):
@@ -77,9 +78,9 @@ def create_transcript_list(input, use_name=True, use_version=False):
 def print_output(output, r, use_name=True):
     for tid in r:
         if use_name:
-            output.write("%s\t%s\t%s\n" % (tid, r[tid][0], r[tid][1]))
+            output.write(f"{tid}\t{r[tid][0]}\t{r[tid][1]}\n")
         else:
-            output.write("%s\t%s\n" % (tid, r[tid][0]))
+            output.write(f"{tid}\t{r[tid][0]}\n")
 
 
 if __name__ == "__main__":

diff --git a/modules/local/templates/concat_h5ad.py b/modules/local/templates/concat_h5ad.py
@@ -5,9 +5,12 @@
 
 os.environ["NUMBA_CACHE_DIR"] = "."
 
-import scanpy as sc, anndata as ad, pandas as pd
-from pathlib import Path
 import platform
+from pathlib import Path
+
+import anndata as ad
+import pandas as pd
+import scanpy as sc
 
 
 def read_samplesheet(samplesheet):
@@ -21,13 +24,17 @@ def read_samplesheet(samplesheet):
 
     return df
 
+
 def format_yaml_like(data: dict, indent: int = 0) -> str:
     """Formats a dictionary to a YAML-like string.
+
     Args:
         data (dict): The dictionary to format.
         indent (int): The current indentation level.
+
     Returns:
         str: A string formatted as YAML.
+
     """
     yaml_str = ""
     for key, value in data.items():
@@ -38,6 +45,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
             yaml_str += f"{spaces}{key}: {value}\\n"
     return yaml_str
 
+
 def dump_versions():
     versions = {
         "${task.process}": {
@@ -49,8 +57,8 @@ def dump_versions():
     with open("versions.yml", "w") as f:
         f.write(format_yaml_like(versions))
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     # Open samplesheet as dataframe
     df_samplesheet = read_samplesheet("${samplesheet}")
 

diff --git a/modules/local/templates/mtx_to_h5ad_cellranger.py b/modules/local/templates/mtx_to_h5ad_cellranger.py
@@ -5,19 +5,18 @@
 
 os.environ["NUMBA_CACHE_DIR"] = "."
 
-import scanpy as sc
-import pandas as pd
-import argparse
-import anndata
-from anndata import AnnData
-import platform
 import glob
+import platform
+
+import anndata
+import pandas as pd
+import scanpy as sc
+
 
 def _mtx_to_adata(
     input: str,
     sample: str,
 ):
-
     adata = sc.read_10x_h5(input)
     adata.var["gene_symbols"] = adata.var_names
     adata.var.set_index("gene_ids", inplace=True)
@@ -31,11 +30,14 @@ def _mtx_to_adata(
 
 def format_yaml_like(data: dict, indent: int = 0) -> str:
     """Formats a dictionary to a YAML-like string.
+
     Args:
         data (dict): The dictionary to format.
         indent (int): The current indentation level.
+
     Returns:
         str: A string formatted as YAML.
+
     """
     yaml_str = ""
     for key, value in data.items():
@@ -46,6 +48,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
             yaml_str += f"{spaces}{key}: {value}\\n"
     return yaml_str
 
+
 def dump_versions():
     versions = {
         "${task.process}": {
@@ -59,6 +62,7 @@ def dump_versions():
     with open("versions.yml", "w") as f:
         f.write(format_yaml_like(versions))
 
+
 def input_to_adata(
     input_data: str,
     output: str,
@@ -71,8 +75,8 @@ def input_to_adata(
 
     # standard format
     # index are gene IDs and symbols are a column
-    adata.var['gene_versions'] = adata.var.index
-    adata.var.index = adata.var['gene_versions'].str.split('.').str[0].values
+    adata.var["gene_versions"] = adata.var.index
+    adata.var.index = adata.var["gene_versions"].str.split(".").str[0].values
     adata.var_names_make_unique()
 
     # write results
@@ -84,6 +88,7 @@ def input_to_adata(
 
     return adata
 
+
 #
 # Run main script
 #
@@ -93,7 +98,7 @@ def input_to_adata(
 
 # input_type comes from NF module
 adata = input_to_adata(
-    input_data=glob.glob("*${meta.input_type}_feature_bc_matrix.h5")[0], # cellrangermulti has 'sample_' as prefix
+    input_data=glob.glob("*${meta.input_type}_feature_bc_matrix.h5")[0],  # cellrangermulti has 'sample_' as prefix
     output="${meta.id}_${meta.input_type}_matrix.h5ad",
-    sample="${meta.id}"
+    sample="${meta.id}",
 )