Merge branch 'master' into plus-sign-labels

tsalo · web-flow · commit 75924cf4757a · 2025-02-24T09:30:04.000-05:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -47,7 +47,7 @@ repos:
         files: tools/schemacode
         args: ["--settings-file", "tools/schemacode/pyproject.toml"]
   - repo: https://github.com/pyCQA/flake8
-    rev: 7.1.1
+    rev: 7.1.2
     hooks:
       - id: flake8
         args: [--config=tools/schemacode/.flake8]
@@ -74,7 +74,7 @@ repos:
       - id: codespell
         args: ["--config=.codespellrc", "--dictionary=-", "--dictionary=.codespell_dict"]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.14.1
+    rev: v1.15.0
     hooks:
       - id: mypy
         # Sync with project.optional-dependencies.typing
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -9,7 +9,7 @@ If you have any questions that aren't discussed below, please let us know
 by [opening an issue](https://github.com/bids-standard/bids-specification/issues/new).
 
 If you are not familiar with Git and GitHub,
-check our [generic contributing guidelines](https://bids-website.readthedocs.io/en/latest/collaboration/bids_github/CONTRIBUTING.html).
+check our [generic contributing guidelines](https://bids.neuroimaging.io/collaboration/bids_github/CONTRIBUTING.html).
 
 If you want to contribute to the BIDS specification,
 make sure you also read the instructions below.
diff --git a/src/schema/README.md b/src/schema/README.md
@@ -259,20 +259,20 @@ The following operators should be defined by an interpreter:
 
 The following functions should be defined by an interpreter:
 
-| Function                                        | Definition                                                                                                                                | Example                                                | Note                                                                           |
-| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------ |
-| `count(arg: array, val: any) -> int`            | Number of elements in an array equal to `val`                                                                                             | `count(columns.type, "EEG")`                           | The number of times "EEG" appears in the column "type" of the current TSV file |
-| `exists(arg: str \| array, rule: str) -> int`   | Count of files in an array that exist in the dataset. String is array with length 1. See following section for the meanings of rules.     | `exists(sidecar.IntendedFor, "subject")`               | True if all files in `IntendedFor` exist, relative to the subject directory.   |
-| `index(arg: array, val: any) -> int`            | Index of first element in an array equal to `val`, `null` if not found                                                                    | `index(["i", "j", "k"], axis)`                         | The number, from 0-2 corresponding to the string `axis`                        |
-| `intersects(a: array, b: array) -> bool`        | `true` if arguments contain any shared elements                                                                                           | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
-| `allequal(a: array, b: array) -> bool`          | `true` if arrays have the same length and paired elements are equal                                                                       | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
-| `length(arg: array) -> int`                     | Number of elements in an array                                                                                                            | `length(columns.onset) > 0`                            | True if there is at least one value in the onset column                        |
-| `match(arg: str, pattern: str) -> bool`         | `true` if `arg` matches the regular expression `pattern` (anywhere in string)                                                             | `match(extension, ".gz$")`                             | True if the file extension ends with `.gz`                                     |
-| `max(arg: array) -> number`                     | The largest non-`n/a` value in an array                                                                                                   | `max(columns.onset)`                                   | The time of the last onset in an events.tsv file                               |
-| `min(arg: array) -> number`                     | The smallest non-`n/a` value in an array                                                                                                  | `min(sidecar.SliceTiming) == 0`                        | A check that the onset of the first slice is 0s                                |
-| `sorted(arg: array, method: str) -> array`      | The sorted values of the input array; defaults to type-determined sort. If method is "lexical", or "numeric" use lexical or numeric sort. | `sorted(sidecar.VolumeTiming) == sidecar.VolumeTiming` | True if `sidecar.VolumeTiming` is sorted                                       |
-| `substr(arg: str, start: int, end: int) -> str` | The portion of the input string spanning from start position to end position                                                              | `substr(path, 0, length(path) - 3)`                    | `path` with the last three characters dropped                                  |
-| `type(arg: Any) -> str`                         | The name of the type, including `"array"`, `"object"`, `"null"`                                                                           | `type(datatypes)`                                      | Returns `"array"`                                                              |
+| Function                                          | Definition                                                                                                                                | Example                                                | Note                                                                           |
+| ------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------------ |
+| `count(arg: array, val: any) -> int`              | Number of elements in an array equal to `val`                                                                                             | `count(columns.type, "EEG")`                           | The number of times "EEG" appears in the column "type" of the current TSV file |
+| `exists(arg: str \| array, rule: str) -> int`     | Count of files in an array that exist in the dataset. String is array with length 1. See following section for the meanings of rules.     | `exists(sidecar.IntendedFor, "subject")`               | True if all files in `IntendedFor` exist, relative to the subject directory.   |
+| `index(arg: array, val: any) -> int`              | Index of first element in an array equal to `val`, `null` if not found                                                                    | `index(["i", "j", "k"], axis)`                         | The number, from 0-2 corresponding to the string `axis`                        |
+| `intersects(a: array, b: array) -> array \| bool` | The intersection of arrays `a` and `b`, or `false` if there are no shared values.                                                         | `intersects(dataset.modalities, ["pet", "mri"])`       | Non-empty array if either PET or MRI data is found in dataset, otherwise false |
+| `allequal(a: array, b: array) -> bool`            | `true` if arrays have the same length and paired elements are equal                                                                       | `intersects(dataset.modalities, ["pet", "mri"])`       | True if either PET or MRI data is found in dataset                             |
+| `length(arg: array) -> int`                       | Number of elements in an array                                                                                                            | `length(columns.onset) > 0`                            | True if there is at least one value in the onset column                        |
+| `match(arg: str, pattern: str) -> bool`           | `true` if `arg` matches the regular expression `pattern` (anywhere in string)                                                             | `match(extension, ".gz$")`                             | True if the file extension ends with `.gz`                                     |
+| `max(arg: array) -> number`                       | The largest non-`n/a` value in an array                                                                                                   | `max(columns.onset)`                                   | The time of the last onset in an events.tsv file                               |
+| `min(arg: array) -> number`                       | The smallest non-`n/a` value in an array                                                                                                  | `min(sidecar.SliceTiming) == 0`                        | A check that the onset of the first slice is 0s                                |
+| `sorted(arg: array, method: str) -> array`        | The sorted values of the input array; defaults to type-determined sort. If method is "lexical", or "numeric" use lexical or numeric sort. | `sorted(sidecar.VolumeTiming) == sidecar.VolumeTiming` | True if `sidecar.VolumeTiming` is sorted                                       |
+| `substr(arg: str, start: int, end: int) -> str`   | The portion of the input string spanning from start position to end position                                                              | `substr(path, 0, length(path) - 3)`                    | `path` with the last three characters dropped                                  |
+| `type(arg: Any) -> str`                           | The name of the type, including `"array"`, `"object"`, `"null"`                                                                           | `type(datatypes)`                                      | Returns `"array"`                                                              |
 
 #### The `exists()` function
 
diff --git a/src/schema/meta/expression_tests.yaml b/src/schema/meta/expression_tests.yaml
@@ -93,7 +93,7 @@
 - expression: type(true)
   result: 'boolean'
 - expression: intersects([1], [1, 2])
-  result: true
+  result: [1]
 - expression: intersects([1], [])
   result: false
 - expression: length([1, 2, 3])
diff --git a/src/schema/objects/files.yaml b/src/schema/objects/files.yaml
@@ -75,6 +75,10 @@ participants:
     followed by a list of optional columns describing participants.
     Each participant MUST be described by one and only one row.
 
+    The `participant_id` entries MUST be a superset of all subject directories
+    and all `participant_id` entries found among phenotypic and assessment data
+    in the `phenotype/` directory.
+
     Commonly used *optional* columns in `participants.tsv` files are `age`, `sex`,
     `handedness`, `strain`, and `strain_rrid`.
 
diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml
@@ -3245,8 +3245,8 @@ ScreenResolution:
     - type: array
       items:
         type: integer
-        minItems: 2
-        maxItems: 2
+      minItems: 2
+      maxItems: 2
     - type: string
       enum:
         - n/a
@@ -3262,8 +3262,8 @@ ScreenSize:
       items:
         type: number
         unit: m
-        minItems: 2
-        maxItems: 2
+      minItems: 2
+      maxItems: 2
     - type: string
       enum:
         - n/a
diff --git a/src/schema/rules/checks/dataset.yaml b/src/schema/rules/checks/dataset.yaml
@@ -18,26 +18,35 @@ ParticipantIDMismatch:
   issue:
     code: PARTICIPANT_ID_MISMATCH
     message: |
-      Participant labels found in this dataset did not match the values in participant_id column
-      found in the participants.tsv file.
+      Subject directories found in this dataset did not match the values in
+      the participant_id column found in the participants.tsv file.
     level: error
   selectors:
     - path == '/participants.tsv'
   checks:
-    - allequal(sorted(columns.participant_id), sorted(dataset.subjects.sub_dirs))
+    - |
+      allequal(
+        sorted(intersects(columns.participant_id, dataset.subjects.sub_dirs)),
+        sorted(dataset.subjects.sub_dirs)
+      )
 
 # 51
 PhenotypeSubjectsMissing:
   issue:
     code: PHENOTYPE_SUBJECTS_MISSING
     message: |
-      A phenotype/ .tsv file lists subjects that were not found in the dataset.
+      A phenotype/ .tsv file lists subjects that were not found in
+      the participant_id column found in the participants.tsv file.
     level: error
   selectors:
-    - path == '/dataset_description.json'
+    - path == '/participants.tsv'
     - type(dataset.subjects.phenotype) != 'null'
   checks:
-    - allequal(sorted(dataset.subjects.phenotype), sorted(dataset.subjects.sub_dirs))
+    - |
+      allequal(
+        sorted(intersects(columns.participant_id, dataset.subjects.phenotype)),
+        sorted(dataset.subjects.phenotype)
+      )
 
 # 214
 SamplesTSVMissing:
diff --git a/tools/schemacode/pyproject.toml b/tools/schemacode/pyproject.toml
@@ -103,6 +103,8 @@ markers = [
 minversion = "6.0"
 xfail_strict = true
 
+[tool.coverage]
+
 [tool.coverage.paths]
 source = [
     "src/bidsschematools",
@@ -112,10 +114,32 @@ source = [
 [tool.coverage.run]
 parallel = true
 
-[tool.bumpver]
+# Release process:
+# cd tools/schemacode
+# uvx bump-my-version bump pre_label --tag
+# [inspect result]
+# git push upstream <current-branch> --tags
+# uvx bump-my-version bump <patch|minor|major>
+# git push upstream <current-branch>
+[tool.bumpversion]
 current_version = "1.1.0-dev"
-version_pattern = "MAJOR.MINOR.PATCH[-TAG]"
-commit = false
+parse = """(?x)
+    (?P<major>[0-9]+)
+    \\.(?P<minor>[0-9]+)
+    \\.(?P<patch>[0-9]+)
+    (?:-(?P<pre_label>dev))?
+"""
+serialize = ["{major}.{minor}.{patch}-{pre_label}", "{major}.{minor}.{patch}"]
+commit = true
+message = "chore: Bump schema package to {new_version}"
+# Use --tag on releases
+tag = false
+tag_name = "schema-{new_version}"
+tag_message = "Schema release {new_version}"
+
+[tool.bumpversion.parts.pre_label]
+values = ["dev", "final"]
+optional_value = "final"
 
-[tool.bumpver.file_patterns]
-"src/bidsschematools/data/schema/SCHEMA_VERSION" = ['{version}']
+[[tool.bumpversion.files]]
+filename = "../../src/schema/SCHEMA_VERSION"
diff --git a/tools/schemacode/src/bidsschematools/__main__.py b/tools/schemacode/src/bidsschematools/__main__.py
@@ -3,15 +3,11 @@
 import os
 import re
 import sys
+from importlib.resources import files
 from itertools import chain
 
 import click
 
-if sys.version_info < (3, 9):
-    from importlib_resources import files
-else:
-    from importlib.resources import files
-
 from .rules import regexify_filename_rules
 from .schema import export_schema, load_schema
 from .validator import _bidsignore_check
diff --git a/tools/schemacode/src/bidsschematools/render/text.py b/tools/schemacode/src/bidsschematools/render/text.py
@@ -60,7 +60,7 @@ def _make_entity_definition(entity, entity_info):
     """Describe an entity."""
     entity_shorthand = entity_info["name"]
     text = ""
-    text += "## {}".format(entity_shorthand)
+    text += f"## {entity_shorthand}"
     text += "\n\n"
     text += f"**Full name**: {entity_info['display_name']}"
     text += "\n\n"
diff --git a/tools/schemacode/src/bidsschematools/rules.py b/tools/schemacode/src/bidsschematools/rules.py
@@ -64,7 +64,7 @@ def _optional_regex(regex, optional):
     return f"(?:{regex})?" if optional else regex
 
 
-@lru_cache()
+@lru_cache
 def _format_entity(entity, name, pattern, level, directory=False):
     if directory and entity not in DIR_ENTITIES:
         return ""
@@ -237,7 +237,7 @@ def regexify_filename_rules(
     return regex_schema
 
 
-@lru_cache()
+@lru_cache
 def regexify_all(schema_dir=None):
     """
     Create full path regexes for all BIDS specification files.
diff --git a/tools/schemacode/src/bidsschematools/schema.py b/tools/schemacode/src/bidsschematools/schema.py
@@ -3,19 +3,14 @@
 import json
 import os
 import re
-import sys
 import tempfile
 from collections.abc import Iterable, Mapping
 from copy import deepcopy
 from functools import lru_cache
+from importlib.resources import files
 
 from jsonschema import ValidationError, validate
 
-if sys.version_info < (3, 9):
-    from importlib_resources import files
-else:
-    from importlib.resources import files
-
 from . import __bids_version__, __version__, utils
 from .types import Namespace
 
@@ -183,7 +178,7 @@ def flatten_enums(namespace, inplace=True):
     return namespace
 
 
-@lru_cache()
+@lru_cache
 def load_schema(schema_path=None):
     """Load the schema into a dictionary.
 
diff --git a/tools/schemacode/src/bidsschematools/validator.py b/tools/schemacode/src/bidsschematools/validator.py
@@ -43,7 +43,7 @@ def _bids_schema_versioncheck(schema_dir, compatibility=VALIDATOR_SCHEMA_COMPATI
 
     schema_version_file = os.path.join(schema_dir, "SCHEMA_VERSION")
     try:
-        with open(schema_version_file, "r") as f:
+        with open(schema_version_file) as f:
             schema_version = f.readlines()[0].strip()
     except FileNotFoundError:
         lgr.warning(