justin13601 · justin13601 · May 3, 2025 · Apr 27, 2025 · Apr 27, 2025 · Apr 27, 2025
diff --git a/.github/workflows/tests.yml → .github/workflows/tests.yaml b/.github/workflows/tests.yml → .github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
 
       - name: Install packages
         run: |
-          pip install .[dev]
+          pip install -e .[dev,tests]
 
       #----------------------------------------------
       #              run test suite

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,5 @@
 default_language_version:
-  python: python3
-
-exclude: "to_organize"
+  python: python3.12
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
@@ -20,32 +18,15 @@ repos:
       - id: check-added-large-files
         args: [--maxkb, "800"]
 
-  # python code formatting
-  - repo: https://github.com/psf/black
-    rev: 23.7.0
-    hooks:
-      - id: black
-        args: [--line-length, "110"]
-
-  # python import sorting
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
-        args: ["--profile", "black", "--filter-files", "-o", "wandb"]
-
-  - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.0
-    hooks:
-      - id: autoflake
-        args: [--in-place, --remove-all-unused-imports]
-
-  # python upgrading syntax to newer version
-  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+  # python code formatting, linting, and import sorting using ruff
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.7
     hooks:
-      - id: pyupgrade
-        args: [--py310-plus]
+      # Run the formatter
+      - id: ruff-format
+      # Run the linter
+      - id: ruff
+        args: ["--fix", "--exit-non-zero-on-fix"]
 
   # python docstring formatting
   - repo: https://github.com/myint/docformatter
@@ -54,78 +35,55 @@ repos:
       - id: docformatter
         args: [--in-place, --wrap-summaries=110, --wrap-descriptions=110]
 
-  # python check (PEP8), programming errors and code complexity
-  - repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
-    hooks:
-      - id: flake8
-        args:
-          [
-            "--max-complexity=10",
-            "--extend-ignore",
-            "E402,E701,E251,E226,E302,W504,E704,E402,E401,C901,E203",
-            "--max-line-length=110",
-            "--exclude",
-            "logs/*,data/*",
-            "--per-file-ignores",
-            "__init__.py:F401",
-          ]
-
   # yaml formatting
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.3
+    rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
         types: [yaml]
-        exclude: "environment.yaml"
 
   # shell scripts linter
   - repo: https://github.com/shellcheck-py/shellcheck-py
-    rev: v0.9.0.5
+    rev: v0.10.0.1
     hooks:
       - id: shellcheck
 
   # md formatting
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.17
+    rev: 0.7.22
     hooks:
       - id: mdformat
         args: ["--number"]
         additional_dependencies:
+          - mdformat-ruff
           - mdformat-gfm
+          - mdformat-gfm-alerts
           - mdformat-tables
           - mdformat_frontmatter
-          - mdformat-myst
-          - mdformat-black
           - mdformat-config
-          - mdformat-shfmt
+          - mdformat-myst
+          - mdformat-toc
 
   # word spelling linter
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.4.1
     hooks:
       - id: codespell
         args:
-          - --skip=logs/**,data/**,*.ipynb,*.bib,env.yml,env_cpu.yml,*.svg,poetry.lock
-          - --ignore-words-list=ehr,nd
+          - --skip=*.ipynb,*.bib,*.svg,pyproject.toml,docs/source/usage.md
+          - --ignore-words-list=ehr,crate
 
   # jupyter notebook cell output clearing
   - repo: https://github.com/kynan/nbstripout
-    rev: 0.6.1
+    rev: 0.8.1
     hooks:
       - id: nbstripout
 
-  # jupyter notebook linting
+  # jupyter notebook linting with ruff
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.0
+    rev: 1.9.1
     hooks:
-      - id: nbqa-black
+      - id: nbqa-ruff
+        args: ["--fix"]
+      - id: nbqa-ruff-format
         args: ["--line-length=110"]
-      - id: nbqa-isort
-        args: ["--profile=black"]
-      - id: nbqa-flake8
-        args:
-          [
-            "--extend-ignore=E203,E402,E501,F401,F841",
-            "--exclude=logs/*,data/*",
-          ]
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
   <a href="https://pypi.org/project/es-aces/"><img alt="PyPI" src="https://img.shields.io/pypi/v/es-aces"></a>
   <a href="https://hydra.cc/"><img alt="Hydra" src="https://img.shields.io/badge/Config-Hydra_1.3-89b8cd"></a>
   <a href="https://codecov.io/gh/justin13601/ACES"><img alt="Codecov" src="https://codecov.io/gh/justin13601/ACES/graph/badge.svg?token=6EA84VFXOV"></a>
-  <a href="https://github.com/justin13601/ACES/actions/workflows/tests.yml"><img alt="Tests" src="https://github.com/justin13601/ACES/actions/workflows/tests.yml/badge.svg"></a>
+  <a href="https://github.com/justin13601/ACES/actions/workflows/tests.yaml"><img alt="Tests" src="https://github.com/justin13601/ACES/actions/workflows/tests.yaml/badge.svg"></a>
   <a href="https://github.com/justin13601/ACES/actions/workflows/code-quality-main.yaml"><img alt="Code Quality" src="https://github.com/justin13601/ACES/actions/workflows/code-quality-main.yaml/badge.svg"></a>
   <a href="https://eventstreamaces.readthedocs.io/en/latest/?badge=latest"><img alt="Documentation" src="https://readthedocs.org/projects/eventstreamaces/badge/?version=latest"/></a>
   <a href="https://github.com/justin13601/ACES/graphs/contributors"><img alt="Contributors" src="https://img.shields.io/github/contributors/justin13601/ACES.svg"></a>
@@ -19,14 +19,14 @@
 
 **Updates**
 
-- **\[2025-01-22\]** ACES accepted to ICLR'25!
-- **\[2024-12-10\]** Latest `polars` version (`1.17.1`) is now supported.
-- **\[2024-10-28\]** Nested derived predicates and derived predicates between static variables and plain predicates can now be defined.
-- **\[2024-09-01\]** Predicates can now be defined in a configuration file separate to task criteria files.
-- **\[2024-08-29\]** Latest `MEDS` version (`0.3.3`) is now supported.
-- **\[2024-08-10\]** Expanded predicates configuration language to support regular expressions, multi-column constraints, and multi-value constraints.
-- **\[2024-07-30\]** Added ability to place constraints on static variables, such as patient demographics.
-- **\[2024-06-28\]** Paper available at [arXiv:2406.19653](https://arxiv.org/abs/2406.19653).
+- **[2025-01-22]** ACES accepted to ICLR'25!
+- **[2024-12-10]** Latest `polars` version (`1.17.1`) is now supported.
+- **[2024-10-28]** Nested derived predicates and derived predicates between static variables and plain predicates can now be defined.
+- **[2024-09-01]** Predicates can now be defined in a configuration file separate to task criteria files.
+- **[2024-08-29]** Latest `MEDS` version (`0.3.3`) is now supported.
+- **[2024-08-10]** Expanded predicates configuration language to support regular expressions, multi-column constraints, and multi-value constraints.
+- **[2024-07-30]** Added ability to place constraints on static variables, such as patient demographics.
+- **[2024-06-28]** Paper available at [arXiv:2406.19653](https://arxiv.org/abs/2406.19653).
 
 Automatic Cohort Extraction System (ACES) is a library that streamlines the extraction of task-specific cohorts from time series datasets formatted as event-streams, such as Electronic Health Records (EHR). ACES is designed to query these EHR datasets for valid subjects, guided by various constraints and requirements defined in a YAML task configuration file. This offers a powerful and user-friendly solution to researchers and developers. The use of a human-readable YAML configuration file also eliminates the need for users to be proficient in complex dataframe querying, making the extraction process accessible to a broader audience.
 
@@ -60,7 +60,9 @@ Install with dependencies from the root directory of the cloned repo:
 pip install -e .
 ```
 
-**Note**: To avoid potential dependency conflicts, please install ESGPT first before installing ACES. This ensures compatibility with the `polars` version required by ACES.
+> [!NOTE]
+> To avoid potential dependency conflicts, please install ESGPT first before installing ACES. This ensures
+> compatibility with the `polars` version required by ACES.
 
 ## Instructions for Use
 
@@ -229,7 +231,12 @@ Fields for a "plain" predicate:
 - `value_max_inclusive` (optional): Must be a boolean specifying whether `value_max` is inclusive or not.
 - `other_cols` (optional): Must be a 1-to-1 dictionary of column name and column value, which places additional constraints on further columns.
 
-**Note**: For memory optimization, we strongly recommend using either the List of Values or Regular Expression formats whenever possible, especially when needing to match multiple values. Defining each code as an individual string will increase memory usage significantly, as each code generates a separate predicate column. Using a list or regex consolidates multiple matching codes under a single column, reducing the overall memory footprint.
+> [!NOTE]
+> For memory optimization, we strongly recommend using either the List of Values or Regular Expression formats
+> whenever possible, especially when needing to match multiple values. Defining each code as an individual
+> string will increase memory usage significantly, as each code generates a separate predicate column. Using a
+> list or regex consolidates multiple matching codes under a single column, reducing the overall memory
+> footprint.
 
 #### Derived Predicates
 

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,32 @@
+"""Test set-up and fixtures code."""
+
+import json
+import sys
+import tempfile
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import polars as pl
+import pytest
+import yaml
+
+
+@pytest.fixture(autouse=True)
+def _setup_doctest_namespace(doctest_namespace: dict[str, Any], caplog: pytest.LogCaptureFixture) -> None:
+    doctest_namespace.update(
+        {
+            "caplog": caplog,
+            "MagicMock": MagicMock,
+            "sys": sys,
+            "Path": Path,
+            "patch": patch,
+            "json": json,
+            "pl": pl,
+            "datetime": datetime,
+            "timedelta": timedelta,
+            "tempfile": tempfile,
+            "yaml": yaml,
+        }
+    )
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -4,6 +4,8 @@
 import sys
 from pathlib import Path
 
+from sphinx.ext import apidoc
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@@ -55,8 +57,6 @@ def ensure_pandoc_installed(_):
 
 # TODO: use https://github.com/sphinx-extensions2/sphinx-autodoc2
 
-from sphinx.ext import apidoc
-
 output_dir = __location__ / "api"
 module_dir = __src__ / "src/aces"
 if output_dir.is_dir():

diff --git a/docs/source/configuration.md b/docs/source/configuration.md
@@ -50,9 +50,10 @@ These configs consist of the following four fields:
   expression (satisfied if the regular expression evaluates to True), or a `any` key and the value being a
   list of strings (satisfied if there is an occurrence for any code in the list).
 
-  **Note**: Each individual definition of `PlainPredicateConfig` and `code` will generate a separate predicate
-  column. Thus, for memory optimization, it is strongly recommended to match multiple values using either the
-  List of Values or Regular Expression formats whenever possible.
+  > [!NOTE]
+  > Each individual definition of `PlainPredicateConfig` and `code` will generate a separate predicate
+  > column. Thus, for memory optimization, it is strongly recommended to match multiple values using either
+  > the List of Values or Regular Expression formats whenever possible.
 
 - `value_min`: If specified, an observation will only satisfy this predicate if the occurrence of the
   underlying `code` with a reported numerical value that is either greater than or greater than or equal to
@@ -82,10 +83,11 @@ on its source format.
    (recommended), then the `code` will be checked directly against MEDS' `code` field and the `value_min`
    and `value_max` constraints will be compared against MEDS' `numeric_value` field.
 
-   **Note**: This syntax does not currently support defining predicates that also rely on matching other,
-   optional fields in the MEDS syntax; if this is a desired feature for you, please let us know by filing a
-   GitHub issue or pull request or upvoting any existing issue/PR that requests/implements this feature,
-   and we will add support for this capability.
+   > [!NOTE]
+   > This syntax does not currently support defining predicates that also rely on matching other, optional
+   > fields in the MEDS syntax; if this is a desired feature for you, please let us know by filing a GitHub
+   > issue or pull request or upvoting any existing issue/PR that requests/implements this feature, and we
+   > will add support for this capability.
 
 2. If the source data is in [ESGPT](https://eventstreamml.readthedocs.io/en/latest/) format, then the
    `code` will be interpreted in the following manner:
@@ -109,8 +111,9 @@ accepted operations that can be applied to other predicates, containing precisel
 - `and(pred_1_name, pred_2_name, ...)`: Asserts that all of the specified predicates must be true.
 - `or(pred_1_name, pred_2_name, ...)`: Asserts that any of the specified predicates must be true.
 
-**Note**: Currently, `and`'s and `or`'s cannot be nested. Upon user request, we may support further advanced
-analytic operations over predicates.
+> [!NOTE]
+> Currently, `and`'s and `or`'s cannot be nested. Upon user request, we may support further advanced
+> analytic operations over predicates.
 
 ______________________________________________________________________
 
@@ -153,20 +156,22 @@ following rules:
       exactly `$TIME_DELTA` either after or before the event being referenced (either the external event or the
       end or start of the window).
 
-      **Note**: If `$REFERENCED` is the `start` field, then `$TIME_DELTA` must be positive, and if
-      `$REFERENCED` is the `end` field, then `$TIME_DELTA` must be negative to preserve the time ordering of
-      the window fields.
+      > [!NOTE]
+      > If `$REFERENCED` is the `start` field, then `$TIME_DELTA` must be positive, and if
+      > `$REFERENCED` is the `end` field, then `$TIME_DELTA` must be negative to preserve the time ordering of
+      > the window fields.
 
    2. `$REFERENCING = $REFERENCED -> $PREDICATE`, `$REFERENCING = $REFERENCED <- $PREDICATE`
       In this case, the referencing event will be defined as the next or previous event satisfying the
       predicate, `$PREDICATE`.
 
-      **Note**: If the `$REFERENCED` is the `start` field, then the "next predicate
-      ordering" (`$REFERENCED -> $PREDICATE`) must be used, and if the `$REFERENCED` is the `end` field, then the
-      "previous predicate ordering" (`$REFERENCED <- $PREDICATE`) must be used to preserve the time ordering of
-      the window fields. These forms can lead to windows being defined as single point events, if the
-      `$REFERENCED` event itself satisfies `$PREDICATE` and the appropriate constraints are satisfied and
-      inclusive values are set.
+      > [!NOTE]
+      > If the `$REFERENCED` is the `start` field, then the "next predicate
+      > ordering" (`$REFERENCED -> $PREDICATE`) must be used, and if the `$REFERENCED` is the `end` field, then
+      > the "previous predicate ordering" (`$REFERENCED <- $PREDICATE`) must be used to preserve the time
+      > ordering of the window fields. These forms can lead to windows being defined as single point events, if
+      > the `$REFERENCED` event itself satisfies `$PREDICATE` and the appropriate constraints are satisfied and
+      > inclusive values are set.
 
    3. `$REFERENCING = $REFERENCED`
       In this case, the referencing event will be defined as the same event as the referenced event.
@@ -196,9 +201,10 @@ that define the valid range the count of observations of the named predicate tha
 for it to be considered valid. Either `min_valid` or `max_valid` constraints can be `None`, in which case
 those endpoints are left unconstrained. Likewise, unreferenced predicates are also left unconstrained.
 
-**Note**: As predicate counts are always integral, this specification does not need an additional
-inclusive/exclusive endpoint field, as one can simply increment the bound by one in the appropriate direction
-to achieve the result. Instead, this bound is always interpreted to be inclusive, so a window would satisfy
-the constraint for predicate `name` with constraint `name: (1, 2)` if the count of observations of predicate
-`name` in a window was either 1 or 2. All constraints in the dictionary must be satisfied on a window for it
-to be included.
+> [!NOTE]
+> As predicate counts are always integral, this specification does not need an additional
+> inclusive/exclusive endpoint field, as one can simply increment the bound by one in the appropriate direction
+> to achieve the result. Instead, this bound is always interpreted to be inclusive, so a window would satisfy
+> the constraint for predicate `name` with constraint `name: (1, 2)` if the count of observations of predicate
+> `name` in a window was either 1 or 2. All constraints in the dictionary must be satisfied on a window for it
+> to be included.
diff --git a/docs/source/notebooks/examples.ipynb b/docs/source/notebooks/examples.ipynb
@@ -33,9 +33,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json\n",
-    "\n",
-    "import yaml\n",
     "from bigtree import print_tree\n",
     "\n",
     "from aces import config"

diff --git a/docs/source/notebooks/tutorial_esgpt.ipynb b/docs/source/notebooks/tutorial_esgpt.ipynb
@@ -86,7 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(config_path, \"r\") as stream:\n",
+    "with open(config_path) as stream:\n",
     "    data_loaded = yaml.safe_load(stream)\n",
     "    print(json.dumps(data_loaded, indent=4))"
    ]

diff --git a/docs/source/notebooks/tutorial_meds.ipynb b/docs/source/notebooks/tutorial_meds.ipynb
@@ -30,7 +30,6 @@
    "outputs": [],
    "source": [
     "import json\n",
-    "from pathlib import Path\n",
     "\n",
     "import pandas as pd\n",
     "import yaml\n",
@@ -88,7 +87,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(config_path, \"r\") as stream:\n",
+    "with open(config_path) as stream:\n",
     "    data_loaded = yaml.safe_load(stream)\n",
     "    print(json.dumps(data_loaded, indent=4))"
    ]