NatLabRockies · iankchristie · Jun 1, 2026 · May 28, 2026 · May 29, 2026
@@ -10,6 +10,8 @@ Command Line Interfaces (CLIs)
    reVeal characterize
    reVeal normalize
    reVeal score-weighted
+   reVeal learn-weights
+   reVeal analyze-features
    reVeal pipeline
    reVeal batch
    reVeal script

@@ -0,0 +1,3 @@
+.. click:: reVeal.cli.analyze_features:main
+   :prog: reVeal analyze-features
+   :nested: full
@@ -0,0 +1,3 @@
+.. click:: reVeal.cli.learn_weights:main
+   :prog: reVeal learn-weights
+   :nested: full
@@ -16,6 +16,7 @@ dependencies:
     - scipy>=1.14.0,<2
     - tqdm>=4.67.1,<5
     - exactextract>=0.2.2,<1
+    - matplotlib>=3.8.0,<4
     - optuna>=4.0.0,<5
     - pydantic>=2.12.5,<3
     - libpysal>=4.13.0,<5

@@ -35,6 +35,7 @@ dependencies = [
   "gdal>=3.10.3,<4",
   "joblib>=1.4.0,<2",
   "libpysal>=4.13.0,<5",
+  "matplotlib>=3.8.0,<4",
   "NLR-GAPs>=0.9.1,<1",
   "NLR-rex>=0.5.0,<1",
   "optuna>=4.0.0,<5",

@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+"""
+cli.analyze_features module - Sets up analyze-features command for use with
+NLR-GAPs CLI.
+"""
+import json
+import logging
+from pathlib import Path
+from pathlib import Path
+
+import geopandas as gpd
+from pydantic import ValidationError
+from gaps.cli import as_click_command, CLICommandFromFunction
+
+from reVeal.config.analyze_features import AnalyzeFeaturesConfig
+from reVeal.feature_analysis import (
+    compute_correlation_matrix,
+    compute_feature_clusters,
+    save_analysis_outputs,
+    suggest_exclusions,
+)
+from reVeal.log import get_logger, remove_streamhandlers
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _log_inputs(config):
+    """
+    Emit log messages summarizing user inputs.
+
+    Parameters
+    ----------
+    config : dict
+        Configuration dictionary.
+    """
+    LOGGER.info(f"Inputs config: {json.dumps(config, indent=4, default=str)}")
+
+
+def _preprocessor(config, job_name, log_directory, verbose):
+    """
+    Preprocess user-input configuration.
+
+    Parameters
+    ----------
+    config : dict
+        User configuration file input as (nested) dict.
+    job_name : str
+        Name of job being run.
+    log_directory : Path
+        Path to log directory.
+    verbose : bool
+        Flag to signal DEBUG verbosity.
+
+    Returns
+    -------
+    dict
+        Configuration dictionary modified to include additional parameters.
+    """
+    if verbose:
+        log_level = "DEBUG"
+    else:
+        log_level = "INFO"
+    get_logger(
+        __name__, log_level=log_level, out_path=log_directory / f"{job_name}.log"
+    )
+
+    LOGGER.info("Validating input configuration file")
+    try:
+        af_config = {
+            k: config.get(k)
+            for k in AnalyzeFeaturesConfig.model_fields.keys()
+            if k in config
+        }
+        AnalyzeFeaturesConfig(**af_config)
+    except ValidationError as e:
+        LOGGER.error(
+            "Configuration did not pass validation. "
+            f"The following issues were identified:\n{e}"
+        )
+        raise e
+    LOGGER.info("Input configuration file is valid.")
+
+    config["_local"] = (
+        config.get("execution_control", {}).get("option", "local") == "local"
+    )
+    _log_inputs(config)
+
+    return config
+
+
+def run(
+    grid,
+    out_dir,
+    attributes=None,
+    exclude_attributes=None,
+    correlation_method="spearman",
+    cluster_threshold=0.7,
+    _local=True,
+):
+    """
+    Analyze feature correlations and clusters in a normalized grid.
+
+    Computes a correlation matrix, performs hierarchical clustering, generates
+    dendrogram and heatmap plots, and suggests redundant features for exclusion.
+
+    Parameters
+    ----------
+    grid : str
+        Path to the normalized grid (output of ``reVeal normalize``). Must be a
+        vector dataset readable by pyogrio with numeric ``*_score`` columns.
+    out_dir : str
+        Output directory. Analysis artifacts will be saved to an ``analysis/``
+        subdirectory.
+    attributes : list of str, optional
+        List of column names from the grid to use as features. If not specified,
+        all columns ending with ``_score`` are used automatically.
+    exclude_attributes : list of str, optional
+        Score columns to exclude from auto-detected features. All ``*_score``
+        columns except those listed will be used. Mutually exclusive with
+        ``attributes``.
+    correlation_method : str, optional
+        Correlation method: 'spearman' or 'pearson'. Default is 'spearman'.
+    cluster_threshold : float, optional
+        Distance threshold for hierarchical clustering. Lower values produce
+        more clusters (features must be more similar to cluster together).
+        Default is 0.7.
+    _local : bool
+        Flag indicating local vs HPC execution. Not user-provided.
+    """
+    # pylint: disable=unused-argument
+
+    if _local:
+        remove_streamhandlers(LOGGER.parent)
+
+    config = AnalyzeFeaturesConfig(
+        grid=grid,
+        attributes=attributes,
+        exclude_attributes=exclude_attributes,
+        correlation_method=correlation_method,
+        cluster_threshold=cluster_threshold,
+    )
+
+    # Read grid
+    LOGGER.info(f"Reading grid from {config.grid}...")
+    grid_df = gpd.read_file(config.grid, engine="pyogrio", use_arrow=True)
+    grid_df.fillna(0, inplace=True)
+    LOGGER.info(f"Grid loaded: {len(grid_df):,} cells, {len(grid_df.columns)} columns.")
+
+    # Resolve attributes
+    if config.attributes is not None:
+        features = config.attributes
+    elif config.exclude_attributes is not None:
+        all_score_cols = [c for c in grid_df.columns if c.endswith("_score")]
+        exclude_set = set(config.exclude_attributes)
+        features = [c for c in all_score_cols if c not in exclude_set]
+        LOGGER.info(
+            f"Excluded {len(exclude_set)} attributes, "
+            f"using {len(features)} of {len(all_score_cols)} score columns."
+        )
+    else:
+        features = [c for c in grid_df.columns if c.endswith("_score")]
+        LOGGER.info(f"Auto-detected {len(features)} score attributes.")
+
+    if not features:
+        raise ValueError(
+            "No attributes found. Provide 'attributes' in the config or ensure "
+            "the grid has columns ending with '_score'."
+        )
+
+    LOGGER.info(
+        f"Starting feature analysis ({config.correlation_method} correlation, "
+        f"{len(features)} features, {len(grid_df):,} grid cells)..."
+    )
+
+    # Compute correlation matrix
+    X = grid_df[features].to_numpy()
+    LOGGER.info(
+        f"Computing {config.correlation_method} correlation matrix "
+        f"({len(grid_df):,} samples x {len(features)} features)... "
+        f"this may take a few minutes for large grids."
+    )
+    corr_matrix = compute_correlation_matrix(
+        X, features, method=config.correlation_method
+    )
+    LOGGER.info("Correlation matrix computed. Computing feature clusters...")
+
+    # Compute clusters
+    cluster_result = compute_feature_clusters(
+        corr_matrix, threshold=config.cluster_threshold
+    )
+    n_clusters = len(cluster_result["clusters"])
+    LOGGER.info(f"Feature analysis complete. Found {n_clusters} clusters.")
+
+    # Suggest exclusions
+    suggested = suggest_exclusions(
+        clusters=cluster_result["clusters"],
+        corr_matrix=corr_matrix,
+    )
+
+    # Save outputs
+    out_path = Path(out_dir)
+    analysis_dir = out_path / "analysis"
+    LOGGER.info(f"Saving analysis outputs to {analysis_dir}...")
+
+    save_analysis_outputs(
+        corr_matrix=corr_matrix,
+        cluster_result=cluster_result,
+        out_dir=analysis_dir,
+    )
+
+    exclusions_out = analysis_dir / "suggested_exclusions.json"
+    with open(exclusions_out, "w") as f:
+        json.dump(suggested, f, indent=2)
+
+    LOGGER.info(f"Analysis complete. Outputs saved to {analysis_dir}")
+
+
+analyze_features_cmd = CLICommandFromFunction(
+    function=run,
+    name="analyze-features",
+    add_collect=False,
+    config_preprocessor=_preprocessor,
+)
+
+main = as_click_command(analyze_features_cmd)
+
+
+if __name__ == "__main__":
+    try:
+        main(obj={})
+    except Exception:
+        LOGGER.exception("Error running reVeal analyze-features command.")
+        raise
@@ -10,11 +10,12 @@
 from reVeal.cli.score_weighted import score_weighted_cmd
 from reVeal.cli.downscale import downscale_cmd
 from reVeal.cli.learn_weights import learn_weights_cmd
+from reVeal.cli.analyze_features import analyze_features_cmd
 
 logger = logging.getLogger(__name__)
 
 commands = [characterize_cmd, normalize_cmd, score_weighted_cmd, downscale_cmd,
-            learn_weights_cmd]
+            learn_weights_cmd, analyze_features_cmd]
 main = make_cli(commands, info={"name": "reVeal", "version": __version__})
 
 # export GAPs commands to namespace for documentation

@@ -83,6 +83,7 @@ def run(
     labels,
     out_dir,
     attributes=None,
+    exclude_attributes=None,
     n_estimators=500,
     class_prior=None,
     background_samples=10000,
@@ -120,6 +121,10 @@ def run(
     attributes : list of str, optional
         List of column names from the grid to use as features. If not specified,
         all columns ending with ``_score`` are used automatically.
+    exclude_attributes : list of str, optional
+        Score columns to exclude from auto-detected features. All ``*_score``
+        columns except those listed will be used. Mutually exclusive with
+        ``attributes``.
     n_estimators : int, optional
         Number of trees in the PUExtraTrees forest. Default is 500.
     class_prior : float, optional
@@ -161,6 +166,7 @@ def run(
         grid=grid,
         labels=labels,
         attributes=attributes,
+        exclude_attributes=exclude_attributes,
         n_estimators=n_estimators,
         class_prior=class_prior,
         background_samples=background_samples,
@@ -183,7 +189,7 @@ def run(
 
     # Save score-weighted config
     config_out = out_path / "config_score_weighted.json"
-    LOGGER.info(f"Saving score-weighted config to {config_out}...")
+    LOGGER.info(f"Saving score-weighted config to {config_out}")
     with open(config_out, "w") as f:
         json.dump(results["config"], f, indent=2)
 

@@ -0,0 +1,33 @@
+"""
+config.analyze_features module - Configuration for analyze-features command.
+"""
+from typing import List, Literal, Optional
+
+from pydantic import Field, model_validator
+from typing_extensions import Annotated
+
+from reVeal.config.config import BaseGridConfig
+
+
+class AnalyzeFeaturesConfig(BaseGridConfig):
+    """
+    Configuration for the analyze-features command.
+
+    Defines inputs for computing feature correlation analysis, hierarchical
+    clustering, and generating exclusion suggestions on a normalized grid.
+    """
+
+    attributes: Optional[List[str]] = None
+    exclude_attributes: Optional[List[str]] = None
+    correlation_method: Literal["spearman", "pearson"] = "spearman"
+    cluster_threshold: Annotated[float, Field(gt=0, le=2)] = 0.7
+
+    @model_validator(mode="after")
+    def _validate_attribute_options(self):
+        """Ensure at most one attribute selection method is specified."""
+        if self.attributes is not None and self.exclude_attributes is not None:
+            raise ValueError(
+                "Only one of 'attributes' or 'exclude_attributes' "
+                "can be specified."
+            )
+        return self