Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/_cli/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Command Line Interfaces (CLIs)
reVeal characterize
reVeal normalize
reVeal score-weighted
reVeal learn-weights
reVeal analyze-features
reVeal pipeline
reVeal batch
reVeal script
Expand Down
3 changes: 3 additions & 0 deletions docs/source/_cli/reVeal analyze-features.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.. click:: reVeal.cli.analyze_features:main
:prog: reVeal analyze-features
:nested: full
3 changes: 3 additions & 0 deletions docs/source/_cli/reVeal learn-weights.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.. click:: reVeal.cli.learn_weights:main
:prog: reVeal learn-weights
:nested: full
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- scipy>=1.14.0,<2
- tqdm>=4.67.1,<5
- exactextract>=0.2.2,<1
- matplotlib>=3.8.0,<4
- optuna>=4.0.0,<5
- pydantic>=2.12.5,<3
- libpysal>=4.13.0,<5
Expand Down
5 changes: 3 additions & 2 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"gdal>=3.10.3,<4",
"joblib>=1.4.0,<2",
"libpysal>=4.13.0,<5",
"matplotlib>=3.8.0,<4",
"NLR-GAPs>=0.9.1,<1",
"NLR-rex>=0.5.0,<1",
"optuna>=4.0.0,<5",
Expand Down
233 changes: 233 additions & 0 deletions reVeal/cli/analyze_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-
"""
cli.analyze_features module - Sets up analyze-features command for use with
NLR-GAPs CLI.
"""
import json
import logging
from pathlib import Path
from pathlib import Path

import geopandas as gpd
from pydantic import ValidationError
from gaps.cli import as_click_command, CLICommandFromFunction

from reVeal.config.analyze_features import AnalyzeFeaturesConfig
from reVeal.feature_analysis import (
compute_correlation_matrix,
compute_feature_clusters,
save_analysis_outputs,
suggest_exclusions,
)
from reVeal.log import get_logger, remove_streamhandlers

LOGGER = logging.getLogger(__name__)


def _log_inputs(config):
"""
Emit log messages summarizing user inputs.

Parameters
----------
config : dict
Configuration dictionary.
"""
LOGGER.info(f"Inputs config: {json.dumps(config, indent=4, default=str)}")


def _preprocessor(config, job_name, log_directory, verbose):
"""
Preprocess user-input configuration.

Parameters
----------
config : dict
User configuration file input as (nested) dict.
job_name : str
Name of job being run.
log_directory : Path
Path to log directory.
verbose : bool
Flag to signal DEBUG verbosity.

Returns
-------
dict
Configuration dictionary modified to include additional parameters.
"""
if verbose:
log_level = "DEBUG"
else:
log_level = "INFO"
get_logger(
__name__, log_level=log_level, out_path=log_directory / f"{job_name}.log"
)

LOGGER.info("Validating input configuration file")
try:
af_config = {
k: config.get(k)
for k in AnalyzeFeaturesConfig.model_fields.keys()
if k in config
}
AnalyzeFeaturesConfig(**af_config)
except ValidationError as e:
LOGGER.error(
"Configuration did not pass validation. "
f"The following issues were identified:\n{e}"
)
raise e
LOGGER.info("Input configuration file is valid.")

config["_local"] = (
config.get("execution_control", {}).get("option", "local") == "local"
)
_log_inputs(config)

return config


def run(
grid,
out_dir,
attributes=None,
exclude_attributes=None,
correlation_method="spearman",
cluster_threshold=0.7,
_local=True,
):
"""
Analyze feature correlations and clusters in a normalized grid.

Computes a correlation matrix, performs hierarchical clustering, generates
dendrogram and heatmap plots, and suggests redundant features for exclusion.

Parameters
----------
grid : str
Path to the normalized grid (output of ``reVeal normalize``). Must be a
vector dataset readable by pyogrio with numeric ``*_score`` columns.
out_dir : str
Output directory. Analysis artifacts will be saved to an ``analysis/``
subdirectory.
attributes : list of str, optional
List of column names from the grid to use as features. If not specified,
all columns ending with ``_score`` are used automatically.
exclude_attributes : list of str, optional
Score columns to exclude from auto-detected features. All ``*_score``
columns except those listed will be used. Mutually exclusive with
``attributes``.
correlation_method : str, optional
Correlation method: 'spearman' or 'pearson'. Default is 'spearman'.
cluster_threshold : float, optional
Distance threshold for hierarchical clustering. Lower values produce
more clusters (features must be more similar to cluster together).
Default is 0.7.
_local : bool
Flag indicating local vs HPC execution. Not user-provided.
"""
# pylint: disable=unused-argument

if _local:
remove_streamhandlers(LOGGER.parent)

config = AnalyzeFeaturesConfig(
grid=grid,
attributes=attributes,
exclude_attributes=exclude_attributes,
correlation_method=correlation_method,
cluster_threshold=cluster_threshold,
)

# Read grid
LOGGER.info(f"Reading grid from {config.grid}...")
grid_df = gpd.read_file(config.grid, engine="pyogrio", use_arrow=True)
grid_df.fillna(0, inplace=True)
LOGGER.info(f"Grid loaded: {len(grid_df):,} cells, {len(grid_df.columns)} columns.")

# Resolve attributes
if config.attributes is not None:
features = config.attributes
elif config.exclude_attributes is not None:
all_score_cols = [c for c in grid_df.columns if c.endswith("_score")]
exclude_set = set(config.exclude_attributes)
features = [c for c in all_score_cols if c not in exclude_set]
LOGGER.info(
f"Excluded {len(exclude_set)} attributes, "
f"using {len(features)} of {len(all_score_cols)} score columns."
)
else:
features = [c for c in grid_df.columns if c.endswith("_score")]
LOGGER.info(f"Auto-detected {len(features)} score attributes.")

if not features:
raise ValueError(
"No attributes found. Provide 'attributes' in the config or ensure "
"the grid has columns ending with '_score'."
)

LOGGER.info(
f"Starting feature analysis ({config.correlation_method} correlation, "
f"{len(features)} features, {len(grid_df):,} grid cells)..."
)

# Compute correlation matrix
X = grid_df[features].to_numpy()
LOGGER.info(
f"Computing {config.correlation_method} correlation matrix "
f"({len(grid_df):,} samples x {len(features)} features)... "
f"this may take a few minutes for large grids."
)
corr_matrix = compute_correlation_matrix(
X, features, method=config.correlation_method
)
LOGGER.info("Correlation matrix computed. Computing feature clusters...")

# Compute clusters
cluster_result = compute_feature_clusters(
corr_matrix, threshold=config.cluster_threshold
)
n_clusters = len(cluster_result["clusters"])
LOGGER.info(f"Feature analysis complete. Found {n_clusters} clusters.")

# Suggest exclusions
suggested = suggest_exclusions(
clusters=cluster_result["clusters"],
corr_matrix=corr_matrix,
)

# Save outputs
out_path = Path(out_dir)
analysis_dir = out_path / "analysis"
LOGGER.info(f"Saving analysis outputs to {analysis_dir}...")

save_analysis_outputs(
corr_matrix=corr_matrix,
cluster_result=cluster_result,
out_dir=analysis_dir,
)

exclusions_out = analysis_dir / "suggested_exclusions.json"
with open(exclusions_out, "w") as f:
json.dump(suggested, f, indent=2)

LOGGER.info(f"Analysis complete. Outputs saved to {analysis_dir}")


analyze_features_cmd = CLICommandFromFunction(
function=run,
name="analyze-features",
add_collect=False,
config_preprocessor=_preprocessor,
)

main = as_click_command(analyze_features_cmd)


if __name__ == "__main__":
try:
main(obj={})
except Exception:
LOGGER.exception("Error running reVeal analyze-features command.")
raise
3 changes: 2 additions & 1 deletion reVeal/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
from reVeal.cli.score_weighted import score_weighted_cmd
from reVeal.cli.downscale import downscale_cmd
from reVeal.cli.learn_weights import learn_weights_cmd
from reVeal.cli.analyze_features import analyze_features_cmd

logger = logging.getLogger(__name__)

commands = [characterize_cmd, normalize_cmd, score_weighted_cmd, downscale_cmd,
learn_weights_cmd]
learn_weights_cmd, analyze_features_cmd]
main = make_cli(commands, info={"name": "reVeal", "version": __version__})

# export GAPs commands to namespace for documentation
Expand Down
8 changes: 7 additions & 1 deletion reVeal/cli/learn_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def run(
labels,
out_dir,
attributes=None,
exclude_attributes=None,
n_estimators=500,
class_prior=None,
background_samples=10000,
Expand Down Expand Up @@ -120,6 +121,10 @@ def run(
attributes : list of str, optional
List of column names from the grid to use as features. If not specified,
all columns ending with ``_score`` are used automatically.
exclude_attributes : list of str, optional
Score columns to exclude from auto-detected features. All ``*_score``
columns except those listed will be used. Mutually exclusive with
``attributes``.
n_estimators : int, optional
Number of trees in the PUExtraTrees forest. Default is 500.
class_prior : float, optional
Expand Down Expand Up @@ -161,6 +166,7 @@ def run(
grid=grid,
labels=labels,
attributes=attributes,
exclude_attributes=exclude_attributes,
n_estimators=n_estimators,
class_prior=class_prior,
background_samples=background_samples,
Expand All @@ -183,7 +189,7 @@ def run(

# Save score-weighted config
config_out = out_path / "config_score_weighted.json"
LOGGER.info(f"Saving score-weighted config to {config_out}...")
LOGGER.info(f"Saving score-weighted config to {config_out}")
with open(config_out, "w") as f:
json.dump(results["config"], f, indent=2)

Expand Down
33 changes: 33 additions & 0 deletions reVeal/config/analyze_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
config.analyze_features module - Configuration for analyze-features command.
"""
from typing import List, Literal, Optional

from pydantic import Field, model_validator
from typing_extensions import Annotated

from reVeal.config.config import BaseGridConfig


class AnalyzeFeaturesConfig(BaseGridConfig):
"""
Configuration for the analyze-features command.

Defines inputs for computing feature correlation analysis, hierarchical
clustering, and generating exclusion suggestions on a normalized grid.
"""

attributes: Optional[List[str]] = None
exclude_attributes: Optional[List[str]] = None
correlation_method: Literal["spearman", "pearson"] = "spearman"
cluster_threshold: Annotated[float, Field(gt=0, le=2)] = 0.7

@model_validator(mode="after")
def _validate_attribute_options(self):
"""Ensure at most one attribute selection method is specified."""
if self.attributes is not None and self.exclude_attributes is not None:
raise ValueError(
"Only one of 'attributes' or 'exclude_attributes' "
"can be specified."
)
return self
Loading
Loading