Skip to content

Commit bee7dfa

Browse files
authored
Add unit tests for python.comps module (#363)
We found in #362 that the comps pipeline was not behaving the way we expected. This problem was made worse by the fact that we didn't have any tests for the comps pipeline, so we didn't have any assurances that our assumptions about the behavior of the pipeline were true, or that they would remain unchanged as we change the comps code in subsequent PRs. This PR adds a simple unit test suite for the `python.comps` module as a first step towards locking down the behavior of the comps pipeline. These tests ensure that the Python side of the pipeline conforms to our expectations. In a future PR, we'll want to do the same for the R side of the pipeline.
1 parent 32ebc23 commit bee7dfa

File tree

8 files changed

+413
-19
lines changed

8 files changed

+413
-19
lines changed

.github/workflows/test.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
on:
2+
pull_request:
3+
push:
4+
branches: [main, master]
5+
6+
name: test
7+
8+
env:
9+
PYTHONUNBUFFERED: "1"
10+
UV_SYSTEM_PYTHON: 1
11+
12+
jobs:
13+
test:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- name: Checkout code
17+
uses: actions/checkout@v4
18+
19+
- name: Install uv
20+
uses: astral-sh/setup-uv@v4
21+
with:
22+
enable-cache: true
23+
cache-dependency-glob: requirements.txt
24+
cache-suffix: pytest
25+
26+
- name: Setup Python
27+
uses: actions/setup-python@v5
28+
with:
29+
python-version: 3.12
30+
31+
- name: Install dependencies
32+
shell: bash
33+
run: |
34+
uv pip install -r requirements.txt
35+
uv pip install pytest~=8.3.5
36+
37+
- name: Run Python tests
38+
shell: bash
39+
working-directory: python
40+
run: pytest

.pre-commit-config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,15 @@ repos:
3434
language: r
3535
additional_dependencies:
3636
- yaml
37+
- repo: https://github.com/astral-sh/ruff-pre-commit
38+
rev: v0.11.2
39+
hooks:
40+
# Python linter. Ruff recommends running this before the formatter to
41+
# avoid conflicts when using the --fix flag
42+
- id: ruff
43+
args:
44+
- --fix
45+
files: ^python/
46+
# Formatter
47+
- id: ruff-format
48+
files: ^python/

python/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore uv lockfile because we use requirements.txt for this project, in order
2+
# to make it compatible with reticulate
3+
uv.lock

python/comps.py

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,83 @@
1+
import typing
2+
13
import numba as nb
24
import numpy as np
35
import pandas as pd
46

57

68
def get_comps(
7-
observation_df,
8-
comparison_df,
9-
weights,
10-
num_comps=5,
11-
):
12-
"""Fast algorithm to get the top `num_comps` comps from a dataframe of lightgbm
13-
leaf node assignments (`observation_df`) compared to a second dataframe of
14-
assignments (`comparison_df`). Leaf nodes are weighted according to a tree
15-
importance matrix `weights` and used to generate a similarity score and
16-
return two dataframes, one a set of indices and the other a set of scores
17-
for the `n` most similar comparables. More details on the underlying
18-
algorithm here: https://ccao-data.github.io/lightsnip/articles/finding-comps.html
9+
observation_df: pd.DataFrame,
10+
comparison_df: pd.DataFrame,
11+
weights: np.ndarray,
12+
num_comps: int = 5,
13+
num_chunks: int = 10,
14+
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
15+
"""
16+
Fast algorithm to get the top `num_comps` comps from a dataframe of
17+
lightgbm leaf node assignments (`observation_df`) compared to a second
18+
dataframe of leaf node assignments (`comparison_df`).
19+
20+
Leaf nodes are weighted according to a tree importance matrix `weights`
21+
and used to generate a similarity score. The function returns two
22+
dataframes: One containing the indices of the most similar compararables
23+
and the other containing their corresponding similarity scores.
24+
25+
More details on the underlying algorithm can be found here:
26+
https://ccao-data.github.io/lightsnip/articles/finding-comps.html
27+
28+
Args:
29+
observation_df (pandas.DataFrame):
30+
DataFrame containing leaf node assignments for observations.
31+
comparison_df (pandas.DataFrame):
32+
DataFrame containing leaf node assignments for potential
33+
comparables.
34+
weights (numpy.ndarray):
35+
Importance weights for leaf nodes, used to compute similarity
36+
scores.
37+
num_comps (int, optional):
38+
Number of top comparables to return for each observation.
39+
Default is 5.
40+
num_chunks (int, optional):
41+
Number of chunks to split observations for progress reporting.
42+
Default is 10.
43+
44+
Returns:
45+
tuple:
46+
- pd.DataFrame:
47+
DataFrame containing the indices of the `num_comps`
48+
most similar comparables in `comparison_df`. The order of
49+
rows will match the order of rows in `observation_df`.
50+
- pd.DataFrame:
51+
DataFrame containing similarity scores for the `num_comps`
52+
most similar comparables. The order of rows will match the
53+
order of rows in `observation_df`.
1954
"""
55+
# Check to make sure the shape of the input matrices is correct
56+
if observation_df.shape[1] != comparison_df.shape[1]:
57+
raise ValueError(
58+
"Number of columns in `observation_df` "
59+
f"({observation_df.shape[1]}) "
60+
f"must match `comparison_df` ({comparison_df.shape[1]})"
61+
)
62+
if comparison_df.shape != weights.shape:
63+
raise ValueError(
64+
f"`comparison_df.shape` {comparison_df.shape} must match "
65+
f"`weights.shape` {weights.shape}"
66+
)
67+
2068
# Convert the weights to a numpy array so that we can take advantage of
2169
# numba acceleration later on
2270
weights_matrix = np.asarray(weights, dtype=np.float32)
2371

2472
# Chunk the observations so that the script can periodically report progress
25-
num_chunks = 10
2673
observation_df["chunk"] = pd.cut(
2774
observation_df.index, bins=num_chunks, labels=False
2875
)
2976

3077
total_num_observations = len(observation_df)
3178
total_num_possible_comps = len(comparison_df)
3279
chunked_ids, chunked_scores = [], []
33-
for chunk_num in set(observation_df["chunk"]):
80+
for chunk_num in observation_df["chunk"].unique():
3481
observations = observation_df[observation_df["chunk"] == chunk_num]
3582
# Drop chunk column to produce a matrix that we can accelerate
3683
# with numba
@@ -81,8 +128,11 @@ def get_comps(
81128

82129
@nb.njit(fastmath=True, parallel=True)
83130
def _get_top_n_comps(
84-
leaf_node_matrix, comparison_leaf_node_matrix, weights_matrix, num_comps
85-
):
131+
leaf_node_matrix: np.ndarray,
132+
comparison_leaf_node_matrix: np.ndarray,
133+
weights_matrix: np.ndarray,
134+
num_comps: int,
135+
) -> typing.Tuple[np.ndarray, np.ndarray]:
86136
"""Helper function that takes matrices of leaf node assignments for
87137
observations in a tree model, a matrix of weights for each obs/tree, and an
88138
integer `num_comps`, and returns a matrix where each observation is scored
@@ -120,8 +170,8 @@ def _get_top_n_comps(
120170
if similarity_score > all_top_n_scores[x_i][-1]:
121171
for idx, score in enumerate(all_top_n_scores[x_i]):
122172
if similarity_score > score:
123-
_insert_at_idx_and_shift(all_top_n_idxs[x_i], y_i, idx)
124-
_insert_at_idx_and_shift(
173+
insert_at_idx_and_shift(all_top_n_idxs[x_i], y_i, idx)
174+
insert_at_idx_and_shift(
125175
all_top_n_scores[x_i], similarity_score, idx
126176
)
127177
break
@@ -130,7 +180,9 @@ def _get_top_n_comps(
130180

131181

132182
@nb.njit(fastmath=True)
133-
def _insert_at_idx_and_shift(arr, elem, idx):
183+
def insert_at_idx_and_shift(
184+
arr: np.ndarray, elem: typing.Union[int, float], idx: int
185+
) -> np.ndarray:
134186
"""Helper function to insert an element `elem` into a sorted numpy array `arr`
135187
at a given index `idx` and shift the subsequent elements down one index."""
136188
arr[idx + 1 :] = arr[idx:-1]

python/pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[project]
2+
name = "model-res-avm-python"
3+
version = "0.0.1"
4+
description = "Python code for the CCAO residential model AVM"
5+
requires-python = ">=3.10"
6+
7+
[tool.pytest.ini_options]
8+
minversion = "7.0.0"
9+
addopts = "-v --cache-clear -rf"
10+
# Make sure the python/ subdir is correctly loaded into the PATH during
11+
# test execution
12+
pythonpath = ["."]

python/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../requirements.txt

python/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)