1+ import typing
2+
13import numba as nb
24import numpy as np
35import pandas as pd
46
57
68def get_comps (
7- observation_df ,
8- comparison_df ,
9- weights ,
10- num_comps = 5 ,
11- ):
12- """Fast algorithm to get the top `num_comps` comps from a dataframe of lightgbm
13- leaf node assignments (`observation_df`) compared to a second dataframe of
14- assignments (`comparison_df`). Leaf nodes are weighted according to a tree
15- importance matrix `weights` and used to generate a similarity score and
16- return two dataframes, one a set of indices and the other a set of scores
17- for the `n` most similar comparables. More details on the underlying
18- algorithm here: https://ccao-data.github.io/lightsnip/articles/finding-comps.html
9+ observation_df : pd .DataFrame ,
10+ comparison_df : pd .DataFrame ,
11+ weights : np .ndarray ,
12+ num_comps : int = 5 ,
13+ num_chunks : int = 10 ,
14+ ) -> typing .Tuple [pd .DataFrame , pd .DataFrame ]:
15+ """
16+ Fast algorithm to get the top `num_comps` comps from a dataframe of
17+ lightgbm leaf node assignments (`observation_df`) compared to a second
18+ dataframe of leaf node assignments (`comparison_df`).
19+
20+ Leaf nodes are weighted according to a tree importance matrix `weights`
21+ and used to generate a similarity score. The function returns two
22+ dataframes: One containing the indices of the most similar compararables
23+ and the other containing their corresponding similarity scores.
24+
25+ More details on the underlying algorithm can be found here:
26+ https://ccao-data.github.io/lightsnip/articles/finding-comps.html
27+
28+ Args:
29+ observation_df (pandas.DataFrame):
30+ DataFrame containing leaf node assignments for observations.
31+ comparison_df (pandas.DataFrame):
32+ DataFrame containing leaf node assignments for potential
33+ comparables.
34+ weights (numpy.ndarray):
35+ Importance weights for leaf nodes, used to compute similarity
36+ scores.
37+ num_comps (int, optional):
38+ Number of top comparables to return for each observation.
39+ Default is 5.
40+ num_chunks (int, optional):
41+ Number of chunks to split observations for progress reporting.
42+ Default is 10.
43+
44+ Returns:
45+ tuple:
46+ - pd.DataFrame:
47+ DataFrame containing the indices of the `num_comps`
48+ most similar comparables in `comparison_df`. The order of
49+ rows will match the order of rows in `observation_df`.
50+ - pd.DataFrame:
51+ DataFrame containing similarity scores for the `num_comps`
52+ most similar comparables. The order of rows will match the
53+ order of rows in `observation_df`.
1954 """
55+ # Check to make sure the shape of the input matrices is correct
56+ if observation_df .shape [1 ] != comparison_df .shape [1 ]:
57+ raise ValueError (
58+ "Number of columns in `observation_df` "
59+ f"({ observation_df .shape [1 ]} ) "
60+ f"must match `comparison_df` ({ comparison_df .shape [1 ]} )"
61+ )
62+ if comparison_df .shape != weights .shape :
63+ raise ValueError (
64+ f"`comparison_df.shape` { comparison_df .shape } must match "
65+ f"`weights.shape` { weights .shape } "
66+ )
67+
2068 # Convert the weights to a numpy array so that we can take advantage of
2169 # numba acceleration later on
2270 weights_matrix = np .asarray (weights , dtype = np .float32 )
2371
2472 # Chunk the observations so that the script can periodically report progress
25- num_chunks = 10
2673 observation_df ["chunk" ] = pd .cut (
2774 observation_df .index , bins = num_chunks , labels = False
2875 )
2976
3077 total_num_observations = len (observation_df )
3178 total_num_possible_comps = len (comparison_df )
3279 chunked_ids , chunked_scores = [], []
33- for chunk_num in set ( observation_df ["chunk" ]):
80+ for chunk_num in observation_df ["chunk" ]. unique ( ):
3481 observations = observation_df [observation_df ["chunk" ] == chunk_num ]
3582 # Drop chunk column to produce a matrix that we can accelerate
3683 # with numba
@@ -81,8 +128,11 @@ def get_comps(
81128
82129@nb .njit (fastmath = True , parallel = True )
83130def _get_top_n_comps (
84- leaf_node_matrix , comparison_leaf_node_matrix , weights_matrix , num_comps
85- ):
131+ leaf_node_matrix : np .ndarray ,
132+ comparison_leaf_node_matrix : np .ndarray ,
133+ weights_matrix : np .ndarray ,
134+ num_comps : int ,
135+ ) -> typing .Tuple [np .ndarray , np .ndarray ]:
86136 """Helper function that takes matrices of leaf node assignments for
87137 observations in a tree model, a matrix of weights for each obs/tree, and an
88138 integer `num_comps`, and returns a matrix where each observation is scored
@@ -120,8 +170,8 @@ def _get_top_n_comps(
120170 if similarity_score > all_top_n_scores [x_i ][- 1 ]:
121171 for idx , score in enumerate (all_top_n_scores [x_i ]):
122172 if similarity_score > score :
123- _insert_at_idx_and_shift (all_top_n_idxs [x_i ], y_i , idx )
124- _insert_at_idx_and_shift (
173+ insert_at_idx_and_shift (all_top_n_idxs [x_i ], y_i , idx )
174+ insert_at_idx_and_shift (
125175 all_top_n_scores [x_i ], similarity_score , idx
126176 )
127177 break
@@ -130,7 +180,9 @@ def _get_top_n_comps(
130180
131181
132182@nb .njit (fastmath = True )
133- def _insert_at_idx_and_shift (arr , elem , idx ):
183+ def insert_at_idx_and_shift (
184+ arr : np .ndarray , elem : typing .Union [int , float ], idx : int
185+ ) -> np .ndarray :
134186 """Helper function to insert an element `elem` into a sorted numpy array `arr`
135187 at a given index `idx` and shift the subsequent elements down one index."""
136188 arr [idx + 1 :] = arr [idx :- 1 ]
0 commit comments