44import logging
55import warnings
66from itertools import chain , count
7- from math import isnan
7+ from math import isnan , prod
88
99import numpy as np
1010
@@ -47,15 +47,15 @@ def _harmonize_tensor(
4747 # greater than the old model's lowest cut.
4848 # eg: new: | | | | |
4949 # old: | |
50- # other1: | | proprotion |
50+ # other1: | | proportion |
5151 # other2: | proportion |
5252 # One wrinkle is that for pairs, we'll be using the pair cuts and we need to
53- # one-dimensionalize any existing pair weights onto their respective 1D axies
54- # before proportionating them. Annother issue is that we might not even have
53+ # one-dimensionalize any existing pair weights onto their respective 1D axis
54+ # before proportioning them. Another issue is that we might not even have
5555 # another term_feature that uses some particular feature that we use in our model
5656 # so we don't have any weights. We can solve that issue by dropping any feature's
5757 # bins for terms that we have no information for. After we do this we'll have
58- # guaranteed that we only have new bin cuts for feature axies that we have inside
58+ # guaranteed that we only have new bin cuts for feature axes that we have inside
5959 # the bin level that we're handling!
6060
6161 old_feature_idxs = list (old_feature_idxs )
@@ -241,7 +241,7 @@ def _harmonize_tensor(
241241 map_bins [bin_idx ]
242242 for map_bins , bin_idx in zip (mapping , old_reversed_bin_idxs )
243243 ]
244- n_cells2 = np . prod ([ len ( x ) for x in cell_map ] )
244+ n_cells2 = prod (map ( len , cell_map ) )
245245 val = 0 if n_multiclasses == 1 else np .zeros (n_multiclasses , np .float64 )
246246 total_weight = 0.0
247247 for cell2_idx in range (n_cells2 ):
@@ -416,7 +416,7 @@ def merge_ebms(models):
416416
417417 # TODO: every time we merge models we fragment the bins more and more and this is undesirable
418418 # especially for pairs. When we build models, we store the feature bin cuts for pairs even
419- # if we have no pairs that use that paritcular feature as a pair. We can eliminate these useless
419+ # if we have no pairs that use that particular feature as a pair. We can eliminate these useless
420420 # pair feature cuts before merging the bins and that'll give us less resulting cuts. Having less
421421 # cuts reduces the number of estimates that we need to make and reduces the complexity of the
422422 # tensors, so it's good to have this reduction.
@@ -470,7 +470,7 @@ def merge_ebms(models):
470470 # order and also handling merged categories (where two categories map to a single score)
471471 # We should first try to progress in order along each set of keys and see if we can
472472 # establish the perfect order which might work if there are isolated missing categories
473- # and if we can't get a unique guaranteed sorted order that way then examime all the
473+ # and if we can't get a unique guaranteed sorted order that way then examine all the
474474 # different known sort order and figure out if any of the possible orderings match
475475 merged_bins = dict (zip (merged_keys , count (1 )))
476476 else :
@@ -550,7 +550,7 @@ def merge_ebms(models):
550550 ):
551551 if hasattr (ebm , "feature_bounds_" ):
552552 # TODO: estimate the histogram bin counts by taking the min of the mins and the max of the maxes
553- # and re-apportioning the counts based on the distributions of the previous histograms. Proprotion
553+ # and re-apportioning the counts based on the distributions of the previous histograms. Proportion
554554 # them to the floor of their counts and then assign any remaining integers based on how much
555555 # they reduce the RMSE of the integer counts from the ideal floating point counts.
556556 pass
@@ -623,7 +623,7 @@ def merge_ebms(models):
623623
624624 # TODO: in the future we might at this point try and figure out the most
625625 # common feature ordering within the terms. Take the mode first
626- # and amonst the orderings that tie, choose the one that's best sorted by
626+ # and amongst the orderings that tie, choose the one that's best sorted by
627627 # feature indexes
628628 ebm .term_features_ = sorted_fgs
629629
@@ -634,26 +634,26 @@ def merge_ebms(models):
634634 # interaction mismatches where an interaction will be in one model, but not the other.
635635 # We need to estimate the bin_weight_ tensors that would have existed in this case.
636636 # We'll use the interaction terms that we do have in other models to estimate the
637- # distribution in the essense of the data, which should be roughly consistent or you
637+ # distribution in the essence of the data, which should be roughly consistent or you
638638 # shouldn't be attempting to merge the models in the first place. We'll then scale
639- # the percentage distribution by the total weight of the model that we're fillin in the
639+ # the percentage distribution by the total weight of the model that we're filling in the
640640 # details for.
641641
642642 # TODO: this algorithm has some problems. The estimated tensor that we get by taking the
643643 # model weight and distributing it by a per-cell percentage measure means that we get
644- # inconsistent weight distibutions along the axis. We can take our resulting weight tensor
644+ # inconsistent weight distributions along the axis. We can take our resulting weight tensor
645645 # and sum the columns/rows to get the weights on each individual feature axis. Our model
646646 # however comes with a known set of weights on each feature, and the result of our operation
647647 # will not match the existing distribution in almost all cases. I think there might be
648648 # some algorithm where we start with the per-feature weights and use the distribution hints
649649 # from the other models to inform where we place our exact weights that we know about in our
650- # model from each axis. The problem is that the sums in both axies need to agree, and each
650+ # model from each axis. The problem is that the sums in both axes need to agree, and each
651651 # change we make influences both. I'm not sure we can even guarantee that there is an answer
652652 # and if there was one I'm not sure how we'd go about generating it. I'm going to leave
653653 # this problem for YOU: a future person who is smarter than me and has more time to solve this.
654654 # One hint: I think a possible place to start would be an iterative algorithm that's similar
655655 # to purification where you randomly select a row/column and try to get closer at each step
656- # to the rigth answer. Good luck!
656+ # to the right answer. Good luck!
657657 #
658658 # Oh, there's also another deeper problem.. let's say you had a crazy 5 way interaction in the
659659 # model eg: (0,1,2,3,4) and you had 2 and 3 way interactions that either overlap or not.
@@ -731,7 +731,7 @@ def merge_ebms(models):
731731 model .bagged_scores_ [term_idx ][bag_idx ],
732732 model .bin_weights_ [
733733 term_idx
734- ], # we use these to weigh distribution of scores for mulple bins
734+ ], # we use these to weigh distribution of scores for multiple bins
735735 )
736736 new_bagged_scores .append (harmonized_bagged_scores )
737737 ebm .bin_weights_ .append (np .sum (new_bin_weights , axis = 0 ))
@@ -768,7 +768,7 @@ def merge_ebms(models):
768768 # TODO: we might be able to do these operations earlier
769769 remove_extra_bins (ebm .term_features_ , ebm .bins_ )
770770
771- # dependent attributes (can be re-derrived after serialization)
771+ # dependent attributes (can be re-derived after serialization)
772772 ebm .n_features_in_ = len (ebm .bins_ ) # scikit-learn specified name
773773 ebm .term_names_ = generate_term_names (ebm .feature_names_in_ , ebm .term_features_ )
774774
0 commit comments