BiomedSciAI
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎.travis.yml
Lines changed: 1 addition & 0 deletions b/‎.travis.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎causallib/__init__.py
Lines changed: 1 addition & 1 deletion b/‎causallib/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎causallib/contrib/README.md
Lines changed: 9 additions & 0 deletions b/‎causallib/contrib/README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎causallib/contrib/adversarial_balancing/adversarial_balancing.py
Lines changed: 3 additions & 2 deletions b/‎causallib/contrib/adversarial_balancing/adversarial_balancing.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎causallib/contrib/adversarial_balancing/classifier_selection.py
Lines changed: 1 addition & 1 deletion b/‎causallib/contrib/adversarial_balancing/classifier_selection.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎causallib/contrib/faissknn.py
Lines changed: 123 additions & 0 deletions b/‎causallib/contrib/faissknn.py
Lines changed: 123 additions & 0 deletions
diff --git a/‎causallib/contrib/requirements.txt
Lines changed: 2 additions & 1 deletion b/‎causallib/contrib/requirements.txt
Lines changed: 2 additions & 1 deletion
diff --git a/‎causallib/datasets/data_loader.py
Lines changed: 17 additions & 6 deletions b/‎causallib/datasets/data_loader.py
Lines changed: 17 additions & 6 deletions
diff --git a/‎causallib/estimation/README.md
Lines changed: 1 addition & 1 deletion b/‎causallib/estimation/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎causallib/estimation/__init__.py
Lines changed: 2 additions & 0 deletions b/‎causallib/estimation/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎causallib/estimation/base_estimator.py
Lines changed: 3 additions & 3 deletions b/‎causallib/estimation/base_estimator.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎causallib/estimation/base_weight.py
Lines changed: 2 additions & 1 deletion b/‎causallib/estimation/base_weight.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎causallib/estimation/ipw.py
Lines changed: 1 addition & 1 deletion b/‎causallib/estimation/ipw.py
Lines changed: 1 addition & 1 deletion
@@ -2,6 +2,7 @@
 .DS_store
 .project
 .pydevproject
+.vscode
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -3,6 +3,7 @@ python:
   - "3.6"
   - "3.7"
   - "3.8"
+  - "3.9"
 cache: pip
 before_script:
   - curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
 
@@ -3,6 +3,7 @@
 [![PyPI version](https://badge.fury.io/py/causallib.svg)](https://badge.fury.io/py/causallib)
 [![Documentation Status](https://readthedocs.org/projects/causallib/badge/?version=latest)](https://causallib.readthedocs.io/en/latest/)
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/IBM/causallib/HEAD)
+[![Slack channel](https://img.shields.io/badge/join-slack-slack.svg?logo=slack)](https://causallib.slack.com/)
 [![Slack channel](https://img.shields.io/badge/support-slack-slack.svg?logo=slack)](https://causallib.slack.com/)
 # Causal Inference 360
 A Python package for inferring causal effects from observational data.
 
@@ -1 +1 @@
-__version__ = "0.6.0"
+__version__ = "0.7.0"
@@ -24,6 +24,15 @@ Currently contributed methods are:
    ```python
    from causallib.contrib.hemm import HEMM
    ```
+1. Matching Estimation/Transform using `faiss`.
+
+   Implemented a nearest neighbors search with API that matches `sklearn.NearestNeighbors`
+   but is powered by [faiss](https://github.com/facebookresearch/faiss) for GPU
+   support and much faster search on CPU as well.
+   
+   ```python
+   from causallib.contrib.faissknn import FaissNearestNeighbors
+   ```
 
 ## Dependencies
 Each model might have slightly different requirements.  
 
@@ -84,13 +84,14 @@ def __init__(self, learner, iterations=20, lr=0.5, decay=1, loss_type='01', use_
         self.verbose = verbose
         self.use_stabilized = use_stabilized
 
-    def fit(self, X, a, w_init=None, **select_kwargs):
+    def fit(self, X, a, y=None, w_init=None, **select_kwargs):
         """
         Trains an Adversarial Balancing model.
 
         Args:
             X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
             a (pd.Series): Treatment assignment of size (num_subjects,).
+            y: IGNORED.
             w_init (pd.Series): Initial sample weights. If not provided, assumes uniform.
             select_kwargs: keywords argument to past into select_classifier.
                            relevant only if model was initialized with list of classifiers in `learner`.
@@ -154,7 +155,7 @@ def _run(self, X, A, w_init=None, is_train=True, use_stabilized=None, **select_k
             # To simplify the task (learning weights) we ensure both target and source populations have the same
             # importance by reweighting classes by their frequency
             y_0_1 = LabelEncoder().fit_transform(y)  # Encode -1 ==> 0  and  1 ==>1
-            class_weight = compute_class_weight('balanced', np.unique(y), y)[y_0_1]
+            class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)[y_0_1]
 
             sample_weight = np.ones((X_augm.shape[0]))
             sample_weight[target_pop_mask] = w[A == a]  # Weights from initialization
 
@@ -85,7 +85,7 @@ def _select_classifier_from_grid(estimator, X, A, param_grid, n_splits=5, seed=1
 def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
     accuracies = np.zeros(len(candidates))
 
-    class_weight = compute_class_weight('balanced', np.unique(A), A)[LabelEncoder().fit_transform(A)]
+    class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(A), y=A)[LabelEncoder().fit_transform(A)]
 
     if n_splits >= 2:
         cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
 
@@ -0,0 +1,123 @@
+# (C) Copyright 2021 IBM Corp.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import faiss
+
+
+class FaissNearestNeighbors:
+
+    def __init__(self,
+                 metric="mahalanobis",
+                 index_type="flatl2", n_cells=100, n_probes=10):
+        """NearestNeighbors object utilizing the faiss library for speed
+
+        Implements the same API as sklearn but runs 5-10x faster. Utilizes the 
+        `faiss` library https://github.com/facebookresearch/faiss . Tested with 
+        version 1.7.0. If `faiss-gpu` is installed from pypi, GPU acceleration
+        will be used if available. 
+
+        Args:
+            metric (str) :  Distance metric for finding nearest neighbors
+                (default: "mahalanobis")
+            index_type (str) : Index type within faiss to use
+                (supported: "flatl2" and "ivfflat")
+            n_cells (int) : Number of voronoi cells (only used for "ivfflat",
+                default: 100)
+            n_probes (int) : Number of voronoi cells to search in
+                (only used for "ivfflat", default: 10)
+        Attributes (after running `fit`):
+            index_ : the faiss index fit from the data. For details about
+            faiss indices, see the faiss documentation at 
+            https://github.com/facebookresearch/faiss/wiki/Faiss-indexes .
+        """
+        self.metric = metric
+        self.n_cells = n_cells
+        self.n_probes = n_probes
+        self.index_type = index_type
+
+    def fit(self, X):
+        """Create faiss index and train with data.
+
+        Args:
+            X (np.array): Array of N samples of shape (NxM)
+
+        Returns:
+            self: Fitted object
+        """
+        X = self._transform_covariates(X)
+        if self.index_type == "flatl2":
+            self.index_ = faiss.IndexFlatL2(X.shape[1])
+            self.index_.add(X)
+        elif self.index_type == "ivfflat":
+            quantizer = faiss.IndexFlatL2(X.shape[1])
+            n_cells = max(1, min(self.n_cells, X.shape[0]//200))
+            n_probes = min(self.n_probes, n_cells)
+            self.index_ = faiss.IndexIVFFlat(
+                quantizer, X.shape[1], n_cells)
+            self.index_.train(X)
+            self.index_.nprobe = n_probes
+            self.index_.add(X)
+        else:
+            raise NotImplementedError(
+                "Index type {} not implemented. Please select"
+                "one of [\"flatl2\", \"ivfflat\"]".format(self.index_type))
+        return self
+
+    def kneighbors(self, X, n_neighbors=1):
+        """Find the k nearest neighbors of each sample in X
+
+        Args:
+            X (np.array):  Array of shape (N,M) of samples to search
+                for neighbors of. M must be the same as the fit data.
+            n_neighbors (int, optional): Number of neighbors to find.
+                Defaults to 1.
+
+        Returns:
+            (distances, indices): Two np.array objects of shape (N,n_neighbors)
+                containing the distances and indices of the closest neighbors.
+        """
+        X = self._transform_covariates(X)
+        distances, indices = self.index_.search(X, n_neighbors)
+        # faiss returns euclidean distance squared
+        return np.sqrt(distances), indices
+
+    def _transform_covariates(self, X):
+        if self.metric == "mahalanobis":
+            if not hasattr(self, "VI"):
+                raise AttributeError("Set inverse covariance VI first.")
+            X = np.dot(X, self.VI.T)
+        return np.ascontiguousarray(X).astype("float32")
+
+    def set_params(self, **parameters):
+        for parameter, value in parameters.items():
+            if parameter == "metric_params":
+                self.set_params(**value)
+            else:
+                self._setattr(parameter, value)
+        return self
+
+    def get_params(self, deep=True):
+        # `deep` plays no role because there are no sublearners
+        params_to_return = ["metric", "n_cells", "n_probes", "index_type"]
+        return {i: self.__getattribute__(i) for i in params_to_return}
+
+    def _setattr(self, parameter, value):
+        # based on faiss docs https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
+        if parameter == "VI":
+            value = np.linalg.inv(value)
+            chol = np.linalg.cholesky(value)
+            cholvi = np.linalg.inv(chol)
+            value = cholvi
+        setattr(self, parameter, value)
@@ -1,2 +1,3 @@
 -f https://download.pytorch.org/whl/cpu/  # To support cpu torch installation
-torch>=1.2.0
+torch>=1.2.0
+faiss-gpu~=1.7.0
@@ -14,6 +14,7 @@
 #
 # Created on Oct 24, 2019
 
+from causallib.utils.stat_utils import robust_lookup
 import os
 import pandas as pd
 from sklearn.utils import Bunch
@@ -28,7 +29,7 @@ def load_data_file(file_name, data_dir_name, sep=","):
     return data
 
 
-def load_nhefs(raw=False, restrict=True):
+def load_nhefs(raw=False, restrict=True, augment=True, onehot=True):
     """Loads the NHEFS smoking-cessation and weight-loss dataset.
 
     Data was gathered during an observational study conducted by the NHANS
@@ -48,6 +49,14 @@ def load_nhefs(raw=False, restrict=True):
                     If True, returns a (pd.DataFrame, pd.Series) tuple (data and description).
         restrict (bool): Whether to apply exclusion criteria on missing data or not.
                          Note: if False - data will have censored (NaN) outcomes.
+        augment (bool): Whether to add augmented (squared) features
+                    If False, only original data returned.
+                    If True, squares continuous valued columns ['age', 'wt71', 'smokeintensity', 'smokeyrs']
+                    and joins to data frame with suffix '^2'
+        onehot (bool): Whether to one-hot encode categorical data.
+                    If False, categorical data ["active", "education", "exercise"], will be returned
+                    in individual columns with categorical values.
+                    If True, extra columns with the categorical value one-hot encoded.
 
     Returns:
         Bunch: dictionary-like object
@@ -75,9 +84,12 @@ def load_nhefs(raw=False, restrict=True):
     y = data.pop("wt82_71")
     X = data[confounders]
     descriptors = descriptors[confounders + ["qsmk", "wt82_71"]]
-
-    X = pd.get_dummies(X, columns=["active", "education", "exercise"], drop_first=True)
-    X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']] ** 2, rsuffix="^2")
+    if onehot:
+        X = pd.get_dummies(
+            X, columns=["active", "education", "exercise"], drop_first=True)
+    if augment:
+        X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']]
+                   ** 2, rsuffix="^2")
 
     data = Bunch(X=X, a=a, y=y, descriptors=descriptors)
     return data
@@ -132,8 +144,7 @@ def load_acic16(instance=1, raw=False):
     # # Extract observed outcome:
     y = zymu[["y0", "y1"]]
     y = y.rename(columns=lambda x: int(x.strip("y")))  # remove 'y' prefix to allow lookup
-    y = y.lookup(y.index, a)  # Choose the outcome based on the treatment assignment
-    y = pd.Series(y, index=a.index)  # `lookup` return ndarray, convert back to Series
+    y = robust_lookup(y, a)
     # # Potential outcomes:
     po = zymu[["mu0", "mu1"]]
     po = po.rename(columns=lambda x: x.strip("mu"))
 
@@ -29,7 +29,7 @@ The methods that are currently available are:
     `causallib.estimation.DoublyRobustVanilla` 
 
 
-### Example: Inverse Probabilty Weighting (IPW)
+### Example: Inverse Probability Weighting (IPW)
 An IPW model can be run, for example, using
 ```Python
 from sklearn.linear_model import LogisticRegression
 
@@ -1,4 +1,6 @@
 from .doubly_robust import DoublyRobustIpFeature, DoublyRobustJoffe, DoublyRobustVanilla
 from .ipw import IPW
+from .overlap_weights import OverlapWeights
 from .standardization import Standardization, StratifiedStandardization
 from .marginal_outcome import MarginalOutcomeEstimator
+from .matching import Matching, PropensityMatching
@@ -19,9 +19,9 @@
 A module defining the various hierarchy of causal models interface.
 Causal models have two main tasks - predicting counterfactual outcomes and predicting effect based on these estimated
 outcomes.
-On top if it there are two resolutions we can work on: the individual level (i.e. outcome and effect for each individual
+On top of it there are two resolutions we can work on: the individual level (i.e. outcome and effect for each individual
 in the dataset) and population level (i.e. some aggregation on the sample level).
-This module defines it it all with:
+This module defines it all with:
 * EffectEstimator - can estimate both individual and population level effect
 * PopulationOutcomeEstimator - estimates aggregated outcomes on different sub-groups in the dataset.
 * IndividualOutcomeEstimator - estimates individual level outcomes.
@@ -175,7 +175,7 @@ def estimate_effect(self, outcome1, outcome2, agg="population", effect_types="di
                                       and values are the corresponding computed effect.
                                       A DataFrame if individual effect (input is a vector) where columns are effects
                                       types and rows are effect in each individual.
-                                      Always: Value type is same is outcome_1 and outcome_2 type.
+                                      Always: Value type is the same as outcome_1 and outcome_2 type.
         """
         if agg == "population":
             outcome1 = self._aggregate_population_outcome(outcome1)
 
@@ -42,13 +42,14 @@ def __init__(self, learner, use_stabilized=False, *args, **kwargs):
         self.use_stabilized = use_stabilized
 
     @abc.abstractmethod
-    def fit(self, X, a):
+    def fit(self, X, a, y=None):
         """
         Trains a model to predict treatment assignment given the covariates: Pr[A|X].
 
         Args:
             X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
             a (pd.Series): Treatment assignment of size (num_subjects,).
+            y: IGNORED.
 
         Returns:
             WeightEstimator: A causal weight model with an inner learner fitted.
 
@@ -49,7 +49,7 @@ def __init__(self, learner, truncate_eps=None, use_stabilized=False):
         self.__check_truncation_value_is_legal(truncate_eps)
         self.truncate_eps = truncate_eps
 
-    def fit(self, X, a):
+    def fit(self, X, a, y=None):
         if self.use_stabilized:
             self.treatment_prevalence_ = a.value_counts(normalize=True, sort=False)
         self.learner.fit(X, a)
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.0"`
	`1`	`+__version__ = "0.7.0"`