Skip to content

Commit 6d39929

Browse files
ehudkrmmdanzigerTalKozlovskiyishaishimoni
authored
Merge pull request #15 from IBM/version-0.7.0
Version 0.7.0 Co-authored-by: mmdanziger <[email protected]> Co-authored-by: TalKozlovski <[email protected]> Co-authored-by: Yishai Shimoni <[email protected]>
2 parents e16f667 + b93f3e9 commit 6d39929

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+7248
-110
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
.DS_store
33
.project
44
.pydevproject
5+
.vscode
56

67
# Byte-compiled / optimized / DLL files
78
__pycache__/

.travis.yml

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ python:
33
- "3.6"
44
- "3.7"
55
- "3.8"
6+
- "3.9"
67
cache: pip
78
before_script:
89
- curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
[![PyPI version](https://badge.fury.io/py/causallib.svg)](https://badge.fury.io/py/causallib)
44
[![Documentation Status](https://readthedocs.org/projects/causallib/badge/?version=latest)](https://causallib.readthedocs.io/en/latest/)
55
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/IBM/causallib/HEAD)
6+
[![Slack channel](https://img.shields.io/badge/join-slack-slack.svg?logo=slack)](https://causallib.slack.com/)
67
[![Slack channel](https://img.shields.io/badge/support-slack-slack.svg?logo=slack)](https://causallib.slack.com/)
78
# Causal Inference 360
89
A Python package for inferring causal effects from observational data.

causallib/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.0"
1+
__version__ = "0.7.0"

causallib/contrib/README.md

+9
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ Currently contributed methods are:
2424
```python
2525
from causallib.contrib.hemm import HEMM
2626
```
27+
1. Matching Estimation/Transform using `faiss`.
28+
29+
Implemented a nearest neighbors search with API that matches `sklearn.NearestNeighbors`
30+
but is powered by [faiss](https://github.com/facebookresearch/faiss) for GPU
31+
support and much faster search on CPU as well.
32+
33+
```python
34+
from causallib.contrib.faissknn import FaissNearestNeighbors
35+
```
2736

2837
## Dependencies
2938
Each model might have slightly different requirements.

causallib/contrib/adversarial_balancing/adversarial_balancing.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,14 @@ def __init__(self, learner, iterations=20, lr=0.5, decay=1, loss_type='01', use_
8484
self.verbose = verbose
8585
self.use_stabilized = use_stabilized
8686

87-
def fit(self, X, a, w_init=None, **select_kwargs):
87+
def fit(self, X, a, y=None, w_init=None, **select_kwargs):
8888
"""
8989
Trains an Adversarial Balancing model.
9090
9191
Args:
9292
X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
9393
a (pd.Series): Treatment assignment of size (num_subjects,).
94+
y: IGNORED.
9495
w_init (pd.Series): Initial sample weights. If not provided, assumes uniform.
9596
select_kwargs: keywords argument to past into select_classifier.
9697
relevant only if model was initialized with list of classifiers in `learner`.
@@ -154,7 +155,7 @@ def _run(self, X, A, w_init=None, is_train=True, use_stabilized=None, **select_k
154155
# To simplify the task (learning weights) we ensure both target and source populations have the same
155156
# importance by reweighting classes by their frequency
156157
y_0_1 = LabelEncoder().fit_transform(y) # Encode -1 ==> 0 and 1 ==>1
157-
class_weight = compute_class_weight('balanced', np.unique(y), y)[y_0_1]
158+
class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)[y_0_1]
158159

159160
sample_weight = np.ones((X_augm.shape[0]))
160161
sample_weight[target_pop_mask] = w[A == a] # Weights from initialization

causallib/contrib/adversarial_balancing/classifier_selection.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def _select_classifier_from_grid(estimator, X, A, param_grid, n_splits=5, seed=1
8585
def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
8686
accuracies = np.zeros(len(candidates))
8787

88-
class_weight = compute_class_weight('balanced', np.unique(A), A)[LabelEncoder().fit_transform(A)]
88+
class_weight = compute_class_weight(class_weight='balanced', classes=np.unique(A), y=A)[LabelEncoder().fit_transform(A)]
8989

9090
if n_splits >= 2:
9191
cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

causallib/contrib/faissknn.py

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# (C) Copyright 2021 IBM Corp.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import numpy as np
16+
import faiss
17+
18+
19+
class FaissNearestNeighbors:
20+
21+
def __init__(self,
22+
metric="mahalanobis",
23+
index_type="flatl2", n_cells=100, n_probes=10):
24+
"""NearestNeighbors object utilizing the faiss library for speed
25+
26+
Implements the same API as sklearn but runs 5-10x faster. Utilizes the
27+
`faiss` library https://github.com/facebookresearch/faiss . Tested with
28+
version 1.7.0. If `faiss-gpu` is installed from pypi, GPU acceleration
29+
will be used if available.
30+
31+
Args:
32+
metric (str) : Distance metric for finding nearest neighbors
33+
(default: "mahalanobis")
34+
index_type (str) : Index type within faiss to use
35+
(supported: "flatl2" and "ivfflat")
36+
n_cells (int) : Number of voronoi cells (only used for "ivfflat",
37+
default: 100)
38+
n_probes (int) : Number of voronoi cells to search in
39+
(only used for "ivfflat", default: 10)
40+
Attributes (after running `fit`):
41+
index_ : the faiss index fit from the data. For details about
42+
faiss indices, see the faiss documentation at
43+
https://github.com/facebookresearch/faiss/wiki/Faiss-indexes .
44+
"""
45+
self.metric = metric
46+
self.n_cells = n_cells
47+
self.n_probes = n_probes
48+
self.index_type = index_type
49+
50+
def fit(self, X):
51+
"""Create faiss index and train with data.
52+
53+
Args:
54+
X (np.array): Array of N samples of shape (NxM)
55+
56+
Returns:
57+
self: Fitted object
58+
"""
59+
X = self._transform_covariates(X)
60+
if self.index_type == "flatl2":
61+
self.index_ = faiss.IndexFlatL2(X.shape[1])
62+
self.index_.add(X)
63+
elif self.index_type == "ivfflat":
64+
quantizer = faiss.IndexFlatL2(X.shape[1])
65+
n_cells = max(1, min(self.n_cells, X.shape[0]//200))
66+
n_probes = min(self.n_probes, n_cells)
67+
self.index_ = faiss.IndexIVFFlat(
68+
quantizer, X.shape[1], n_cells)
69+
self.index_.train(X)
70+
self.index_.nprobe = n_probes
71+
self.index_.add(X)
72+
else:
73+
raise NotImplementedError(
74+
"Index type {} not implemented. Please select"
75+
"one of [\"flatl2\", \"ivfflat\"]".format(self.index_type))
76+
return self
77+
78+
def kneighbors(self, X, n_neighbors=1):
79+
"""Find the k nearest neighbors of each sample in X
80+
81+
Args:
82+
X (np.array): Array of shape (N,M) of samples to search
83+
for neighbors of. M must be the same as the fit data.
84+
n_neighbors (int, optional): Number of neighbors to find.
85+
Defaults to 1.
86+
87+
Returns:
88+
(distances, indices): Two np.array objects of shape (N,n_neighbors)
89+
containing the distances and indices of the closest neighbors.
90+
"""
91+
X = self._transform_covariates(X)
92+
distances, indices = self.index_.search(X, n_neighbors)
93+
# faiss returns euclidean distance squared
94+
return np.sqrt(distances), indices
95+
96+
def _transform_covariates(self, X):
97+
if self.metric == "mahalanobis":
98+
if not hasattr(self, "VI"):
99+
raise AttributeError("Set inverse covariance VI first.")
100+
X = np.dot(X, self.VI.T)
101+
return np.ascontiguousarray(X).astype("float32")
102+
103+
def set_params(self, **parameters):
104+
for parameter, value in parameters.items():
105+
if parameter == "metric_params":
106+
self.set_params(**value)
107+
else:
108+
self._setattr(parameter, value)
109+
return self
110+
111+
def get_params(self, deep=True):
112+
# `deep` plays no role because there are no sublearners
113+
params_to_return = ["metric", "n_cells", "n_probes", "index_type"]
114+
return {i: self.__getattribute__(i) for i in params_to_return}
115+
116+
def _setattr(self, parameter, value):
117+
# based on faiss docs https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
118+
if parameter == "VI":
119+
value = np.linalg.inv(value)
120+
chol = np.linalg.cholesky(value)
121+
cholvi = np.linalg.inv(chol)
122+
value = cholvi
123+
setattr(self, parameter, value)

causallib/contrib/requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
-f https://download.pytorch.org/whl/cpu/ # To support cpu torch installation
2-
torch>=1.2.0
2+
torch>=1.2.0
3+
faiss-gpu~=1.7.0

causallib/datasets/data_loader.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#
1515
# Created on Oct 24, 2019
1616

17+
from causallib.utils.stat_utils import robust_lookup
1718
import os
1819
import pandas as pd
1920
from sklearn.utils import Bunch
@@ -28,7 +29,7 @@ def load_data_file(file_name, data_dir_name, sep=","):
2829
return data
2930

3031

31-
def load_nhefs(raw=False, restrict=True):
32+
def load_nhefs(raw=False, restrict=True, augment=True, onehot=True):
3233
"""Loads the NHEFS smoking-cessation and weight-loss dataset.
3334
3435
Data was gathered during an observational study conducted by the NHANS
@@ -48,6 +49,14 @@ def load_nhefs(raw=False, restrict=True):
4849
If True, returns a (pd.DataFrame, pd.Series) tuple (data and description).
4950
restrict (bool): Whether to apply exclusion criteria on missing data or not.
5051
Note: if False - data will have censored (NaN) outcomes.
52+
augment (bool): Whether to add augmented (squared) features
53+
If False, only original data returned.
54+
If True, squares continuous valued columns ['age', 'wt71', 'smokeintensity', 'smokeyrs']
55+
and joins to data frame with suffix '^2'
56+
onehot (bool): Whether to one-hot encode categorical data.
57+
If False, categorical data ["active", "education", "exercise"], will be returned
58+
in individual columns with categorical values.
59+
If True, extra columns with the categorical value one-hot encoded.
5160
5261
Returns:
5362
Bunch: dictionary-like object
@@ -75,9 +84,12 @@ def load_nhefs(raw=False, restrict=True):
7584
y = data.pop("wt82_71")
7685
X = data[confounders]
7786
descriptors = descriptors[confounders + ["qsmk", "wt82_71"]]
78-
79-
X = pd.get_dummies(X, columns=["active", "education", "exercise"], drop_first=True)
80-
X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']] ** 2, rsuffix="^2")
87+
if onehot:
88+
X = pd.get_dummies(
89+
X, columns=["active", "education", "exercise"], drop_first=True)
90+
if augment:
91+
X = X.join(X[['age', 'wt71', 'smokeintensity', 'smokeyrs']]
92+
** 2, rsuffix="^2")
8193

8294
data = Bunch(X=X, a=a, y=y, descriptors=descriptors)
8395
return data
@@ -132,8 +144,7 @@ def load_acic16(instance=1, raw=False):
132144
# # Extract observed outcome:
133145
y = zymu[["y0", "y1"]]
134146
y = y.rename(columns=lambda x: int(x.strip("y"))) # remove 'y' prefix to allow lookup
135-
y = y.lookup(y.index, a) # Choose the outcome based on the treatment assignment
136-
y = pd.Series(y, index=a.index) # `lookup` return ndarray, convert back to Series
147+
y = robust_lookup(y, a)
137148
# # Potential outcomes:
138149
po = zymu[["mu0", "mu1"]]
139150
po = po.rename(columns=lambda x: x.strip("mu"))

causallib/estimation/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ The methods that are currently available are:
2929
`causallib.estimation.DoublyRobustVanilla`
3030

3131

32-
### Example: Inverse Probabilty Weighting (IPW)
32+
### Example: Inverse Probability Weighting (IPW)
3333
An IPW model can be run, for example, using
3434
```Python
3535
from sklearn.linear_model import LogisticRegression

causallib/estimation/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from .doubly_robust import DoublyRobustIpFeature, DoublyRobustJoffe, DoublyRobustVanilla
22
from .ipw import IPW
3+
from .overlap_weights import OverlapWeights
34
from .standardization import Standardization, StratifiedStandardization
45
from .marginal_outcome import MarginalOutcomeEstimator
6+
from .matching import Matching, PropensityMatching

causallib/estimation/base_estimator.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
A module defining the various hierarchy of causal models interface.
2020
Causal models have two main tasks - predicting counterfactual outcomes and predicting effect based on these estimated
2121
outcomes.
22-
On top if it there are two resolutions we can work on: the individual level (i.e. outcome and effect for each individual
22+
On top of it there are two resolutions we can work on: the individual level (i.e. outcome and effect for each individual
2323
in the dataset) and population level (i.e. some aggregation on the sample level).
24-
This module defines it it all with:
24+
This module defines it all with:
2525
* EffectEstimator - can estimate both individual and population level effect
2626
* PopulationOutcomeEstimator - estimates aggregated outcomes on different sub-groups in the dataset.
2727
* IndividualOutcomeEstimator - estimates individual level outcomes.
@@ -175,7 +175,7 @@ def estimate_effect(self, outcome1, outcome2, agg="population", effect_types="di
175175
and values are the corresponding computed effect.
176176
A DataFrame if individual effect (input is a vector) where columns are effects
177177
types and rows are effect in each individual.
178-
Always: Value type is same is outcome_1 and outcome_2 type.
178+
Always: Value type is the same as outcome_1 and outcome_2 type.
179179
"""
180180
if agg == "population":
181181
outcome1 = self._aggregate_population_outcome(outcome1)

causallib/estimation/base_weight.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,14 @@ def __init__(self, learner, use_stabilized=False, *args, **kwargs):
4242
self.use_stabilized = use_stabilized
4343

4444
@abc.abstractmethod
45-
def fit(self, X, a):
45+
def fit(self, X, a, y=None):
4646
"""
4747
Trains a model to predict treatment assignment given the covariates: Pr[A|X].
4848
4949
Args:
5050
X (pd.DataFrame): Covariate matrix of size (num_subjects, num_features).
5151
a (pd.Series): Treatment assignment of size (num_subjects,).
52+
y: IGNORED.
5253
5354
Returns:
5455
WeightEstimator: A causal weight model with an inner learner fitted.

causallib/estimation/ipw.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def __init__(self, learner, truncate_eps=None, use_stabilized=False):
4949
self.__check_truncation_value_is_legal(truncate_eps)
5050
self.truncate_eps = truncate_eps
5151

52-
def fit(self, X, a):
52+
def fit(self, X, a, y=None):
5353
if self.use_stabilized:
5454
self.treatment_prevalence_ = a.value_counts(normalize=True, sort=False)
5555
self.learner.fit(X, a)

0 commit comments

Comments
 (0)