Merge pull request #77 from CamDavidsonPilon/adding-conditional-time-to

CamDavidsonPilon · CamDavidsonPilon · commit d22c2861a8d1 · 2014-07-23T20:19:40.000-04:00
Adding conditional time to
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 ### Changelogs
 
-
+####0.4.3
+ - refactoring of `qth_survival_times`: it can now accept an iterable (or a scalar still) of probabilities in the q argument, and will return a DataFrame with these as columns. If len(q)==1 and a single survival function is given, will return a scalar, not a DataFrame. Also some good speed improvements.
+ - KaplanMeierFitter and NelsonAalenFitter now have a `_label` property that is passed in during the fit.
+ - KaplanMeierFitter/NelsonAalenFitter's inital `alpha` value is overwritten if a new `alpha` value is passed
+ in during the `fit`.
+ - New method for KaplanMeierFitter: `conditional_time_to`. This returns a DataFrame of the estimate:
+    med(S(t | T>s)) - s, human readable: the estimated time left of living, given an individual is aged s.
+- Adds option `include_likelihood` to CoxPHFitter fit method to save the final log-likelihood value.
 
 ####0.4.2
 
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
@@ -80,12 +80,13 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
                                                                 self._additive_f, self._variance_f, False)
 
         # esimates
-        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[label])
+        self._label = label
+        self.cumulative_hazard_ = pd.DataFrame(cumulative_hazard_, columns=[self._label])
         self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
         self._cumulative_sq = cumulative_sq_
 
         # estimation functions
-        self.predict = _predict(self, "cumulative_hazard_", label)
+        self.predict = _predict(self, "cumulative_hazard_", self._label)
         self.subtract = _subtract(self, "cumulative_hazard_")
         self.divide = _divide(self, "cumulative_hazard_")
 
@@ -99,10 +100,9 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None,
     def _bounds(self, cumulative_sq_, alpha, ci_labels):
         alpha2 = inv_normal_cdf(1 - (1 - alpha) / 2)
         df = pd.DataFrame(index=self.timeline)
-        name = self.cumulative_hazard_.columns[0]
 
         if ci_labels is None:
-            ci_labels = ["%s_upper_%.2f" % (name, self.alpha), "%s_lower_%.2f" % (name, self.alpha)]
+            ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)]
         assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
         self.ci_labels = ci_labels
 
@@ -206,7 +206,8 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
 
         v = preprocess_inputs(durations, event_observed, timeline, entry)
         self.durations, self.event_observed, self.timeline, self.entry, self.event_table = v
-
+        self._label = label
+        self.alpha = alpha if alpha else self.alpha
         log_survival_function, cumulative_sq_ = _additive_estimate(self.event_table, self.timeline,
                                                                    self._additive_f, self._additive_var,
                                                                    left_censorship)
@@ -219,12 +220,12 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
             net_population = (self.event_table['entrance'] - self.event_table['removed']).cumsum()
             if net_population.iloc[:int(n / 2)].min() == 0:
                 ix = net_population.iloc[:int(n / 2)].argmin()
-                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BFH estimator.""" % ix)
+                raise StatError("""There are too few early truncation times and too many events. S(t)==0 for all t>%.1f. Recommend BreslowFlemingHarringtonFitter.""" % ix)
 
         # estimation
-        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[label]))
+        setattr(self, estimate_name, pd.DataFrame(np.exp(log_survival_function), columns=[self._label]))
         self.__estimate = getattr(self, estimate_name)
-        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], alpha if alpha else self.alpha, ci_labels)
+        self.confidence_interval_ = self._bounds(cumulative_sq_[:, None], ci_labels)
         self.median_ = median_survival_times(self.__estimate)
 
         # estimation methods
@@ -237,15 +238,14 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         setattr(self, "plot_" + estimate_name, self.plot)
         return self
 
-    def _bounds(self, cumulative_sq_, alpha, ci_labels):
+    def _bounds(self, cumulative_sq_, ci_labels):
         # See http://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes2.pdfg
-        alpha2 = inv_normal_cdf((1. + alpha) / 2.)
+        alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
         df = pd.DataFrame(index=self.timeline)
-        name = self.__estimate.columns[0]
         v = np.log(self.__estimate.values)
 
         if ci_labels is None:
-            ci_labels = ["%s_upper_%.2f" % (name, self.alpha), "%s_lower_%.2f" % (name, self.alpha)]
+            ci_labels = ["%s_upper_%.2f" % (self._label, self.alpha), "%s_lower_%.2f" % (self._label, self.alpha)]
         assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
 
         df[ci_labels[0]] = np.exp(-np.exp(np.log(-v) + alpha2 * np.sqrt(cumulative_sq_) / v))
@@ -260,6 +260,23 @@ def _additive_var(self, population, deaths):
         np.seterr(divide='ignore')
         return (1. * deaths / (population * (population - deaths))).replace([np.inf], 0)
 
+    def conditional_time_to(self):
+        """
+        Return a DataFrame, with index equal to survival_function_, that estimates the median
+        duration remaining until the death event, given survival up until time t. For example, if an
+        indivual exists until age 1, their expected life remaining *given they lived to time 1*
+        might be 9 years.
+
+        Returns:
+            conditional_time_to_: DataFrame, with index equal to survival_function_
+
+        """
+        age = self.survival_function_.index.values[:, None]
+        columns = ['%s - Conditional time remaining to event' % self._label]
+        return pd.DataFrame(qth_survival_times(self.survival_function_[self._label] * 0.5, self.survival_function_).T.sort(ascending=False).values,
+                            index=self.survival_function_.index,
+                            columns=columns) - age
+
 
 class BreslowFlemingHarringtonFitter(BaseFitter):
 
@@ -868,7 +885,7 @@ def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
             return hessian, gradient
 
     def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=1.,
-                         epsilon=10e-5, show_progress=True):
+                         epsilon=10e-5, show_progress=True, include_likelihood=False):
         """
         Newton Rhaphson algorithm for fitting CPH model.
 
@@ -883,6 +900,7 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=1.,
             step_size: 0 < float <= 1 to determine a step size in NR algorithm.
             epsilon: the convergence halts if the norm of delta between
                      successive positions is less than epsilon.
+            include_likelihood: saves the final log-likelihood to the CoxPHFitter under _log_likelihood.
 
         Returns:
             beta: (1,d) numpy array.
@@ -914,7 +932,8 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=1.,
         i = 1
         converging = True
         while converging:
-            hessian, gradient = get_gradients(X, beta, T, E)
+            output = get_gradients(X, beta, T, E, include_likelihood=include_likelihood)
+            hessian, gradient = output[:2]
             delta = solve(-hessian, step_size * gradient.T)
             beta = delta + beta
             if pd.isnull(delta).sum() > 1:
@@ -928,12 +947,14 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=1.,
 
         self._hessian_ = hessian
         self._score_ = gradient
+        if include_likelihood:
+            self._log_likelihood = output[2]
         if show_progress:
             print("Convergence completed after %d iterations." % (i))
         return beta
 
     def fit(self, df, duration_col='T', event_col='E',
-            show_progress=False, initial_beta=None):
+            show_progress=False, initial_beta=None, include_likelihood=False):
         """
         Fit the Cox Propertional Hazard model to a dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -951,6 +972,9 @@ def fit(self, df, duration_col='T', event_col='E',
              diagnostics.
           initial_beta: initialize the starting point of the iterative
              algorithm. Default is the zero vector.
+          include_likelihood: saves the final log-likelihood to the CoxPHFitter under
+             the property _log_likelihood.
+
 
         Returns:
             self, with additional properties: hazards_
@@ -969,7 +993,8 @@ def fit(self, df, duration_col='T', event_col='E',
         self._check_values(df)
 
         hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta,
-                                         show_progress=show_progress)
+                                         show_progress=show_progress, 
+                                         include_likelihood=include_likelihood)
 
         self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns,
                                      index=['coef'])
@@ -1047,7 +1072,7 @@ def predict_percentile(self, X, p=0.5):
         Returns the median lifetimes for the individuals.
         http://stats.stackexchange.com/questions/102986/percentile-loss-functions
         """
-        return qth_survival_times(0.5, self.predict_survival_function(X))
+        return qth_survival_times(p, self.predict_survival_function(X))[p]
 
     def predict_median(self, X):
         """
@@ -1201,17 +1226,25 @@ def qth_survival_times(q, survival_functions):
         If numpy array, will return indices.
 
     Returns:
-      v: an array containing the first times the value was crossed.
-        np.inf if infinity.
+      v: if d==1, returns a float, np.inf if infinity.
+         if d > 1, an DataFrame containing the first times the value was crossed.
+
+    """
+    q = pd.Series(q)
+    survival_functions = pd.DataFrame(survival_functions)
+    if survival_functions.shape[1] == 1 and q.shape == (1,):
+        return survival_functions.apply(lambda s: qth_survival_time(q[0], s)).ix[0]
+    else:
+        return pd.DataFrame({_q: survival_functions.apply(lambda s: qth_survival_time(_q, s)) for _q in q})
+
+
+def qth_survival_time(q, survival_function):
+    """
+    Expects a Pandas series, returns the time when the qth probability is reached.
     """
-    assert 0. <= q <= 1., "q must be between 0. and 1."
-    sv_b = (1.0 * (survival_functions < q)).cumsum() > 0
-    try:
-        v = sv_b.idxmax(0)
-        v[sv_b.iloc[-1,:] == 0] = np.inf
-    except:
-        v = sv_b.argmax(0)
-        v[sv_b[-1,:] == 0] = np.inf
+    if survival_function.iloc[-1] > q:
+        return np.inf
+    v = (survival_function <= q).idxmax(0)
     return v
 
 
diff --git a/lifelines/tests/test_suite.py b/lifelines/tests/test_suite.py
@@ -17,10 +17,11 @@
 from collections import Counter
 import matplotlib.pyplot as plt
 import pandas as pd
+from pandas.util.testing import assert_frame_equal
 
 from ..estimation import KaplanMeierFitter, NelsonAalenFitter, AalenAdditiveFitter, \
     median_survival_times, BreslowFlemingHarringtonFitter, BayesianFitter, \
-    CoxPHFitter
+    CoxPHFitter, qth_survival_times, qth_survival_time
 
 from ..statistics import (logrank_test, multivariate_logrank_test,
                           pairwise_logrank_test, concordance_index)
@@ -33,6 +34,44 @@
 
 class MiscTests(unittest.TestCase):
 
+    def test_qth_survival_times_with_varying_datatype_inputs(self):
+        sf_list = [1.0, 0.75, 0.5, 0.25, 0.0]
+        sf_array = np.array([1.0, 0.75, 0.5, 0.25, 0.0])
+        sf_df_no_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0])
+        sf_df_index = pd.DataFrame([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
+        sf_series_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0], index=[10, 20, 30, 40, 50])
+        sf_series_no_index = pd.Series([1.0, 0.75, 0.5, 0.25, 0.0])
+
+        q = 0.5
+
+        assert qth_survival_times(q, sf_list) == 2
+        assert qth_survival_times(q, sf_array) == 2
+        assert qth_survival_times(q, sf_df_no_index) == 2
+        assert qth_survival_times(q, sf_df_index) == 30
+        assert qth_survival_times(q, sf_series_index) == 30
+        assert qth_survival_times(q, sf_series_no_index) == 2
+
+    def test_qth_survival_times_multi_dim_input(self):
+        sf = np.linspace(1, 0, 50)
+        sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf ** 2})
+
+        medians = qth_survival_times(0.5, sf_multi_df)
+        assert medians.ix['sf'][0.5] == 25
+        assert medians.ix['sf**2'][0.5] == 15
+
+    def test_qth_survival_time_returns_inf(self):
+        sf = pd.Series([1., 0.7, 0.6])
+        assert qth_survival_time(0.5, sf) == np.inf
+
+    def test_qth_survival_times_with_multivariate_q(self):
+        sf = np.linspace(1, 0, 50)
+        sf_multi_df = pd.DataFrame({'sf': sf, 'sf**2': sf ** 2})
+
+        assert_frame_equal(qth_survival_times([0.2, 0.5], sf_multi_df), pd.DataFrame([[40, 25], [28, 15]], columns=[0.2, 0.5], index=['sf', 'sf**2']))
+        assert_frame_equal(qth_survival_times([0.2, 0.5], sf_multi_df['sf']), pd.DataFrame([[40, 25]], columns=[0.2, 0.5], index=['sf']))
+        assert_frame_equal(qth_survival_times(0.5, sf_multi_df), pd.DataFrame([[25], [15]], columns=[0.5], index=['sf', 'sf**2']))
+        assert qth_survival_times(0.5, sf_multi_df['sf']) == 25
+
     def test_datetimes_to_durations_days(self):
         start_date = ['2013-10-10 0:00:00', '2013-10-09', '2012-10-10']
         end_date = ['2013-10-13', '2013-10-10 0:00:00', '2013-10-15']
@@ -108,6 +147,19 @@ def test_cross_validator_with_predictor_and_kwargs(self):
                                 duration_col='T', event_col='E', k=3,
                                 predictor="predict_percentile", predictor_kwargs={'p': 0.6})
 
+    def test_label_is_a_property(self):
+        kmf = KaplanMeierFitter()
+        kmf.fit(LIFETIMES, label='Test Name')
+        assert kmf._label == 'Test Name'
+        assert kmf.confidence_interval_.columns[0] == 'Test Name_upper_0.95'
+        assert kmf.confidence_interval_.columns[1] == 'Test Name_lower_0.95'
+
+        naf = NelsonAalenFitter()
+        naf.fit(LIFETIMES, label='Test Name')
+        assert naf._label == 'Test Name'
+        assert naf.confidence_interval_.columns[0] == 'Test Name_upper_0.95'
+        assert naf.confidence_interval_.columns[1] == 'Test Name_lower_0.95'
+
 
 class StatisticalTests(unittest.TestCase):
 
@@ -140,7 +192,7 @@ def test_censor_kaplan_meier(self):
 
     def test_median(self):
         sv = pd.DataFrame(1 - np.linspace(0, 1, 1000))
-        self.assertTrue(median_survival_times(sv).ix[0] == 500)
+        self.assertTrue(median_survival_times(sv) == 500)
 
     def test_not_to_break(self):
         try:
@@ -731,6 +783,11 @@ def test_flat_style_no_censor(self):
 
 class CoxRegressionTests(unittest.TestCase):
 
+    def test_log_likelihood_is_available_in_output(self):
+        cox = CoxPHFitter()
+        cox.fit(data_nus, duration_col='t', event_col='E', include_likelihood=True)
+        assert abs( cox._log_likelihood - -12.7601409152 ) < 0.001
+
     def test_efron_computed_by_hand_examples(self):
         cox = CoxPHFitter()
 
diff --git a/lifelines/utils.py b/lifelines/utils.py
@@ -4,7 +4,6 @@
 from datetime import datetime
 
 import numpy as np
-from numpy.random import permutation
 import pandas as pd
 from pandas import to_datetime
 
@@ -217,7 +216,7 @@ def datetimes_to_durations(start_times, end_times, fill_date=datetime.today(), f
     end_times = pd.Series(end_times).copy()
     start_times_ = to_datetime(start_times, dayfirst=dayfirst)
 
-    C = ~(pd.isnull(end_times).values + (end_times == "") + (end_times == na_values))
+    C = ~(pd.isnull(end_times).values | (end_times == "") | (end_times == na_values))
     end_times[~C] = fill_date
     """
     c =  (to_datetime(end_times, dayfirst=dayfirst, coerce=True) > fill_date)
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@ def read(fname):
 
 setup(
     name="lifelines",
-    version="0.4.2",
+    version="0.4.3",
     author="Cameron Davidson-Pilon",
     author_email="cam.davidson.pilon@gmail.com",
     description="Survival analysis in Python, including Kaplan Meier, Nelson Aalen and regression",