Skip to content

Commit 3de8677

Browse files
committed
fix: more progress on load_all_df, added derived.sleep, finished Whoop sleep loading, fixed location loading, and many other fixes
1 parent ebe720b commit 3de8677

11 files changed

+380
-95
lines changed

config.example.toml

+6-7
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,17 @@ name = "john"
88
date_offset_hours = 5
99

1010
[data]
11-
categories= "categories.example.toml"
12-
habitbull = "~/Downloads/HabitBullData.csv"
13-
location = "~/location"
14-
oura = "~/Downloads/oura_2020-02-27T09-07-47.json"
11+
categories = "~/work/quantifiedme/quantifiedme/categories.example.toml"
12+
#habitbull = "~/Downloads/HabitBullData.csv"
13+
#location = "~/location"
14+
#oura = "~/Downloads/oura_2020-02-27T09-07-47.json"
1515

1616
[data.activitywatch]
1717
port = 5666
1818
hostnames = ["fakedata"]
1919

20-
[data.smartertime_buckets]
21-
example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'
22-
20+
#[data.smartertime_buckets]
21+
#example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'
2322

2423
[locations]
2524
[locations.gym]

src/quantifiedme/derived/all_df.py

+130-7
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,136 @@
1+
import os
2+
import logging
3+
from typing import Literal, TypeAlias
4+
from datetime import date, datetime, timedelta, timezone
5+
6+
import pandas as pd
7+
18
from aw_core import Event
2-
from typing import Literal
39

4-
from .heartrate import load_heartrate_daily_df
5-
from .screentime import load_category_df
10+
from ..load.location import load_daily_df as load_location_daily_df
11+
from ..load.qslang import load_daily_df as load_drugs_df
12+
13+
from .heartrate import load_heartrate_summary_df
14+
from .screentime import load_screentime_cached, load_category_df
15+
from .sleep import load_sleep_df
16+
17+
Sources = Literal["screentime", "heartrate", "drugs", "location", "sleep"]
618

7-
Sources = Literal["activitywatch", "heartrate"]
819

9-
def load_all_df(events: list[Event], ignore: list[Sources] = []):
10-
df = load_category_df(events)
20+
def load_all_df(
21+
fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = []
22+
) -> pd.DataFrame:
23+
"""
24+
Loads a bunch of data into a single dataframe with one row per day.
25+
Serves as a useful starting point for further analysis.
26+
"""
27+
df = pd.DataFrame()
28+
since = datetime.now(tz=timezone.utc) - timedelta(days=30 if fast else 2 * 365)
29+
30+
if "screentime" not in ignore:
31+
print("Adding screentime")
32+
if screentime_events is None:
33+
screentime_events = load_screentime_cached(fast=fast, since=since)
34+
df_time = load_category_df(screentime_events)
35+
df_time = df_time[["Work", "Media", "ActivityWatch"]]
36+
df = join(df, df_time.add_prefix("time:"))
37+
1138
if "heartrate" not in ignore:
12-
df = df.join(load_heartrate_daily_df(events))
39+
print("Adding heartrate")
40+
df_hr = load_heartrate_summary_df(freq="D")
41+
# translate daily datetime column to a date column
42+
df_hr.index = df_hr.index.date # type: ignore
43+
df = join(df, df_hr)
44+
45+
if "drugs" not in ignore:
46+
print("Adding drugs")
47+
# keep only columns starting with "tag"
48+
df_drugs = load_drugs_df()
49+
df_drugs = df_drugs[df_drugs.columns[df_drugs.columns.str.startswith("tag")]]
50+
df = join(df, df_drugs)
51+
52+
if "location" not in ignore:
53+
print("Adding location")
54+
# TODO: add boolean for if sleeping together
55+
df_location = load_location_daily_df()
56+
df_location.index = df_location.index.date # type: ignore
57+
df = join(df, df_location.add_prefix("loc:"))
58+
59+
if "sleep" not in ignore:
60+
df_sleep = load_sleep_df()
61+
df = join(df, df_sleep.add_prefix("sleep:"))
62+
63+
# look for all-na columns, emit a warning, and drop them
64+
na_cols = df.columns[df.isna().all()]
65+
if len(na_cols) > 0:
66+
print(f"Warning: dropping all-NA columns: {str(list(na_cols))}")
67+
df = df.drop(columns=na_cols)
68+
1369
return df
70+
71+
72+
def join(df_target: pd.DataFrame, df_source: pd.DataFrame) -> pd.DataFrame:
73+
if not df_target.empty:
74+
check_new_data_in_range(df_source, df_target)
75+
print(
76+
f"Adding new columns: {str(list(df_source.columns.difference(df_target.columns)))}"
77+
)
78+
return df_target.join(df_source) if not df_target.empty else df_source
79+
80+
81+
DateLike: TypeAlias = datetime | date | pd.Timestamp
82+
83+
84+
def datelike_to_date(d: DateLike) -> date:
85+
if isinstance(d, datetime) or isinstance(d, pd.Timestamp):
86+
return d.date()
87+
elif isinstance(d, date):
88+
return d
89+
else:
90+
raise ValueError(f"Invalid type for datelike: {type(d)}")
91+
92+
93+
def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) -> None:
94+
# check that source data covers target data, or emit warning
95+
source_start = datelike_to_date(df_source.index.min())
96+
source_end = datelike_to_date(df_source.index.max())
97+
target_start = datelike_to_date(df_target.index.min())
98+
target_end = datelike_to_date(df_target.index.max())
99+
100+
# check the worst case
101+
if source_start > target_end or source_end < target_start:
102+
print(
103+
f"Warning: source data does not cover ANY of target data: ({source_start}/{source_end}) not in ({target_start}/{target_end})"
104+
)
105+
elif source_start > target_start:
106+
print(
107+
f"Warning: source data starts after target data (partial): {source_start} > {target_start}"
108+
)
109+
elif source_end < target_end:
110+
print(
111+
f"Warning: source data ends before target data (partial): {source_end} < {target_end}"
112+
)
113+
114+
115+
if __name__ == "__main__":
116+
logging.basicConfig(level=logging.INFO)
117+
118+
# print a summary of all data
119+
df = load_all_df(fast=os.environ.get("FAST", "1") == "1")
120+
print(df)
121+
print(df.describe())
122+
123+
# check for missing data
124+
df_days_na = df.isna().sum()
125+
df_days_na = df_days_na[df_days_na > 0]
126+
if len(df_days_na) > 0:
127+
print(f"Missing data for {len(df_days_na)} out of {len(df.columns)} columns")
128+
print(df_days_na)
129+
print("Total days: ", len(df))
130+
131+
# keep days with full coverage
132+
df = df.dropna()
133+
print("Total days with full coverage: ", len(df))
134+
135+
print("Final dataframe:")
136+
print(df)

src/quantifiedme/derived/heartrate.py

+24-14
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,37 @@ def load_heartrate_df() -> pd.DataFrame:
2626
return df
2727

2828

29-
def load_heartrate_daily_df(
30-
zones={"low": 100, "med": 140, "high": 160}, freq="D"
29+
def load_heartrate_minutes_df():
30+
"""We consider using minute-resolution a decent starting point for summary heartrate data.
31+
32+
NOTE: ignores source, combines all sources into a single point per freq.
33+
"""
34+
df = load_heartrate_df().drop(columns=["source"])
35+
df = df.resample("1min").mean()
36+
return df
37+
38+
39+
def load_heartrate_summary_df(
40+
zones={"resting": 0, "low": 100, "med": 140, "high": 160}, freq="D"
3141
) -> pd.DataFrame:
3242
"""
33-
Load heartrates, group into day, bin by zone, and return a dataframe.
34-
35-
NOTE: Ignores source, combines all sources into a single point per freq.
43+
Load heartrates, group into freq, bin by zone, and return a dataframe.
3644
"""
37-
source_df = load_heartrate_df().drop(columns=["source"])
45+
source_df = load_heartrate_minutes_df()
3846
df = pd.DataFrame()
39-
df["hr"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()
40-
df["zone"] = pd.cut(
41-
df["hr"], bins=[0, *zones.values(), 300], labels=["resting", *zones.keys()]
47+
df["hr_mean"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()
48+
49+
# compute time spent in each zone
50+
df_zones = pd.cut(
51+
source_df["hr"], bins=[*zones.values(), 300], labels=[*zones.keys()]
4252
)
53+
for zone in zones.keys():
54+
df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby(
55+
pd.Grouper(freq=freq)
56+
).count() * pd.Timedelta(minutes=1)
4357
return df
4458

4559

4660
if __name__ == "__main__":
47-
df = load_heartrate_df()
48-
print(df)
49-
print(df.describe())
50-
51-
df = load_heartrate_daily_df()
61+
df = load_heartrate_summary_df()
5262
print(df)

src/quantifiedme/derived/screentime.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pickle
12
import logging
23
from datetime import datetime, timezone, timedelta
34
from pathlib import Path
@@ -37,10 +38,10 @@ def _get_aw_client(testing: bool) -> ActivityWatchClient:
3738

3839

3940
def load_screentime(
40-
since: datetime | None,
41-
datasources: list[DatasourceType] | None,
42-
hostnames: list[str] | None,
43-
personal: bool,
41+
since: datetime | None = None,
42+
datasources: list[DatasourceType] | None = None,
43+
hostnames: list[str] | None = None,
44+
personal: bool = True,
4445
cache: bool = True,
4546
awc: ActivityWatchClient | None = None,
4647
) -> list[Event]:
@@ -122,6 +123,24 @@ def load_screentime(
122123

123124
return events
124125

126+
def load_screentime_cached(*args, since: datetime | None = None, fast = False, **kwargs) -> list[Event]:
127+
# returns screentime from picked cache produced by Dashboard.ipynb (or here)
128+
path = Path(__file__).parent.parent.parent.parent / "notebooks" / ("events_fast.pickle" if fast else "events.pickle")
129+
if path.exists():
130+
print(f"Loading from cache: {path}")
131+
with open(path, "rb") as f:
132+
events = pickle.load(f)
133+
# if fast didn't get us enough data to satisfy the query, we need to load the rest
134+
if fast and since and events[-1].timestamp < since:
135+
print("Fast couldn't satisfy since, trying again without fast")
136+
events = load_screentime_cached(fast=False, **kwargs)
137+
# trim according to since
138+
if since:
139+
events = [e for e in events if e.timestamp >= since]
140+
return events
141+
else:
142+
return load_screentime(*args, **kwargs)
143+
125144

126145
def _join_events(
127146
old_events: list[Event], new_events: list[Event], source: str

src/quantifiedme/derived/sleep.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""
2+
Aggregates sleep data from Fitbit, Oura, and Whoop into a single dataframe.
3+
"""
4+
5+
from datetime import datetime, timedelta, timezone
6+
7+
import pandas as pd
8+
9+
from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df
10+
from ..load.oura import load_sleep_df as load_oura_sleep_df
11+
from ..load.whoop import load_sleep_df as load_whoop_sleep_df
12+
13+
14+
def load_sleep_df(ignore: list[str] = []) -> pd.DataFrame:
15+
"""
16+
Loads sleep data from Fitbit, Oura, and Whoop into a single dataframe.
17+
"""
18+
df = pd.DataFrame()
19+
20+
# Fitbit
21+
#df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit")
22+
23+
# Oura
24+
if "oura" not in ignore:
25+
df_oura = load_oura_sleep_df()
26+
df = join(df, df_oura.add_suffix("_oura"))
27+
28+
# Whoop
29+
if "whoop" not in ignore:
30+
df_whoop = load_whoop_sleep_df()
31+
df = join(df, df_whoop.add_suffix("_whoop"))
32+
33+
# perform some aggregations
34+
keys = list(set(col.split("_")[0] for col in df.columns) & {"duration", "score"})
35+
for key in keys:
36+
subkeys = df.columns[df.columns.str.startswith(key)]
37+
df[key] = df[subkeys].mean(axis=1)
38+
df = df[keys]
39+
40+
return df
41+
42+
43+
def join(df_target, df_source, **kwargs) -> pd.DataFrame:
44+
if df_target.empty:
45+
return df_source
46+
else:
47+
return df_target.join(df_source, **kwargs)
48+
49+
50+
if __name__ == "__main__":
51+
df = load_sleep_df()
52+
print(df)
53+
"""
54+
df["duration_whoop"].plot()
55+
import matplotlib.pyplot as plt
56+
57+
plt.show()
58+
"""

src/quantifiedme/load/fitbit.py

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
import pandas as pd
77

88

9+
def load_sleep_df() -> pd.DataFrame:
10+
raise NotImplementedError
11+
12+
913
def _load_heartrate_file(filepath):
1014
# print(f"Loading {filepath}...")
1115
# json format is {"dateTime": "2020-01-01", "value": {"bpm": 60, "confidence": 0}}

0 commit comments

Comments
 (0)