1
1
from aw_core import Event
2
- from typing import Literal
2
+ from typing import Literal , TypeAlias
3
+ from datetime import date , datetime , timedelta , timezone
3
4
4
- from .heartrate import load_heartrate_daily_df
5
- from .screentime import load_category_df
5
+ import pandas as pd
6
6
7
- Sources = Literal ["activitywatch" , "heartrate" ]
7
+ from ..load .location import load_daily_df as load_location_daily_df
8
+ from ..load .qslang import load_daily_df as load_drugs_df
9
+
10
+ from .heartrate import load_heartrate_summary_df
11
+ from .screentime import load_screentime_cached , load_category_df
12
+ from .sleep import load_sleep_df
13
+
14
+ Sources = Literal ["screentime" , "heartrate" , "drugs" , "location" ]
15
+
16
+
17
+ def load_all_df (
18
+ fast = True , screentime_events : list [Event ] | None = None , ignore : list [Sources ] = []
19
+ ) -> pd .DataFrame :
20
+ """
21
+ Loads a bunch of data into a single dataframe with one row per day.
22
+ Serves as a useful starting point for further analysis.
23
+ """
24
+ df = pd .DataFrame ()
25
+ since = datetime .now (tz = timezone .utc ) - timedelta (days = 30 if fast else 2 * 365 )
26
+
27
+ if "screentime" not in ignore :
28
+ print ("Adding screentime" )
29
+ if screentime_events is None :
30
+ screentime_events = load_screentime_cached (fast = fast , since = since )
31
+ df_time = load_category_df (screentime_events )
32
+ df_time = df_time [["Work" , "Media" , "ActivityWatch" ]]
33
+ df = join (df , df_time .add_prefix ("time:" ))
8
34
9
- def load_all_df (events : list [Event ], ignore : list [Sources ] = []):
10
- df = load_category_df (events )
11
35
if "heartrate" not in ignore :
12
- df = df .join (load_heartrate_daily_df (events ))
36
+ print ("Adding heartrate" )
37
+ df_hr = load_heartrate_summary_df (freq = "D" )
38
+ # translate daily datetime column to a date column
39
+ df_hr .index = df_hr .index .date # type: ignore
40
+ df = join (df , df_hr )
41
+
42
+ if "drugs" not in ignore :
43
+ print ("Adding drugs" )
44
+ # keep only columns starting with "tag"
45
+ df_drugs = load_drugs_df ()
46
+ df_drugs = df_drugs [df_drugs .columns [df_drugs .columns .str .startswith ("tag" )]]
47
+ df = join (df , df_drugs )
48
+
49
+ if "location" not in ignore :
50
+ print ("Adding location" )
51
+ # TODO: add boolean for if sleeping together
52
+ df_location = load_location_daily_df ()
53
+ df_location .index = df_location .index .date # type: ignore
54
+ df = join (df , df_location .add_prefix ("loc:" ))
55
+
56
+ if "sleep" not in ignore :
57
+ df_sleep = load_sleep_df ()
58
+ df = join (df , df_sleep .add_prefix ("sleep:" ))
59
+
60
+ # look for all-na columns, emit a warning, and drop them
61
+ na_cols = df .columns [df .isna ().all ()]
62
+ if len (na_cols ) > 0 :
63
+ print (f"Warning: dropping all-NA columns: { str (list (na_cols ))} " )
64
+ df = df .drop (columns = na_cols )
65
+
13
66
return df
67
+
68
+
69
+
70
+ def join (df_target : pd .DataFrame , df_source : pd .DataFrame ) -> pd .DataFrame :
71
+ if not df_target .empty :
72
+ check_new_data_in_range (df_source , df_target )
73
+ print (f"Adding new columns: { str (list (df_source .columns .difference (df_target .columns )))} " )
74
+ return df_target .join (df_source ) if not df_target .empty else df_source
75
+
76
+
77
+ DateLike : TypeAlias = datetime | date | pd .Timestamp
78
+
79
+
80
+ def datelike_to_date (d : DateLike ) -> date :
81
+ if isinstance (d , datetime ) or isinstance (d , pd .Timestamp ):
82
+ return d .date ()
83
+ elif isinstance (d , date ):
84
+ return d
85
+ else :
86
+ raise ValueError (f"Invalid type for datelike: { type (d )} " )
87
+
88
+
89
+ def check_new_data_in_range (df_source : pd .DataFrame , df_target : pd .DataFrame ) -> None :
90
+ # check that source data covers target data, or emit warning
91
+ source_start = datelike_to_date (df_source .index .min ())
92
+ source_end = datelike_to_date (df_source .index .max ())
93
+ target_start = datelike_to_date (df_target .index .min ())
94
+ target_end = datelike_to_date (df_target .index .max ())
95
+
96
+ # check the worst case
97
+ if source_start > target_end or source_end < target_start :
98
+ print (
99
+ f"Warning: source data does not cover ANY of target data: ({ source_start } /{ source_end } ) not in ({ target_start } /{ target_end } )"
100
+ )
101
+ elif source_start > target_start :
102
+ print (
103
+ f"Warning: source data starts after target data (partial): { source_start } > { target_start } "
104
+ )
105
+ elif source_end < target_end :
106
+ print (
107
+ f"Warning: source data ends before target data (partial): { source_end } < { target_end } "
108
+ )
109
+
110
+
111
+ if __name__ == "__main__" :
112
+ import os
113
+ import logging
114
+ logging .basicConfig (level = logging .INFO )
115
+
116
+ # print a summary of all data
117
+ df = load_all_df (fast = os .environ .get ("FAST" , "1" ) == "1" )
118
+ print (df )
119
+ print (df .describe ())
120
+
121
+ # check for missing data
122
+ df_days_na = df .isna ().sum ()
123
+ df_days_na = df_days_na [df_days_na > 0 ]
124
+ if len (df_days_na ) > 0 :
125
+ print (f"Missing data for { len (df_days_na )} out of { len (df .columns )} columns" )
126
+ print (df_days_na )
127
+ print ("Total days: " , len (df ))
128
+
129
+ # keep days with full coverage
130
+ df = df .dropna ()
131
+ print ("Total days with full coverage: " , len (df ))
132
+
133
+ print ("Final dataframe:" )
134
+ print (df )
0 commit comments