lmi_rct_pilot_analysis/analysis.py at main · joecdsit/lmi_rct_pilot_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
import pandas as pd

from rct_analysis import linear_regression as lr
from rct_analysis import plotting as pl
from utilities import config_reader, data_read_in

# Suppress warnings from printing
import warnings
warnings.filterwarnings('ignore')


def _pct_change(new, base):

	"""
	Percentage change helper.

	Args:
		new (float): adjusted value.
		base (float): baseline value.

	Returns:
		float: 100 * (new/base - 1), or NaN if base is zero.
	"""

	if base == 0:
		return float("nan")
	return 100.0 * (new / base - 1.0)

def analysis_pipeline(
	data,
	outcome,
	assigned,
	uptake=None,
	covariates=None,
	fixed_effects=None,
	clusters=None,
	robust_SEs=False,
	simple=False,
	two_stage=False,
	multiple=False,
	wald_test=False,
	interaction_variables=None
	):

	"""
	Full pipeline for analysis, to be looped over with aggregate data and split by tasks.

	Args:
		data (DataFrame): contains columns of later specified variables.
		outcomes (list): column names of output variables.
		assigned (str): column name of assigned treatment column.
		uptake (str): column name of uptake, for use in two stage linear regression.
		covariates (str): column name(s) of covariate (exogenous) variable(s) that are not considered fixed effects.
		fixed_effects (str): column name(s) of covariate (exogenous) variable(s) that are considered fixed effects.
		clusters (str): column name(s) of covariates to cluster standard errors.
		robust_SEs (bool): If True, use robust standard errors in simple linear regression.
		simple (bool): if True, perform linear regression with one independent variable.
		two_stage (bool): if True, perform two stage least squares analysis for simple regression to account for compliance (always on for multiple regression).
		multiple (bool): if True, perform linear regression with multiple independent variables.
		wald_test (bool): if True, interact uptake with covariates and perform Wald tests.

	Returns:
		results (class): results of fit of regression model.
		gap_scores (tuple): if multiple linear regression, contains scores showing how much outcome difference is due to change in uptake.
	"""

	data_copy = data.copy()

	if simple:
		gap_scores = None
		if not two_stage:
			results = lr.simple_linear_regression(data_copy, assigned, outcome)
			if robust_SEs:
				results = results.get_robustcov_results(cov_type="HC3") # Use robust standard errors as residuals don't have constant variance
		elif two_stage:
			results, wald_results = lr.two_stage_least_squares(data_copy, uptake, assigned, outcome, covariates=None, fixed_effects=None) # Simple linear regression with two stages to account for non-compliance
	elif multiple:
		results, wald_results = lr.two_stage_least_squares(data_copy, uptake, assigned, outcome, covariates, fixed_effects, clusters, interact_with_covariates=wald_test, interaction_variables=interaction_variables) # Multiple linear regression with fixed effects and clustered standard errors
		gap_scores = lr.two_stage_component_decomposition_log(data_copy, outcome, assigned, uptake, results) # Determine how much uptake impacts difference in outcomes
	else:
		print("No method selected, ensure one of simple or multiple is True.")

	if wald_test:
		try:
			wtest = lr.wald_test_uptake_interactions(wald_results, uptake_name=uptake, verbose=False)
			# Capture values:
			wald_stat = float(wtest.stat)
			wald_p = float(wtest.pval)
			wald_stats = (wald_stat, wald_p)
		except ValueError as e:
			# No interactions found; skip or log
			print(f"Wald test skipped: {e}")
	else:
		wald_stats = None

	return results, gap_scores, wald_stats


def compute_adjusted_effects(
	data,
	outcome,
	assigned,
	uptake,
	variables,
	role,
	simple_coeff,
):

	"""
	Compute adjusted uptake effects and percentage changes, one variable at a time.

	Args:
		data (DataFrame): input data containing model variables.
		outcome (str): column name of outcome variable.
		assigned (str): column name of assigned treatment.
		uptake (str): column name of uptake (instrumented endogenous regressor).
		variables (list): list of variable names to include individually.
		role (str): 'covariate' to enter as covariates; 'fixed' to enter as fixed effects.
		simple_coeff (float): baseline coefficient from simple two-stage model.
	Returns:

		tuple: (flat_values, headings) where
			flat_values (list): [coeff_1, pct_1, coeff_2, pct_2, ...].
			headings (list): ["x adjusted coefficient", "x percentage change", ...].
	"""

	headings = []
	flat_values = []
	for var in variables:
		if role == "covariate":
			results, _, _ = analysis_pipeline(
				data=data,
				outcome=outcome,
				assigned=assigned,
				uptake=uptake,
				multiple=True,
				covariates=var,
			)
		else:
			results, _, _ = analysis_pipeline(
				data=data,
				outcome=outcome,
				assigned=assigned,
				uptake=uptake,
				multiple=True,
				fixed_effects=var,
			)
		adj_coeff = float(results.params[uptake])
		change = _pct_change(adj_coeff, simple_coeff)
		headings += [f"{var} adjusted coefficient", f"{var} percentage change"]
		flat_values += [adj_coeff, change]
	return flat_values, headings


def run_scope(
	data,
	outcome_vars,
	label,
	*,
	assigned,
	uptake,
	covariates,
	fixed_effects,
	clusters,
	robust_SEs,
	wald_test,
	interaction_variables,
):

	"""
	Run the full analysis for one scope (aggregate or a single task).

	Args:
		data (DataFrame): input data for the scope.
		outcome_vars (list): list of outcome variable names.
		label (str): label for the scope (e.g., "All Tasks" or a task value).
		assigned (str): column name of assigned treatment.
		uptake (str): column name of uptake (instrumented endogenous regressor).
		covariates (list): covariates (exogenous) not considered fixed effects.
		fixed_effects (list): covariates (exogenous) considered fixed effects.
		clusters (list): cluster columns for clustered standard errors.
		robust_SEs (bool): if True, apply robust SEs in simple regression when available.
		wald_test (bool): if True, compute Wald statistics for interactions.
		interaction_variables (list): variables to interact with uptake.

	Returns:
		tuple: (model_rows, var_rows, var_headings, wald_rows, overall) where
			model_rows (list): rows for the main model results table.
			var_rows (list): rows for variable-effects table.
			var_headings (list): column headings for variable-effects table.
			wald_rows (list): rows for Wald statistics table.
			overall (dict): {outcome: [coef, se, p]} summary for plotting.
	"""

	model_rows = []
	var_rows = []
	wald_rows = []
	var_headings = []
	overall = {}

	for i, outcome in enumerate(outcome_vars):
		# Baseline: simple two-stage to compare coefficients
		simple_results, _, _ = analysis_pipeline(
			data=data,
			outcome=outcome,
			assigned=assigned,
			uptake=uptake,
			simple=True,
			two_stage=True,
		)
		simple_coeff = float(simple_results.params[uptake])

		# One-at-a-time covariates and fixed effects
		cov_vals, cov_heads = compute_adjusted_effects(
			data=data,
			outcome=outcome,
			assigned=assigned,
			uptake=uptake,
			variables=covariates,
			role="covariate",
			simple_coeff=simple_coeff,
		)
		eff_vals, eff_heads = compute_adjusted_effects(
			data=data,
			outcome=outcome,
			assigned=assigned,
			uptake=uptake,
			variables=fixed_effects,
			role="fixed",
			simple_coeff=simple_coeff,
		)
		if i == 0:
			var_headings = [*cov_heads, *eff_heads]
		var_rows.append((label, outcome, simple_coeff, *cov_vals, *eff_vals))

		# Full multivariate model
		results, gap_scores, wald_stats = analysis_pipeline(
			data=data,
			outcome=outcome,
			assigned=assigned,
			uptake=uptake,
			covariates=covariates,
			fixed_effects=fixed_effects,
			clusters=clusters,
			robust_SEs=robust_SEs,
			multiple=True,
			wald_test=wald_test,
			interaction_variables=interaction_variables,
		)
		coef = float(results.params[uptake])
		se = float(results.std_errors[uptake])
		p = float(results.pvalues[uptake])
		overall[outcome] = [coef, se, p]

		total_gap, via_uptake, residual_gap = (gap_scores or (None, None, None))
		model_rows.append((label, outcome, coef, se, p, total_gap, via_uptake, residual_gap))
		wald_rows.append((label, outcome, *((wald_stats or (None, None)))))

	return model_rows, var_rows, var_headings, wald_rows, overall


if __name__=="__main__":

	# Open and unpack data config variables
	config = config_reader.open_config()
	config_data = config["data"]
	drive = config_data["drive"]
	filepath = config_data["filepath"]
	input_filename = config_data["input_filename"]
	model_output_filename = config_data["model_output_filename"]
	variable_effects_output_filename = config_data["variable_effects_output_filename"]

	data_filepath = data_read_in.drive_filepath(drive,filepath) # Construct filepath to data

	# Columns of data csv to read
	data_columns = config_data["columns"]
	data = data_read_in.adaptive_reader(data_filepath, input_filename, columns=data_columns) # Load csv data

	# Unpack modelling config variables
	config_modelling = config["modelling"]
	assigned = config_modelling["assigned"]
	c_compliance = config_modelling["c_complicance"]
	t_compliance = config_modelling["t_complicance"]
	uptake = config_modelling["uptake"]
	outcomes = config_modelling["outcomes"]
	outcome_variables = [item[0] for item in outcomes]
	outcome_labels = [item[1] for item in outcomes]
	covariates = config_modelling["covariates"]
	fixed_effects = config_modelling["fixed_effects"]
	clusters = config_modelling["clusters"]
	robust_SEs = config_modelling["robust_SEs"]
	wald_test = config_modelling["wald_test"]
	interaction_variables = config_modelling["interaction_variables"]

	data = lr.determine_uptake(data, assigned, c_compliance, t_compliance) # Determine uptake, create new column
	clean_data = data.dropna() # Drop rows with nans

	# Aggregate results over all tasks (condensed)
	model_aggregate_results_rows, variable_effects_rows, variable_effects_headings, wald_stats_rows, overall_results = run_scope(
		clean_data,
		outcome_variables,
		label="All Tasks",
		assigned=assigned,
		uptake=uptake,
		covariates=covariates,
		fixed_effects=fixed_effects,
		clusters=clusters,
		robust_SEs=robust_SEs,
		wald_test=wald_test,
		interaction_variables=interaction_variables,
	)

	model_results = pd.DataFrame(model_aggregate_results_rows, columns=["Task", "Outcome", "Uptake parameter", "Standard error", "p-value", "Total gap", "Gap via uptake", "Residual gap"])
	aggregate_percentage_changes = pl.plot_regression_results(outcomes, overall_results, data_filepath, mode="aggregate")
	model_results = model_results.merge(aggregate_percentage_changes, left_on="p-value", right_on="p").drop("p", axis=1)
	model_results = model_results.set_index(["Task", "Outcome"])

	variable_effects = pd.DataFrame(variable_effects_rows, columns=["Task", "Outcome", "Simple model coeff", *variable_effects_headings])
	variable_effects = variable_effects.set_index(["Task", "Outcome"])

	wald_stats = pd.DataFrame(wald_stats_rows, columns=["Task", "Outcome", "Wald statistic", "Wald p-value"])
	wald_stats = wald_stats.set_index(["Task", "Outcome"])

	# Per task results (condensed)
	model_tasks_results_rows = []
	task_variable_effects_rows = []
	wald_tasks_results_rows = []
	overall_task_results = {key: {} for key in sorted(clean_data["task"].unique())}
	for task in overall_task_results.keys():
		task_data = clean_data[clean_data["task"] == task]
		m_rows, v_rows, _v_heads_unused, w_rows, task_overall = run_scope(
			data=task_data,
			outcome_vars=outcome_variables,
			label=task,
			assigned=assigned,
			uptake=uptake,
			covariates=covariates,
			fixed_effects=fixed_effects,
			clusters=clusters,
			robust_SEs=robust_SEs,
			wald_test=wald_test,
			interaction_variables=interaction_variables,
		)
		model_tasks_results_rows.extend(m_rows)
		task_variable_effects_rows.extend(v_rows)
		wald_tasks_results_rows.extend(w_rows)
		overall_task_results[task] = task_overall

	model_tasks_results = pd.DataFrame(model_tasks_results_rows, columns=["Task", "Outcome", "Uptake parameter", "Standard error", "p-value", "Total gap", "Gap via uptake", "Residual gap"])
	task_percentage_changes = pl.plot_regression_results(outcomes, overall_task_results, data_filepath, mode="by_task")
	model_tasks_results = model_tasks_results.merge(task_percentage_changes, left_on="p-value", right_on="p").drop("p", axis=1)
	model_tasks_results = model_tasks_results.set_index(["Task", "Outcome"])
	model_results = pd.concat([model_results, model_tasks_results])
	model_results.to_csv(f"{data_filepath}/{model_output_filename}")

	task_variable_effects = pd.DataFrame(task_variable_effects_rows, columns=["Task", "Outcome", "Simple model coeff", *variable_effects_headings])
	task_variable_effects = task_variable_effects.set_index(["Task", "Outcome"])
	variable_effects = pd.concat([variable_effects, task_variable_effects])
	variable_effects.to_csv(f"{data_filepath}/{variable_effects_output_filename}")

	wald_tasks_results = pd.DataFrame(wald_tasks_results_rows, columns=["Task", "Outcome", "Wald statistic", "Wald p-value"])
	wald_tasks_results = wald_tasks_results.set_index(["Task", "Outcome"])
	wald_stats = pd.concat([wald_stats, wald_tasks_results])
	if wald_test:
		wald_stats.to_csv(f"{data_filepath}/wald_stats.csv")