Skip to content

Commit 0289e08

Browse files
committed
added r code
1 parent 246604a commit 0289e08

File tree

1 file changed

+37
-281
lines changed

1 file changed

+37
-281
lines changed

src/idd_forecast_mbp/modeling/model_malaria_phase_2.r

Lines changed: 37 additions & 281 deletions
Original file line numberDiff line numberDiff line change
@@ -12,148 +12,57 @@ require(data.table)
1212

1313

1414
data_path <- "/mnt/team/idd/pub/forecast-mbp/03-modeling_data"
15-
file_path <- glue("{data_path}/malaria_df.parquet")
15+
file_path <- glue("{data_path}/malaria_stage_2_modeling_df.parquet")
1616
lsae_hierarchy_path <- "/mnt/team/rapidresponse/pub/population-model/admin-inputs/raking/gbd-inputs/hierarchy_lsae_1209.parquet"
1717
gbd_hierarchy_path <- "/mnt/team/rapidresponse/pub/population-model/admin-inputs/raking/gbd-inputs/hierarchy_gbd_2023.parquet"
1818

1919

2020
source("/mnt/share/homes/bcreiner/repos/idd-forecast-mbp/src/idd_forecast_mbp/modeling/helper_functions.r")
2121
# Read in a parquet file
2222
# Read in a parquet file
23-
24-
malaria_mortality_threshold <- 1e-5
25-
2623
malaria_df <- arrow::read_parquet(file_path)
27-
lsae_hierarchy_df <- arrow::read_parquet(lsae_hierarchy_path)
28-
gbd_hierarchy_df <- arrow::read_parquet(gbd_hierarchy_path)
29-
30-
malaria_df$malaria_pfpr <- malaria_df$malaria_pfpr / malaria_df$population
31-
malaria_df$malaria_pf_mort_rate <- malaria_df$malaria_pf_mort_rate / malaria_df$population
32-
malaria_df$malaria_pf_inc_rate <- malaria_df$malaria_pf_inc_rate / malaria_df$population
33-
34-
35-
36-
A0_rows <- which(malaria_df$location_id == malaria_df$A0_location_id)
37-
38-
malaria_A0_df <- malaria_df[A0_rows,]
39-
malaria_A0_df <- malaria_A0_df[which(malaria_A0_df$malaria_pf_mort_rate > malaria_mortality_threshold),]
40-
keep_A0 <- unique(malaria_A0_df$location_id)
41-
42-
malaria_df <- malaria_df[which(malaria_df$A0_location_id %in% keep_A0),]
43-
44-
malaria_df <- malaria_df[which(malaria_df$level == 3),]
4524

4625
malaria_df$A0_af <- as.factor(as.character(paste("A0", malaria_df$A0_location_id, sep = "_")))
4726

48-
49-
toss <- which(is.na(malaria_df$malaria_pfpr))
50-
malaria_df <- malaria_df[-toss,]
51-
52-
malaria_df$malaria_pfpr <- 0.99 * malaria_df$malaria_pfpr
53-
54-
malaria_df$log_gdppc_mean <- log(malaria_df$gdppc_mean)
55-
malaria_df$log_ldipc_mean <- log(malaria_df$ldipc_mean)
56-
57-
malaria_df$mal_DAH_23_per_capita <- malaria_df$mal_DAH_23 / malaria_df$population
58-
malaria_df$log_mal_DAH_23_per_capita <- log(malaria_df$mal_DAH_23_per_capita + 1e-5)
59-
malaria_df$log_mal_DAH_23 <- log(malaria_df$mal_DAH_23 + 1e-5)
60-
61-
62-
malaria_df$elevation <- malaria_df$elevation / malaria_df$population
63-
64-
malaria_df <- center_by_location_df(malaria_df, "log_mal_DAH_23_per_capita")
65-
malaria_df <- center_by_location_df(malaria_df, "log_mal_DAH_23")
66-
malaria_df <- center_by_location_df(malaria_df, "malaria_suitability")
67-
malaria_df <- center_by_location_df(malaria_df, "log_gdppc_mean")
68-
malaria_df <- center_by_location_df(malaria_df, "log_ldipc_mean")
69-
malaria_df <- center_by_location_df(malaria_df, "log_ldipc_mean_A0_centered")
70-
malaria_df <- center_by_location_df(malaria_df, "elevation")
71-
malaria_df <- center_by_location_df(malaria_df, "total_precipitation")
72-
malaria_df <- center_by_location_df(malaria_df, "people_flood_days_per_capita")
73-
malaria_df <- center_by_location_df(malaria_df, "relative_humidity")
74-
malaria_df <- center_by_location_df(malaria_df, "precipitation_days")
75-
76-
malaria_pfpr_df <- malaria_df[which(malaria_df$malaria_pfpr > 0),]
77-
78-
malaria_pfpr_df$logit_malaria_pfpr <- log(malaria_pfpr_df$malaria_pfpr / (1 - malaria_pfpr_df$malaria_pfpr))
79-
8027
###
8128
#
8229
# Without A0_af, with A0_af, centered with A0_af
8330
#
8431
###
8532

86-
mod1 <- lm(logit_malaria_pfpr ~ malaria_suitability_A0_centered*log_ldipc_mean_A0_centered +log_mal_DAH_23_per_capita_A0_centered + total_precipitation_A0_centered +
87-
people_flood_days_per_capita_A0_centered + A0_af, data = malaria_pfpr_df)
33+
mod1 <- lm(logit_malaria_pfpr ~ malaria_suitability_A0_centered*log_ldipc_mean_A0_centered + log_mal_DAH_total_per_capita + total_precipitation_A0_centered +
34+
people_flood_days_per_capita_A0_centered + A0_af, data = malaria_df)
8835

89-
out <- summary(mod1)[[4]]
90-
cbind(out[which(rownames(out) %nlike% "A0_af"), 1:3], round(out[which(rownames(out) %nlike% "A0_af"), 4], 3))
91-
summary(mod1)$r.squared
9236

9337

9438

95-
# Run the three models
96-
results <- compare_three_models(
97-
df = malaria_pfpr_df,
98-
y_var = "logit_malaria_pfpr",
99-
covariate_names = Covariate_options[[10]],
100-
interaction_pairs = list(Covariate_options[[10]][1:2])
101-
)
102-
103-
results$summary_table[,c("Covariate", "Model2_Centered_Coef")]
104-
105-
# # Create the grid plot
106-
# plot <- plot_models_grid_base(
107-
# model_results = results,
108-
# interaction_pair = c("log_ldipc_mean_A0_centered", "malaria_suitability"),
109-
# df = malaria_pfpr_df,
110-
# y_var = "logit_malaria_pfpr",
111-
# gbd_hierarchy_path = gbd_hierarchy_path
112-
# )
113-
114-
# Create the grid plot
115-
plot <- plot_models_grid_base(
116-
model_results = results,
117-
interaction_pair = Covariate_options[[4]][1:2],
118-
df = malaria_pfpr_df,
119-
y_var = "logit_malaria_pfpr",
120-
transform_type = "invlogit",
121-
pretty_labels = list(
122-
original = "Logit pfpr",
123-
transformed = "pfpr"
124-
),
125-
color_ranges = list(
126-
original = c(-20, 5),
127-
transformed = c(0, 0.1)
128-
),
129-
gbd_hierarchy_path = gbd_hierarchy_path
130-
)
131-
13239

40+
out <- summary(mod1)[[4]]
41+
cbind(out[which(rownames(out) %nlike% "A0_af"), 1:3], round(out[which(rownames(out) %nlike% "A0_af"), 4], 3))
42+
summary(mod1)$r.squared
13343

134-
#
13544

13645

13746
Covariate_options <- list(
138-
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_23_per_capita", "total_precipitation", "people_flood_days_per_capita"),
139-
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_23_per_capita", "relative_humidity", "people_flood_days_per_capita"),
140-
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_23_per_capita", "precipitation_days", "people_flood_days_per_capita"),
141-
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_23_per_capita", "total_precipitation", "people_flood_days_per_capita"),
142-
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_23_per_capita", "relative_humidity", "people_flood_days_per_capita"),
143-
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_23_per_capita", "precipitation_days", "people_flood_days_per_capita"),
144-
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_23_per_capita", "total_precipitation", "people_flood_days_per_capita"),
145-
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_23_per_capita", "relative_humidity", "people_flood_days_per_capita"),
146-
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_23_per_capita", "precipitation_days", "people_flood_days_per_capita"),
147-
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_23_per_capita", "total_precipitation", "people_flood_days_per_capita"),
148-
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_23_per_capita", "relative_humidity", "people_flood_days_per_capita"),
149-
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_23_per_capita", "precipitation_days", "people_flood_days_per_capita")
47+
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_total_per_capita", "total_precipitation", "people_flood_days_per_capita"),
48+
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_total_per_capita", "relative_humidity", "people_flood_days_per_capita"),
49+
c("malaria_suitability","log_gdppc_mean","elevation","log_mal_DAH_total_per_capita", "precipitation_days", "people_flood_days_per_capita"),
50+
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_total_per_capita", "total_precipitation", "people_flood_days_per_capita"),
51+
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_total_per_capita", "relative_humidity", "people_flood_days_per_capita"),
52+
c("malaria_suitability","log_ldipc_mean","elevation","log_mal_DAH_total_per_capita", "precipitation_days", "people_flood_days_per_capita"),
53+
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_total_per_capita", "total_precipitation", "people_flood_days_per_capita"),
54+
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_total_per_capita", "relative_humidity", "people_flood_days_per_capita"),
55+
c("malaria_suitability","log_gdppc_mean","log_mal_DAH_total_per_capita", "precipitation_days", "people_flood_days_per_capita"),
56+
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_total_per_capita", "total_precipitation", "people_flood_days_per_capita"),
57+
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_total_per_capita", "relative_humidity", "people_flood_days_per_capita"),
58+
c("malaria_suitability","log_ldipc_mean","log_mal_DAH_total_per_capita", "precipitation_days", "people_flood_days_per_capita")
15059
)
15160

15261
All_Results <- vector("list", length = length(Covariate_options))
15362
for (c_num in seq_along(Covariate_options)){
15463
message("Running model ", c_num, " of ", length(Covariate_options))
15564
results <- compare_three_models(
156-
df = malaria_pfpr_df,
65+
df = malaria_df,
15766
y_var = "logit_malaria_pfpr",
15867
covariate_names = Covariate_options[[c_num]],
15968
interaction_pairs = list(Covariate_options[[c_num]][1:2])
@@ -176,7 +85,7 @@ Expectation_table <- data.frame(variable = c("malaria_suitability",
17685
"log_gdppc_mean",
17786
"log_ldipc_mean",
17887
"elevation",
179-
"log_mal_DAH_23_per_capita",
88+
"log_mal_DAH_total_per_capita",
18089
"total_precipitation",
18190
"relative_humidity",
18291
"precipitation_days",
@@ -219,6 +128,13 @@ parse_model <- function(results_table){
219128
tmp_direction <- summary_df$value_3 / abs(summary_df$value_3)
220129
summary_df$issues_3 <- ifelse(tmp_direction* summary_df$expectation < 0, "ISSUE", "")
221130
return(summary_df)
131+
# # if (any(summary_df$issues_3 == "ISSUE")){
132+
# #
133+
# # } else {
134+
# return(summary_df[,c("covariate", "value_3", "p_value_3", "issues_3")])
135+
# # }
136+
137+
222138
}
223139

224140
for (c_num in seq_along(All_Results)){
@@ -236,7 +152,7 @@ get_pchs <- function(vec){
236152
UCov <- setdiff(Expectation_table$variable, "R-squared")
237153
require(RColorBrewer)
238154
require(beeswarm, lib = "~/packages")
239-
COLS <- brewer.pal(6, "Set1")
155+
COLS <- brewer.pal(length(All_Results), "Paired")
240156
# COLS <- rep(1,12)
241157
par(mfrow=c(3,4))
242158
for (c_num in seq_along(UCov)){
@@ -271,7 +187,7 @@ for (c_num in seq_along(UCov)){
271187
}
272188
YLIM <- c(min(tmp_df$Model1_Coef, tmp_df$Model2_Coef, tmp_df$Model2_Centered_Coef), max(tmp_df$Model1_Coef, tmp_df$Model2_Coef, tmp_df$Model2_Centered_Coef))
273189

274-
# Pad YLIM a little
190+
275191
if (YLIM[1] < 0){
276192
YLIM[1] <- YLIM[1] - 0.05 * diff(YLIM)
277193
} else {
@@ -283,6 +199,14 @@ for (c_num in seq_along(UCov)){
283199
YLIM[2] <- YLIM[2] + 0.05 * diff(YLIM)
284200
}
285201

202+
if (diff(sign(YLIM)) == 0){
203+
if (sign(YLIM[1]) == -1){
204+
YLIM[2] <- abs(YLIM[1])*0.1
205+
} else {
206+
YLIM[1] <- -abs(YLIM[2])*0.1
207+
}
208+
}
209+
286210
plot(1, type = "n", xlim = c(0.5, 3.5),xaxs = "i", yaxs = "i", ylim = YLIM, ann = FALSE, axes = FALSE)
287211
box()
288212
axis(2, pretty(YLIM))
@@ -342,175 +266,7 @@ Centered", line = 1.1, tick = FALSE)
342266

343267

344268

345-
mod <- lm(logit_malaria_pfpr ~ log_mal_DAH_23_per_capita + total_precipitation +
346-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
347-
348-
out <- summary(mod)[[4]]
349-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
350-
summary(mod)$r.squared
351-
352-
353-
354-
mod <- lm(logit_malaria_pfpr ~ log_mal_DAH_23_per_capita + total_precipitation +
355-
log_gdppc_mean + malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
356-
357-
358-
out <- summary(mod)[[4]]
359-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
360-
summary(mod)$r.squared
361-
362-
363-
364-
mod <- lm(logit_malaria_pfpr ~ log_mal_DAH_23_per_capita + relative_humidity +
365-
log_ldipc_mean + malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
366-
367-
368-
out <- summary(mod)[[4]]
369-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
370-
summary(mod)$r.squared
371-
372-
373-
require(scam)
374-
375-
376-
mod <- scam(logit_malaria_pfpr ~ s(log_mal_DAH_23_per_capita, k = 4, bs = "mpi") + relative_humidity +
377-
log_ldipc_mean * malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
378-
summary(mod)
379-
plot(mod)
380-
381-
382-
383-
384-
385-
386-
387-
388-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation +
389-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
390-
391-
out <- summary(mod)[[4]]
392-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
393-
summary(mod)$r.squared
394-
395-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + precipitation_days +
396-
log_gdppc_mean + malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
397-
398-
out <- summary(mod)[[4]]
399-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
400-
summary(mod)$r.squared
401-
402-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation +
403-
log_gdppc_mean + malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
404-
405-
out <- summary(mod)[[4]]
406-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
407-
summary(mod)$r.squared
408-
409-
410-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + precipitation_days +
411-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_df)
412-
413-
out <- summary(mod)[[4]]
414-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
415-
summary(mod)$r.squared
416-
417-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation +
418-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_df)
419-
420-
out <- summary(mod)[[4]]
421-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
422-
summary(mod)$r.squared
423-
424-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + precipitation_days +
425-
log_gdppc_mean + malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_df)
426-
427-
out <- summary(mod)[[4]]
428-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
429-
summary(mod)$r.squared
430-
431-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation +
432-
log_gdppc_mean + malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_df)
433-
434-
out <- summary(mod)[[4]]
435-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
436-
summary(mod)$r.squared
437-
438-
439-
440-
441-
442-
443-
444-
445-
446-
447-
448-
449-
450-
451-
452-
mod <- gam(logit_malaria_pfpr ~ elevation + s(mal_DAH_23, k = 4) + precipitation_days +
453-
log_ldipc_mean + malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
454-
summary(mod)
455-
plot(mod)
456-
457-
458-
459-
460-
461-
462-
463-
464-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 +malaria_itn+
465-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita+ A0_af, data = malaria_pfpr_int_df)
466-
out <- summary(mod)[[4]]
467-
cbind(out[which(rownames(out) %nlike% "A0"), 1:3], round(out[which(rownames(out) %nlike% "A0"), 4], 3))
468-
469-
470-
471-
mod <- gam(logit_malaria_pfpr ~ elevation + s(mal_DAH_23, k = 4) +
472-
log_ldipc_mean * malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_int_df)
473-
summary(mod)
474-
plot(mod)
475-
476-
477-
478-
479-
mod <- gam(logit_malaria_pfpr ~ s(malaria_suitability), data = malaria_pfpr_df)
480-
481-
malaria_pfpr_int_df <- malaria_pfpr_df[which(malaria_pfpr_df$malaria_itn > 0 | malaria_pfpr_df$malaria_irs),]
482-
483-
summary(mod)
484-
485-
mod <- lm(logit_malaria_pfpr ~ elevation + malaria_itn + malaria_irs + mal_DAH_23 +
486-
log_gdppc_mean * malaria_suitability + relative_humidity + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
487-
488-
summary(mod)
489-
490-
mod <- lm(logit_malaria_pfpr ~ elevation + malaria_irs + mal_DAH_23 + total_precipitation +
491-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_int_df)
492-
summary(mod)
493-
494-
495-
496-
497-
mod <- gam(logit_malaria_pfpr ~ elevation + s(mal_DAH_23, k = 4) + total_precipitation +
498-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_int_df)
499-
summary(mod)
500-
501-
502-
mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation
503-
log_gdppc_mean * malaria_suitability + people_flood_days_per_capita + A0_af, data = malaria_pfpr_df)
504-
plot(mod)
505-
506-
# Summarize the model except for the A0 location
507-
# mod <- lm(logit_malaria_pfpr ~ elevation + mal_DAH_23 + total_precipitation
508-
# log_gdppc_mean * malaria_suitability + people_flood_days_per_capita, data = malaria_pfpr_df)
509-
# summary(mod)
510-
511269

512-
summary(mod)
513270

514-
cor(malaria_pfpr_df$total_precipitation, malaria_pfpr_df$people_flood_days_per_capita)
515271

516272

0 commit comments

Comments
 (0)