ihmeuw
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎STALE_FILE_HANDLE_FIX.md‎ b/‎STALE_FILE_HANDLE_FIX.md‎
diff --git a/‎idd-forecast-mbp.Rproj‎
Lines changed: 13 additions & 0 deletions b/‎idd-forecast-mbp.Rproj‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/idd_forecast_mbp/02_data_prep/02_as_fhs_and_full_population.py‎
Lines changed: 26 additions & 8 deletions b/‎src/idd_forecast_mbp/02_data_prep/02_as_fhs_and_full_population.py‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎src/idd_forecast_mbp/02_data_prep/03_rake_aa_A2_to_GBD.py‎
Lines changed: 2 additions & 1 deletion b/‎src/idd_forecast_mbp/02_data_prep/03_rake_aa_A2_to_GBD.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/idd_forecast_mbp/02_data_prep/04_rake_as_A2_to_GBD.py‎
Lines changed: 12 additions & 5 deletions b/‎src/idd_forecast_mbp/02_data_prep/04_rake_as_A2_to_GBD.py‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎src/idd_forecast_mbp/02_data_prep/05_malaria_modeling_dataframe.py‎
Lines changed: 17 additions & 7 deletions b/‎src/idd_forecast_mbp/02_data_prep/05_malaria_modeling_dataframe.py‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/idd_forecast_mbp/02_data_prep/06_dengue_modeling_dataframe.py‎
Lines changed: 16 additions & 6 deletions b/‎src/idd_forecast_mbp/02_data_prep/06_dengue_modeling_dataframe.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎src/idd_forecast_mbp/02_data_prep/07_forecasted_dataframes_non_draw_part.py‎
Lines changed: 7 additions & 4 deletions b/‎src/idd_forecast_mbp/02_data_prep/07_forecasted_dataframes_non_draw_part.py‎
Lines changed: 7 additions & 4 deletions
@@ -77,4 +77,5 @@ venv.bak/
 
 # IDEs
 .idea/
-notebooks/
+notebooks/
+.Rproj.user
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
@@ -14,7 +14,8 @@
 import sys
 import xarray as xr
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids, write_parquet
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet
+from idd_forecast_mbp.xarray_functions import read_netcdf_with_integer_ids, write_netcdf, convert_with_preset
 
 age_type_map = {
     "all_age": {
@@ -39,10 +40,15 @@
 fhs_hierarchy_df_path = f"{GBD_DATA_PATH}/fhs_2023_modeling_hierarchy.parquet"
 
 gbd_population_path = f"{GBD_DATA_PATH}/gbd_2023_population.parquet"
-aa_full_population_df_path = f"{PROCESSED_DATA_PATH}/aa_2023_full_population.parquet"
-as_full_population_df_path = f"{PROCESSED_DATA_PATH}/as_2023_full_population.parquet"
-aa_fhs_population_path = f"{PROCESSED_DATA_PATH}/aa_2023_fhs_population.parquet"
-as_fhs_population_path = f"{PROCESSED_DATA_PATH}/as_2023_fhs_population.parquet"
+aa_full_population_df_path = f"{PROCESSED_DATA_PATH}/aa_2023_full_population_df.parquet"
+as_full_population_df_path = f"{PROCESSED_DATA_PATH}/as_2023_full_population_df.parquet"
+aa_full_population_ds_path = f"{PROCESSED_DATA_PATH}/aa_2023_full_population_ds.nc"
+as_full_population_ds_path = f"{PROCESSED_DATA_PATH}/as_2023_full_population_ds.nc"
+
+aa_fhs_population_df_path = f"{PROCESSED_DATA_PATH}/aa_2023_fhs_population_df.parquet"
+as_fhs_population_df_path = f"{PROCESSED_DATA_PATH}/as_2023_fhs_population_df.parquet"
+aa_fhs_population_ds_path = f"{PROCESSED_DATA_PATH}/aa_2023_fhs_population_ds.nc"
+as_fhs_population_ds_path = f"{PROCESSED_DATA_PATH}/as_2023_fhs_population_ds.nc"
 
 missing_level_4_location_path = f"{PROCESSED_DATA_PATH}/missing_level_4_location_ids.parquet"
 missing_level_5_location_path = f"{PROCESSED_DATA_PATH}/missing_level5_location_ids.parquet"
@@ -155,8 +161,14 @@
 
 
 # Write to parquet
-write_parquet(aa_fhs_population_df, aa_fhs_population_path)
-write_parquet(as_fhs_population_df, as_fhs_population_path)
+write_parquet(aa_fhs_population_df, aa_fhs_population_df_path)
+write_parquet(as_fhs_population_df, as_fhs_population_df_path)
+
+aa_fhs_population_ds = convert_with_preset(aa_fhs_population_df, preset='aa_variables')
+write_netcdf(aa_fhs_population_ds, aa_fhs_population_ds_path)
+as_fhs_population_ds = convert_with_preset(as_fhs_population_df, preset='as_variables')
+write_netcdf(as_fhs_population_ds, as_fhs_population_ds_path)
+
 
 ###----------------------------------------------------------###
 ### 3. Base Population Data Loading
@@ -423,6 +435,9 @@
 
 # Step 5: Finalize the aa_full_population_df
 write_parquet(aa_full_population_df, aa_full_population_df_path)
+# Convert to xarray dataset and write to netCDF
+aa_full_population_ds = convert_with_preset(aa_full_population_df, preset='aa_variables')
+write_netcdf(aa_full_population_ds, aa_full_population_ds_path)
 
 ###----------------------------------------------------------###
 ### 5. Age Metadata Processing
@@ -539,4 +554,7 @@
 # As a last step, replace all age-sex population 
 
 # Write the final DataFrame to a parquet file
-write_parquet(as_full_population_df, as_full_population_df_path)
+write_parquet(as_full_population_df, as_full_population_df_path)
+# Convert to xarray dataset and write to netCDF
+as_full_population_ds = convert_with_preset(as_full_population_df, preset='as_variables')
+write_netcdf(as_full_population_ds, as_full_population_ds_path)
@@ -5,7 +5,8 @@
 from datetime import datetime
 from rra_tools.shell_tools import mkdir  # type: ignore
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids, write_parquet, check_column_for_problematic_values
+from idd_forecast_mbp.helper_functions import check_column_for_problematic_values
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet
 from idd_forecast_mbp.cause_processing_functions import format_aa_gbd_df, process_lsae_df
 from idd_forecast_mbp.rake_and_aggregate_functions import rake_aa_count_lsae_to_gbd, make_aa_full_rate_df_from_aa_count_df, check_concordance, aggregate_aa_rate_lsae_to_gbd
 
 
@@ -3,8 +3,8 @@
 from pathlib import Path
 from rra_tools.shell_tools import mkdir  # type: ignore
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids, write_parquet, level_filter
-
+from idd_forecast_mbp.helper_functions import level_filter
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet
 
 PROCESSED_DATA_PATH = rfc.PROCESSED_DATA_PATH
 FORECASTING_DATA_PATH = rfc.FORECASTING_DATA_PATH
@@ -68,13 +68,16 @@
         gbd_columns_to_read = ["location_id", "year_id", "age_group_id", "sex_id", 'population', "val"]
         full_df_columns_to_read = ["location_id", "year_id", 'population', outcome_count]
         # Get the most detailed gbd data
+        print("Reading most-detailed GBD data")
+        print(f"Reading {as_gbd_cause_df_path}")
         as_md_gbd_df = read_parquet_with_integer_ids(as_gbd_cause_df_path,
                                                     columns=gbd_columns_to_read,
                                                     filters=[year_filter, level_filter(hierarchy_df, start_level = 3, end_level = 5), measure_filter,
                                                             metric_filter, age_filter, sex_filter]).rename(columns={'val': outcome_count})
 
         gbd_location_ids = as_md_gbd_df['location_id'].unique().tolist()
         gbd_location_filter = ('location_id', 'in', gbd_location_ids)
+        print(f"Reading {aa_full_cause_df_path}")
         aa_md_gbd_df = read_parquet_with_integer_ids(aa_full_cause_df_path,
                                                     columns=full_df_columns_to_read,
                                                     filters=[year_filter, gbd_location_filter]).rename(columns={
@@ -83,6 +86,7 @@
 
         as_md_gbd_df = as_md_gbd_df.merge(aa_md_gbd_df, on=['location_id', 'year_id'], how='left').copy()
 
+        print("Calculating most-detailed GBD rates and rate ratios")
         as_md_gbd_df['aa_' + outcome_rate] = as_md_gbd_df['aa_' + outcome_count] / as_md_gbd_df['aa_population']
         as_md_gbd_df[outcome_rate] = as_md_gbd_df[outcome_count] / as_md_gbd_df['population']
         as_md_gbd_df['rate_ratio'] = as_md_gbd_df[outcome_rate] / as_md_gbd_df['aa_' + outcome_rate]
@@ -106,21 +110,24 @@
         )
 
         # Rename the gbd dataframe columns
-        gbd_outcome_columns = [col for col in as_md_gbd_df.columns if measure_short in col or 'ratio' in col] + ['location_id']
+        gbd_outcome_columns = [col for col in as_md_gbd_df.columns if measure_short in col or 'ratio' in col] + ['location_id','aa_population', 'population']
         rename_dict = {col: 'gbd_' + col for col in gbd_outcome_columns}
-        as_md_gbd_df = as_md_gbd_df.rename(columns=rename_dict).drop(columns=['aa_population', 'population'])
+        as_md_gbd_df = as_md_gbd_df.rename(columns=rename_dict)
 
+        print("Starting subnational merge")
         # Merge
         as_subnat_df = as_subnat_df.merge(
             as_md_gbd_df,
             how='left',
             on=['gbd_location_id', 'year_id', 'age_group_id', 'sex_id']
         )
-
+        
+        
         as_subnat_df['aa_' + outcome_rate] = as_subnat_df['aa_' + outcome_count] / as_subnat_df['aa_population']
         as_subnat_df[outcome_rate] = as_subnat_df['gbd_rate_ratio'] * as_subnat_df['aa_' + outcome_rate]
         as_subnat_df[outcome_count] = as_subnat_df[outcome_rate] * as_subnat_df['population']
         # 
+        print(as_subnat_df[as_subnat_df[outcome_rate] == as_subnat_df[outcome_rate].max()])
         drop_cols = [col for col in as_subnat_df.columns if 'gbd_' in col or 'rate_ratio' in col]
         as_subnat_df = as_subnat_df.drop(columns=drop_cols)
 
 
@@ -12,7 +12,8 @@
 import os
 import sys
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids, merge_dataframes, read_income_paths, read_urban_paths, write_parquet, level_filter
+from idd_forecast_mbp.helper_functions import merge_dataframes, read_income_paths, read_urban_paths, level_filter
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet
 import glob
 
 malaria_mortality_threshold = 1
@@ -97,8 +98,8 @@
 age_group_ids = age_sex_df['age_group_id'].unique().tolist()
 age_filter = ('age_group_id', 'in', age_group_ids)
 
-aa_merge_variables = ["location_id", "year_id"]
-as_merge_variables = ["location_id", "year_id", "age_group_id", "sex_id"]
+aa_merge_variables = rfc.aa_merge_variables
+as_merge_variables = rfc.as_merge_variables
 
 ###----------------------------------------------------------###
 ### 4. Data Loading and Integration
@@ -113,7 +114,9 @@
 filters=[year_filter, level_filter(hierarchy_df, start_level = 3, end_level = 5)])
 # Read and merge development assistance data
 dah_df = read_parquet_with_integer_ids(dah_df_path)
-malaria_df = pd.merge(malaria_df, dah_df, on=["location_id", "year_id"], how="left")
+malaria_df = pd.merge(malaria_df, 
+                      dah_df[aa_merge_variables + ['mal_DAH_total','mal_DAH_total_per_capita']], 
+                      on = aa_merge_variables, how="left")
 
 # Load and merge urbanization metrics
 urban_dfs = read_urban_paths(urban_paths, VARIABLE_DATA_PATH)
@@ -173,12 +176,18 @@
 malaria_stage_2_df = malaria_stage_2_df.copy()
 
 # Select countries with significant malaria burden (mortality > threshold)
-A0_malaria_stage_2_df = malaria_stage_2_df[(malaria_stage_2_df["location_id"] == malaria_stage_2_df["A0_location_id"]) & (malaria_stage_2_df["year_id"] == 2022)]
-A0_malaria_stage_2_df = A0_malaria_stage_2_df[A0_malaria_stage_2_df["malaria_mort_count"] >= malaria_mortality_threshold]
-phase_2_A0_location_ids = A0_malaria_stage_2_df["location_id"].unique()
+A0_malaria_stage_2_df = malaria_stage_2_df[(malaria_stage_2_df["location_id"] == malaria_stage_2_df["A0_location_id"]) & (malaria_stage_2_df["year_id"] == 2022)].copy()
+A0_malaria_stage_2_df = A0_malaria_stage_2_df.rename(columns={
+    "malaria_pfpr": "A0_malaria_pfpr",
+    "malaria_mort_count": "A0_malaria_mort_count",
+    "malaria_inc_count": "A0_malaria_inc_count"})
+
+A0_malaria_stage_2_df = A0_malaria_stage_2_df[A0_malaria_stage_2_df["A0_malaria_mort_count"] >= malaria_mortality_threshold]
+phase_2_A0_location_ids = A0_malaria_stage_2_df["A0_location_id"].unique()
 
 # Subset to high-burden countries and most detailed geographic units
 malaria_stage_2_df = malaria_stage_2_df[malaria_stage_2_df["A0_location_id"].isin(phase_2_A0_location_ids)]
+malaria_stage_2_df = malaria_stage_2_df.merge(A0_malaria_stage_2_df[["A0_location_id", "A0_malaria_pfpr", "A0_malaria_mort_count", "A0_malaria_inc_count"]], on=["A0_location_id"], how="left")
 
 ###----------------------------------------------------------###
 ### 7. Feature Engineering and Transformations
@@ -227,6 +236,7 @@
     'urban_100m_threshold_300', 'urban_1km_threshold_1500', 'urban_100m_threshold_1500', 'gdppc_mean', 
     'total_precipitation', 'relative_humidity', 'mean_temperature',
     'mean_high_temperature', 'malaria_suitability', 'people_flood_days_per_capita', 'A0_location_id',
+    "A0_malaria_pfpr", "A0_malaria_mort_count", "A0_malaria_inc_count",
     'A0_af', 'log_mal_DAH_total_per_capita', 'log_gdppc_mean', 'logit_urban_1km_threshold_300',
     'logit_urban_100m_threshold_300', 'logit_urban_1km_threshold_1500', 'logit_urban_100m_threshold_1500', 'logit_malaria_pfpr']
 malaria_stage_3_df = malaria_stage_2_df[stage_2_df_columns_to_keep].copy()
 
@@ -12,7 +12,8 @@
 import os
 import sys
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import read_parquet_with_integer_ids, merge_dataframes, read_income_paths, read_urban_paths, write_parquet, level_filter
+from idd_forecast_mbp.helper_functions import merge_dataframes, read_income_paths, read_urban_paths, level_filter
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet
 import glob
 
 dengue_mortality_theshold = 1
@@ -101,7 +102,7 @@
 ### including urbanization metrics, income data,
 ### and climate variables from different sources.
 ###----------------------------------------------------------###
-# Load core malaria data
+# Load core dengue data
 dengue_df = read_parquet_with_integer_ids(aa_full_cause_df_path_template,
                                            filters=[year_filter, level_filter(hierarchy_df, start_level = 3, end_level = 5)])
 
@@ -170,20 +171,28 @@
     clipped_values = dengue_df[col].clip(lower=0.001, upper=0.999)
     dengue_df[f"logit_{col}"] = np.log(clipped_values / (1 - clipped_values))
 
+aa_A0_dengue_df = dengue_df[(dengue_df["location_id"] == dengue_df["A0_location_id"]) & (dengue_df["year_id"] == 2022)].copy()
+aa_A0_dengue_df = aa_A0_dengue_df[aa_A0_dengue_df['aa_dengue_mort_count'] > 1].copy()
+A0_dengue_ids = aa_A0_dengue_df['A0_location_id'].unique()
+
+
 # Make the yn variable. This will be used as the response in the phase 1 model and to trim the data in the phase 2 model
 # dengue_df$yn[which(dengue_df$dengue_mort_rate > dengue_mortality_rate_theshold & dengue_df$dengue_mort_count > dengue_mortality_theshold & dengue_df$dengue_suitability > 0)] <- 1
 dengue_df["yn"] = 0
 dengue_df.loc[
-    (dengue_df["aa_dengue_mort_rate"] > dengue_mortality_rate_theshold) &
-    (dengue_df["aa_dengue_mort_count"] > dengue_mortality_theshold) &
+    (dengue_df["aa_dengue_mort_rate"] > 1/100000) &
+    (dengue_df["aa_dengue_mort_count"] > 0) &
     (dengue_df["aa_dengue_inc_count"] > 0) &
     (dengue_df["dengue_suitability"] > 0),
     "yn"
 ] = 1
 
-
 write_parquet(dengue_df, aa_ge3_dengue_stage_1_modeling_df_path)
 
+dengue_df = dengue_df[dengue_df['A0_location_id'].isin(A0_dengue_ids)].copy() 
+
+
+
 ###----------------------------------------------------------###
 ### 8. Final Modeling Dataset Preparation
 ### Prepares the final dataset for modeling by selecting relevant columns,
@@ -198,7 +207,7 @@
 dengue_stage_2_df["A0_location_id"] = dengue_stage_2_df["A0_location_id"].astype(int)
 dengue_stage_2_df['A0_af'] = 'A0_' + dengue_stage_2_df['A0_location_id'].astype(str)
 dengue_stage_2_df['A0_af'] = dengue_stage_2_df['A0_af'].astype('category')
-dengue_stage_2_df = dengue_stage_2_df.drop(columns=['aa_dengue_inc_count', 'aa_dengue_inc_rate', 'aa_dengue_mort_count', 'aa_dengue_mort_rate', 'aa_dengue_cfr'])
+# dengue_stage_2_df = dengue_stage_2_df.drop(columns=['aa_dengue_inc_count', 'aa_dengue_inc_rate', 'aa_dengue_mort_count', 'aa_dengue_mort_rate', 'aa_dengue_cfr'])
 # Get the as data
 md_location_ids = dengue_stage_2_df["location_id"].unique().tolist()
 md_location_filter = ('location_id', 'in', md_location_ids)
@@ -207,6 +216,7 @@
                                          columns=as_merge_variables + ["dengue_mort_rate","dengue_inc_rate","dengue_mort_count","dengue_inc_count","population","aa_population"],
                                          filters=[year_filter, md_location_filter, age_filter, sex_filter])
 
+
 as_md_df["dengue_cfr"] = as_md_df["dengue_mort_rate"] / as_md_df["dengue_inc_rate"]
 as_md_df.loc[as_md_df["dengue_inc_rate"] == 0, "dengue_cfr"] = 0
 
 
@@ -11,7 +11,8 @@
 import numpy as np
 
 from idd_forecast_mbp import constants as rfc
-from idd_forecast_mbp.helper_functions import ensure_id_columns_are_integers, read_parquet_with_integer_ids, read_income_paths, merge_dataframes, write_parquet, read_urban_paths
+from idd_forecast_mbp.helper_functions import read_income_paths, merge_dataframes, read_urban_paths
+from idd_forecast_mbp.parquet_functions import read_parquet_with_integer_ids, write_parquet, ensure_id_columns_are_integers, sort_id_columns
 
 
 hierarchy = "lsae_1209"
@@ -31,6 +32,9 @@
 hierarchy_df_path = f'{PROCESSED_DATA_PATH}/full_hierarchy_lsae_1209.parquet'
 hierarchy_df = read_parquet_with_integer_ids(hierarchy_df_path)
 
+md_location_ids = hierarchy_df[hierarchy_df['level'] == 5]['location_id'].unique().tolist()
+md_location_filter = ('location_id', 'in', md_location_ids)
+
 aa_full_population_df_path = f"{PROCESSED_DATA_PATH}/aa_2023_full_population.parquet"
 aa_full_population_df = read_parquet_with_integer_ids(aa_full_population_df_path)
 aa_merge_variables = rfc.aa_merge_variables
@@ -61,7 +65,7 @@
     flooding_df_path = flooding_df_path_template.format(ssp_scenario=ssp_scenario)
 
     forecast_df = read_parquet_with_integer_ids(flooding_df_path,
-                                                filters = [year_filter])
+                                                filters = [year_filter, md_location_filter])
     forecast_df = forecast_df.drop(
         columns=["model", "variant", 'population']
     )
@@ -87,15 +91,14 @@
 
     # Merge in the hierarchy_df
     forecast_df = forecast_df.merge(
-        hierarchy_df[['location_id', 'A0_location_id', 'most_detailed_lsae']],
+        hierarchy_df[['location_id', 'A0_location_id']],
         how="left",
         left_on="location_id",
         right_on="location_id"
     )
 
     # Drop rows where A0_location_id is NaN
     forecast_df = forecast_df.dropna(subset=["A0_location_id"])
-    forecast_df = forecast_df[forecast_df["most_detailed_lsae"] == 1]
 
     forecast_df = ensure_id_columns_are_integers(forecast_df)