Merge pull request #6 from ihmeuw/feature/rmbarber/v7-height-update

rmbarber · web-flow · commit 4c32f0acb6eb · 2025-06-24T14:22:53.000-05:00
Feature/rmbarber/v7 height update
diff --git a/.gitignore b/.gitignore
@@ -29,6 +29,9 @@ MANIFEST
 pip-log.txt
 pip-delete-this-directory.txt
 
+# Lightning model logs
+*lightning_logs/
+
 # Unit test / coverage reports
 htmlcov/
 .tox/
diff --git a/specifications/model_specification.yaml b/specifications/model_specification.yaml
@@ -1,14 +1,13 @@
 resolution: "100"
 split: 0
-denominator: "ghsl_residential_volume"
+denominator: "microsoft_v7_residential_volume"
 features:
-  - "log_nighttime_lights"
-  - "ghsl_residential_volume_250m"
-  - "ghsl_residential_volume_500m"
-  - "ghsl_residential_volume_1000m"
-  - "ghsl_residential_volume_2500m"
-  - "ghsl_residential_volume_5000m"
-  - "ghsl_residential_volume_10000m"
+  # - "microsoft_v7_residential_volume_250m"
+  # - "microsoft_v7_residential_volume_500m"
+  # - "microsoft_v7_residential_volume_1000m"
+  # - "microsoft_v7_residential_volume_2500m"
+  # - "microsoft_v7_residential_volume_5000m"
+  # - "microsoft_v7_residential_volume_10000m"
 training_target: "admin_log_occupancy_rate"
 loss_target: "admin_log_occupancy_rate"
 loss_metric: "mse"
diff --git a/src/rra_population_model/constants.py b/src/rra_population_model/constants.py
@@ -27,7 +27,7 @@ def to_list(cls) -> list[str]:
 
 class BuiltVersion(BaseModel):
     provider: Literal["ghsl", "microsoft"]
-    version: Literal["v6", "r2023a"]
+    version: Literal["v6", "v7", "r2023a"]
     time_points: list[str]
     measures: list[str]
 
@@ -69,6 +69,20 @@ def time_points_float(self) -> list[float]:
         ][1:],
         measures=["density"],
     ),
+    "microsoft_v7": BuiltVersion(
+        provider="microsoft",
+        version="v7",
+        # time_points=[
+        #     f"{y}q{q}" for y, q in itertools.product(range(2020, 2025), range(1, 5))
+        # ][1:-2],
+        time_points=[
+            f"{y}q{q}" for y, q in itertools.product(range(2020, 2024), range(1, 5))
+        ][1:],
+        measures=[
+            "density",
+            "height",
+        ],
+    ),
 }
 
 DENOMINATORS = []
@@ -93,7 +107,7 @@ def time_points_float(self) -> list[float]:
 
 ALL_TIME_POINTS = sorted(
     set.union(*[set(v.time_points) for v in BUILT_VERSIONS.values()])
-    | {f"{y}q1" for y in range(1975, 2025)}
+    | {f"{y}q1" for y in range(1975, 2026)}
 )
 
 
diff --git a/src/rra_population_model/data.py b/src/rra_population_model/data.py
@@ -153,22 +153,24 @@ def get_shapefile_dir(
         iso3: str,
         year: str | int,
         state: str | None = None,
+        purpose: str = 'training',
     ) -> Path:
-        shapefile_dir = self.shapefiles / iso3 / str(year)
+        shapefile_dir = self.shapefiles / purpose / iso3 / str(year)
         if state:
             return shapefile_dir / state
         return shapefile_dir
 
-    def list_shapefile_years(self) -> list[tuple[str, str]]:
+    def list_shapefile_years(self, purpose: str = 'training') -> list[tuple[str, str]]:
         """List all available shapefile years by country."""
-        return self._list_years(self.shapefiles)
+        return self._list_years(self.shapefiles / purpose)
 
     def load_shapefile(
         self,
         admin_level: int,
         iso3: str,
         year: str | int,
         state: str | None = None,
+        purpose: str = 'training',
     ) -> gpd.GeoDataFrame:
         """Load administrative boundary data from a shapefile.
 
@@ -182,20 +184,22 @@ def load_shapefile(
             The year represented by the shapefile boundaries.
         state
             State or province name. Optional.
+        purpose
+            Shapefile purpose - training or raking.
 
         Returns
         -------
         gpd.GeoDataFrame
             Administrative boundary data.
         """
-        shape_root = self.get_shapefile_dir(iso3, year, state)
+        shape_root = self.get_shapefile_dir(iso3, year, state, purpose=purpose)
         path = shape_root / f"admin{admin_level}.parquet"
         gdf = gpd.read_parquet(path)
         return gdf
 
-    def list_admin_levels(self, iso3: str, year: str | int) -> list[int]:
+    def list_admin_levels(self, iso3: str, year: str | int, purpose: str = 'training') -> list[int]:
         """List all available administrative levels for a country and year."""
-        shapefile_dir = self.get_shapefile_dir(iso3, year)
+        shapefile_dir = self.get_shapefile_dir(iso3, year, purpose=purpose)
         admin_levels = [
             admin_level
             for admin_level in range(10)
diff --git a/src/rra_population_model/model/inference/runner.py b/src/rra_population_model/model/inference/runner.py
@@ -136,11 +136,12 @@ def inference(
         runner="pmtask model",
         task_name="inference",
         node_args={
+            # "version": [f"2025_06_21.00{x}" for x in range(1, 5)],
             "time-point": time_points,
         },
         task_args={
-            "resolution": resolution,
             "version": version,
+            "resolution": resolution,
             "output-dir": output_dir,
         },
         task_resources={
diff --git a/src/rra_population_model/model_prep/features/built.py b/src/rra_population_model/model_prep/features/built.py
@@ -132,8 +132,8 @@ def generate_derived_measures(
         if self.built_version.name == "ghsl_r2023a":
             # No derived measures for GHSL
             return {}
-        elif self.built_version.name == "microsoft_v6":
-            return _generate_microsoft_derived_measures(pm_data, self.feature_metadata)
+        elif self.built_version.name in ["microsoft_v6", "microsoft_v7"]:
+            return _generate_microsoft_derived_measures(pm_data, self.feature_metadata, self.built_version.name)
         else:
             msg = f"Unknown built version: {self.built_version.name}"
             raise ValueError(msg)
@@ -256,18 +256,31 @@ def link_features(
 def _generate_microsoft_derived_measures(
     pm_data: PopulationModelData,
     feature_metadata: FeatureMetadata,
+    built_version_name: str,
 ) -> dict[str, Path]:
+    feature_dict = {
+        "microsoft_v6": {
+            "density": "microsoft_v6_density",
+            "height": "ghsl_r2023a_height",
+            "p_residential": "ghsl_r2023a_proportion_residential",
+        },
+        "microsoft_v7": {
+            "density": "microsoft_v7_density",
+            "height": "microsoft_v7_height",
+            "p_residential": "ghsl_r2023a_proportion_residential",
+        },
+    }[built_version_name]
     density = pm_data.load_feature(
-        feature_name="microsoft_v6_density",
+        feature_name=feature_dict["density"],
         **feature_metadata.shared_kwargs,
     )
     density_arr = density._ndarray  # noqa: SLF001
     height_arr = pm_data.load_feature(  # noqa: SLF001
-        feature_name="ghsl_r2023a_height",
+        feature_name=feature_dict["height"],
         **feature_metadata.shared_kwargs,
     )._ndarray
     p_residential_arr = pm_data.load_feature(  # noqa: SLF001
-        feature_name="ghsl_r2023a_proportion_residential",
+        feature_name=feature_dict["p_residential"],
         **feature_metadata.shared_kwargs,
     )._ndarray
 
@@ -298,13 +311,13 @@ def _generate_microsoft_derived_measures(
         )
         pm_data.save_feature(
             out,
-            feature_name=f"microsoft_v6_{measure}",
+            feature_name=f"{built_version_name}_{measure}",
             **feature_metadata.shared_kwargs,
         )
 
     out_paths = {
-        f"microsoft_v6_{m}": pm_data.feature_path(
-            feature_name=f"microsoft_v6_{m}",
+        f"{built_version_name}_{m}": pm_data.feature_path(
+            feature_name=f"{built_version_name}_{m}",
             **feature_metadata.shared_kwargs,
         )
         for m in out_ops
diff --git a/src/rra_population_model/model_prep/features/runner.py b/src/rra_population_model/model_prep/features/runner.py
@@ -17,8 +17,8 @@
 
 # GHSL first, as we need the height and residential mask for msft
 BUILT_VERSIONS = [
-    pmc.BUILT_VERSIONS["ghsl_r2023a"],
-    pmc.BUILT_VERSIONS["microsoft_v6"],
+    # pmc.BUILT_VERSIONS["ghsl_r2023a"],
+    pmc.BUILT_VERSIONS["microsoft_v7"],
 ]
 
 
diff --git a/src/rra_population_model/postprocess/rake/runner.py b/src/rra_population_model/postprocess/rake/runner.py
@@ -105,6 +105,7 @@ def rake(
     rf_time_points = pm_data.list_raking_factor_time_points(resolution, version)
 
     time_points = clio.convert_choice(time_point, rf_time_points)
+    # time_points = sorted([time_point for time_point in time_points if time_point.startswith('202')])
 
     model_frame = pm_data.load_modeling_frame(resolution)
     block_keys = model_frame.block_key.unique().tolist()
@@ -121,11 +122,12 @@ def rake(
             "project": "proj_rapidresponse",
         },
         node_args={
-            "version": [f"2025_04_24.00{x}" for x in range(1, 9)],
+            # "version": [f"2025_06_21.00{x}" for x in range(1, 5)],
             "block-key": block_keys,
             "time-point": time_points,
         },
         task_args={
+            "version": version,
             "resolution": resolution,
             "output-dir": output_dir,
         },
diff --git a/src/rra_population_model/postprocess/raking_factors/runner.py b/src/rra_population_model/postprocess/raking_factors/runner.py
@@ -224,6 +224,7 @@ def raking_factors(
             "project": "proj_rapidresponse",
         },
         node_args={
+            # "version": [f"2025_06_21.00{x}" for x in range(1, 5)],
             "time-point": time_points,
         },
         task_args={
diff --git a/src/rra_population_model/postprocess/upsample/runner.py b/src/rra_population_model/postprocess/upsample/runner.py
@@ -20,9 +20,11 @@
     "world_cylindrical_2000f": (pmc.CRSES["world_cylindrical"], 2000, "average"),
     "world_cylindrical_4000f": (pmc.CRSES["world_cylindrical"], 4000, "average"),
     "world_cylindrical_8000f": (pmc.CRSES["world_cylindrical"], 8000, "average"),
+    "world_cylindrical_10000f": (pmc.CRSES["world_cylindrical"], 10000, "average"),
     "world_cylindrical_16000f": (pmc.CRSES["world_cylindrical"], 16000, "average"),
-    "world_cylindrical_5000": (pmc.CRSES["world_cylindrical"], 5000, "sum"),
     "world_cylindrical_1000": (pmc.CRSES["world_cylindrical"], 1000, "sum"),
+    "world_cylindrical_5000": (pmc.CRSES["world_cylindrical"], 5000, "sum"),
+    "world_cylindrical_10000": (pmc.CRSES["world_cylindrical"], 10000, "sum"),
     "wgs84_0p1": (pmc.CRSES["wgs84"], 0.1, "sum"),
     "wgs84_0p01": (pmc.CRSES["wgs84"], 0.01, "sum"),
 }
@@ -160,8 +162,9 @@ def upsample(
     compiled_time_points = pm_data.list_compiled_prediction_time_points(
         resolution, version
     )
-    compiled_time_points = [f"{y}q1" for y in range(1950, 1976)]
+    # compiled_time_points = [f"{y}q1" for y in range(1950, 1976)]
     time_points = clio.convert_choice(time_point, compiled_time_points)
+    # time_points = [time_point for time_point in time_points if time_point.startswith('202')]
 
     print("Upsampling")
 
diff --git a/src/rra_population_model/validate/metrics/runner.py b/src/rra_population_model/validate/metrics/runner.py
@@ -64,7 +64,7 @@ def pixel_metrics_main(
     )
     block_poly = block_frame.geometry.iloc[0]
 
-    print("Loading and raked population predictions")
+    print("Loading raked population predictions")
     model_spec = pm_data.load_model_specification(resolution, version)
     pop_raster = pm_data.load_raked_prediction(block_key, time_point, model_spec)
     pop_arr = pop_raster._ndarray  # noqa: SLF001
@@ -156,7 +156,7 @@ def metrics(
 ) -> None:
     pm_data = PopulationModelData(output_dir)
 
-    time_points = pm_data.list_raking_factor_time_points(resolution, version)
+    time_points = pm_data.list_raked_prediction_time_points(resolution, version)
     if time_point not in time_points:
         msg = (
             f"Time point {time_point} not found in {resolution} {version}.\n"
@@ -179,7 +179,7 @@ def metrics(
         },
         node_args={
             "block-key": block_keys,
-            "version": [f"2025_04_24.00{i}" for i in range(1, 9)],
+            "version": [f"2025_06_21.00{i}" for i in range(1, 5)],
         },
         task_args={
             "resolution": resolution,

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`
`18`	`18`	`# GHSL first, as we need the height and residential mask for msft`
`19`	`19`	`BUILT_VERSIONS = [`
`20`		`- pmc.BUILT_VERSIONS["ghsl_r2023a"],`
`21`		`- pmc.BUILT_VERSIONS["microsoft_v6"],`
	`20`	`+ # pmc.BUILT_VERSIONS["ghsl_r2023a"],`
	`21`	`+ pmc.BUILT_VERSIONS["microsoft_v7"],`
`22`	`22`	`]`
`23`	`23`
`24`	`24`