CDCgov · erik-rosenstrom · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Makefile b/Makefile
@@ -19,15 +19,15 @@ synthetic-population:
 
 # Run synthetic population generator tests
 test-syn-pop:
-	uv run pytest scripts/test_create_synthetic_population.py
+	uv run pytest packages/create_synthetic_population/tests/test_create_synthetic_population.py
 
 # Remove generated synthetic population CSVs. Optionally narrow with STATE and/or SIZE.
 clean-synthetic-population:
 	rm -f input/synth_pop_people_$(CLEAN_STATE_PATTERN)_$(CLEAN_SIZE_PATTERN).csv
 	rm -f input/synth_pop_region_$(CLEAN_STATE_PATTERN)_$(CLEAN_SIZE_PATTERN).csv
 
 input/synth_pop_people_%.csv:
-	uv run scripts/create_synthetic_population.py --state $(shell echo "$*" | sed 's/_.*//')  --size $(shell echo "$*" | sed 's/^[A-Z]*_//')
+	uv run packages/create_synthetic_population/src/create_synthetic_population/run.py --state $(shell echo "$*" | sed 's/_.*//')  --size $(shell echo "$*" | sed 's/^[A-Z]*_//')
 
 # Run the model with a synthetic population (e.g., make run SIZE=1_000_000)
 # Generates the population file if it doesn't exist.

diff --git a/packages/create_synthetic_population/src/create_synthetic_population/run.py b/packages/create_synthetic_population/src/create_synthetic_population/run.py
@@ -307,10 +307,21 @@ def sample_population(
     synth_pop_df["workplace_id"] = pd.array(
         [pd.NA] * len(synth_pop_df), dtype="object"
     )
+
+    # Assign some workers as teachers to schools instead of workplaces
     if n_workers > 0:
-        synth_pop_df.loc[wrk_mask, "workplace_id"] = rng.choice(
-            workplace_ids, size=n_workers
+        teacher_fraction = 0.0256  # ~2.56% of workers are teachers in schools
+        n_teachers = max(1, int(n_workers * teacher_fraction))
+        worker_indices = np.where(wrk_mask)[0]
+        teacher_indices = rng.choice(
+            worker_indices, size=n_teachers, replace=False
         )
+        regular_worker_indices = np.setdiff1d(worker_indices, teacher_indices)
+
+        if len(regular_worker_indices) > 0:
+            synth_pop_df.loc[regular_worker_indices, "workplace_id"] = (
+                rng.choice(workplace_ids, size=len(regular_worker_indices))
+            )
 
     # Vectorized school assignment
     sch_mask = synth_pop_df["SCH"].astype(str).isin(["2", "3"])
@@ -323,6 +334,17 @@ def sample_population(
             school_ids, size=n_students
         )
 
+    # Assign teachers to schools
+    if n_workers > 0:
+        teacher_indices = rng.choice(
+            worker_indices,
+            size=min(n_teachers, len(worker_indices)),
+            replace=False,
+        )
+        synth_pop_df.loc[teacher_indices, "school_id"] = rng.choice(
+            school_ids, size=len(teacher_indices)
+        )
+
     elapsed = time.time() - start_time
     print(f"Population sampling took {elapsed:.2f}s")
     return synth_pop_df

diff --git a/packages/create_synthetic_population/tests/test_create_synthetic_population.py b/packages/create_synthetic_population/tests/test_create_synthetic_population.py
@@ -272,7 +272,10 @@ def test_produces_at_least_target(self, household_pums, sample_pums, rng):
 
     def test_workers_get_workplaces(self, pop_df):
         workers = pop_df[pop_df["WRK"].astype(str) == "1"]
-        assert workers["workplace_id"].notna().all()
+
+        assert (
+            workers["workplace_id"].notna() | workers["school_id"].notna()
+        ).all()
 
     def test_students_get_schools(self, pop_df):
         students = pop_df[pop_df["SCH"].astype(str).isin(["2", "3"])]