Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ synthetic-population:

# Run synthetic population generator tests
test-syn-pop:
uv run pytest scripts/test_create_synthetic_population.py
uv run pytest packages/create_synthetic_population/tests/test_create_synthetic_population.py

# Remove generated synthetic population CSVs. Optionally narrow with STATE and/or SIZE.
clean-synthetic-population:
rm -f input/synth_pop_people_$(CLEAN_STATE_PATTERN)_$(CLEAN_SIZE_PATTERN).csv
rm -f input/synth_pop_region_$(CLEAN_STATE_PATTERN)_$(CLEAN_SIZE_PATTERN).csv

input/synth_pop_people_%.csv:
uv run scripts/create_synthetic_population.py --state $(shell echo "$*" | sed 's/_.*//') --size $(shell echo "$*" | sed 's/^[A-Z]*_//')
uv run packages/create_synthetic_population/src/create_synthetic_population/run.py --state $(shell echo "$*" | sed 's/_.*//') --size $(shell echo "$*" | sed 's/^[A-Z]*_//')

# Run the model with a synthetic population (e.g., make run SIZE=1_000_000)
# Generates the population file if it doesn't exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,21 @@ def sample_population(
synth_pop_df["workplace_id"] = pd.array(
[pd.NA] * len(synth_pop_df), dtype="object"
)

# Assign some workers as teachers to schools instead of workplaces
if n_workers > 0:
synth_pop_df.loc[wrk_mask, "workplace_id"] = rng.choice(
workplace_ids, size=n_workers
teacher_fraction = 0.0256 # ~2.56% of workers are teachers in schools
n_teachers = max(1, int(n_workers * teacher_fraction))
worker_indices = np.where(wrk_mask)[0]
teacher_indices = rng.choice(
worker_indices, size=n_teachers, replace=False
)
regular_worker_indices = np.setdiff1d(worker_indices, teacher_indices)

if len(regular_worker_indices) > 0:
synth_pop_df.loc[regular_worker_indices, "workplace_id"] = (
rng.choice(workplace_ids, size=len(regular_worker_indices))
)

# Vectorized school assignment
sch_mask = synth_pop_df["SCH"].astype(str).isin(["2", "3"])
Expand All @@ -323,6 +334,17 @@ def sample_population(
school_ids, size=n_students
)

# Assign teachers to schools
if n_workers > 0:
teacher_indices = rng.choice(
worker_indices,
size=min(n_teachers, len(worker_indices)),
replace=False,
)
synth_pop_df.loc[teacher_indices, "school_id"] = rng.choice(
school_ids, size=len(teacher_indices)
)

elapsed = time.time() - start_time
print(f"Population sampling took {elapsed:.2f}s")
return synth_pop_df
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,10 @@ def test_produces_at_least_target(self, household_pums, sample_pums, rng):

def test_workers_get_workplaces(self, pop_df):
workers = pop_df[pop_df["WRK"].astype(str) == "1"]
assert workers["workplace_id"].notna().all()

assert (
workers["workplace_id"].notna() | workers["school_id"].notna()
).all()

def test_students_get_schools(self, pop_df):
students = pop_df[pop_df["SCH"].astype(str).isin(["2", "3"])]
Expand Down
Loading