Skip to content

Commit f156350

Browse files
authored
Merge pull request #18 from dime-worldbank/liberia
Bringing in Liberia Indicators data and updating to include Liberia in pipeline
2 parents 33bc6f0 + 826ba7d commit f156350

4 files changed

+45
-9
lines changed

population/COD/cod_subnational_population.py

-4
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,6 @@
2727
ddf = df[df.ISO_Code=='COD'][['Country', 'Region', 'year', 'population_millions']]
2828

2929

30-
# COMMAND ----------
31-
32-
pop.sample(5)
33-
3430
# COMMAND ----------
3531

3632
# Write to indicator_intermediate
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Databricks notebook source
2+
import pandas as pd
3+
4+
# COMMAND ----------
5+
6+
def process_to_indicator_intermediate(country_name:str, country_code:str, adm1_drop:list=[]):
7+
8+
spark_df = spark.table(f'prd_mega.indicator.global_data_lab_subnational_population')
9+
df = spark_df.toPandas()
10+
11+
ddf = df[df.ISO_Code==country_code.upper()][['Country', 'Region', 'year', 'population_millions']]
12+
ddf.columns = ['country_name', 'adm1_name', 'year', 'population']
13+
ddf['population'] = ddf.population.map(lambda x: x*1_000_000)
14+
ddf['adm1_name'] = ddf['adm1_name'].str.lower()
15+
ddf = ddf[ddf.adm1_name!='total']
16+
ddf['adm1_name'] = ddf['adm1_name'].str.strip().str.title()
17+
ddf = ddf[~ddf['adm1_name'].isin(adm1_drop)]
18+
19+
pop = ddf.sort_values(['year', 'adm1_name'])
20+
pop.country_name = country_name
21+
pop['data_source'] = 'Global Data Lab'
22+
23+
return pop
24+
25+
def write_to_indicator_intermediate(pop:pd.DataFrame, country_code:str):
26+
27+
database_name = "prd_mega.indicator_intermediate"
28+
if not spark.catalog.databaseExists(database_name):
29+
print(f"Database '{database_name}' does not exist. Creating the database.")
30+
spark.sql(f"CREATE DATABASE {database_name}")
31+
32+
sdf = spark.createDataFrame(pop)
33+
sdf.write.mode("overwrite").saveAsTable(f"{database_name}.{country_code.lower()}_subnational_population")
34+
35+
return
36+
37+
# COMMAND ----------
38+
39+
country_code = 'LBR'
40+
country_name = 'Liberia'
41+
adm1_drop = ['North Central','North Western','Monrovia','South Eastern A','South Eastern B','South Central']
42+
pop = process_to_indicator_intermediate(country_name, country_code, adm1_drop)
43+
write_to_indicator_intermediate(pop, country_code)

population/global_data_lab_subnational_population.r

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
# Databricks notebook source
22
install.packages("gdldata")
3-
4-
# COMMAND ----------
5-
63
library(gdldata)
74
library(magrittr)
85

@@ -20,7 +17,7 @@ sess <- gdl_session(api_token)
2017
# COMMAND ----------
2118

2219
sess <- sess %>%
23-
set_dataset('areadata') %>%
20+
set_dataset('demographics') %>%
2421
set_countries_all() %>%
2522
set_indicators(c('regpopm'))
2623
# by default linear extrapolation for 3 years

population/subnational_population_official_dlt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pyspark.sql import functions as F
44

55
# Adding a new country requires adding the country here
6-
country_codes = ['moz', 'pry', 'ken', 'pak', 'bfa', 'col', 'cod', 'tun', 'btn', 'chl', 'nga', 'bgd', 'alb', "zaf", 'chl', 'gha']
6+
country_codes = ['moz', 'pry', 'ken', 'pak', 'bfa', 'col', 'cod', 'tun', 'btn', 'chl', 'nga', 'bgd', 'alb', "zaf", 'chl', 'gha', 'lbr']
77

88
@dlt.table(name=f'subnational_population')
99
def subnational_population():

0 commit comments

Comments
 (0)