-
Notifications
You must be signed in to change notification settings - Fork 2
Refactor simd locality #1334
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development
Are you sure you want to change the base?
Refactor simd locality #1334
Changes from all commits
3c2fbce
a924b32
559409f
3520c0d
b81fc70
59269ea
2ebe66f
6e3f1e5
762b09e
bf2060c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,23 +6,28 @@ | |
| #' | ||
| #' @return A data frame with keep_population flags | ||
| #' @family individual_file | ||
| add_keep_population_flag <- function(individual_file, year) { | ||
| calendar_year <- paste0("20", substr(year, 1, 2)) %>% as.integer() | ||
| add_keep_population_flag <- function(individual_file, | ||
| year, | ||
| pop_estimates = get_datazone_pop_data( | ||
| denodo_connect = get_denodo_connection(BYOC_MODE = BYOC_MODE), | ||
| file_path = get_pop_path(type = "datazone"), | ||
| BYOC_MODE | ||
| ), | ||
| locality_data = get_locality_data( | ||
| denodo_connect = get_denodo_connection(BYOC_MODE = BYOC_MODE), | ||
| file_path = get_locality_path(), | ||
| BYOC_MODE | ||
| ), | ||
| BYOC_MODE) { | ||
| # TODO: Check arguments - do get_datazone_pop_data and get_locality_data just need BYOC_MODE? | ||
|
|
||
| calendar_year <- paste0("20", substr(year, 1, 2)) %>% | ||
| as.integer() | ||
|
|
||
| if (!check_year_valid(year, "nsu")) { | ||
| individual_file <- individual_file %>% | ||
| dplyr::mutate(keep_population = 1L) | ||
| } else { | ||
| ## Obtain the population estimates for Locality AgeGroup and Gender. | ||
| pop_estimates <- | ||
| readr::read_rds(get_pop_path(type = "datazone")) %>% | ||
| dplyr::select( | ||
| .data$year, | ||
| .data$datazone2011, | ||
| .data$sex, | ||
| .data$age0:.data$age90plus | ||
| ) | ||
|
|
||
| # Step 1: Obtain the population estimates for Locality, AgeGroup, and Gender | ||
| # Select out the estimates for the year of interest. | ||
| # if we don't have estimates for this year (and so have to use previous year). | ||
|
|
@@ -55,8 +60,7 @@ add_keep_population_flag <- function(individual_file, year) { | |
| dplyr::mutate(age = as.integer(.data$age)) %>% | ||
| add_age_group(.data$age) %>% | ||
| dplyr::left_join( | ||
| readr::read_rds(get_locality_path()) %>% | ||
| dplyr::select("locality" = "hscp_locality", .data$datazone2011), | ||
| locality_data, | ||
| by = "datazone2011" | ||
| ) %>% | ||
| dplyr::group_by(.data$locality, .data$age_group, .data$gender) %>% | ||
|
|
@@ -139,7 +143,7 @@ add_keep_population_flag <- function(individual_file, year) { | |
| ) | ||
| } | ||
|
|
||
| cli::cli_alert_info("Add keep population function finished at {Sys.time()}") | ||
| cli::cli_alert_info("Add keep population function finished at {Sys.time()}") # TODO: Is this being kept or changed with a logger_utils function? | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is log message being kept or changed with a
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Happy for us to change it to a logger message. When I made the change of our logs to logger, I changed some of the cli:: alerts to logger since some of them sometimes don't even print in the console so I'm happy for us to change it to logger::log_info instead. |
||
| return(individual_file) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,11 +11,14 @@ | |
| #' Postcode and GP Practice details. | ||
| fill_geographies <- function( | ||
| data, | ||
| slf_pc_lookup = read_file(get_slf_postcode_path()), | ||
| slf_pc_lookup = read_file( | ||
| get_slf_postcode_path(BYOC_MODE = BYOC_MODE) | ||
| ), | ||
| slf_gpprac_lookup = read_file( | ||
| get_slf_gpprac_path(), | ||
| get_slf_gpprac_path(BYOC_MODE = BYOC_MODE), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Lucy! No need for me to do this on my branch then.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Merge order: spd -> simd_locality > gpprac_lookup |
||
| col_select = c("gpprac", "cluster", "hbpraccode") | ||
| ) | ||
| ), | ||
| BYOC_MODE = BYOC_MODE | ||
| ) { | ||
| check_variables_exist(data, c( | ||
| "anon_chi", | ||
|
|
@@ -29,17 +32,10 @@ fill_geographies <- function( | |
| )) | ||
|
|
||
| data <- data %>% | ||
| fill_postcode_geogs( | ||
| slf_pc_lookup = read_file(get_slf_postcode_path()) | ||
| ) %>% | ||
| fill_gpprac_geographies( | ||
| slf_gpprac_lookup = read_file( | ||
| get_slf_gpprac_path(), | ||
| col_select = c("gpprac", "cluster", "hbpraccode") | ||
| ) | ||
| ) | ||
| fill_postcode_geogs(slf_pc_lookup) %>% | ||
| fill_gpprac_geographies(slf_gpprac_lookup) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Lucy! No need for me to do this on my branch then. |
||
|
|
||
| cli::cli_alert_info("Fill geographies function finished at {Sys.time()}") | ||
| cli::cli_alert_info("Fill geographies function finished at {Sys.time()}") # TODO: Is this being kept or changed with a logger_utils function? | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as above.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Happy for us to change it to a logger message. When I made the change of our logs to logger, I changed some of the cli:: alerts to logger since some of them sometimes don't even print in the console so I'm happy for us to change it to logger::log_info instead. |
||
| return(data) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ get_lookups_dir <- function() { | |
| #' | ||
| #' @inheritParams get_file_path | ||
| #' | ||
| #' @return An [fs::path()] to the Scottish Postcode Directory | ||
| #' @return An [fs::path()] to the HSCP Localities file | ||
| #' @export | ||
| #' | ||
| #' @family lookup file paths | ||
|
|
@@ -37,6 +37,50 @@ get_locality_path <- function(file_name = NULL, ext = "rds") { | |
| } | ||
|
|
||
|
|
||
| #' Locality data | ||
| #' | ||
| #' @description Return the data for centrally held HSCP Localities file. | ||
| #' | ||
| #' @param denodo_connect Connection to denodo | ||
| #' @param file_path Path to local HSCP Localities file | ||
| #' @param BYOC_MODE BYOC MODE | ||
| #' | ||
| #' @return a [tibble][tibble::tibble-package]. | ||
| #' @export | ||
| #' | ||
| #' @family lookup files | ||
| get_locality_data <- function(denodo_connect = get_denodo_connection(BYOC_MODE = BYOC_MODE), | ||
| file_path = get_locality_path(), | ||
| BYOC_MODE) { | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment also applies to |
||
| if (isTRUE(BYOC_MODE)) { | ||
| log_slf_event(stage = "read", status = "start", type = "HSCP Localities Lookup", year = "all") # TODO: Check whether to add hscp_locality to log_slf_event mapping list | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add add hscp_locality / simd / datazone_pop to |
||
| on.exit(try(DBI::dbDisconnect(denodo_connect), silent = TRUE), add = TRUE) | ||
|
|
||
| locality_data <- dplyr::tbl( | ||
| denodo_connect, | ||
| dbplyr::in_schema("sdl", "sdl_hscp_locality_source") # TODO: Check table name | ||
| ) %>% | ||
| dplyr::select( | ||
| locality = "hscp_locality", | ||
| tidyselect::matches("datazone\\d{4}$") | ||
| ) %>% # TODO: Check whether we need to select columns | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to select columns when data is read in (either from Denodo or disk)?Columns could be selected when the data is read in or when it is used. NOTE: For
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment also applies to |
||
| collect() | ||
|
|
||
| log_slf_event(stage = "read", status = "complete", type = "HSCP Localities Lookup", year = "all") # TODO: Check whether to add hscp_locality to log_slf_event mapping list | ||
| } else { # TODO: Check logic - are we reading the local file when BYOC_MODE = FALSE or are we still reading from Denodo? | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we reading the local file when
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment also applies to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| locality_data <- read_file(file_path) %>% | ||
| dplyr::select( | ||
| locality = "hscp_locality", | ||
| tidyselect::matches("datazone\\d{4}$") | ||
| ) # TODO: Check whether we need to select columns | ||
| } | ||
|
|
||
| return(locality_data) | ||
| } | ||
|
|
||
|
|
||
| #' Scottish Postcode Directory File Path | ||
| #' | ||
| #' @description Get the path to the centrally held Scottish Postcode Directory | ||
|
|
@@ -95,6 +139,63 @@ get_simd_path <- function(file_name = NULL, ext = "parquet") { | |
| } | ||
|
|
||
|
|
||
| #' SIMD data | ||
| #' | ||
| #' @description Return the data for centrally held Scottish Index of Multiple | ||
| #' Deprivation (SIMD) file. | ||
| #' | ||
| #' @param denodo_connect Connection to denodo | ||
| #' @param file_path Path to local SIMD file | ||
| #' @param BYOC_MODE BYOC MODE | ||
| #' | ||
| #' @return a [tibble][tibble::tibble-package]. | ||
| #' @export | ||
| #' | ||
| #' @family lookup files | ||
| get_simd_data <- function(denodo_connect = get_denodo_connection(BYOC_MODE = BYOC_MODE), | ||
| file_path = get_simd_path(), | ||
| BYOC_MODE) { | ||
| if (isTRUE(BYOC_MODE)) { | ||
| log_slf_event(stage = "read", status = "start", type = "SIMD Lookup", year = "all") # TODO: Check whether to add simd to log_slf_event mapping list | ||
|
|
||
| on.exit(try(DBI::dbDisconnect(denodo_connect), silent = TRUE), add = TRUE) | ||
|
|
||
| simd_data <- dplyr::tbl( | ||
| denodo_connect, | ||
| dbplyr::in_schema("sdl", "sdl_simd_source") # TODO: Check table name | ||
| ) %>% | ||
| dplyr::select( | ||
| "pc7", | ||
| "simd2020v2_rank", | ||
| "simd2020v2_sc_decile", | ||
| "simd2020v2_sc_quintile", | ||
| "simd2020v2_hb2019_decile", | ||
| "simd2020v2_hb2019_quintile", | ||
| "simd2020v2_hscp2019_decile", | ||
| "simd2020v2_hscp2019_quintile" | ||
| ) %>% # TODO: Check whether we need to select columns. When a new version of the SIMD is released, the column names within the file will change. | ||
| collect() | ||
|
|
||
| log_slf_event(stage = "read", status = "complete", type = "SIMD Lookup", year = "all") # TODO: Check whether to add simd to log_slf_event mapping list | ||
| } else { # TODO: Check logic - are we reading the local file when BYOC_MODE = FALSE or are we still reading from Denodo? | ||
|
|
||
| simd_data <- read_file(file_path) %>% | ||
| dplyr::select( | ||
| "pc7", | ||
| "simd2020v2_rank", | ||
| "simd2020v2_sc_decile", | ||
| "simd2020v2_sc_quintile", | ||
| "simd2020v2_hb2019_decile", | ||
| "simd2020v2_hb2019_quintile", | ||
| "simd2020v2_hscp2019_decile", | ||
| "simd2020v2_hscp2019_quintile" | ||
| ) # TODO: Check whether we need to select columns. When a new version of the SIMD is released, the column names within the file will change. | ||
| } | ||
|
|
||
| return(simd_data) | ||
| } | ||
|
|
||
|
|
||
| #' Populations File Path for different types | ||
| #' | ||
| #' @description Get the path to the populations estimates | ||
|
|
@@ -138,6 +239,54 @@ get_pop_path <- function(file_name = NULL, | |
| } | ||
|
|
||
|
|
||
| #' DataZone population data | ||
| #' | ||
| #' @description Return the data for DataZone population estimates. | ||
| #' | ||
| #' @param denodo_connect Connection to denodo | ||
| #' @param file_path Path to local DataZone population file | ||
| #' @param BYOC_MODE BYOC MODE | ||
| #' | ||
| #' @return a [tibble][tibble::tibble-package]. | ||
| #' @export | ||
| #' | ||
| #' @family lookup files | ||
| get_datazone_pop_data <- function(denodo_connect = get_denodo_connection(BYOC_MODE = BYOC_MODE), | ||
| file_path = get_pop_path(type = "datazone"), | ||
| BYOC_MODE) { | ||
| if (isTRUE(BYOC_MODE)) { | ||
| log_slf_event(stage = "read", status = "start", type = "DataZone Population Lookup", year = "all") # TODO: Check whether to add datazone_pop to log_slf_event mapping list | ||
|
|
||
| on.exit(try(DBI::dbDisconnect(denodo_connect), silent = TRUE), add = TRUE) | ||
|
|
||
| datazone_pop_data <- dplyr::tbl( | ||
| denodo_connect, | ||
| dbplyr::in_schema("sdl", "sdl_datazone_population_source") # TODO: Check table name | ||
| ) %>% | ||
| dplyr::select( | ||
| "year", | ||
| "datazone2011", | ||
| "sex", | ||
| dplyr::starts_with("age") | ||
| ) %>% # TODO: Check whether we need to select columns | ||
| collect() | ||
|
|
||
| log_slf_event(stage = "read", status = "complete", type = "DataZone Population Lookup", year = "all") # TODO: Check whether to add datazone_pop to log_slf_event mapping list | ||
| } else { # TODO: Check logic - are we reading the local file when BYOC_MODE = FALSE or are we still reading from Denodo? | ||
|
|
||
| datazone_pop_data <- read_file(file_path) %>% | ||
| dplyr::select( | ||
| "year", | ||
| "datazone2011", | ||
| "sex", | ||
| dplyr::starts_with("age") | ||
| ) # TODO: Check whether we need to select columns | ||
| } | ||
|
|
||
| return(datazone_pop_data) | ||
| } | ||
|
|
||
|
|
||
| #' GP Practice Reference File Path (gpprac) | ||
| #' | ||
| #' @description Get the path for the centrally held reference file `gpprac` | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,13 +9,23 @@ | |
| #' @export | ||
| #' @family slf lookup file path | ||
| #' @seealso [get_file_path()] for the generic function. | ||
| get_slf_postcode_path <- function(update = latest_update(), ...) { | ||
| get_file_path( | ||
| directory = fs::path(get_slf_dir(), "Lookups"), | ||
| file_name = stringr::str_glue("source_postcode_lookup_{update}"), | ||
| ext = "parquet", | ||
| ... | ||
| ) | ||
| get_slf_postcode_path <- function(update = latest_update(), BYOC_MODE, ...) { # TODO: Check whether to keep the update argument | ||
|
|
||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the |
||
| file_name <- stringr::str_glue("source_postcode_lookup_{update}.parquet") | ||
|
|
||
| if (BYOC_MODE) { | ||
| slf_postcode_path <- file.path( | ||
| directory = denodo_output_path(), | ||
| file_name = file_name | ||
| ) | ||
| } else { | ||
| slf_postcode_path <- get_file_path( | ||
| directory = fs::path(get_slf_dir(), "Lookups"), | ||
| file_name = file_name, | ||
| ... | ||
| ) | ||
| } | ||
| return(slf_postcode_path) | ||
| } | ||
|
|
||
| #' get uk postcode list file path | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do
get_datazone_pop_dataandget_locality_dataneed all arguments to be written or isBYOC_MODEsufficient? I.e.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
process_lookup_sc_demographics()in #1313 just uses theBYOC_MODEargument forspd_data,uk_postcode_dataandch_name_lookup.