Skip to content

Commit 619fb77

Browse files
authored
Merge pull request #20 from Public-Health-Scotland/refactor_archive_search
Generalise the ability to look for 'archived' file versions
2 parents 89fc87a + ea3f6c2 commit 619fb77

13 files changed

Lines changed: 395 additions & 129 deletions

DESCRIPTION

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ Imports:
1818
cli,
1919
dplyr,
2020
fs,
21-
glue,
22-
magrittr,
2321
readr,
2422
rlang,
2523
stringr,

R/find_latest_file.R

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#' passed to [fs::dir_info()] to search for the file.
1313
#' @param selection_method Valid arguments are "modification_date"
1414
#' (the default) or "file_name".
15+
#' @param quiet (default: FALSE) Used to suppress message output
1516
#'
1617
#' @return the [fs::path()] to the file
1718
#' @examples
@@ -23,11 +24,13 @@
2324
#' }
2425
#' @noRd
2526
#' @keywords internal
26-
find_latest_file <- function(directory,
27-
regexp,
28-
selection_method = "modification_date") {
27+
find_latest_file <- function(
28+
directory,
29+
regexp,
30+
selection_method = "modification_date",
31+
quiet = FALSE) {
2932
if (selection_method == "modification_date") {
30-
latest_file <- fs::dir_info(
33+
latest_file_options <- fs::dir_info(
3134
path = directory,
3235
type = "file",
3336
regexp = regexp,
@@ -38,9 +41,9 @@ find_latest_file <- function(directory,
3841
dplyr::desc(.data$modification_time),
3942
dplyr::desc(.data$path)
4043
) |>
41-
magrittr::extract(1L, )
44+
dplyr::pull(.data$path)
4245
} else if (selection_method == "file_name") {
43-
latest_file <- fs::dir_info(
46+
latest_file_options <- fs::dir_info(
4447
path = directory,
4548
type = "file",
4649
regexp = regexp,
@@ -51,24 +54,27 @@ find_latest_file <- function(directory,
5154
dplyr::desc(.data$birth_time),
5255
dplyr::desc(.data$modification_time)
5356
) |>
54-
magrittr::extract(1L, )
57+
dplyr::pull(.data$path)
5558
}
5659

57-
if (nrow(latest_file) == 1L) {
58-
cli::cli_alert_info(
59-
"Using the latest available version: {.val {fs::path_file(
60-
fs::path_ext_remove(latest_file$path))}}.
61-
If you require an older version specify an argument `version`."
62-
)
60+
if (length(latest_file_options) >= 1) {
61+
file_path <- latest_file_options |>
62+
dplyr::first()
63+
64+
if (!quiet) {
65+
cli::cli_alert_info(
66+
"Using the latest available version:
67+
{.val {fs::path_file(fs::path_ext_remove(file_path))}}.
68+
If you require an older version or for reproducibility purposes
69+
please specify the version argument accordingly."
70+
)
71+
}
72+
73+
return(file_path)
6374
} else {
6475
cli::cli_abort(
6576
"There was no file in {.path {directory}} that matched the
6677
regular expression {.val {regexp}}"
6778
)
6879
}
69-
70-
file_path <- latest_file |>
71-
dplyr::pull(.data$path)
72-
73-
return(file_path)
7480
}

R/find_specific_file.R

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#' Find a specific version of a lookup file
2+
#'
3+
#' @description
4+
#' This function retrieves the file path for a specific version of a lookup
5+
#' file based on the provided directory and lookup type. It validates the
6+
#' existence of the file and throws an error if no matching file is found.
7+
#' For lookup types requiring multiple versions (SIMD Postcode),
8+
#' `version` must be a named list or vector with the
9+
#' appropriate names (e.g., `simd_version` and `postcode_version`).
10+
#'
11+
#' @param directory The base directory where lookup files are stored.
12+
#' The function searches here and this directory's "Archive" subdirectory.
13+
#' @param lookup_type A string specifying the type of lookup file to find.
14+
#' Supported values include `"SPD"`, `"HSCP Locality"`, and `"SIMD Postcode"`.
15+
#' @param version A string defining the version to locate, or a
16+
#' named list/vector for lookup types requiring multiple versions.
17+
#' For example:
18+
#' - `"YYYY_1"` for `lookup_type = "SPD"`
19+
#' - `"YYYYMMDD"` for `lookup_type = "HSCP Locality"`
20+
#' - `list(postcode_version = "2023_2", simd_version = "2020v2")` for
21+
#' `lookup_type = "SIMD Postcode"`
22+
#'
23+
#' @return The [fs::path()] of the file if found.
24+
#' @keywords internal
25+
#' @noRd
26+
find_specific_file <- function(directory, lookup_type, version) {
27+
# Determine the file prefix and version handling based on the lookup_type
28+
if (length(version) == 1) {
29+
# Handle single-version cases
30+
file_prefix <- dplyr::case_match(
31+
lookup_type,
32+
"SPD" ~ paste0("Scottish_Postcode_Directory_", version),
33+
"HSCP Locality" ~ paste0("HSCP Localities_DZ11_Lookup_", version),
34+
.default = NA_character_
35+
)
36+
} else {
37+
# Handle multi-version cases
38+
file_prefix <- dplyr::case_match(
39+
lookup_type,
40+
"SIMD Postcode" ~
41+
paste0(
42+
"postcode_",
43+
version[["postcode_version"]],
44+
"_simd",
45+
version[["simd_version"]]
46+
),
47+
.default = NA_character_
48+
)
49+
}
50+
51+
if (is.na(file_prefix)) {
52+
cli::cli_abort("Unsupported lookup_type: {.val {lookup_type}}")
53+
}
54+
55+
# Generate possible file names and paths
56+
name_ver_list <- paste0(
57+
file_prefix,
58+
c(".parquet", ".rds", ".csv")
59+
)
60+
path_ver_list <- c(
61+
fs::path(directory, name_ver_list),
62+
fs::path(directory, "Archive", name_ver_list)
63+
)
64+
65+
# Find the first valid file that exists
66+
file_path <- path_ver_list[fs::file_exists(path_ver_list)][1]
67+
68+
# Handle case where no matching file is found
69+
if (is.na(file_path)) {
70+
cli::cli_abort(
71+
c(
72+
"x" = "{.val {lookup_type}} version {.val {version}} is NOT available",
73+
"i" = "Contact phs.geography@phs.scot"
74+
),
75+
call = NULL,
76+
rlang_backtrace_on_error = "none"
77+
)
78+
}
79+
80+
return(file_path)
81+
}

R/get_hscp_locality.R

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1-
#' Get the HSCP Locality lookup
1+
#' Get HSCP Locality lookup
22
#'
3-
#' @param version Default is "latest", otherwise supply a date e.g. "20230804"
3+
#' Read a Health and Social Care Partnership (HSCP) Locality lookup file from
4+
#' cl-out into a tibble.
5+
#' @param version A string defining a version to read in. The default value
6+
#' is "latest", otherwise supply a date (file name suffix), e.g. "20230804".
47
#' @inheritParams readr::read_csv
58
#'
69
#' @return a [tibble][tibble::tibble-package] of the HSCP localities lookup
710
#' @export
811
#'
912
#' @examples
1013
#' get_hscp_locality()
14+
#' get_hscp_locality(version = "20240308")
15+
#' get_hscp_locality(col_select = c("datazone2011", "hscp_locality"))
1116
get_hscp_locality <- function(version = "latest", col_select = NULL) {
1217
dir <- fs::path(get_lookups_dir(), "Geography", "HSCP Locality")
1318

@@ -21,9 +26,21 @@ get_hscp_locality <- function(version = "latest", col_select = NULL) {
2126
selection_method = "file_name"
2227
)
2328
} else {
24-
hscp_locality_path <- fs::path(
25-
dir,
26-
glue::glue("HSCP Localities_DZ11_Lookup_{date}.{ext}")
29+
if (!stringr::str_detect(
30+
version,
31+
"^20\\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\\d|3[01])$"
32+
)) {
33+
cli::cli_abort(c(
34+
"x" = "Invalid version name: {.val {version}}",
35+
"i" = "It should follow pattern the YYYYMMDD",
36+
call = NULL
37+
))
38+
}
39+
40+
hscp_locality_path <- find_specific_file(
41+
directory = dir,
42+
lookup_type = "HSCP Locality",
43+
version = version
2744
)
2845
}
2946

R/get_simd_datazone.R

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,69 @@
1-
#' Get a SIMD DataZone Lookup
1+
#' Get Data Zone-SIMD lookup
22
#'
3-
#' @param datazone_version Default is "latest", otherwise supply a year
4-
#' e.g. "2011"
5-
#' @param simd_version Default is "latest", otherwise supply a version
6-
#' e.g. "2020v2"
7-
#' @inheritParams readr::read_csv
3+
#' Read a Data Zone-Scottish Index of Multiple Deprivation (SIMD) lookup file
4+
#' from cl-out into a tibble.
5+
#' @inheritParams get_simd_postcode
86
#'
97
#' @return a [tibble][tibble::tibble-package] of the SIMD DataZone lookup
108
#' @export
119
#'
1210
#' @examples
1311
#' get_simd_datazone()
14-
get_simd_datazone <- function(
15-
datazone_version = "latest",
16-
simd_version = "latest",
17-
col_select = NULL) {
12+
#' get_simd_datazone(simd_version = "2016")
13+
#' get_simd_datazone(
14+
#' simd_version = "2016",
15+
#' col_select = c("DataZone2011", "simd2016rank")
16+
#' )
17+
get_simd_datazone <- function(simd_version = "latest", col_select = NULL) {
1818
dir <- fs::path(get_lookups_dir(), "Deprivation")
1919

20-
if (datazone_version != "latest" && simd_version != "latest") {
21-
simd_datazone_path <- fs::path(
22-
dir,
23-
glue::glue("DataZone_{datazone_version}_simd{simd_version}.rds")
24-
)
25-
} else {
20+
if (simd_version == "latest") {
2621
regexp <- paste0(
2722
"DataZone",
28-
ifelse(datazone_version == "latest", "\\d{4}", datazone_version),
23+
"\\d{4}",
2924
"_simd",
30-
ifelse(simd_version == "latest", "\\d{4}(:?v[1-2])?", simd_version),
25+
"\\d{4}(:?v2)?",
3126
"\\.rds"
3227
)
28+
} else {
29+
valid_simd_version <- stringr::str_detect(
30+
string = simd_version,
31+
pattern = "^20[0-9]{2}(:?v2)?$"
32+
)
33+
34+
if (!valid_simd_version) {
35+
cli::cli_abort(c(
36+
"x" = "Invalid version specification of SIMD: {.val {simd_version}}",
37+
"i" = "SIMD should follow the pattern YYYY or YYYYv2"
38+
))
39+
}
40+
41+
if (!(simd_version %in% c(
42+
"2004", "2006", "2009v2", "2012",
43+
"2016", "2020v2"
44+
))) {
45+
cli::cli_abort(c(
46+
"x" = "SIMD version {.val {simd_version}} does NOT exit.",
47+
"i" = "Note that \"2009\" and \"2020\" versions have been corrected and
48+
replaced with versions \"2009v2\" and \"2020v2\" respectively."
49+
))
50+
}
3351

34-
simd_datazone_path <- find_latest_file(
35-
directory = dir,
36-
regexp = regexp,
37-
selection_method = "file_name"
52+
regexp <- paste0(
53+
"DataZone",
54+
"\\d{4}",
55+
"_simd",
56+
simd_version,
57+
"\\.rds"
3858
)
3959
}
4060

61+
simd_datazone_path <- find_latest_file(
62+
directory = dir,
63+
regexp = regexp,
64+
selection_method = "file_name",
65+
quiet = simd_version != "latest"
66+
)
67+
4168
return(read_file(simd_datazone_path, col_select = {{ col_select }}))
4269
}

R/get_simd_postcode.R

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,36 @@
1-
#' Get a SIMD Postcode Lookup
1+
#' Get Postcode-SIMD lookup
22
#'
3-
#' @param postcode_version Default is "latest", otherwise supply a tag
4-
#' e.g. "2023_2"
5-
#' @param simd_version Default is "latest", otherwise supply a version
6-
#' e.g. "2020v2"
7-
#' @inheritParams arrow::read_parquet
3+
#' Read a Postcode-Scottish Index of Multiple Deprivation (SIMD) lookup file
4+
#' from cl-out into a tibble.
5+
#' @param postcode_version A string defining a postcode version. The default
6+
#' value is "latest". Alternatively you can supply a string defining
7+
#' a specific version that you would like to load. It should follow pattern
8+
#' "YYYY_1" or "YYYY_2", e.g. "2023_2".
9+
#' @param simd_version A string defining a SIMD version. The default value
10+
#' is "latest". Alternatively you can supply a string defining a specific
11+
#' version. It should follow pattern "YYYY" or "YYYYv2", e.g. "2020v2".
12+
#' @inheritParams readr::read_csv
813
#'
914
#' @return a [tibble][tibble::tibble-package] of the SIMD Postcode lookup
1015
#' @export
1116
#'
1217
#' @examples
1318
#' get_simd_postcode()
14-
#' get_simd_postcode(col_select = c("pc7", "simd2020v2_rank"))
19+
#' get_simd_postcode(postcode_version = "2016_1", simd_version = "2012")
20+
#'
21+
#' library(dplyr)
22+
#' get_simd_postcode(
23+
#' postcode_version = "2016_1",
24+
#' simd_version = "2012",
25+
#' col_select = c("pc7", starts_with("simd"))
26+
#' )
1527
get_simd_postcode <- function(
1628
postcode_version = "latest",
1729
simd_version = "latest",
1830
col_select = NULL) {
1931
dir <- fs::path(get_lookups_dir(), "Deprivation")
2032

21-
if (postcode_version != "latest" && simd_version != "latest") {
22-
simd_postcode_path <- fs::path(
23-
dir,
24-
glue::glue("postcode_{postcode_version}_simd{simd_version}.parquet")
25-
)
26-
} else {
33+
if (postcode_version == "latest" && simd_version == "latest") {
2734
regexp <- paste0(
2835
"postcode_",
2936
ifelse(postcode_version == "latest", "\\d{4}_[1-2]", postcode_version),
@@ -37,6 +44,39 @@ get_simd_postcode <- function(
3744
regexp = regexp,
3845
selection_method = "file_name"
3946
)
47+
} else if (postcode_version != "latest" && simd_version != "latest") {
48+
valid_postcode_version <- stringr::str_detect(
49+
postcode_version,
50+
"\\d{4}_[1-2]"
51+
)
52+
valid_simd_version <- stringr::str_detect(
53+
simd_version,
54+
"^20[0-9]{2}(:?v2)?$"
55+
)
56+
57+
if (!valid_postcode_version || !valid_simd_version) {
58+
cli::cli_abort(c(
59+
"x" = "Invalid version specification, Postcode:
60+
{.val {postcode_version}}, SIMD: {.val {simd_version}}",
61+
"i" = "Postcode should be follow the pattern YYYY_1 or YYYY_2",
62+
"i" = "SIMD should follow the pattern YYYY or YYYYv2"
63+
))
64+
}
65+
66+
simd_postcode_path <- find_specific_file(
67+
directory = dir,
68+
lookup_type = "SIMD Postcode",
69+
version = list(
70+
postcode_version = postcode_version,
71+
simd_version = simd_version
72+
)
73+
)
74+
} else {
75+
# Case when one of the versions is 'latest' but the other isn't
76+
cli::cli_abort(c(
77+
"x" = "When using a version other than {.val latest} both
78+
{.arg postcode_version} and {.arg simd_version} must be specified."
79+
))
4080
}
4181

4282
return(read_file(

0 commit comments

Comments
 (0)