From bcc221352d9b08538cc1f191ed5a6332c6515ad3 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 21 Apr 2026 15:18:47 +0200 Subject: [PATCH 1/7] feat: :sparkles: `list_parquet_files()` and `list_parquet_datasets()` helpers --- R/list.R | 40 +++++++++++++++++++++++++++++ man/list_parquet.Rd | 31 +++++++++++++++++++++++ tests/testthat/test-list.R | 52 ++++++++++++++++++++++++++++++++++++++ vignettes/fastreg.qmd | 22 ++++++++++++++++ 4 files changed, 145 insertions(+) create mode 100644 man/list_parquet.Rd diff --git a/R/list.R b/R/list.R index ac23340c..f2e408cf 100644 --- a/R/list.R +++ b/R/list.R @@ -30,3 +30,43 @@ list_sas_files <- function(path) { sas_files } + +#' List Parquet datasets or files in a project +#' +#' Only lists Parquet files that end in `part-*.parquet`. For datasets, +#' it will only look for Parquet files with a `year=YYYY` in its path. +#' This function will search the whole system for the project ID, so might +#' be slow sometimes. +#' +#' @name list_parquet +#' @rdname list_parquet +#' @param project_id The project ID to look for. +#' @returns The path(s) to the Parquet datasets (as directories) or files. +NULL + +#' @describeIn list_parquet List all Parquet (Hive partitioned by year) datasets. +#' @export +list_parquet_datasets <- function() { + list_parquet_files() |> + fs::path_filter(regexp = "year=[[:digit:]]{4}") |> + fs::path_dir() |> + fs::path_dir() |> + unique() |> + fs::path() +} + +#' @describeIn list_parquet List all Parquet files within a project. +#' @export +list_parquet_files <- function() { + rawdata_path <- get_project_rawdata_dir() + workdata_path <- get_project_workdata_dir() + + fs::dir_ls( + # Start from root of system. + c(rawdata_path, workdata_path), + regexp = glue::glue(".*/part-.*\\.parquet$"), + recurse = TRUE, + fail = FALSE, + type = "file" + ) +} diff --git a/man/list_parquet.Rd b/man/list_parquet.Rd new file mode 100644 index 00000000..45208f84 --- /dev/null +++ b/man/list_parquet.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list.R +\name{list_parquet} +\alias{list_parquet} +\alias{list_parquet_datasets} +\alias{list_parquet_files} +\title{List Parquet datasets or files in a project} +\usage{ +list_parquet_datasets() + +list_parquet_files() +} +\arguments{ +\item{project_id}{The project ID to look for.} +} +\value{ +The path(s) to the Parquet datasets (as directories) or files. +} +\description{ +Only lists Parquet files that end in \verb{part-*.parquet}. For datasets, +it will only look for Parquet files with a \code{year=YYYY} in its path. +This function will search the whole system for the project ID, so might +be slow sometimes. +} +\section{Functions}{ +\itemize{ +\item \code{list_parquet_datasets()}: List all Parquet (Hive partitioned by year) datasets. + +\item \code{list_parquet_files()}: List all Parquet files within a project. + +}} diff --git a/tests/testthat/test-list.R b/tests/testthat/test-list.R index 25c63824..b0f533a8 100644 --- a/tests/testthat/test-list.R +++ b/tests/testthat/test-list.R @@ -42,3 +42,55 @@ test_that("list_sas_files() errors when path does not exist", { regexp = "does not exist" ) }) + +# Test list_parquet_datasets() ------------------------------------------------- + +# Make all combinations of paths to Parquet files for testing. +parquet_files <- tidyr::expand_grid( + root = c("rawdata", "workdata"), + project = "701010", + register = c("bef", "lmdb"), + year = c("year=2023", "year=2024", "year=__HIVE_DEFAULT_PARTITION__"), + file = c("part-bae04.parquet", "part-04df1.parquet") +) |> + purrr::pmap_chr( + \(root, project, register, year, file) { + fs::path(fs::path_temp(root), project, register, year, file) + } + ) |> + fs::path() + +purrr::walk(parquet_files, \(path) fs::dir_create(fs::path_dir(path))) +purrr::walk(parquet_files, fs::file_create) +# purrr::walk(parquet_files, \(path) fs::file_delete(path)) + +test_that("list expected Parquet files and datasets", { + withr::with_options( + list( + fastreg.project_rawdata_dir = fs::path_temp("rawdata/701010/"), + fastreg.project_workdata_dir = fs::path_temp("workdata/701010/") + ), + { + expected_files <- parquet_files |> + sort() + actual_files <- list_parquet_files() |> + # Need to remove name attributes for comparison. + unname() |> + sort() + + expect_identical(actual_files, expected_files) + + expected_datasets <- parquet_files |> + fs::path_dir() |> + fs::path_dir() |> + unique() |> + fs::path() |> + sort() + + actual_datasets <- list_parquet_datasets() |> + sort() + + expect_identical(actual_datasets, expected_datasets) + } + ) +}) diff --git a/vignettes/fastreg.qmd b/vignettes/fastreg.qmd index 03444048..b68fd584 100644 --- a/vignettes/fastreg.qmd +++ b/vignettes/fastreg.qmd @@ -204,6 +204,28 @@ file into a Parquet file, all done in parallel. Re-running `tar_make()` only re-converts registers whose source files have changed or if the pipeline itself has been edited. +## Listing available Parquet files and datasets + +To list what Parquet files or datasets are available, there's the +helpful `list_parquet_files()` and `list_parquet_datasets()` functions. +These look in the `fastreg.project_workdata_dir` and +`fastreg.project_rawdata_dir` directories (set with `options()`) for any +Parquet files following a specific pattern. See the function help docs +for more details. + +You can use them interactively in the Console: + + + +```{r list-files} +#| filename: "Console" +#| eval: false +# For individual files +list_parquet_files() +# For datasets (registers with all years). +list_parquet_datasets() +``` + ## Reading a Parquet register The final function reads the converted Parquet register data into R, From 6608e5056852df2cb48172ed38ed78056bef439e Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Fri, 24 Apr 2026 15:54:07 +0200 Subject: [PATCH 2/7] chore: :wrench: regen roxygen docs --- NAMESPACE | 2 ++ man/use_template.Rd | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 5b0aaf14..544c7088 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(convert) +export(list_parquet_datasets) +export(list_parquet_files) export(list_sas_files) export(read_parquet_file) export(read_parquet_partition) diff --git a/man/use_template.Rd b/man/use_template.Rd index 90bc67c8..d71a13a8 100644 --- a/man/use_template.Rd +++ b/man/use_template.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/use-targets.R +% Please edit documentation in R/use.R \name{use_template} \alias{use_template} \title{Use a targets pipeline template for converting SAS registers to Parquet} From 8df70f7589ff7e95caf7169a662f5e5a85984c67 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Fri, 24 Apr 2026 16:01:40 +0200 Subject: [PATCH 3/7] build: :heavy_plus_sign: move tidyr into Suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index d07c801a..779a46ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -34,6 +34,7 @@ Imports: uuid Suggests: crew, + tidyr, dbplyr, devtools, duckdb, From 42f1f11670e0ca69d837d905c1495e139d05c3c7 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Mon, 27 Apr 2026 14:17:17 +0200 Subject: [PATCH 4/7] docs: :pencil2: forgot to remove param in roxygen docs --- R/list.R | 1 - man/list_parquet.Rd | 3 --- 2 files changed, 4 deletions(-) diff --git a/R/list.R b/R/list.R index f2e408cf..98b47210 100644 --- a/R/list.R +++ b/R/list.R @@ -40,7 +40,6 @@ list_sas_files <- function(path) { #' #' @name list_parquet #' @rdname list_parquet -#' @param project_id The project ID to look for. #' @returns The path(s) to the Parquet datasets (as directories) or files. NULL diff --git a/man/list_parquet.Rd b/man/list_parquet.Rd index 45208f84..9890afad 100644 --- a/man/list_parquet.Rd +++ b/man/list_parquet.Rd @@ -10,9 +10,6 @@ list_parquet_datasets() list_parquet_files() } -\arguments{ -\item{project_id}{The project ID to look for.} -} \value{ The path(s) to the Parquet datasets (as directories) or files. } From 2f61787acb5dd65cbe16c4885a92f2175cfa8813 Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 28 Apr 2026 13:01:11 +0200 Subject: [PATCH 5/7] docs: :pencil2: edits from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Signe Kirk Brødbæk <40836345+signekb@users.noreply.github.com> --- R/list.R | 2 +- vignettes/fastreg.qmd | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/list.R b/R/list.R index 98b47210..c2354e70 100644 --- a/R/list.R +++ b/R/list.R @@ -35,7 +35,7 @@ list_sas_files <- function(path) { #' #' Only lists Parquet files that end in `part-*.parquet`. For datasets, #' it will only look for Parquet files with a `year=YYYY` in its path. -#' This function will search the whole system for the project ID, so might +#' This function will search the whole system for the project ID, so it might #' be slow sometimes. #' #' @name list_parquet diff --git a/vignettes/fastreg.qmd b/vignettes/fastreg.qmd index b727c0e1..61f50259 100644 --- a/vignettes/fastreg.qmd +++ b/vignettes/fastreg.qmd @@ -213,12 +213,12 @@ pipeline itself has been edited. ## Listing available Parquet files and datasets -To list what Parquet files or datasets are available, there's the -helpful `list_parquet_files()` and `list_parquet_datasets()` functions. +To list what Parquet files or datasets are available, use the +`list_parquet_files()` and `list_parquet_datasets()` functions. These look in the `fastreg.project_workdata_dir` and `fastreg.project_rawdata_dir` directories (set with `options()`) for any -Parquet files following a specific pattern. See the function help docs -for more details. +Parquet files following a specific pattern. See the reference documentation +for more details. You can use them interactively in the Console: From c3e334b1a511feebfcf2140d15639eaf7d2631a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Apr 2026 11:01:19 +0000 Subject: [PATCH 6/7] =?UTF-8?q?chore:=20=E2=9C=8F=EF=B8=8F=20automatic=20p?= =?UTF-8?q?re-commit=20hook=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/list.R | 2 +- vignettes/fastreg.qmd | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/list.R b/R/list.R index c2354e70..9ecacabe 100644 --- a/R/list.R +++ b/R/list.R @@ -35,7 +35,7 @@ list_sas_files <- function(path) { #' #' Only lists Parquet files that end in `part-*.parquet`. For datasets, #' it will only look for Parquet files with a `year=YYYY` in its path. -#' This function will search the whole system for the project ID, so it might +#' This function will search the whole system for the project ID, so it might #' be slow sometimes. #' #' @name list_parquet diff --git a/vignettes/fastreg.qmd b/vignettes/fastreg.qmd index 61f50259..be6f7922 100644 --- a/vignettes/fastreg.qmd +++ b/vignettes/fastreg.qmd @@ -213,12 +213,12 @@ pipeline itself has been edited. ## Listing available Parquet files and datasets -To list what Parquet files or datasets are available, use the -`list_parquet_files()` and `list_parquet_datasets()` functions. +To list what Parquet files or datasets are available, use the +`list_parquet_files()` and `list_parquet_datasets()` functions. These look in the `fastreg.project_workdata_dir` and `fastreg.project_rawdata_dir` directories (set with `options()`) for any -Parquet files following a specific pattern. See the reference documentation -for more details. +Parquet files following a specific pattern. See the reference documentation +for more details. You can use them interactively in the Console: From ec66745dea0f5a1cfbaa37e9b1d25d97726fb17a Mon Sep 17 00:00:00 2001 From: "Luke W. Johnston" Date: Tue, 28 Apr 2026 13:08:31 +0200 Subject: [PATCH 7/7] chore: :bento: rebuild Rd docs --- man/list_parquet.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/list_parquet.Rd b/man/list_parquet.Rd index 9890afad..8bd01da2 100644 --- a/man/list_parquet.Rd +++ b/man/list_parquet.Rd @@ -16,7 +16,7 @@ The path(s) to the Parquet datasets (as directories) or files. \description{ Only lists Parquet files that end in \verb{part-*.parquet}. For datasets, it will only look for Parquet files with a \code{year=YYYY} in its path. -This function will search the whole system for the project ID, so might +This function will search the whole system for the project ID, so it might be slow sometimes. } \section{Functions}{