diff --git a/DESCRIPTION b/DESCRIPTION index f5cec82..e680d44 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Imports: uuid Suggests: crew, + tidyr, dbplyr, devtools, duckdb, diff --git a/NAMESPACE b/NAMESPACE index 5b0aaf1..544c708 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(convert) +export(list_parquet_datasets) +export(list_parquet_files) export(list_sas_files) export(read_parquet_file) export(read_parquet_partition) diff --git a/R/list.R b/R/list.R index ac23340..9ecacab 100644 --- a/R/list.R +++ b/R/list.R @@ -30,3 +30,42 @@ list_sas_files <- function(path) { sas_files } + +#' List Parquet datasets or files in a project +#' +#' Only lists Parquet files that end in `part-*.parquet`. For datasets, +#' it will only look for Parquet files with a `year=YYYY` in its path. +#' This function will search the whole system for the project ID, so it might +#' be slow sometimes. +#' +#' @name list_parquet +#' @rdname list_parquet +#' @returns The path(s) to the Parquet datasets (as directories) or files. +NULL + +#' @describeIn list_parquet List all Parquet (Hive partitioned by year) datasets. +#' @export +list_parquet_datasets <- function() { + list_parquet_files() |> + fs::path_filter(regexp = "year=[[:digit:]]{4}") |> + fs::path_dir() |> + fs::path_dir() |> + unique() |> + fs::path() +} + +#' @describeIn list_parquet List all Parquet files within a project. +#' @export +list_parquet_files <- function() { + rawdata_path <- get_project_rawdata_dir() + workdata_path <- get_project_workdata_dir() + + fs::dir_ls( + # Start from root of system. + c(rawdata_path, workdata_path), + regexp = glue::glue(".*/part-.*\\.parquet$"), + recurse = TRUE, + fail = FALSE, + type = "file" + ) +} diff --git a/man/list_parquet.Rd b/man/list_parquet.Rd new file mode 100644 index 0000000..8bd01da --- /dev/null +++ b/man/list_parquet.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list.R +\name{list_parquet} +\alias{list_parquet} +\alias{list_parquet_datasets} +\alias{list_parquet_files} +\title{List Parquet datasets or files in a project} +\usage{ +list_parquet_datasets() + +list_parquet_files() +} +\value{ +The path(s) to the Parquet datasets (as directories) or files. +} +\description{ +Only lists Parquet files that end in \verb{part-*.parquet}. For datasets, +it will only look for Parquet files with a \code{year=YYYY} in its path. +This function will search the whole system for the project ID, so it might +be slow sometimes. +} +\section{Functions}{ +\itemize{ +\item \code{list_parquet_datasets()}: List all Parquet (Hive partitioned by year) datasets. + +\item \code{list_parquet_files()}: List all Parquet files within a project. + +}} diff --git a/man/use_template.Rd b/man/use_template.Rd index 90bc67c..d71a13a 100644 --- a/man/use_template.Rd +++ b/man/use_template.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/use-targets.R +% Please edit documentation in R/use.R \name{use_template} \alias{use_template} \title{Use a targets pipeline template for converting SAS registers to Parquet} diff --git a/tests/testthat/test-list.R b/tests/testthat/test-list.R index 25c6382..b0f533a 100644 --- a/tests/testthat/test-list.R +++ b/tests/testthat/test-list.R @@ -42,3 +42,55 @@ test_that("list_sas_files() errors when path does not exist", { regexp = "does not exist" ) }) + +# Test list_parquet_datasets() ------------------------------------------------- + +# Make all combinations of paths to Parquet files for testing. +parquet_files <- tidyr::expand_grid( + root = c("rawdata", "workdata"), + project = "701010", + register = c("bef", "lmdb"), + year = c("year=2023", "year=2024", "year=__HIVE_DEFAULT_PARTITION__"), + file = c("part-bae04.parquet", "part-04df1.parquet") +) |> + purrr::pmap_chr( + \(root, project, register, year, file) { + fs::path(fs::path_temp(root), project, register, year, file) + } + ) |> + fs::path() + +purrr::walk(parquet_files, \(path) fs::dir_create(fs::path_dir(path))) +purrr::walk(parquet_files, fs::file_create) +# purrr::walk(parquet_files, \(path) fs::file_delete(path)) + +test_that("list expected Parquet files and datasets", { + withr::with_options( + list( + fastreg.project_rawdata_dir = fs::path_temp("rawdata/701010/"), + fastreg.project_workdata_dir = fs::path_temp("workdata/701010/") + ), + { + expected_files <- parquet_files |> + sort() + actual_files <- list_parquet_files() |> + # Need to remove name attributes for comparison. + unname() |> + sort() + + expect_identical(actual_files, expected_files) + + expected_datasets <- parquet_files |> + fs::path_dir() |> + fs::path_dir() |> + unique() |> + fs::path() |> + sort() + + actual_datasets <- list_parquet_datasets() |> + sort() + + expect_identical(actual_datasets, expected_datasets) + } + ) +}) diff --git a/vignettes/fastreg.qmd b/vignettes/fastreg.qmd index 3468e0e..be6f792 100644 --- a/vignettes/fastreg.qmd +++ b/vignettes/fastreg.qmd @@ -211,6 +211,28 @@ file into a Parquet file, all done in parallel. Re-running `tar_make()` only re-converts registers whose source files have changed or if the pipeline itself has been edited. +## Listing available Parquet files and datasets + +To list what Parquet files or datasets are available, use the +`list_parquet_files()` and `list_parquet_datasets()` functions. +These look in the `fastreg.project_workdata_dir` and +`fastreg.project_rawdata_dir` directories (set with `options()`) for any +Parquet files following a specific pattern. See the reference documentation +for more details. + +You can use them interactively in the Console: + + + +```{r list-files} +#| filename: "Console" +#| eval: false +# For individual files +list_parquet_files() +# For datasets (registers with all years). +list_parquet_datasets() +``` + ## Reading a Parquet register The final function reads the converted Parquet register data into R,