dp-next · signekb · Apr 29, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 23, 2026
@@ -35,6 +35,7 @@ Imports:
     uuid
 Suggests:
     crew,
+    tidyr,
     dbplyr,
     devtools,
     duckdb,

@@ -1,6 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
 export(convert)
+export(list_parquet_datasets)
+export(list_parquet_files)
 export(list_sas_files)
 export(read_parquet_file)
 export(read_parquet_partition)

@@ -30,3 +30,42 @@ list_sas_files <- function(path) {
 
   sas_files
 }
+
+#' List Parquet datasets or files in a project
+#'
+#' Only lists Parquet files that end in `part-*.parquet`. For datasets,
+#' it will only look for Parquet files with a `year=YYYY` in its path.
+#' This function will search the whole system for the project ID, so it might
+#' be slow sometimes.
+#'
+#' @name list_parquet
+#' @rdname list_parquet
+#' @returns The path(s) to the Parquet datasets (as directories) or files.
+NULL
+
+#' @describeIn list_parquet List all Parquet (Hive partitioned by year) datasets.
+#' @export
+list_parquet_datasets <- function() {
+  list_parquet_files() |>
+    fs::path_filter(regexp = "year=[[:digit:]]{4}") |>
+    fs::path_dir() |>
+    fs::path_dir() |>
+    unique() |>
+    fs::path()
+}
+
+#' @describeIn list_parquet List all Parquet files within a project.
+#' @export
+list_parquet_files <- function() {
+  rawdata_path <- get_project_rawdata_dir()
+  workdata_path <- get_project_workdata_dir()
+
+  fs::dir_ls(
+    # Start from root of system.
+    c(rawdata_path, workdata_path),
+    regexp = glue::glue(".*/part-.*\\.parquet$"),
+    recurse = TRUE,
+    fail = FALSE,
+    type = "file"
+  )
+}
@@ -42,3 +42,55 @@ test_that("list_sas_files() errors when path does not exist", {
     regexp = "does not exist"
   )
 })
+
+# Test list_parquet_datasets() -------------------------------------------------
+
+# Make all combinations of paths to Parquet files for testing.
+parquet_files <- tidyr::expand_grid(
+  root = c("rawdata", "workdata"),
+  project = "701010",
+  register = c("bef", "lmdb"),
+  year = c("year=2023", "year=2024", "year=__HIVE_DEFAULT_PARTITION__"),
+  file = c("part-bae04.parquet", "part-04df1.parquet")
+) |>
+  purrr::pmap_chr(
+    \(root, project, register, year, file) {
+      fs::path(fs::path_temp(root), project, register, year, file)
+    }
+  ) |>
+  fs::path()
+
+purrr::walk(parquet_files, \(path) fs::dir_create(fs::path_dir(path)))
+purrr::walk(parquet_files, fs::file_create)
+# purrr::walk(parquet_files, \(path) fs::file_delete(path))
+
+test_that("list expected Parquet files and datasets", {
+  withr::with_options(
+    list(
+      fastreg.project_rawdata_dir = fs::path_temp("rawdata/701010/"),
+      fastreg.project_workdata_dir = fs::path_temp("workdata/701010/")
+    ),
+    {
+      expected_files <- parquet_files |>
+        sort()
+      actual_files <- list_parquet_files() |>
+        # Need to remove name attributes for comparison.
+        unname() |>
+        sort()
+
+      expect_identical(actual_files, expected_files)
+
+      expected_datasets <- parquet_files |>
+        fs::path_dir() |>
+        fs::path_dir() |>
+        unique() |>
+        fs::path() |>
+        sort()
+
+      actual_datasets <- list_parquet_datasets() |>
+        sort()
+
+      expect_identical(actual_datasets, expected_datasets)
+    }
+  )
+})
@@ -211,6 +211,28 @@ file into a Parquet file, all done in parallel. Re-running `tar_make()`
 only re-converts registers whose source files have changed or if the
 pipeline itself has been edited.
 
+## Listing available Parquet files and datasets
+
+To list what Parquet files or datasets are available, use the
+`list_parquet_files()` and `list_parquet_datasets()` functions.
+These look in the `fastreg.project_workdata_dir` and
+`fastreg.project_rawdata_dir` directories (set with `options()`) for any
+Parquet files following a specific pattern. See the reference documentation
+for more details.
+
+You can use them interactively in the Console:
+
+<!-- TODO: remove eval false when other functions have been revised to match this style -->
+
+```{r list-files}
+#| filename: "Console"
+#| eval: false
+# For individual files
+list_parquet_files()
+# For datasets (registers with all years).
+list_parquet_datasets()
+```
+
 ## Reading a Parquet register
 
 The final function reads the converted Parquet register data into R,
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,7 @@ Imports: @@
         uuid
     Suggests:
         crew,
+        tidyr,
         dbplyr,
         devtools,
         duckdb,
@@ Expand Down @@