1- # ' Convert register SAS file(s) and save to Parquet format
2- # '
3- # ' @description
4- # ' This function reads one or more SAS files for a given register, and saves the
5- # ' data in Parquet format. It expects the input SAS files to come from the same
6- # ' register, e.g., different years of the same register. The function checks
7- # ' that all files belong to the same register by comparing the alphabetic
8- # ' characters in the file name(s).
9- # '
10- # ' The function looks for a year (1900-2099) in the file
11- # ' names in `path` to use the year as partition, see `vignette("design")`
12- # ' for more information about the partitioning.
13- # '
14- # ' If a year is found, the data is saved as a partition by year in the output
15- # ' directory, e.g., `output_dir/register_name/year=2020/part-ad5b.parquet`
16- # ' (the ending being a UUID). If no year is found in the file name, the data
17- # ' is saved in a
18- # ' `year=__HIVE_DEFAULT_PARTITION__` partition, which is the standard Hive
19- # ' convention for missing partition values.
20- # '
21- # ' Two columns are added to the output: `source_file` (the original SAS file
22- # ' path) and `year` (extracted from the file name, used as partition key).
23- # '
24- # ' To be able to handle larger-than-memory SAS files, this function uses
25- # ' `convert_file()` internally and only converts one file at a time in chunks.
26- # ' As a result, identical rows are not deduplicated.
27- # '
28- # ' @param path Paths to SAS files for one register. See [list_sas_files()].
29- # ' @param output_dir Directory to save the Parquet output to. Must not include
30- # ' the register name as this will be extracted from `path` to create the
31- # ' register folder.
32- # ' @param chunk_size Number of rows to read and convert at a time.
33- # '
34- # ' @returns `output_dir`, invisibly.
35- # '
36- # ' @export
37- # ' @examples
38- # ' sas_file_directory <- fs::path_package("fastreg", "extdata")
39- # ' convert_register(
40- # ' path = list_sas_files(sas_file_directory),
41- # ' output_dir = fs::path_temp("path/to/output/register/")
42- # ' )
43- convert_register <- function (
44- path ,
45- output_dir ,
46- chunk_size = 10000000L
47- ) {
48- # Check that register dir is empty (if exists) to avoid duplicating data
49- # since parts are named with UUIDs.
50- # Get register name checks that only one register is in `path`.
51- register_dir <- fs :: path(output_dir , get_register_name(path ))
52- if (fs :: dir_exists(register_dir ) && length(fs :: dir_ls(register_dir )) > 0 ) {
53- cli :: cli_abort(c(
54- " Output directory is not empty: {.path {register_dir}}" ,
55- " i" = " Delete the directory manually before re-running."
56- ))
57- }
58-
59- # Convert files.
60- purrr :: walk(path , \(p ) {
61- convert_file(p , output_dir , chunk_size )
62- gc()
63- })
64-
65- # Success message.
66- cli :: cli_alert_success(" Successfully converted {length(path)} file{?s}." )
67- cli :: cli_bullets(c(
68- " *" = " Input: {.val {fs::path_file(path)}}" ,
69- " *" = " Output: Register files in {.path {fs::path(output_dir, get_register_name(path))}}"
70- ))
71-
72- invisible (output_dir )
73- }
74-
751# ' Convert a single register SAS file to Parquet
762# '
773# ' To be able to handle larger-than-memory files, the SAS file is converted in
@@ -80,7 +6,10 @@ convert_register <- function(
806# ' exists in the directory, since files are saved with UUIDs in their names.
817# '
828# ' @param path Path to a single SAS file.
83- # ' @inheritParams convert_register
9+ # ' @param output_dir Directory to save the Parquet output to. Must not include
10+ # ' the register name as this will be extracted from `path` to create the
11+ # ' register folder.
12+ # ' @param chunk_size Number of rows to read and convert at a time.
8413# '
8514# ' @returns `output_dir`, invisibly.
8615# '
0 commit comments