Skip to content

Commit 4649f25

Browse files
authored
Merge pull request #115 from mountainMath/v0.3.14
better deal with semi-long tables, more robust header parsing
2 parents 96bdc4c + 2169db4 commit 4649f25

67 files changed

Lines changed: 194 additions & 124 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: cansim
22
Type: Package
33
Title: Accessing Statistics Canada Data Table and Vectors
4-
Version: 0.3.13
4+
Version: 0.3.14
55
Authors@R: c(
66
person("Jens", "von Bergmann", email = "jens@mountainmath.ca", role = c("cre")),
77
person("Dmitry", "Shkolnik", email = "shkolnikd@gmail.com", role = c("aut")))

NEWS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# cansim 0.3.14
2+
## Minor changes
3+
* Better header parsing to avoid warning messages
4+
* Fix problem with some semi-wide tables
5+
16
# cansim 0.3.13
27
## Minor changes
38
* Speed up access to cached sqlite tables

R/cansim.R

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,10 +524,27 @@ get_cansim <- function(cansimTableNumber, language="english", refresh=FALSE, tim
524524
value_column="VALEUR"
525525
}
526526

527+
header <- csv_reader(file.path(exdir, paste0(base_table, ".csv")), n_max=1,
528+
na=na_strings,
529+
locale=readr::locale(encoding="UTF-8"),
530+
col_types = list(.default = "c"),
531+
col_names = FALSE) %>%
532+
as.character()
533+
534+
symbols <- which(header=="Symbol")
535+
536+
if (length(symbols)>1) {
537+
header[symbols] <- paste0("Symbol ",seq(1,length(symbols)))
538+
}
539+
540+
541+
527542
data <- csv_reader(file.path(exdir, paste0(base_table, ".csv")),
528543
na=na_strings,
529544
locale=readr::locale(encoding="UTF-8"),
530-
col_types = list(.default = "c"))
545+
col_types = list(.default = "c"),
546+
skip=1,
547+
col_names = header)
531548

532549
data <- data %>% transform_value_column(value_column)
533550

R/cansim_helpers.R

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ add_provincial_abbreviations <- function(data){
215215
short_prov <- short_prov.fr
216216
}
217217
data <- data %>%
218-
mutate(GEO.abb=factor(as.character(short_prov[!!as.name(data_geography_column)]), levels=c("CAN","BC","AB","SK","MB","ON","QC","NB","PE","NS","NL","YT","NT","NU","NTNU")))
218+
mutate(GEO.abb=factor(as.character(short_prov[!!as.name(data_geography_column)]),
219+
levels=c("CAN","BC","AB","SK","MB","ON","QC","NB","PE","NS","NL","YT","NT","NU","NTNU")))
219220
}
220221

221222

@@ -262,44 +263,43 @@ get_cansim_code_set <- function(code_set=c("scalar", "frequency", "symbol", "sta
262263
# transforms the value column to nomeric. If table is in semi-wide form it converts the wide for dimension
263264
# to long form and creates and modifies the COORDINATE column as needed.
264265
transform_value_column <- function(data,value_column){
265-
symbol_grep_string <- "^Symbol...\\d+$|^Symbol$|^Symbol_\\d+$"
266-
if (!(value_column %in% names(data)) & sum(grepl(symbol_grep_string,names(data)))>1) {
267-
symbols <- which(grepl(symbol_grep_string,names(data)))
268-
dimension_grep_string <- paste0("^.+ \\(",length(symbols),"\\):.+\\[\\d+\\]$")
266+
symbols <- which(grepl("^Symbol( \\d+)*$",names(data)))
267+
if (!(value_column %in% names(data)) & length(symbols)>1) {
268+
#message("\nTransforming to long form.")
269+
dimension_grep_string <- paste0("^.+ \\(",length(symbols),"[A-Z]*\\):.+\\[\\d+\\]$")
269270
dimensions <- which(grepl(dimension_grep_string,names(data)))
270271
if (sum(symbols!=dimensions+1)>0) {
271272
warning("Unable to identify dimensions")
272273
} else {
273-
dimension_members <- gsub(paste0("^.+ \\(",length(symbols),"\\): *"),"",names(data)[dimensions]) %>%
274+
count_type <- stringr::str_match(names(data)[dimensions][1],paste0("(\\(",length(symbols),"[A-Z]*\\))"))[1,2]
275+
dimension_members <- gsub(paste0("^.+ \\(",length(symbols),"[A-Z]*\\): *"),"",names(data)[dimensions]) %>%
274276
gsub(" *\\[\\d+\\]$","",.)
275277
member_ids <- stringr::str_extract(names(data)[dimensions],"\\[\\d+\\]$") %>% gsub("\\[|\\]","",.)
276-
dimension_name <- gsub(paste0(" \\(",length(symbols),"\\):.+\\[\\d+\\]"),"",names(data)[dimensions]) %>%
277-
unique() %>% paste0(.," (",length(symbols),")")
278+
dimension_name <- gsub(paste0(" \\(",length(symbols),"[A-Z]*\\):.+\\[\\d+\\]"),"",names(data)[dimensions]) %>%
279+
unique() %>% paste0(.," ",count_type)
278280

279281
if (length(dimension_name)>1) {
280282
warning("Unable to identify dimension name")
281283
} else {
282-
data_short <- data %>%
283-
select(-c(symbols,dimensions))
284-
data <- data_short %>%
285-
dplyr::left_join(
286-
data %>%
287-
dplyr::select(-symbols) %>%
288-
tidyr::pivot_longer(matches(dimension_grep_string),names_to=dimension_name,values_to="VALUE") %>%
289-
dplyr::mutate(!!paste0("Member ID: ",dimension_name):=
290-
stringr::str_extract(.data[[dimension_name]],"\\[\\d+\\]$") %>% gsub("\\[|\\]","",.)) %>%
291-
dplyr::mutate_at(dimension_name,function(d)
292-
gsub(paste0("^.+ \\(",length(symbols),"\\): *"),"",d) %>%
293-
gsub(" *\\[\\d+\\]$","",.)),
294-
by=names(data_short))
284+
renames <- c(setNames(names(data)[dimensions],paste0(member_ids," --- ",value_column)),
285+
setNames(names(data)[symbols],paste0(member_ids," --- Symbol")))
286+
287+
member_names <- dplyr::tibble(!!as.name(paste0("Member ID: ",dimension_name)):=member_ids,
288+
!!as.name(dimension_name):=dimension_members)
289+
290+
data <- data %>%
291+
dplyr::rename(!!!renames) %>%
292+
tidyr::pivot_longer(matches(" --- "), names_pattern="^(.+) --- (.+)$",
293+
names_to=c(paste0("Member ID: ",dimension_name),".value")) %>%
294+
dplyr::left_join(member_names,by=paste0("Member ID: ",dimension_name))
295295
if ("Coordinate" %in% names(data)) {
296296
data <- data %>%
297297
dplyr::mutate(COORDINATE = paste0(.data$Coordinate,".",!!as.name(paste0("Member ID: ",dimension_name)))) %>%
298-
select(-.data$Coordinate)
298+
dplyr::select(-.data$Coordinate)
299299
}
300+
300301
data <- data %>%
301302
dplyr::select(-dplyr::all_of(paste0("Member ID: ",dimension_name)))
302-
data_short <- NULL
303303
}
304304
}
305305
}
@@ -318,9 +318,12 @@ format_file_size <- function (x, units = "b", standard = "auto", digits = 1L, ..
318318
{
319319
known_bases <- c(legacy = 1024, IEC = 1024, SI = 1000)
320320
known_units <- list(SI = c("B", "kB", "MB", "GB", "TB", "PB",
321-
"EB", "ZB", "YB"), IEC = c("B", "KiB", "MiB", "GiB",
322-
"TiB", "PiB", "EiB", "ZiB", "YiB"), legacy = c("b", "Kb",
323-
"Mb", "Gb", "Tb", "Pb"), LEGACY = c("B", "KB", "MB",
321+
"EB", "ZB", "YB"),
322+
IEC = c("B", "KiB", "MiB", "GiB",
323+
"TiB", "PiB", "EiB", "ZiB", "YiB"),
324+
legacy = c("b", "Kb",
325+
"Mb", "Gb", "Tb", "Pb"),
326+
LEGACY = c("B", "KB", "MB",
324327
"GB", "TB", "PB"))
325328
units <- match.arg(units, c("auto", unique(unlist(known_units),
326329
use.names = FALSE)))

R/cansim_sql.R

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,34 @@ get_cansim_sqlite <- function(cansimTableNumber, language="english", refresh=FAL
142142
hierarchy_name <- paste0(hierarchy_prefix," ", data_geography_column)
143143
}
144144

145+
146+
header <- readr::read_delim(file.path(exdir, paste0(base_table, ".csv")),
147+
n_max=1,
148+
delim=delim,
149+
na=na_strings,
150+
locale=readr::locale(encoding="UTF-8"),
151+
col_types = list(.default = "c"),
152+
col_names = FALSE) %>%
153+
as.character()
154+
155+
symbols <- which(grepl("^Symbol( .+)*$",header,ignore.case = TRUE))
156+
sl <- length(symbols)
157+
158+
if (sl>1) {
159+
header[symbols] <- paste0("Symbol ",seq(1,sl))
160+
}
161+
162+
chunk_size=ceiling(5000000/pmax(sl,1))
163+
145164
csv2sqlite(file.path(exdir, paste0(base_table, ".csv")),
146165
sqlite_file = sqlite_path,
147166
table_name=table_name,
148167
col_types = list(.default = "c"),
168+
col_names = header,
169+
skip=1,
149170
na = na_strings,
150171
delim = delim,
172+
chunk_size=chunk_size,
151173
transform=function(data){
152174
data <- data %>% transform_value_column(value_string)
153175
if (length(geo_column_pos)==1)
@@ -442,12 +464,13 @@ create_index <- function(connection,table_name,field){
442464
#' @param na na character strings
443465
#' @param text_encoding encoding of csv file (default UTF-8)
444466
#' @param delim (Optional) csv deliminator, default is ","
467+
#' @param ... (Optional) additional parameters passed to `readr::read_delim_chunked`
445468
#'
446469
#' @return A database connection
447470
#' @keywords internal
448471
csv2sqlite <- function(csv_file, sqlite_file, table_name, transform=NULL,chunk_size=5000000,
449472
append=FALSE,col_types=NULL,na=c(NA,"..","","...","F"),
450-
text_encoding="UTF-8",delim = ",") {
473+
text_encoding="UTF-8",delim = ",",...) {
451474
# Connect to database.
452475
if (!append && file.exists(sqlite_file)) file.remove(sqlite_file)
453476
con <- DBI::dbConnect(RSQLite::SQLite(), dbname=sqlite_file)
@@ -463,7 +486,8 @@ csv2sqlite <- function(csv_file, sqlite_file, table_name, transform=NULL,chunk_s
463486
col_types=col_types,
464487
chunk_size = chunk_size,
465488
locale=readr::locale(encoding = text_encoding),
466-
na=na)
489+
na=na,
490+
...)
467491

468492
DBI::dbDisconnect(con)
469493
}

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,15 +173,15 @@ If you want to get in touch, we are pretty good at responding via email or via t
173173

174174
If you wish to cite the `cansim` package in your work:
175175

176-
von Bergmann, J., Dmitry Shkolnik (2022). cansim: functions and convenience tools for accessing Statistics Canada data tables. v0.3.13.
176+
von Bergmann, J., Dmitry Shkolnik (2022). cansim: functions and convenience tools for accessing Statistics Canada data tables. v0.3.14.
177177

178178
A BibTeX entry for LaTeX users is
179179
```
180180
@Manual{cansim,
181181
author = {Jens {von Bergmann} and Dmitry Shkolnik},
182182
title = {cansim: functions and convenience tools for accessing Statistics Canada data tables},
183183
year = {2022},
184-
note = {R package version 0.3.13},
184+
note = {R package version 0.3.14},
185185
url = {https://mountainmath.github.io/cansim/}
186186
}
187187
```

cran-comments.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## Test environments
2-
* local OS X install, R 4.0.5
3-
* GitHub Action macOS-latest, windows-lastest (3.6), ubuntu-16.04 (devel, release, oldrel), ubuntu-16.04 (3.4, 3.5)
2+
* local OS X install, R 4.2.2
3+
* GitHub Action macOS-latest, windows-lastest (3.6), ubuntu-20.04 (devel, release)
44

55
## R CMD check results
66
There were no ERRORs or WARNINGs or NOTEs.
@@ -91,4 +91,9 @@ There were no ERRORs or WARNINGs or NOTEs.
9191
* Speed up access to cached sqlite tables
9292
* Fix problem with `get_cansim_vector_info()`
9393

94+
# cansim 0.3.13
95+
## Minor changes
96+
* Better header parsing to avoid warning messages
97+
* Fix problem with some semi-wide tables
98+
9499

docs/404.html

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/LICENSE-text.html

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/LICENSE.html

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)