diff --git a/.Rbuildignore b/.Rbuildignore index 1966f505..e823106f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -21,3 +21,5 @@ fix_meta.sh ^CRAN-SUBMISSION$ ^\.github$ R-CMD-check-old.yaml +^\.claude$ +^CLAUDE\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index f6fd0b04..cb0e1cbf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: cansim Type: Package Title: Accessing Statistics Canada Data Table and Vectors -Version: 0.4.4 +Version: 0.4.5 Authors@R: c( person("Jens", "von Bergmann", email = "jens@mountainmath.ca", role = c("aut","cre")), person("Dmitry", "Shkolnik", email = "shkolnikd@gmail.com", role = c("aut"))) diff --git a/NEWS.md b/NEWS.md index b312c8b1..176e2b9d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# cansim 0.4.5 +## Minor changes +* better connection error handling + # cansim 0.4.4 ## Minor changes * fix a problem with metadata parsing does not work properly for table names diff --git a/R/cansim.R b/R/cansim.R index fbef3204..3f510bc8 100644 --- a/R/cansim.R +++ b/R/cansim.R @@ -152,9 +152,9 @@ normalize_cansim_values <- function(data, replacement_value="val_norm", normaliz for (field in fields) { if (!is.null(getOption("cansim.debug"))) message(paste0('Converting ',field,' to factors')) tryCatch({ + # get_deduped_column_level_data now returns pre-sorted data, no need for arrange() level_table <- get_deduped_column_level_data(cansimTableNumber = cansimTableNumber, - language=language,column=field) %>% - arrange(as.integer(.data$`...dim`),as.integer(.data$`...id`)) + language=language,column=field) if (!(field %in% names(data))) { geography_column <- ifelse(cleaned_language=="eng","Geography|Geographic name",paste0("G",intToUtf8(0x00E9),"ographie|Nom g",intToUtf8(0x00E9),"ographique")) data_geography_column <- ifelse(language=="eng","GEO",paste0("G",intToUtf8(0x00C9),"O")) @@ -191,8 +191,8 @@ normalize_cansim_values <- function(data, replacement_value="val_norm", normaliz "than with StatCan, or if this problem can't be resolved, please flag this as an issue in the\n", "{cansim} repository at https://github.com/mountainMath/cansim/issues.")) } else { - data <- data %>% - mutate(!!field:=factor(!!as.name(field),levels=level_table$...name)) + # Use base R for factor conversion - faster than dplyr's mutate for this operation + data[[field]] <- factor(data[[field]], levels = level_table$...name) } }, @@ -929,6 +929,10 @@ get_cansim_table_url <- function(cansimTableNumber, language = "en"){ l <- cleaned_ndm_language(language) %>% substr(1,2) url=paste0("https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/",naked_ndm_table_number(cansimTableNumber),"/",l) response <- httr::GET(url) + if (is.null(response)){return(response)} + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } @@ -976,6 +980,10 @@ get_cansim_changed_tables <- function(start_date,end_date=NULL){ lapply(function(date){ url=paste0("https://www150.statcan.gc.ca/t1/wds/rest/getChangedCubeList/",strftime(date,"%Y-%m-%d")) response <- httr::GET(url) + if (is.null(response)){return(response)} + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } diff --git a/R/cansim_helpers.R b/R/cansim_helpers.R index 2a433aa9..2f1db012 100644 --- a/R/cansim_helpers.R +++ b/R/cansim_helpers.R @@ -279,6 +279,10 @@ get_cansim_code_set <- function(code_set=c("scalar", "frequency", "symbol", "sta if (refresh | !file.exists(path)) { url='https://www150.statcan.gc.ca/t1/wds/rest/getCodeSets' r<-get_with_timeout_retry(url) + if (is.null(r)||is.null(r$status_code)){ + warning("Problem downloading code sets.") + return(NULL) + } if (r$status_code==200) { content <- httr::content(r) saveRDS(content,path) @@ -434,20 +438,24 @@ get_deduped_column_level_data <- function(cansimTableNumber,language,column) { column = column, language = language) - # full level values from metadata + # full level values from metadata - combine mutates for efficiency level_table <- columns %>% select(...dim=!!as.name(dimension_id_column), ...id=!!as.name(member_id_column), ...name=!!as.name(member_name_column), - ...parent_id=!!as.name(parent_member_id_column)) %>% - mutate(...n=as.integer(.data$...id)) %>% - arrange("...n") %>% - select(-"...n") %>% - mutate(...count=n(),.by=c("...dim","...name")) %>% - mutate(...duplicated=.data$...count>1) %>% - mutate(...original=!.data$...duplicated) %>% - mutate(...original_name=.data$...name) %>% - mutate(...last_parent_id=.data$...parent_id) + ...parent_id=!!as.name(parent_member_id_column)) + + # Sort once using base R for efficiency + level_table <- level_table[order(as.integer(level_table$...id)), ] + + # Compute duplicates in one pass + level_table <- level_table %>% + mutate(...count=n(), + ...duplicated=.data$...count>1, + ...original=.data$...count==1, + ...original_name=.data$...name, + ...last_parent_id=.data$...parent_id, + .by=c("...dim","...name")) fixed_level_table <- NULL # don't try to dedup census geographies, too messy @@ -459,27 +467,26 @@ get_deduped_column_level_data <- function(cansimTableNumber,language,column) { filter(.data$...dim!="1") } - # try to dedup + # try to dedup - only if there are duplicates max_run <- 30 while (sum(level_table$...duplicated)>0 && max_run>0) { # deals with 36-10-0580 max_run <- max_run - 1 + # Use join-based approach for deduplication as it handles dynamic parent chains efficiently level_table <- level_table %>% left_join(level_table %>% select("...id","...dim",...parent_name="...original_name",...new_parent_id="...last_parent_id"), by=c("...last_parent_id"="...id","...dim"="...dim")) %>% mutate(...name=case_when(.data$...duplicated & is.na(.data$...parent_name) ~ paste0(.data$...name," [",.data$...id,"]"), .data$...duplicated & !is.na(.data$...parent_name) ~ paste0(.data$...name," ==> ",.data$...parent_name), - TRUE ~ .data$...name)) %>% - mutate(...last_parent_id=ifelse(.data$...duplicated, - .data$...new_parent_id, - .data$...last_parent_id)) %>% - mutate(...count=n(),.by=c("...dim","...name")) %>% - mutate(...duplicated=.data$...count>1) %>% + TRUE ~ .data$...name), + ...last_parent_id=ifelse(.data$...duplicated, .data$...new_parent_id, .data$...last_parent_id)) %>% + mutate(...count=n(), ...duplicated=.data$...count>1, .by=c("...dim","...name")) %>% select(-any_of(c("...parent_name","...new_parent_id"))) - } - bind_rows(fixed_level_table,level_table) %>% - arrange("...dim") %>% + result <- bind_rows(fixed_level_table,level_table) + # Sort by dim and id for final output + result <- result[order(as.integer(result$...dim), as.integer(result$...id)), ] + result %>% select("...dim","...id","...name","...original","...original_name") } diff --git a/R/cansim_tables_list.R b/R/cansim_tables_list.R index 9c184a82..c0b5df3c 100644 --- a/R/cansim_tables_list.R +++ b/R/cansim_tables_list.R @@ -89,7 +89,11 @@ list_cansim_cubes <- function(lite=FALSE,refresh=FALSE,quiet=FALSE){ if (!quiet) message("Retrieving cube information from StatCan servers...") url=ifelse(lite,"https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite","https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesList") r<-get_with_timeout_retry(url,retry=0,warn_only=TRUE) - if (r$status_code==200) { + if (is.null(r)||is.null(r$status_code)){ + warning("Could not retrieve cube list from StatCan servers.") + return(NULL) + } + if (!is.null(r$status_code) && r$status_code==200) { content <- httr::content(r) header <- content[[1]] %>% @@ -173,7 +177,11 @@ list_cansim_cubes <- function(lite=FALSE,refresh=FALSE,quiet=FALSE){ #' #' @export search_cansim_cubes <- function(search_term, refresh=FALSE){ - list_cansim_cubes(refresh = refresh) %>% + cube_list <- list_cansim_cubes(refresh = refresh) + if (is.null(cube_list)) { + stop("Could not retrieve cube list from StatCan servers.",call.=FALSE) + } + cube_list %>% filter(grepl(search_term,.data$cubeTitleEn,ignore.case = TRUE) | grepl(search_term,.data$cubeTitleFr,ignore.case = TRUE) | grepl(search_term,.data$surveyEn,ignore.case = TRUE) | diff --git a/R/cansim_vectors.R b/R/cansim_vectors.R index 4945d21c..eb2a534b 100644 --- a/R/cansim_vectors.R +++ b/R/cansim_vectors.R @@ -207,6 +207,9 @@ get_cansim_vector<-function(vectors, start_time = as.Date("1800-01-01"), end_tim timeout = timeout) } if (is.null(response)) return(response) + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } @@ -317,6 +320,9 @@ get_cansim_vector_for_latest_periods<-function(vectors, periods=NULL, message(paste0("Accessing CANSIM NDM vectors from Statistics Canada",addition)) response <- post_with_timeout_retry(url, body=vectors_string, timeout = timeout) if (is.null(response)) return(response) + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } @@ -411,7 +417,7 @@ get_cansim_data_for_table_coord_periods<-function(tableCoordinates, periods=NULL if ("list" %in% class(tableCoordinates)) { tableCoordinates <- tibble::enframe(tableCoordinates) %>% setNames(c("cansimTableNumber","COORDINATE")) %>% - tidyr::unnest_longer(.data$COORDINATE) + tidyr::unnest_longer("COORDINATE") } tableCoordinates <- tableCoordinates %>% mutate(cansimTableNumber=naked_ndm_table_number(.data$cansimTableNumber)) %>% @@ -457,6 +463,10 @@ get_cansim_data_for_table_coord_periods<-function(tableCoordinates, periods=NULL } message(paste0("Accessing CANSIM NDM coordinates from Statistics Canada",addition)) response <- post_with_timeout_retry(url, body=body_string, timeout = timeout) + if (is.null(response)) {return(response)} + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } @@ -564,6 +574,10 @@ get_cansim_vector_info <- function(vectors){ url="https://www150.statcan.gc.ca/t1/wds/rest/getSeriesInfoFromVector" vectors_string=paste0("[",paste(purrr::map(as.character(vectors),function(x)paste0('{"vectorId":',x,'}')),collapse = ", "),"]") response <- post_with_timeout_retry(url, body=vectors_string) + if (is.null(response)){return(response)} + if (is.null(response$status_code)) { + stop("Problem downloading data.\n",response$error,call.=FALSE) + } if (response$status_code!=200) { stop("Problem downloading data, status code ",response$status_code,"\n",httr::content(response),call.=FALSE) } diff --git a/R/hex_sticker.R b/R/hex_sticker.R index ac716383..5c554008 100644 --- a/R/hex_sticker.R +++ b/R/hex_sticker.R @@ -1,14 +1,13 @@ #' Internal function to generate hex sticker #' @keywords internal generate_cansim_hex_sticker <- function (){ - income_data <- cansim::get_cansim("11-10-0239") %>% - cansim::normalize_cansim_values() income_age_groups <- c("16 to 24 years", "25 to 34 years" , "35 to 44 years" , "45 to 54 years" ,"55 to 64 years", "65 years and over") - income_plot_data <- income_data %>% + income_plot_data <- cansim::get_cansim_connection("11-10-0239") %>% dplyr::filter(Sex=="Both sexes", Statistics=="Median income (excluding zeros)", `Income source`=="Total income", `Age group` %in% income_age_groups) %>% + cansim::collect_and_normalize() %>% dplyr::mutate(`Age group`=factor(`Age group`,levels=income_age_groups)) %>% dplyr::group_by(GEO,`Age group`) %>% dplyr::left_join(dplyr::filter(.,Date==min(Date)) %>% @@ -19,7 +18,31 @@ generate_cansim_hex_sticker <- function (){ pd <- income_plot_data %>% dplyr::filter(GEO=="Canada") ed <- pd %>% dplyr::filter(Date==max(Date)) - ca_data <- cancensus::get_census("CA16",regions=list(C="01"),geo_format='sf') %>% sf::st_transform(102002) + crs <- 'PROJCS["Canada_Lambert_Conformal_Conic", + GEOGCS["NAD83", + DATUM["North_American_Datum_1983", + SPHEROID["GRS 1980",6378137,298.257222101, + AUTHORITY["EPSG","7019"]], + AUTHORITY["EPSG","6269"]], + PRIMEM["Greenwich",0, + AUTHORITY["EPSG","8901"]], + UNIT["degree",0.0174532925199433, + AUTHORITY["EPSG","9122"]], + AUTHORITY["EPSG","4269"]], + PROJECTION["Lambert_Conformal_Conic_2SP"], + PARAMETER["latitude_of_origin",40], + PARAMETER["central_meridian",-96], + PARAMETER["standard_parallel_1",50], + PARAMETER["standard_parallel_2",70], + PARAMETER["false_easting",0], + PARAMETER["false_northing",0], + UNIT["metre",1, + AUTHORITY["EPSG","9001"]], + AXIS["Easting",EAST], + AXIS["Northing",NORTH], + AUTHORITY["ESRI","102002"]]' + + ca_data <- cancensus::get_census("CA16",regions=list(C="01"),geo_format='sf') %>% sf::st_transform(crs) q <- ggplot2::ggplot(ca_data) + ggplot2::geom_sf(fill="grey20",size=0.01) + ggplot2::theme_void() + @@ -27,7 +50,7 @@ generate_cansim_hex_sticker <- function (){ bbox=sf::st_bbox(ca_data) p<-ggplot2::ggplot(pd,ggplot2::aes(x=Date,y=VALUE,color=`Age group`)) + ggplot2::geom_line() + - ggplot2::scale_color_brewer(palette="Dark2",guide=FALSE) + + ggplot2::scale_color_brewer(palette="Dark2",guide='none') + ggplot2::labs(x="",y="") + ggplot2::theme_void() + hexSticker::theme_transparent() @@ -45,6 +68,16 @@ generate_cansim_hex_sticker <- function (){ p_color="white", filename=here::here("images/cansim-sticker.png")) + if (FALSE) { + hexSticker::sticker(pp, package="CanViz", + p_size=8, p_y=1.5, + s_x=1, s_y=0.78, s_width=1.5, s_height=1.5, + h_color="#FF0000", + h_fill= "grey40", + p_color="white", + filename=here::here("~/Downloads/canviz-sticker.svg")) + } + } #' Internal function to update table list diff --git a/README.md b/README.md index 0e73634b..3673b670 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,10 @@ [![CRAN status](https://www.r-pkg.org/badges/version/cansim)](https://CRAN.R-project.org/package=cansim) [![CRAN_Downloads_Badge](https://cranlogs.r-pkg.org/badges/cansim)](https://cranlogs.r-pkg.org/badges/cansim) [![R-CMD-check](https://github.com/mountainMath/cansim/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mountainMath/cansim/actions/workflows/R-CMD-check.yaml) -[![DOI](https://img.shields.io/badge/doi-10.32614/CRAN.package.cansim-#d2b24a.svg)](https://doi.org/10.32614/CRAN.package.cansim) +[![DOI](https://img.shields.io/badge/DOI-10.32614/CRAN.package.cansim-d2b24a.svg)](https://doi.org/10.32614/CRAN.package.cansim) -cansim logo + An R package to retrieve and work with public Statistics Canada data tables. diff --git a/cran-comments.md b/cran-comments.md index 56481467..6b59b4a9 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -151,3 +151,8 @@ There were no ERRORs or WARNINGs or NOTEs. * fix a problem with metadata parsing does not work properly for table names * make documentations more consistent wrt default langauge names * add convenience functions for setting cache paths for data accessed via get_cansim_connection + +# cansim 0.4.5 +## Minor changes +* better connection error handling + diff --git a/docs/404.html b/docs/404.html index c73f2820..d0ec05d7 100644 --- a/docs/404.html +++ b/docs/404.html @@ -38,7 +38,7 @@ cansim - 0.4.4 + 0.4.5