From 3039d03afe7497f861b87e5aedc5c0bd6e5eb6fc Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 00:55:49 -0500 Subject: [PATCH 01/16] Create col_anomaly_check.R --- R/col_anomaly_check.R | 94 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 R/col_anomaly_check.R diff --git a/R/col_anomaly_check.R b/R/col_anomaly_check.R new file mode 100644 index 000000000..27f476375 --- /dev/null +++ b/R/col_anomaly_check.R @@ -0,0 +1,94 @@ +# +# _ _ _ _ _ +# (_) | | | | | | | | +# _ __ ___ _ _ __ | |_ | |__ | | __ _ _ __ | | __ +# | '_ \ / _ \ | || '_ \ | __|| '_ \ | | / _` || '_ \ | |/ / +# | |_) || (_) || || | | || |_ | |_) || || (_| || | | || < +# | .__/ \___/ |_||_| |_| \__||_.__/ |_| \__,_||_| |_||_|\_\ +# | | +# |_| +# +# This file is part of the 'rich-iannone/pointblank' package. +# +# (c) Richard Iannone +# +# For full copyright and license information, please look at +# https://rich-iannone.github.io/pointblank/LICENSE.html +# + +#' Do columns in the table contain anomalous values? +#' +#' @name col_anomaly_check +NULL + +#' @rdname col_anomaly_check +#' @import rlang +#' @export +col_anomaly_check <- function(x, + x_column, + y_column, + preconditions = NULL, + actions = NULL, + step_id = NULL, + label = NULL, + brief = NULL, + active = TRUE) { + + x_column <- rlang::enquo(x_column) + y_column <- rlang::enquo(y_column) + + # Resolve the columns based on the expression + x_column <- resolve_columns(x = x, var_expr = x_column, preconditions) + y_column <- resolve_columns(x = x, var_expr = y_column, preconditions) + + if (is_a_table_object(x)) { + + secret_agent <- + create_agent(x, label = "::QUIET::") %>% + col_anomaly_check( + x_column = x_column, + y_column = y_column, + preconditions = preconditions, + label = label, + brief = brief, + actions = prime_actions(actions), + active = active + ) %>% + interrogate() + + return(x) + } + + agent <- x + + if (is.null(brief)) { + + brief <- + create_autobrief( + agent = agent, + assertion_type = "col_anomaly_check" + ) + } + + # Normalize any provided `step_id` value(s) + step_id <- normalize_step_id(step_id, columns, agent) + + # Check `step_id` value(s) against all other `step_id` + # values in earlier validation steps + check_step_id_duplicates(step_id, agent) + + # Add a validation step + agent <- + create_validation_step( + agent = agent, + assertion_type = "col_anomaly_check", + column = x_column, + values = y_column, + preconditions = preconditions, + actions = covert_actions(actions, agent), + step_id = step_id, + label = label, + brief = brief, + active = active + ) +} From e28385ec21b031781bf7515a5af2e7dc9e612c6c Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 00:55:54 -0500 Subject: [PATCH 02/16] Create col_anomaly_check.Rd --- man/col_anomaly_check.Rd | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 man/col_anomaly_check.Rd diff --git a/man/col_anomaly_check.Rd b/man/col_anomaly_check.Rd new file mode 100644 index 000000000..b1f8a78da --- /dev/null +++ b/man/col_anomaly_check.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/col_anomaly_check.R +\name{col_anomaly_check} +\alias{col_anomaly_check} +\title{Do columns in the table contain anomalous values?} +\usage{ +col_anomaly_check( + x, + x_column, + y_column, + preconditions = NULL, + actions = NULL, + step_id = NULL, + label = NULL, + brief = NULL, + active = TRUE +) +} +\description{ +Do columns in the table contain anomalous values? +} From 96f0f8b35d0342c0a93797c6bb6ce675bdfa5516 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 00:55:57 -0500 Subject: [PATCH 03/16] Update NAMESPACE --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index 7ba1ab726..e439551dd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(affix_datetime) export(all_passed) export(between) export(case_when) +export(col_anomaly_check) export(col_exists) export(col_is_character) export(col_is_date) From c5977f05f96a96cb9b12ce77746c59c63fa1e22a Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 00:57:45 -0500 Subject: [PATCH 04/16] Update DESCRIPTION --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index dbf41f1d9..b03d4ace1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,8 +41,9 @@ Imports: htmltools (>= 0.4.0), log4r (>= 0.3.2), knitr (>= 1.30), - rlang (>= 0.4.9), magrittr, + mgcv (>= 1.8-33), + rlang (>= 0.4.9), scales (>= 1.1.1), testthat (>= 2.3.2), tibble (>= 3.0.4), From b500637700d57e364dddb4dcdc4d9bddce663106 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:26:13 -0500 Subject: [PATCH 05/16] Add `anomaly_detection_ts()` function --- R/anomaly_detection.R | 419 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 R/anomaly_detection.R diff --git a/R/anomaly_detection.R b/R/anomaly_detection.R new file mode 100644 index 000000000..3da881cc0 --- /dev/null +++ b/R/anomaly_detection.R @@ -0,0 +1,419 @@ +anomaly_detection_ts <- function(data_tbl, + time_col, + value_col) { + + data_tbl <- + data_tbl %>% + dplyr::select(dplyr::all_of(c(time_col, value_col))) %>% + dplyr::rename(time = 1, value = 2) %>% + dplyr::filter(!is.na(time)) %>% + dplyr::filter(!is.na(value)) + + n_rows_data_tbl <- + data_tbl %>% + dplyr::count() %>% + dplyr::pull(n) + + mean <- get_tbl_value_mean(data_tbl) + sd <- get_tbl_value_sd(data_tbl) + + # Create a standardized version of the input data set; this centers the + # data (yielding a `z` value of approximately 0) and scales the data so that + # the standard deviation is 1 + data_tbl_standardized <- + data_tbl %>% + dplyr::select(time, value) %>% + dplyr::mutate(z = (value - mean) / sd) + + # Sample random datapoints from the db table; this is needed for the fit + # model which will be used for detrending the standardized dataset + random_rows <- + c( + 1:5, + sample( + x = seq_len(n_rows_data_tbl), + size = floor(0.05 * n_rows_data_tbl), + replace = FALSE + ), + (n_rows_data_tbl - 5):n_rows_data_tbl + ) %>% + sort() %>% + unique() + + # Collect random rows from the standardized dataset + data_tbl_standardized_sample <- + data_tbl_standardized %>% + dplyr::filter(dplyr::row_number() %in% random_rows) %>% + dplyr::collect() + + # Prepare a data frame suitable for spline generation via + # the `mgcv::gam()` function + data_tbl_gam_df <- + data_tbl_standardized_sample %>% + dplyr::mutate(time = as.integer(time)) %>% + dplyr::rename(x = time, y = z) %>% + dplyr::select(-value) %>% + as.data.frame() + + # Generate a GAM model for the sampled points + gam_model <- mgcv::gam(y ~ s(x), data = data_tbl_gam_df) + + # Create a table with the predicted values + gam_model_tbl <- + dplyr::tibble( + time = data_tbl_standardized_sample$time, + fit = unname(mgcv::predict.gam(gam_model)), + ) + + # # Plot the sampled data points along with the model fit + # ggplot(data_tbl_standardized_sample) + + # geom_point(aes(x = time, y = z)) + + # geom_line(aes(x = time, y = fit), data = gam_model_tbl, color = "blue") + + # Perform an analysis on the standardized data; this focuses only on the + # unchanged `time` variable, giving us columns that indicate how many data + # points are in each time grouping + time_interval_n_analysis <- add_time_granularity_cols(data_tbl_standardized) + + # This provides a time interval summary and should be + # used to detemrine which time granularity to use + time_interval_summary <- + time_interval_n_analysis %>% + dplyr::select(starts_with("n_")) %>% + dplyr::group_by() %>% + dplyr::summarize_all( + .funs = list( + ~ mean(., na.rm = TRUE), + ~ min(., na.rm = TRUE), + ~ max(., na.rm = TRUE) + ) + ) %>% + dplyr::collect() %>% + tidyr::pivot_longer( + cols = dplyr::everything(), + names_prefix = "n_", + names_to = c("timespan", "stat"), + values_to = "values", + names_sep = "_" + ) %>% + tidyr::pivot_wider( + names_from = "stat", + values_from = "values" + ) + + # Automatically select the time granularity with the highest + # average number of data points above 20 per window + time_int_gt_20 <- + time_interval_summary %>% dplyr::filter(mean >= 20) + + if (nrow(time_int_gt_20) < 1) { + stop( + "There aren't enough data points in any of the possible time granularity types.", + call. = FALSE + ) + } + + time_granularity_list <- + time_int_gt_20 %>% + dplyr::arrange(dplyr::desc(mean)) %>% + utils::head(1) %>% + as.list() + + timespan_select <- time_granularity_list$timespan + + # This provides the min and max of `z` + z_interval_n_analysis <- + time_interval_n_analysis %>% + dplyr::group_by() %>% + dplyr::summarize( + min = min(z, na.rm = TRUE), + max = max(z, na.rm = TRUE) + ) + + z_interval_breaks <- + c( + seq(from = 0, to = 1, by = 0.5), + seq(from = 0, to = -1, by = -0.5) + ) %>% + unique() %>% + sort() + + # Simplify the `time_interval_n_analysis` by only keeping key columns + data_tbl <- + time_interval_n_analysis %>% + dplyr::select(1, 2, z, dplyr::all_of(paste0("time_", timespan_select))) + + # Create a local lookup table that contains all of the time groups and + # an empty column with the `model_fit` values; this is important because the + # sampling used to create the spline fit likely won't contain values for all + # time groups, but, we can use this template to account for all available + # time groups and fill in missing values + time_groups <- + data_tbl %>% + dplyr::select(starts_with("time_")) %>% + dplyr::distinct() %>% + dplyr::collect() %>% + dplyr::mutate(model_fit = NA_real_) + + # Create a piecewise representation of the GAM model. This necessary (but + # also not very lossy approximation) makes it possible to join the GAM + # predictions to the DB data + gam_model_piecewise_tbl <- + gam_model_tbl %>% + add_time_granularity_cols() %>% + dplyr::select(1, 2, ends_with(timespan_select)) %>% + dplyr::group_by(time_1d) %>% + dplyr::summarize( + model_fit = mean(fit, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::bind_rows(time_groups) %>% + dplyr::arrange(time_1d) %>% + tidyr::fill(model_fit) %>% + dplyr::distinct() + + data_tbl <- + data_tbl %>% + dplyr::left_join( + gam_model_piecewise_tbl, + by = paste0("time_", timespan_select), + copy = TRUE + ) %>% + dplyr::mutate(z_detrend = model_fit - z) + + encoded_windows <- + data_tbl %>% + dplyr::group_by(time_1d) %>% + dplyr::summarize( + mean = mean(z_detrend, na.rm = TRUE) + ) %>% + dplyr::mutate(code_n_w = dplyr::case_when( + mean < -2.0 ~ 1, + mean < -1.5 ~ 2, + mean < -1.0 ~ 3, + mean < -0.5 ~ 4, + mean < -0.0 ~ 5, + mean < 0.5 ~ 6, + mean < 1.0 ~ 7, + mean < 1.5 ~ 8, + mean < 2.0 ~ 9, + mean < 9999 ~ 10 + )) %>% + dplyr::mutate(code_l_w = dplyr::case_when( + mean < -2.0 ~ "A", + mean < -1.5 ~ "B", + mean < -1.0 ~ "C", + mean < -0.5 ~ "D", + mean < -0.0 ~ "E", + mean < 0.5 ~ "F", + mean < 1.0 ~ "G", + mean < 1.5 ~ "H", + mean < 2.0 ~ "I", + mean < 9999 ~ "J" + )) + + encoded_points <- + data_tbl %>% + dplyr::mutate(code_n_p = dplyr::case_when( + z_detrend < -2.0 ~ 1, + z_detrend < -1.5 ~ 2, + z_detrend < -1.0 ~ 3, + z_detrend < -0.5 ~ 4, + z_detrend < -0.0 ~ 5, + z_detrend < 0.5 ~ 6, + z_detrend < 1.0 ~ 7, + z_detrend < 1.5 ~ 8, + z_detrend < 2.0 ~ 9, + z_detrend < 9999 ~ 10 + )) %>% + dplyr::mutate(code_l_p = dplyr::case_when( + z_detrend < -2.0 ~ "A", + z_detrend < -1.5 ~ "B", + z_detrend < -1.0 ~ "C", + z_detrend < -0.5 ~ "D", + z_detrend < -0.0 ~ "E", + z_detrend < 0.5 ~ "F", + z_detrend < 1.0 ~ "G", + z_detrend < 1.5 ~ "H", + z_detrend < 2.0 ~ "I", + z_detrend < 9999 ~ "J" + )) + + unified_data <- + encoded_points %>% + dplyr::left_join( + encoded_windows, by = "time_1d" + ) %>% + dplyr::mutate(code_n_dist = abs(code_n_p - code_n_w)) %>% + dplyr::mutate( + sd_z_detrend_ul = 2, + sd_z_detrend_ll = -2 + ) %>% + dplyr::mutate(outlier_sd_z = dplyr::case_when( + z_detrend >= sd_z_detrend_ul | z_detrend <= sd_z_detrend_ll ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::mutate(outlier_dist = dplyr::case_when( + code_n_dist > 2 ~ TRUE, + TRUE ~ FALSE + )) %>% + dplyr::mutate(outlier = dplyr::if_else( + outlier_sd_z | outlier_dist, TRUE, FALSE + )) %>% + dplyr::mutate(`pb_is_good_` = !outlier) + + + # plot_data <- + # unified_data %>% + # dplyr::select(1, 2, z, z_detrend, sd_z_detrend_ul, sd_z_detrend_ll, outlier) %>% + # dplyr::collect() + # + # # Plot of actual data values with seeded values and marked outliers + # ggplot() + + # geom_point( + # data = plot_data %>% dplyr::filter(outlier), + # aes(x = time, y = value), color = "red" + # ) + + # geom_point( + # data = plot_data %>% dplyr::filter(!outlier), + # aes(x = time, y = value) + # ) + + # scale_x_datetime(date_breaks = "1 month") + # + # # Plot of standardized data values with seeded values and marked outliers + # ggplot() + + # geom_point( + # data = plot_data %>% dplyr::filter(outlier), + # aes(x = time, y = z_detrend), color = "red" + # ) + + # geom_point( + # data = plot_data %>% dplyr::filter(!outlier), + # aes(x = time, y = z_detrend) + # ) + + # geom_hline(yintercept = 1, color = "gray") + + # geom_hline(yintercept = -1, color = "gray") + + # geom_line(data = plot_data, aes(x = time, y = sd_z_detrend_ul), color = "red") + + # geom_line(data = plot_data, aes(x = time, y = sd_z_detrend_ll), color = "red") + + # scale_y_continuous(breaks = seq(-5.0, 5.0, 0.5)) + + # scale_x_datetime(date_breaks = "1 month") + + + unified_data +} + +get_tbl_value_mean <- function(data_tbl) { + + data_tbl %>% + dplyr::summarize(value_mean = mean(value, na.rm = TRUE)) %>% + dplyr::pull(value_mean) +} + +get_tbl_value_sd <- function(data_tbl) { + + mean <- get_tbl_value_mean(data_tbl = data_tbl) + + variance <- + data_tbl %>% + dplyr::select(value) %>% + dplyr::mutate( + "diff" = (!!mean - value)^2 + ) %>% + dplyr::group_by() %>% + dplyr::summarize( + "var" = mean(diff, na.rm = TRUE) + ) %>% + dplyr::pull(var) + + (abs(variance))^0.5 +} + +add_time_granularity_cols <- function(data_tbl) { + + data_tbl %>% + dplyr::mutate( + time_1d = substr(as.character(time), 1, 10), + time_12h = + dplyr::case_when( + substr(as.character(time), 12, 13) %in% + c("00", "01", "02", "03", "04", "05", "06", + "07", "08", "09", "10", "11") ~ + paste0(substr(as.character(time), 1, 11), "00"), + substr(as.character(time), 12, 13) %in% + c("12", "13", "14", "15", "16", "17", "18", + "19", "20", "21", "22", "23") ~ + paste0(substr(as.character(time), 1, 11), "12") + ), + time_8h = + dplyr::case_when( + substr(as.character(time), 12, 13) %in% + c("00", "01", "02", "03", "04", "05", "06", "07") ~ + paste0(substr(as.character(time), 1, 11), "00"), + substr(as.character(time), 12, 13) %in% + c("08", "09", "10", "11", "12", "13", "14", "15") ~ + paste0(substr(as.character(time), 1, 11), "08"), + substr(as.character(time), 12, 13) %in% + c("16", "17", "18", "19", "20", "21", "22", "23") ~ + paste0(substr(as.character(time), 1, 11), "16") + ), + time_6h = + dplyr::case_when( + substr(as.character(time), 12, 13) %in% + c("00", "01", "02", "03", "04", "05") ~ + paste0(substr(as.character(time), 1, 11), "00"), + substr(as.character(time), 12, 13) %in% + c("06", "07", "08", "09", "10", "11") ~ + paste0(substr(as.character(time), 1, 11), "06"), + substr(as.character(time), 12, 13) %in% + c("12", "13", "14", "15", "16", "17") ~ + paste0(substr(as.character(time), 1, 11), "12"), + substr(as.character(time), 12, 13) %in% + c("18", "19", "20", "21", "22", "23") ~ + paste0(substr(as.character(time), 1, 11), "16") + ), + time_1h = substr(as.character(time), 1, 13), + time_30m = + dplyr::case_when( + substr(as.character(time), 15, 15) %in% c("3", "4", "5") ~ + paste0(substr(as.character(time), 1, 14), "3"), + substr(as.character(time), 15, 15) %in% c("0", "1", "2") ~ + paste0(substr(as.character(time), 1, 14), "0") + ), + time_15m = + dplyr::case_when( + substr(as.character(time), 15, 16) %in% + c("00", "01", "02", "03", "04", "05", "06", "07", + "08", "09", "10", "11", "12", "13", "14") ~ + paste0(substr(as.character(time), 1, 14), "00"), + substr(as.character(time), 15, 16) %in% + c("15", "16", "17", "18", "19", "20", "21", "22", + "23", "24", "25", "26", "27", "28", "29") ~ + paste0(substr(as.character(time), 1, 14), "15"), + substr(as.character(time), 15, 16) %in% + c("30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44") ~ + paste0(substr(as.character(time), 1, 14), "30"), + substr(as.character(time), 15, 16) %in% + c("45", "46", "47", "48", "49", "50", "51", "52", + "53", "54", "55", "56", "57", "58", "59") ~ + paste0(substr(as.character(time), 1, 14), "45") + ), + time_1m = substr(as.character(time), 1, 16) + ) %>% + dplyr::group_by(time_1d) %>% + dplyr::mutate(n_1d = n()) %>% + dplyr::group_by(time_12h) %>% + dplyr::mutate(n_12h = n()) %>% + dplyr::group_by(time_8h) %>% + dplyr::mutate(n_8h = n()) %>% + dplyr::group_by(time_6h) %>% + dplyr::mutate(n_6h = n()) %>% + dplyr::group_by(time_1h) %>% + dplyr::mutate(n_1h = n()) %>% + dplyr::group_by(time_30m) %>% + dplyr::mutate(n_30m = n()) %>% + dplyr::group_by(time_15m) %>% + dplyr::mutate(n_15m = n()) %>% + dplyr::group_by(time_1m) %>% + dplyr::mutate(n_1m = n()) %>% + dplyr::ungroup() +} From cf212fd56c7448c84d13c0b555b3c4ec77cf60d7 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:26:18 -0500 Subject: [PATCH 06/16] Update col_anomaly_check.R --- R/col_anomaly_check.R | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/R/col_anomaly_check.R b/R/col_anomaly_check.R index 27f476375..1a180f213 100644 --- a/R/col_anomaly_check.R +++ b/R/col_anomaly_check.R @@ -18,6 +18,30 @@ #' Do columns in the table contain anomalous values? #' +#' The `col_anomaly_check()` validation function, the +#' `expect_col_anomaly_check()` expectation function, and the +#' `test_col_anomaly_check()` test function all check whether column values +#' contain anomalous values in a time series. Because this family of functions +#' only currently supports a time-series analysis for the detection of +#' anomalies, the `x_column` must contain date-time values and the `y_column` +#' must contain numeric values. The validation function can be used directly on +#' a data table or with an *agent* object (technically, a `ptblank_agent` +#' object) whereas the expectation and test functions can only be used with a +#' data table. The types of data tables that can be used include data frames, +#' tibbles, database tables (`tbl_dbi`), and Spark DataFrames (`tbl_spark`). +#' Each validation step or expectation will operate over the number of test +#' units that is equal to the number of rows in the table (after any +#' `preconditions` have been applied). +#' +#' @inheritParams col_vals_gt +#' @param x_column The column that contains the *x* values. This should be a +#' date-time column because, currently, the `col_anomaly_check()` function +#' only supports anomaly detection as part of a time-series analysis. +#' @param y_column The column that contains the *y* values, which must be +#' numeric. +#' +#' @family validation functions +#' #' @name col_anomaly_check NULL @@ -69,9 +93,9 @@ col_anomaly_check <- function(x, assertion_type = "col_anomaly_check" ) } - + # Normalize any provided `step_id` value(s) - step_id <- normalize_step_id(step_id, columns, agent) + step_id <- normalize_step_id(step_id, columns = "column", agent) # Check `step_id` value(s) against all other `step_id` # values in earlier validation steps From a486558a080f962bed81d8d0e14a05fe89d3ade9 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:26:53 -0500 Subject: [PATCH 07/16] Update interrogate.R --- R/interrogate.R | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/R/interrogate.R b/R/interrogate.R index 45e10a5de..59dae9e12 100644 --- a/R/interrogate.R +++ b/R/interrogate.R @@ -649,6 +649,11 @@ check_table_with_assertion <- function(agent, agent = agent, idx = idx, table = table + ), + "col_anomaly_check" = interrogate_col_anomaly_check( + agent = agent, + idx = idx, + table = table ) ) @@ -1718,6 +1723,74 @@ interrogate_col_schema_match <- function(agent, ) } +interrogate_col_anomaly_check <- function(agent, + idx, + table) { + # Obtain the `x_column` as a symbol + x_column <- get_column_as_sym_at_idx(agent = agent, idx = idx) + + # Convert the symbol to a character vector + x_column <- as.character(x_column) + + # Obtain the `y_column` as a symbol + y_column <- get_values_at_idx(agent = agent, idx = idx) + + # nocov start + + # Create function for validating the `col_anomaly_check()` step function + tbl_col_anomaly_check <- function(table, + x_column, + y_column) { + + # Stop function if `x_column` is not a time column + x_column_value_1 <- + table %>% + dplyr::select({{ x_column }}) %>% + utils::head(1) %>% + dplyr::pull(1) + + if (!inherits(x_column_value_1, "POSIXct")) { + stop( + "The column provided as the `x_column` must contain date-time values.", + call. = FALSE + ) + } + + # Stop function if `y_column` is not numeric + y_column_value_1 <- + table %>% + dplyr::select({{ y_column }}) %>% + utils::head(1) %>% + dplyr::pull(1) + + if (!is.numeric(y_column_value_1)) { + stop( + "The column provided as the `y_column` must contain numeric values.", + call. = FALSE + ) + } + + # Get the augmented table with anomalies values + table_anomalies <- + anomaly_detection_ts( + data_tbl = table, + time_col = x_column, + value_col = y_column + ) + + table_anomalies + } + + # Perform the validation of the table + pointblank_try_catch( + tbl_col_anomaly_check( + table = table, + x_column = x_column, + y_column = y_column + ) + ) +} + # nolint start # Validity checks for the column and value From ba521f3ce0c43160f45fa9975be583ba01c37630 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:26:55 -0500 Subject: [PATCH 08/16] Update steps_and_briefs.R --- R/steps_and_briefs.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/steps_and_briefs.R b/R/steps_and_briefs.R index 4208d31bc..6557b235f 100644 --- a/R/steps_and_briefs.R +++ b/R/steps_and_briefs.R @@ -282,6 +282,12 @@ create_autobrief <- function(agent, autobrief <- finalize_autobrief(expectation_text, precondition_text) } + if (assertion_type == "col_anomaly_check") { + + # TODO: replace with prepared text + autobrief <- "The expectation is that anomalies are not present." + } + if (assertion_type == "conjointly") { values_text <- values_text %>% tidy_gsub("\"", "'") From 87c48698b2ed5be430244461c0694ff242cd33a9 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:26:57 -0500 Subject: [PATCH 09/16] Update zzz.R --- R/zzz.R | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/R/zzz.R b/R/zzz.R index 910fbce09..d43a99e99 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -59,6 +59,8 @@ utils::globalVariables( "assertion_type", "bin_num", "brief", + "code_n_p", + "code_n_w", "color", "column", "columns", @@ -81,6 +83,7 @@ utils::globalVariables( "f_failed", "f_pass", "f_passed", + "fit", "Freq", "frequency", "Frequency", @@ -88,6 +91,7 @@ utils::globalVariables( "item", "label", "::labels::", + "model_fit", "n", "n_fail", "n_pass", @@ -96,6 +100,9 @@ utils::globalVariables( "N_pts", "N_val", "notify", + "outlier", + "outlier_dist", + "outlier_sd_z", ".panel_x", ".panel_y", "pb_is_good_", @@ -117,10 +124,21 @@ utils::globalVariables( "status_color", "step_id", "tbl_checked", + "time", + "time_12h", + "time_15m", + "time_1d", + "time_1h", + "time_1m", + "time_30m", + "time_6h", + "time_8h", "total_pts", "type", "value", "values", + "value_mean", + "var", "Var1", "Var2", "view", @@ -128,7 +146,9 @@ utils::globalVariables( "W_pts", "W_val", "warn", - "x" + "x", + "z", + "z_detrend" ) ) From 31e50f9d8700cbab84653e4ae837eb264cbd5ca3 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:28:46 -0500 Subject: [PATCH 10/16] Update help files using roxygen --- man/col_anomaly_check.Rd | 97 ++++++++++++++++++++++++++++++++++++- man/col_exists.Rd | 1 + man/col_is_character.Rd | 1 + man/col_is_date.Rd | 1 + man/col_is_factor.Rd | 1 + man/col_is_integer.Rd | 1 + man/col_is_logical.Rd | 1 + man/col_is_numeric.Rd | 1 + man/col_is_posix.Rd | 1 + man/col_schema_match.Rd | 1 + man/col_vals_between.Rd | 1 + man/col_vals_decreasing.Rd | 1 + man/col_vals_equal.Rd | 1 + man/col_vals_expr.Rd | 1 + man/col_vals_gt.Rd | 1 + man/col_vals_gte.Rd | 1 + man/col_vals_in_set.Rd | 1 + man/col_vals_increasing.Rd | 1 + man/col_vals_lt.Rd | 1 + man/col_vals_lte.Rd | 1 + man/col_vals_not_between.Rd | 1 + man/col_vals_not_equal.Rd | 1 + man/col_vals_not_in_set.Rd | 1 + man/col_vals_not_null.Rd | 1 + man/col_vals_null.Rd | 1 + man/col_vals_regex.Rd | 1 + man/conjointly.Rd | 1 + man/rows_distinct.Rd | 1 + 28 files changed, 123 insertions(+), 1 deletion(-) diff --git a/man/col_anomaly_check.Rd b/man/col_anomaly_check.Rd index b1f8a78da..55c860422 100644 --- a/man/col_anomaly_check.Rd +++ b/man/col_anomaly_check.Rd @@ -16,6 +16,101 @@ col_anomaly_check( active = TRUE ) } +\arguments{ +\item{x}{A data frame, tibble (\code{tbl_df} or \code{tbl_dbi}), Spark DataFrame +(\code{tbl_spark}), or, an agent object of class \code{ptblank_agent} that is created +with \code{\link[=create_agent]{create_agent()}}.} + +\item{x_column}{The column that contains the \emph{x} values. This should be a +date-time column because, currently, the \code{col_anomaly_check()} function +only supports anomaly detection as part of a time-series analysis.} + +\item{y_column}{The column that contains the \emph{y} values, which must be +numeric.} + +\item{preconditions}{An optional expression for mutating the input table +before proceeding with the validation. This is ideally as a one-sided R +formula using a leading \code{~}. In the formula representation, the \code{.} serves +as the input data table to be transformed (e.g., +\code{~ . \%>\% dplyr::mutate(col = col + 10)}.} + +\item{actions}{A list containing threshold levels so that the validation step +can react accordingly when exceeding the set levels. This is to be created +with the \code{\link[=action_levels]{action_levels()}} helper function.} + +\item{step_id}{One or more optional identifiers for the single or multiple +validation steps generated from calling a validation function. The use of +step IDs serves to distinguish validation steps from each other and provide +an opportunity for supplying a more meaningful label compared to the step +index. By default this is \code{NULL}, and \strong{pointblank} will automatically +generate the step ID value (based on the step index) in this case. One or +more values can be provided, and the exact number of ID values should (1) +match the number of validation steps that the validation function call will +produce (influenced by the number of \code{columns} provided), (2) be an ID +string not used in any previous validation step, and (3) be a vector with +unique values.} + +\item{label}{An optional label for the validation step. This label appears in +the agent report and for the best appearance it should be kept short.} + +\item{brief}{An optional, text-based description for the validation step. If +nothing is provided here then an \emph{autobrief} is generated by the agent, +using the language provided in \code{\link[=create_agent]{create_agent()}}'s \code{lang} argument (which +defaults to \code{"en"} or English). The \emph{autobrief} incorporates details of the +validation step so it's often the preferred option in most cases (where a +\code{label} might be better suited to succinctly describe the validation).} + +\item{active}{A logical value indicating whether the validation step should +be active. If the step function is working with an agent, \code{FALSE} will make +the validation step inactive (still reporting its presence and keeping +indexes for the steps unchanged). If the step function will be operating +directly on data, then any step with \code{active = FALSE} will simply pass the +data through with no validation whatsoever. The default for this is \code{TRUE}.} +} \description{ -Do columns in the table contain anomalous values? +The \code{col_anomaly_check()} validation function, the +\code{expect_col_anomaly_check()} expectation function, and the +\code{test_col_anomaly_check()} test function all check whether column values +contain anomalous values in a time series. Because this family of functions +only currently supports a time-series analysis for the detection of +anomalies, the \code{x_column} must contain date-time values and the \code{y_column} +must contain numeric values. The validation function can be used directly on +a data table or with an \emph{agent} object (technically, a \code{ptblank_agent} +object) whereas the expectation and test functions can only be used with a +data table. The types of data tables that can be used include data frames, +tibbles, database tables (\code{tbl_dbi}), and Spark DataFrames (\code{tbl_spark}). +Each validation step or expectation will operate over the number of test +units that is equal to the number of rows in the table (after any +\code{preconditions} have been applied). +} +\seealso{ +Other validation functions: +\code{\link{col_exists}()}, +\code{\link{col_is_character}()}, +\code{\link{col_is_date}()}, +\code{\link{col_is_factor}()}, +\code{\link{col_is_integer}()}, +\code{\link{col_is_logical}()}, +\code{\link{col_is_numeric}()}, +\code{\link{col_is_posix}()}, +\code{\link{col_schema_match}()}, +\code{\link{col_vals_between}()}, +\code{\link{col_vals_decreasing}()}, +\code{\link{col_vals_equal}()}, +\code{\link{col_vals_expr}()}, +\code{\link{col_vals_gte}()}, +\code{\link{col_vals_gt}()}, +\code{\link{col_vals_in_set}()}, +\code{\link{col_vals_increasing}()}, +\code{\link{col_vals_lte}()}, +\code{\link{col_vals_lt}()}, +\code{\link{col_vals_not_between}()}, +\code{\link{col_vals_not_equal}()}, +\code{\link{col_vals_not_in_set}()}, +\code{\link{col_vals_not_null}()}, +\code{\link{col_vals_null}()}, +\code{\link{col_vals_regex}()}, +\code{\link{conjointly}()}, +\code{\link{rows_distinct}()} } +\concept{validation functions} diff --git a/man/col_exists.Rd b/man/col_exists.Rd index a3505d6e9..224ea651a 100644 --- a/man/col_exists.Rd +++ b/man/col_exists.Rd @@ -189,6 +189,7 @@ tbl \%>\% test_col_exists(vars(a, b)) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, \code{\link{col_is_factor}()}, diff --git a/man/col_is_character.Rd b/man/col_is_character.Rd index ed79f8eba..2b8b29121 100644 --- a/man/col_is_character.Rd +++ b/man/col_is_character.Rd @@ -184,6 +184,7 @@ tbl \%>\% test_col_is_character(vars(b)) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_date}()}, \code{\link{col_is_factor}()}, diff --git a/man/col_is_date.Rd b/man/col_is_date.Rd index 836d458ce..08eb434f6 100644 --- a/man/col_is_date.Rd +++ b/man/col_is_date.Rd @@ -188,6 +188,7 @@ small_table \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_factor}()}, diff --git a/man/col_is_factor.Rd b/man/col_is_factor.Rd index ac6794856..66335e417 100644 --- a/man/col_is_factor.Rd +++ b/man/col_is_factor.Rd @@ -190,6 +190,7 @@ tbl \%>\% test_col_is_factor(vars(f)) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_is_integer.Rd b/man/col_is_integer.Rd index e8601ff52..f13e010e3 100644 --- a/man/col_is_integer.Rd +++ b/man/col_is_integer.Rd @@ -185,6 +185,7 @@ tbl \%>\% test_col_is_integer(vars(b)) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_is_logical.Rd b/man/col_is_logical.Rd index 5507e993c..3be5512b0 100644 --- a/man/col_is_logical.Rd +++ b/man/col_is_logical.Rd @@ -189,6 +189,7 @@ small_table \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_is_numeric.Rd b/man/col_is_numeric.Rd index 0bc32e5c1..a98efd417 100644 --- a/man/col_is_numeric.Rd +++ b/man/col_is_numeric.Rd @@ -189,6 +189,7 @@ small_table \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_is_posix.Rd b/man/col_is_posix.Rd index dd02e487d..4aee212a8 100644 --- a/man/col_is_posix.Rd +++ b/man/col_is_posix.Rd @@ -191,6 +191,7 @@ small_table \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_schema_match.Rd b/man/col_schema_match.Rd index 2b11aca2a..3c12d679b 100644 --- a/man/col_schema_match.Rd +++ b/man/col_schema_match.Rd @@ -252,6 +252,7 @@ tbl \%>\% test_col_schema_match(schema_obj) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_between.Rd b/man/col_vals_between.Rd index 863368b9e..a79f6f6d7 100644 --- a/man/col_vals_between.Rd +++ b/man/col_vals_between.Rd @@ -295,6 +295,7 @@ small_table \%>\% The analogue to this function: \code{\link[=col_vals_not_between]{col_vals_not_between()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_decreasing.Rd b/man/col_vals_decreasing.Rd index dd8c3bca9..7b9ff6c02 100644 --- a/man/col_vals_decreasing.Rd +++ b/man/col_vals_decreasing.Rd @@ -181,6 +181,7 @@ The analogous function that moves in the opposite direction: \code{\link[=col_vals_increasing]{col_vals_increasing()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_equal.Rd b/man/col_vals_equal.Rd index 2074811d8..840a7a5a3 100644 --- a/man/col_vals_equal.Rd +++ b/man/col_vals_equal.Rd @@ -245,6 +245,7 @@ test_col_vals_equal(tbl, vars(a), 5) The analogue to this function: \code{\link[=col_vals_not_equal]{col_vals_not_equal()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_expr.Rd b/man/col_vals_expr.Rd index e06648683..fe9d27c38 100644 --- a/man/col_vals_expr.Rd +++ b/man/col_vals_expr.Rd @@ -232,6 +232,7 @@ nicely within \code{col_vals_expr()} and its variants: \code{\link[rlang:nse-def \code{\link[dplyr:between]{dplyr::between()}}, and \code{\link[dplyr:case_when]{dplyr::case_when()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_gt.Rd b/man/col_vals_gt.Rd index 7499bcd75..051e87fae 100644 --- a/man/col_vals_gt.Rd +++ b/man/col_vals_gt.Rd @@ -246,6 +246,7 @@ test_col_vals_gt(tbl, vars(a), 4) The analogous function with a left-closed bound: \code{\link[=col_vals_gte]{col_vals_gte()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_gte.Rd b/man/col_vals_gte.Rd index c6705d860..c66767063 100644 --- a/man/col_vals_gte.Rd +++ b/man/col_vals_gte.Rd @@ -247,6 +247,7 @@ test_col_vals_gte(tbl, vars(a), 5) The analogous function with a left-open bound: \code{\link[=col_vals_gt]{col_vals_gt()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_in_set.Rd b/man/col_vals_in_set.Rd index 6a5d9bdc5..331b9d742 100644 --- a/man/col_vals_in_set.Rd +++ b/man/col_vals_in_set.Rd @@ -226,6 +226,7 @@ small_table \%>\% The analogue to this function: \code{\link[=col_vals_not_in_set]{col_vals_not_in_set()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_increasing.Rd b/man/col_vals_increasing.Rd index 7540678b1..ba8dacbdb 100644 --- a/man/col_vals_increasing.Rd +++ b/man/col_vals_increasing.Rd @@ -181,6 +181,7 @@ The analogous function that moves in the opposite direction: \code{\link[=col_vals_decreasing]{col_vals_decreasing()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_lt.Rd b/man/col_vals_lt.Rd index 32b7080b7..42f8b60b4 100644 --- a/man/col_vals_lt.Rd +++ b/man/col_vals_lt.Rd @@ -246,6 +246,7 @@ test_col_vals_lt(tbl, vars(c), 5) The analogous function with a right-closed bound: \code{\link[=col_vals_lte]{col_vals_lte()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_lte.Rd b/man/col_vals_lte.Rd index ed37343f5..6344da5fc 100644 --- a/man/col_vals_lte.Rd +++ b/man/col_vals_lte.Rd @@ -249,6 +249,7 @@ test_col_vals_lte(tbl, vars(c), 4) The analogous function with a right-open bound: \code{\link[=col_vals_lt]{col_vals_lt()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_not_between.Rd b/man/col_vals_not_between.Rd index b96b24019..e72f125d4 100644 --- a/man/col_vals_not_between.Rd +++ b/man/col_vals_not_between.Rd @@ -292,6 +292,7 @@ small_table \%>\% The analogue to this function: \code{\link[=col_vals_between]{col_vals_between()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_not_equal.Rd b/man/col_vals_not_equal.Rd index 2a52827a5..1a0389ab0 100644 --- a/man/col_vals_not_equal.Rd +++ b/man/col_vals_not_equal.Rd @@ -246,6 +246,7 @@ test_col_vals_not_equal(tbl, vars(a), 6) The analogue to this function: \code{\link[=col_vals_equal]{col_vals_equal()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_not_in_set.Rd b/man/col_vals_not_in_set.Rd index 87f64305d..fd76dbffe 100644 --- a/man/col_vals_not_in_set.Rd +++ b/man/col_vals_not_in_set.Rd @@ -233,6 +233,7 @@ small_table \%>\% The analogue to this function: \code{\link[=col_vals_in_set]{col_vals_in_set()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_not_null.Rd b/man/col_vals_not_null.Rd index 71e4d9de9..bd700e875 100644 --- a/man/col_vals_not_null.Rd +++ b/man/col_vals_not_null.Rd @@ -216,6 +216,7 @@ tbl \%>\% test_col_vals_not_null(vars(b)) The analogue to this function: \code{\link[=col_vals_null]{col_vals_null()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_null.Rd b/man/col_vals_null.Rd index 7504353a8..59061fac6 100644 --- a/man/col_vals_null.Rd +++ b/man/col_vals_null.Rd @@ -215,6 +215,7 @@ tbl \%>\% test_col_vals_null(vars(c)) The analogue to this function: \code{\link[=col_vals_not_null]{col_vals_not_null()}}. Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/col_vals_regex.Rd b/man/col_vals_regex.Rd index 35002195d..8ffe52d9e 100644 --- a/man/col_vals_regex.Rd +++ b/man/col_vals_regex.Rd @@ -245,6 +245,7 @@ small_table \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/conjointly.Rd b/man/conjointly.Rd index 9690c78b7..f9d34ae79 100644 --- a/man/conjointly.Rd +++ b/man/conjointly.Rd @@ -272,6 +272,7 @@ tbl \%>\% } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, diff --git a/man/rows_distinct.Rd b/man/rows_distinct.Rd index baa9e5648..4606d684a 100644 --- a/man/rows_distinct.Rd +++ b/man/rows_distinct.Rd @@ -177,6 +177,7 @@ all_passed(agent) } \seealso{ Other validation functions: +\code{\link{col_anomaly_check}()}, \code{\link{col_exists}()}, \code{\link{col_is_character}()}, \code{\link{col_is_date}()}, From 5474f3cc1371a066421399d8bd5814f260cd7b3d Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sat, 19 Dec 2020 22:40:03 -0500 Subject: [PATCH 11/16] Remove min version requirement for mgcv --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b03d4ace1..49f3f5818 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,7 @@ Imports: log4r (>= 0.3.2), knitr (>= 1.30), magrittr, - mgcv (>= 1.8-33), + mgcv, rlang (>= 0.4.9), scales (>= 1.1.1), testthat (>= 2.3.2), From e80b1fa302b0570796ab7744cd0ff1d13fcc434b Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sun, 20 Dec 2020 21:36:01 -0500 Subject: [PATCH 12/16] Create col_anomaly_check.png --- inst/img/function_icons/col_anomaly_check.png | Bin 0 -> 2948 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 inst/img/function_icons/col_anomaly_check.png diff --git a/inst/img/function_icons/col_anomaly_check.png b/inst/img/function_icons/col_anomaly_check.png new file mode 100644 index 0000000000000000000000000000000000000000..07d01cb265492817f105342ff13b586e05c79794 GIT binary patch literal 2948 zcmV-~3w!j5P)1^@s6$I7^K00004XF*Lt006O% z3;baP0000uWmrjOO-%qQ0000800D<-00aO40096102%-Q00002paK8{000010000( zpaTE|000010000(00000aic-r000X2Nkl-kIH2m*t*cY{R3~KQ;|5?% zwP?{oef;<_6mZ~k4=YxzPze$wFpx6Xju5cRX&|Cy%a%ST2(^dXwrx|fV#NYNzH;!u~<~_wR2|$F84vkjq{mW6`2T_WmJ#4|$k0X;K3rX@H63T)wiV zP^^nXdh@J0d$Px4POr>=8>{)vL{JHro7!}x00M;() zfDEyu;>V9K4wPL;zC4vEQG$*hJxaZL^`dg+%F(@h_hbn$mH;gAAN-E6tugTe`9)qiff$(YSHrXztv(6j{j7*BM;2>u<8v+3}+|ajf5h ztzH7_0N?_Q963_h$Tl>nw+(rA)v#ehU&1zM&pv$kAX1z+Ebq8`_paP;-n=m{>x~*UB9zF4C}Wez z+$7$0`0!zxG-;CSYeOCx2n=UpgWrWRZdBR`1@uDS!U_#2Sh^ zckV29499aFkrCxG!*ON5*#J9b+O%n6yuE(?nmDp0>;a^BK}b=?+O=!NSQ{{4fT-%- zyLS@@afCX&5P|kcr;PLG&(p<=7d_Y3xN&3AG;7wZ@$8!~Uq110_wV0N-@bjLB}YQef##6xtT2_Qnz>S zUWF*uym|8!`#-)6PxF~>fd-8-Zr{F5g9i^LkS?sv6DLkky?XViQ>RYE$npHzDN>}M z*s){N!Gi}q`)10NN%|yDo?PM|ty;A*$ADG*p)O;W?bxw{a^=b; z@3Fi7`uL$khY~`D@87?d$GF#h9=)%x`fszz=g*&o{Mgx2nKNfr3l}a_$Xw|Fp@ueo zu@vTO1$W(FSh;egITmv?Yu3zj=z|9j6nn$!W24RJKf=OQ`e7eT%RDg%5@?D9}YMnFc_r%#{6vdft>C(W8QOAH*t z#dacHXc9Zv`5N(1_zwvEWz3k7x^(Gc4zc$QL#~6qAwRQh*|G@KvvcQ8nS;Ge#XN7ej~Fq+oZFuY$6D3Efdkd_>C+W| z%m_nt$^gz>k--rs#rX?a4APHC+5$O&01&c$`*wvF`$}lDH)(E`*CTZpOa?f~0AF(% zV{qNIYnPEvl{u?ctr}tNej}qtj~+4!gx!U$*qsk>RRZYt*PA_WiJ7!zfj%RG=b>UW0F) zWQ24I@K2EY=e_VK!0|Y)Z)d9)u2Ix9KV`AZ=iL^EZ%^nfR*fU>Cz>|@e8wG_wL>8 zU4(Tp7oy`pieJcBw{D#|sj(0^VZNYEn>LodP_H_0;6=@uGe_)_ckkZmO@7rO5REn9 z3Kj1JU@0~N6&qqcyXb7fbNPpi;lqbp7Jx(v2H8h_w{u&YjyF#)viRfmZxR1~$%efQl9^N~KDb zqKz9j$|A;(A1|SPg-tl8s@SR0yC{xSnR8-( z_BtcXV;R6V2o>mSVMK}`xA(K(>ulnxTJ2b~-y@T*12X3b;y_j6+y#mqQm0N$=gysz zpbykDpDc`^?(Fj-$cwWYpgt!+W=;o2I&pMO%zOR0TitIq@ij^5(xokrahAdt0xX^Q z2I2ayUAs!u%a?m?pBM26u%{bTuU_5Kj4VF<90P#R)2DHu(H(x9O+a$6H`1m}E2Iw> z{!akReKx$rM#zyPhg!dWz2~&hFn#*;5jD%?B73|=N5;Qm6L*4(y4LI0uS?Pz8W5@*X+=D1*RGvw`oL3w zB7BBpMM&2{*deSJT>*#TaN38hTUjFH67HFe{a&&on{doh6)#@gR9Wy#SX%_d4KTNn zD>3{6wm}pdWu65p7?!C&ef#z6Cu^!*ySA*ge*OB&?wvXbA*jA2t*JVUQBQPr+fhN}Q&7Lizifkhl3MVj9U&YU?z(D%rk z!RSGJg57B1!*fnUqTO>x$}qkba+h%7-F}Dza~R$#e#Ze9+pf=zxuV-urwmt_VSF}@ z9U4{36-glgJXJVo$Z>*i-OOgo1kvoql}+5%%2uJAdzd_VvUCGWa1iIOMFchxFfwr1 z9cR9qHf>5AohG=;0A>-n{>^`Y3}^{rUUuEbty{N1e8s3yqvU51p@D*5y8tpC;1*vQ)S;6*!tijNam0YFJ2_%d zuwcPoqeH9)Ho3>;Ho|7GICX$lfQI1Al;kx+84=jGAwbvTX2OICxL9Duj2VPvQcM5? z8UN+YzQf_s4I4JZgnj$mqk)X4-1;B%fMbM1VKEU5WMJwih``kIKuj~zP}=gb^`a)7Y+elFs+$Al{q&vwe-eOn+s uhw{sDRxCR^V(HihiDzjIp&Jfd{qui&oo;vg8tm)<0000 Date: Sun, 20 Dec 2020 21:36:03 -0500 Subject: [PATCH 13/16] Create col_anomaly_check.svg --- inst/img/function_icons/col_anomaly_check.svg | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 inst/img/function_icons/col_anomaly_check.svg diff --git a/inst/img/function_icons/col_anomaly_check.svg b/inst/img/function_icons/col_anomaly_check.svg new file mode 100644 index 000000000..823fa3cab --- /dev/null +++ b/inst/img/function_icons/col_anomaly_check.svg @@ -0,0 +1,51 @@ + + + col_anomaly_check() + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 9d356bc8ac4177e42964866b9a64c75c391632a0 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sun, 20 Dec 2020 21:51:19 -0500 Subject: [PATCH 14/16] Update interrogate.R --- R/interrogate.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/R/interrogate.R b/R/interrogate.R index 59dae9e12..c3b80486a 100644 --- a/R/interrogate.R +++ b/R/interrogate.R @@ -346,6 +346,26 @@ interrogate <- function(agent, ) } + # Add extracts of rows for anomalous values + if (assertion_type == "col_anomaly_check") { + + tbl_checked$value <- + tbl_checked$value %>% + dplyr::select(time, value, pb_is_good_) + + agent <- + add_table_extract( + agent = agent, + idx = i, + tbl_checked = tbl_checked, + extract_failed = extract_failed, + get_first_n = get_first_n, + sample_n = sample_n, + sample_frac = sample_frac, + sample_limit = sample_limit + ) + } + # Get the ending time for the validation step validation_end_time <- Sys.time() From 9d9a30b6dfaa482cf8c5c8f42521c61061975d52 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Sun, 20 Dec 2020 22:52:15 -0500 Subject: [PATCH 15/16] Update interrogate.R --- R/interrogate.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/interrogate.R b/R/interrogate.R index c3b80486a..0326a31a6 100644 --- a/R/interrogate.R +++ b/R/interrogate.R @@ -1801,6 +1801,8 @@ interrogate_col_anomaly_check <- function(agent, table_anomalies } + # nocov end + # Perform the validation of the table pointblank_try_catch( tbl_col_anomaly_check( From 9b87c1500f550a0925b7e6d363d76d713522cb42 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 21 Dec 2020 16:50:15 -0500 Subject: [PATCH 16/16] Update anomaly_detection.R --- R/anomaly_detection.R | 100 ++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/R/anomaly_detection.R b/R/anomaly_detection.R index 3da881cc0..79eb37b43 100644 --- a/R/anomaly_detection.R +++ b/R/anomaly_detection.R @@ -65,11 +65,6 @@ anomaly_detection_ts <- function(data_tbl, fit = unname(mgcv::predict.gam(gam_model)), ) - # # Plot the sampled data points along with the model fit - # ggplot(data_tbl_standardized_sample) + - # geom_point(aes(x = time, y = z)) + - # geom_line(aes(x = time, y = fit), data = gam_model_tbl, color = "blue") - # Perform an analysis on the standardized data; this focuses only on the # unchanged `time` variable, giving us columns that indicate how many data # points are in each time grouping @@ -108,7 +103,8 @@ anomaly_detection_ts <- function(data_tbl, if (nrow(time_int_gt_20) < 1) { stop( - "There aren't enough data points in any of the possible time granularity types.", + "There aren't enough data points in any of the available time ", + "granularity types.", call. = FALSE ) } @@ -242,7 +238,7 @@ anomaly_detection_ts <- function(data_tbl, unified_data <- encoded_points %>% dplyr::left_join( - encoded_windows, by = "time_1d" + encoded_windows, by = paste0("time_", timespan_select) ) %>% dplyr::mutate(code_n_dist = abs(code_n_p - code_n_w)) %>% dplyr::mutate( @@ -262,42 +258,6 @@ anomaly_detection_ts <- function(data_tbl, )) %>% dplyr::mutate(`pb_is_good_` = !outlier) - - # plot_data <- - # unified_data %>% - # dplyr::select(1, 2, z, z_detrend, sd_z_detrend_ul, sd_z_detrend_ll, outlier) %>% - # dplyr::collect() - # - # # Plot of actual data values with seeded values and marked outliers - # ggplot() + - # geom_point( - # data = plot_data %>% dplyr::filter(outlier), - # aes(x = time, y = value), color = "red" - # ) + - # geom_point( - # data = plot_data %>% dplyr::filter(!outlier), - # aes(x = time, y = value) - # ) + - # scale_x_datetime(date_breaks = "1 month") - # - # # Plot of standardized data values with seeded values and marked outliers - # ggplot() + - # geom_point( - # data = plot_data %>% dplyr::filter(outlier), - # aes(x = time, y = z_detrend), color = "red" - # ) + - # geom_point( - # data = plot_data %>% dplyr::filter(!outlier), - # aes(x = time, y = z_detrend) - # ) + - # geom_hline(yintercept = 1, color = "gray") + - # geom_hline(yintercept = -1, color = "gray") + - # geom_line(data = plot_data, aes(x = time, y = sd_z_detrend_ul), color = "red") + - # geom_line(data = plot_data, aes(x = time, y = sd_z_detrend_ll), color = "red") + - # scale_y_continuous(breaks = seq(-5.0, 5.0, 0.5)) + - # scale_x_datetime(date_breaks = "1 month") - - unified_data } @@ -417,3 +377,57 @@ add_time_granularity_cols <- function(data_tbl) { dplyr::mutate(n_1m = n()) %>% dplyr::ungroup() } + +plot_fit_standardized_sample <- function(data_tbl_standardized_sample) { + + # Plot the sampled data points along with the model fit + ggplot(data_tbl_standardized_sample) + + geom_point(aes(x = time, y = z)) + + geom_line(aes(x = time, y = fit), data = gam_model_tbl, color = "blue") +} + +plot_anomalies <- function(unified_data) { + + plot_data <- + unified_data %>% + dplyr::select( + 1, 2, z, z_detrend, + sd_z_detrend_ul, sd_z_detrend_ll, outlier + ) %>% + dplyr::collect() + + # Plot of actual data values with seeded values and marked outliers + ggplot() + + geom_point( + data = plot_data %>% dplyr::filter(outlier), + aes(x = time, y = value), color = "red" + ) + + geom_point( + data = plot_data %>% dplyr::filter(!outlier), + aes(x = time, y = value) + ) + + scale_x_datetime(date_breaks = "1 month") + + # Plot of standardized data values with seeded values and marked outliers + ggplot() + + geom_point( + data = plot_data %>% dplyr::filter(outlier), + aes(x = time, y = z_detrend), color = "red" + ) + + geom_point( + data = plot_data %>% dplyr::filter(!outlier), + aes(x = time, y = z_detrend) + ) + + geom_hline(yintercept = 1, color = "gray") + + geom_hline(yintercept = -1, color = "gray") + + geom_line( + data = plot_data, + aes(x = time, y = sd_z_detrend_ul), color = "red" + ) + + geom_line( + data = plot_data, aes(x = time, y = sd_z_detrend_ll), + color = "red" + ) + + scale_y_continuous(breaks = seq(-5.0, 5.0, 0.5)) + + scale_x_datetime(date_breaks = "1 month") +}