diff --git a/.gitignore b/.gitignore index d0719ceaeb..6f8c50d114 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,7 @@ pointblank/py.typed # Great Docs build directory (ephemeral, do not commit) great-docs/ + +# Other directories +vhs/ +pdf-export/ diff --git a/Makefile b/Makefile index 7e9b75d69c..82fcaefb73 100644 --- a/Makefile +++ b/Makefile @@ -110,10 +110,10 @@ docs-pdf: ## Build PDF version of User Guide (HTML to PDF preserving graphics) uv run python scripts/create_toc_pdf.py docs/user-guide.pdf @echo "PDF available at docs/user-guide.pdf" -docs-llms: ## Generate llms.txt and llms-full.txt files for LLM consumption - @uv run python scripts/generate_llms_txt.py +docs-api-text: ## Regenerate api-docs.txt for DraftValidation/assistant + @uv run python scripts/generate_api_docs.py -docs-full: docs-build docs-llms ## Build docs and generate llms.txt files +docs-full: docs-build ## Build docs install: dist ## install the package to the active Python's site-packages python3 -m pip install --force-reinstall dist/pointblank*.whl diff --git a/README.md b/README.md index 716c140a95..c1ccf63af2 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # A sample dataset # Use DraftValidation to generate a validation plan -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` The output is a complete validation plan with intelligent suggestions based on your data: diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 490f32af35..0000000000 --- a/docs/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -/.quarto/ - -# quartodoc -_site -_sidebar.yml -objects.json -reference diff --git a/docs/_extensions/machow/interlinks/.gitignore b/docs/_extensions/machow/interlinks/.gitignore deleted file mode 100644 index 5a1bf0b4e2..0000000000 --- a/docs/_extensions/machow/interlinks/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.html -*.pdf -*_files/ diff --git a/docs/_extensions/machow/interlinks/_extension.yml b/docs/_extensions/machow/interlinks/_extension.yml deleted file mode 100644 index c8a812136b..0000000000 --- a/docs/_extensions/machow/interlinks/_extension.yml +++ /dev/null @@ -1,7 +0,0 @@ -title: Interlinks -author: Michael Chow -version: 1.1.0 -quarto-required: ">=1.2.0" -contributes: - filters: - - interlinks.lua diff --git a/docs/_extensions/machow/interlinks/interlinks.lua b/docs/_extensions/machow/interlinks/interlinks.lua deleted file mode 100644 index cc538a20f1..0000000000 --- a/docs/_extensions/machow/interlinks/interlinks.lua +++ /dev/null @@ -1,411 +0,0 @@ -local inventory = {} -- sphinx inventories -local autolink -- set in Meta -local autolink_ignore_token = "qd-no-link" - -local function _debug_log(text, debug) - if debug then - quarto.log.warning(text) - end -end - -local function read_inv_text(filename) - -- read file - local file = io.open(filename, "r") - if file == nil then - return nil - end - local str = file:read("a") - file:close() - - - local project = str:match("# Project: (%S+)") - local version = str:match("# Version: (%S+)") - - local data = { project = project, version = version, items = {} } - - local ptn_data = - "^" .. - "(.-)%s+" .. -- name - "([%S:]-):" .. -- domain - "([%S]+)%s+" .. -- role - "(%-?%d+)%s+" .. -- priority - "(%S*)%s+" .. -- uri - "(.-)\r?$" -- dispname - - - -- Iterate through each line in the file content - for line in str:gmatch("[^\r\n]+") do - if not line:match("^#") then - -- Match each line against the pattern - local name, domain, role, priority, uri, dispName = line:match(ptn_data) - - -- if name is nil, raise an error - if name == nil then - error("Error parsing line: " .. line) - end - - data.items[#data.items + 1] = { - name = name, - domain = domain, - role = role, - priority = priority, - uri = uri, - dispName = dispName - } - end - end - return data -end - -local function read_json(filename) - local file = io.open(filename, "r") - if file == nil then - return nil - end - local str = file:read("a") - file:close() - - local decoded = quarto.json.decode(str) - return decoded -end - -local function read_inv_text_or_json(base_name) - local file = io.open(base_name .. ".txt", "r") - if file then - -- TODO: refactors so we don't just close the file immediately - io.close(file) - json = read_inv_text(base_name .. ".txt") - else - json = read_json(base_name .. ".json") - end - - return json -end - --- each inventory has entries: project, version, items -local function lookup(search_object, debug) - local results = {} - for _, inv in ipairs(inventory) do - for _, item in ipairs(inv.items) do - -- e.g. :external+:::`` - if item.inv_name and item.inv_name ~= search_object.inv_name then - goto continue - end - - if item.name ~= search_object.name then - goto continue - end - - if search_object.role and item.role ~= search_object.role then - goto continue - end - - if search_object.domain and item.domain ~= search_object.domain then - goto continue - else - if search_object.domain or item.domain == "py" then - table.insert(results, item) - end - - goto continue - end - - ::continue:: - end - end - - if #results == 1 then - return results[1] - end - if #results > 1 then - _debug_log("Found multiple matches for " .. search_object.name .. ", using the first match.", debug) - return results[1] - end - if #results == 0 then - _debug_log("Found no matches for object:\n", debug) - _debug_log(search_object, debug) - end - - return nil -end - -local function mysplit(inputstr, sep) - if sep == nil then - sep = "%s" - end - local t = {} - for str in string.gmatch(inputstr, "([^" .. sep .. "]+)") do - table.insert(t, str) - end - return t -end - -local function normalize_role(role) - if role == "func" then - return "function" - end - return role -end - -local function copy_replace(original, key, new_value) - -- First create a copy of the table - local copy = {} - for k, v in pairs(original) do - copy[k] = v - end - - -- Then replace the specific value - copy[key] = new_value - - return copy -end - -local function contains(list, value) - -- check if list contains a value - for i, v in ipairs(list) do - if v == value then - return true - end - end - return false -end - -local function flatten_alias_list(list) - -- flatten a list of lists into a single list, - -- where each entry has the form {key, subvalue}} - -- e.g. - -- input: {key1 = {subval1, subval2}, key2 = subval3} - -- output: {{key1, subval1}, {key1, subval2}, {key2, subval3}} - local flat = {} - for key, sublist in pairs(list) do - if type(sublist) == "table" then - for _, subvalue in ipairs(sublist) do - table.insert(flat, { key, subvalue }) - end - else - table.insert(flat, { key, sublist }) - end - end - return flat -end - -local function prepend_aliases(flat_aliases) - -- if str up to first period starts with an alias, then - -- replace it with the full name. - -- For example, suppose we have the alias quartodoc -> qd - -- e.g. qd.Auto -> quartodoc.Auto - -- e.g. qda.Auto -> qda.Auto - - local new_inv = {} - new_inv["project"] = "aliases" - new_inv["version"] = "0.0.9999" -- I have not begun to think about version... - new_inv["items"] = {} - - for _, name_pair in pairs(flat_aliases) do - local full = name_pair[1] - local alias = name_pair[2] - for _, inv in ipairs(inventory) do - for _, item in ipairs(inv.items) do - if string.sub(item.name, 1, string.len(full) + 1) == (full .. ".") then - -- replace full .. "." with alias .. "." - local prefix - if not alias or pandoc.utils.stringify(alias) == "" then - prefix = "" - else - -- TODO: ensure alias doesn't end with period - prefix = pandoc.utils.stringify(alias) .. "." - end - local new_name = prefix .. string.sub(item.name, string.len(full) + 2) - table.insert(new_inv.items, copy_replace(item, "name", new_name)) - end - end - end - end - table.insert(inventory, new_inv) -end - -local function build_search_object(str, debug) - local starts_with_colon = str:sub(1, 1) == ":" - local search = {} - if starts_with_colon then - local t = mysplit(str, ":") - if #t == 2 then - -- e.g. :py:func:`my_func` - search.role = normalize_role(t[1]) - search.name = t[2]:match("%%60(.*)%%60") - elseif #t == 3 then - -- e.g. :py:func:`my_func` - search.domain = t[1] - search.role = normalize_role(t[2]) - search.name = t[3]:match("%%60(.*)%%60") - elseif #t == 4 then - -- e.g. :ext+inv:py:func:`my_func` - search.external = true - - search.inv_name = t[1]:match("external%+(.*)") - search.domain = t[2] - search.role = normalize_role(t[3]) - search.name = t[4]:match("%%60(.*)%%60") - else - _debug_log("couldn't parse this link: " .. str, debug) - return {} - end - else - search.name = str:match("%%60(.*)%%60") - end - - if search.name == nil then - _debug_log("couldn't parse this link: " .. str, debug) - return {} - end - - if search.name:sub(1, 1) == "~" then - search.shortened = true - search.name = search.name:sub(2, -1) - end - return search -end - -local function report_broken_link(link, search_object, replacement) - -- TODO: how to unescape html elements like [? - return pandoc.Code(pandoc.utils.stringify(link.content)) -end - -function Link(link) - -- do not process regular links ---- - if not link.target:match("%%60") then - return link - end - - -- lookup item ---- - local search = build_search_object(link.target) - local item = lookup(search) - - -- determine replacement, used if no link text specified ---- - local original_text = pandoc.utils.stringify(link.content) - local replacement = search.name - if search.shortened then - local t = mysplit(search.name, ".") - replacement = t[#t] - end - - -- set link text ---- - if original_text == "" and replacement ~= nil then - link.content = pandoc.Code(replacement) - end - - -- report broken links ---- - if item == nil then - return report_broken_link(link, search) - end - link.target = item.uri:gsub("%$$", search.name) - - - return link -end - -function Code(code) - if (not autolink) or contains(code.classes, autolink_ignore_token) then - return code - end - - -- allow text for lookup to be simple function call - -- and also support shortened syntax (~~ prefix) - -- e.g. my_func() -> my_func - -- e.g. a.b.call() -> a.b.call - -- e.g. ~~my_func() -> my_func - local text - - -- detect and remove shortening syntax (~~ prefix) - local is_shortened = code.text:sub(1, 2) == "~~" - local is_short_dot = code.text:sub(1, 3) == "~~." - local unprefixed = code.text:gsub("^~~%.?", "") - if unprefixed:match("%(%s*%)") then - text = unprefixed:gsub("%(%s*%)", "") - else - text = unprefixed - end - - - -- return code.attr - local search = build_search_object("%60" .. text .. "%60") - local item = lookup(search) - - -- determine replacement, used if no link text specified ---- - if item == nil then - code.text = unprefixed - return code - end - - -- shorten text if shortening syntax used - if is_shortened then - -- keep text after last period (.) - local split = mysplit(unprefixed, ".") - if #split > 0 then - local new_name = split[#split] - if is_short_dot then - -- if shortened with dot, keep the dot - new_name = "." .. new_name - end - code.text = new_name - else - code.text = unprefixed - end - end - - - return pandoc.Link(code, item.uri:gsub("%$$", search.name)) -end - -local function fixup_json(json, prefix, attach) - for _, item in ipairs(json.items) do - item.uri = prefix .. item.uri - end - table.insert(inventory, json) -end - -return { - { - Meta = function(meta) - local json - local prefix - local aliases - - -- set globals from config - if meta.interlinks and meta.interlinks.autolink then - autolink = true - else - autolink = false - end - - local aliases - if meta.interlinks and meta.interlinks.aliases then - aliases = meta.interlinks.aliases - else - aliases = {} - end - - -- process sources - if meta.interlinks and meta.interlinks.sources then - for k, v in pairs(meta.interlinks.sources) do - local base_name = quarto.project.offset .. "/_inv/" .. k .. "_objects" - json = read_inv_text_or_json(base_name) - prefix = pandoc.utils.stringify(v.url) - if json ~= nil then - fixup_json(json, prefix) - end - end - end - json = read_inv_text_or_json(quarto.project.offset .. "/objects") - if json ~= nil then - fixup_json(json, "/") - end - - prepend_aliases(flatten_alias_list(aliases)) - end - }, - { - Link = Link, - Code = Code - } -} diff --git a/docs/_quarto.yml b/docs/_quarto.yml deleted file mode 100644 index 3057fb7e89..0000000000 --- a/docs/_quarto.yml +++ /dev/null @@ -1,337 +0,0 @@ -project: - type: website - post-render: scripts/post-render.py - resources: - - "assets/**" - - "user-guide.pdf" - -format: - html: - include-in-header: - - text: | - - - - theme: flatly - css: - - styles.css - - reference/_styles-quartodoc.css - toc: true - grid: - sidebar-width: 270px - body-width: 950px - margin-width: 225px - gutter-width: 1rem - -filters: - - interlinks - -interlinks: - fast: true - autolink: true - aliases: - pointblank: [null, pb] - sources: - numpy: - url: https://numpy.org/doc/stable/ - python: - url: https://docs.python.org/3/ - -website: - title: Pointblank - google-analytics: "G-XSFKYZM9GW" - search: - show-item-context: true - page-navigation: true - bread-crumbs: false - favicon: assets/fav-logo.png - site-url: https://posit-dev.github.io/pointblank/ - repo-url: https://github.com/posit-dev/pointblank - description: "Find out if your data is what you think it is" - navbar: - logo: assets/pointblank_logo_small.svg - title: false - left: - - href: index.qmd - text: Get Started - - href: demos/index.qmd - text: Examples - - href: reference/index.qmd - text: API Reference - - href: blog/index.qmd - text: Pointblog - right: - - icon: discord - href: https://discord.com/invite/YH7CybCNCQ - - icon: github - href: https://github.com/posit-dev/pointblank - sidebar: - id: user-guide - contents: - - section: "Get Started" - contents: - - text: "Welcome to Pointblank" - href: index.qmd - - user-guide/quickstart.qmd - - user-guide/installation.qmd - - section: "Validation Plan" - contents: - - user-guide/validation-overview.qmd - - user-guide/validation-methods.qmd - - user-guide/column-selection-patterns.qmd - - user-guide/preprocessing.qmd - - user-guide/segmentation.qmd - - user-guide/thresholds.qmd - - user-guide/actions.qmd - - user-guide/briefs.qmd - - section: "Advanced Validation" - contents: - - user-guide/expressions.qmd - - user-guide/schema-validation.qmd - - user-guide/assertions.qmd - - user-guide/draft-validation.qmd - - section: "YAML" - contents: - - user-guide/yaml-validation-workflows.qmd - - user-guide/yaml-reference.qmd - - section: "Post Interrogation" - contents: - - user-guide/validation-reports.qmd - - user-guide/step-reports.qmd - - user-guide/extracts.qmd - - user-guide/sundering.qmd - - section: "Data Inspection" - contents: - - user-guide/preview.qmd - - user-guide/col-summary-tbl.qmd - - user-guide/missing-vals-tbl.qmd - - section: "Test Data Generation" - contents: - - user-guide/test-data-generation.qmd - - section: "The Pointblank CLI" - contents: - - user-guide/cli-data-inspection.qmd - - user-guide/cli-data-validation.qmd - - user-guide/cli-reference.qmd - - section: "MCP Server" - contents: - - user-guide/mcp-quick-start.qmd - - page-footer: - left: 'Proudly supported by Posit' - right: "© 2024–2026 Posit Software, PBC." - -html-table-processing: none - -quartodoc: - package: pointblank - dir: reference - title: API Reference - style: pkgdown - dynamic: true - render_interlinks: true - renderer: - style: markdown - table_style: description-list - sections: - - title: Validate - desc: > - When performing data validation, you'll need the `Validate` class to get the process started. - It's given the target table and you can optionally provide some metadata and/or failure - thresholds (using the `Thresholds` class or through shorthands for this task). The - `Validate` class has numerous methods for defining validation steps and for obtaining - post-interrogation metrics and data. - contents: - - name: Validate - members: [] - - name: Thresholds - - name: Actions - members: [] - - name: FinalActions - - name: Schema - members: [] - - name: DraftValidation - members: [] - - title: Validation Steps - desc: > - Validation steps can be thought of as sequential validations on the target data. We call - `Validate`'s validation methods to build up a validation plan: a collection of steps that, - in the aggregate, provides good validation coverage. - contents: - - name: Validate.col_vals_gt - - name: Validate.col_vals_lt - - name: Validate.col_vals_ge - - name: Validate.col_vals_le - - name: Validate.col_vals_eq - - name: Validate.col_vals_ne - - name: Validate.col_vals_between - - name: Validate.col_vals_outside - - name: Validate.col_vals_in_set - - name: Validate.col_vals_not_in_set - - name: Validate.col_vals_increasing - - name: Validate.col_vals_decreasing - - name: Validate.col_vals_null - - name: Validate.col_vals_not_null - - name: Validate.col_vals_regex - - name: Validate.col_vals_within_spec - - name: Validate.col_vals_expr - - name: Validate.col_sum_gt - - name: Validate.col_sum_lt - - name: Validate.col_sum_ge - - name: Validate.col_sum_le - - name: Validate.col_sum_eq - - name: Validate.col_avg_gt - - name: Validate.col_avg_lt - - name: Validate.col_avg_ge - - name: Validate.col_avg_le - - name: Validate.col_avg_eq - - name: Validate.col_sd_gt - - name: Validate.col_sd_lt - - name: Validate.col_sd_ge - - name: Validate.col_sd_le - - name: Validate.col_sd_eq - - name: Validate.rows_distinct - - name: Validate.rows_complete - - name: Validate.col_exists - - name: Validate.col_pct_null - - name: Validate.data_freshness - - name: Validate.col_schema_match - - name: Validate.row_count_match - - name: Validate.col_count_match - - name: Validate.tbl_match - - name: Validate.conjointly - - name: Validate.specially - - name: Validate.prompt - - title: Column Selection - desc: > - A flexible way to select columns for validation is to use the `col()` function along with - column selection helper functions. A combination of `col()` + `starts_with()`, `matches()`, - etc., allows for the selection of multiple target columns (mapping a validation across many - steps). Furthermore, the `col()` function can be used to declare a comparison column (e.g., - for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value - for comparison. - contents: - - name: col - - name: starts_with - - name: ends_with - - name: contains - - name: matches - - name: everything - - name: first_n - - name: last_n - - name: expr_col - - title: Segment Groups - desc: > - Combine multiple values into a single segment using `seg_*()` helper functions. - contents: - - name: seg_group - - title: Interrogation and Reporting - desc: > - The validation plan is put into action when `interrogate()` is called. The workflow for - performing a comprehensive validation is then: (1) `Validate()`, (2) adding validation - steps, (3) `interrogate()`. After interrogation of the data, we can view a validation report - table (by printing the object or using `get_tabular_report()`), extract key metrics, or we - can split the data based on the validation results (with `get_sundered_data()`). - contents: - - name: Validate.interrogate - - name: Validate.set_tbl - - name: Validate.get_tabular_report - - name: Validate.get_step_report - - name: Validate.get_json_report - - name: Validate.get_sundered_data - - name: Validate.get_data_extracts - - name: Validate.all_passed - - name: Validate.assert_passing - - name: Validate.assert_below_threshold - - name: Validate.above_threshold - - name: Validate.n - - name: Validate.n_passed - - name: Validate.n_failed - - name: Validate.f_passed - - name: Validate.f_failed - - name: Validate.warning - - name: Validate.error - - name: Validate.critical - - title: Inspection and Assistance - desc: > - The *Inspection and Assistance* group contains functions that are helpful for getting to - grips on a new data table. Use the `DataScan` class to get a quick overview of the data, - `preview()` to see the first and last few rows of a table, `col_summary_tbl()` for a - column-level summary of a table, and `missing_vals_tbl()` to see where there are missing - values in a table. Several datasets included in the package can be accessed via the - `load_dataset()` function. On the assistance side, the `assistant()` function can be used to - get help with Pointblank. - contents: - - name: DataScan - - name: preview - - name: col_summary_tbl - - name: missing_vals_tbl - - name: assistant - - name: load_dataset - - name: get_data_path - - name: connect_to_table - - name: print_database_tables - - title: Table Pre-checks - desc: > - The *Table Pre-checks* group contains helper functions that are designed for use with the - `active=` parameter of validation methods. These callables inspect the target table before - a validation step runs and conditionally skip the step when a precondition is not met - (e.g., a required column is missing or the table does not have enough rows). A descriptive, - locale-aware note is automatically attached to any step that is skipped. - contents: - - name: has_columns - - name: has_rows - - title: YAML - desc: > - The *YAML* group contains functions that allow for the use of YAML to orchestrate validation - workflows. The `yaml_interrogate()` function can be used to run a validation workflow from - YAML strings or files. The `validate_yaml()` function checks if the YAML configuration - passes its own validity checks. The `yaml_to_python()` function converts YAML configuration - to equivalent Python code. - contents: - - name: yaml_interrogate - - name: validate_yaml - - name: yaml_to_python - - title: Utility Functions - desc: > - The *Utility Functions* group contains functions that are useful accessing metadata about - the target data. Use `get_column_count()` or `get_row_count()` to get the number of columns - or rows in a table. The `get_action_metadata()` function is useful when building custom - actions since it returns metadata about the validation step that's triggering the action. - Lastly, the `config()` utility lets us set global configuration parameters. - contents: - - name: get_column_count - - name: get_row_count - - name: get_action_metadata - - name: get_validation_summary - - name: write_file - - name: read_file - - name: config - - title: Test Data Generation - desc: > - Generate synthetic test data based on schema definitions. Use `generate_dataset()` to - create data from a `Schema` object. The helper functions define typed fields with - constraints for realistic test data generation. The `profile_fields()` helper creates a - complete person-profile schema (name, email, address, phone, etc.) in a single call. - contents: - - name: generate_dataset - - name: int_field - - name: float_field - - name: string_field - - name: bool_field - - name: date_field - - name: datetime_field - - name: time_field - - name: duration_field - - name: profile_fields - - title: Prebuilt Actions - desc: > - The *Prebuilt Actions* group contains a function that can be used to send a Slack - notification when validation steps exceed failure threshold levels or just to provide a - summary of the validation results, including the status, number of steps, passing and - failing steps, table information, and timing details. - contents: - - name: send_slack_notification diff --git a/docs/assets/fav-logo.png b/docs/assets/fav-logo.png deleted file mode 100644 index b6d80dc311..0000000000 Binary files a/docs/assets/fav-logo.png and /dev/null differ diff --git a/docs/assets/pb-info-worldcities-csv.png b/docs/assets/pb-info-worldcities-csv.png deleted file mode 100644 index e52bbea84a..0000000000 Binary files a/docs/assets/pb-info-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-info-worldcities-github-csv.png b/docs/assets/pb-info-worldcities-github-csv.png deleted file mode 100644 index dbe5eec5a6..0000000000 Binary files a/docs/assets/pb-info-worldcities-github-csv.png and /dev/null differ diff --git a/docs/assets/pb-make-template.png b/docs/assets/pb-make-template.png deleted file mode 100644 index 9aef7a8a55..0000000000 Binary files a/docs/assets/pb-make-template.png and /dev/null differ diff --git a/docs/assets/pb-missing-worldcities-csv.png b/docs/assets/pb-missing-worldcities-csv.png deleted file mode 100644 index 9c1c7e5be9..0000000000 Binary files a/docs/assets/pb-missing-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-preview-game_revenue-all-columns.png b/docs/assets/pb-preview-game_revenue-all-columns.png deleted file mode 100644 index b16e40d431..0000000000 Binary files a/docs/assets/pb-preview-game_revenue-all-columns.png and /dev/null differ diff --git a/docs/assets/pb-preview-game_revenue-column-names.png b/docs/assets/pb-preview-game_revenue-column-names.png deleted file mode 100644 index 12e7569197..0000000000 Binary files a/docs/assets/pb-preview-game_revenue-column-names.png and /dev/null differ diff --git a/docs/assets/pb-preview-game_revenue-column-range.png b/docs/assets/pb-preview-game_revenue-column-range.png deleted file mode 100644 index 8dd48811d7..0000000000 Binary files a/docs/assets/pb-preview-game_revenue-column-range.png and /dev/null differ diff --git a/docs/assets/pb-preview-worldcities-csv-no-row-numbers.png b/docs/assets/pb-preview-worldcities-csv-no-row-numbers.png deleted file mode 100644 index d31b95d9fe..0000000000 Binary files a/docs/assets/pb-preview-worldcities-csv-no-row-numbers.png and /dev/null differ diff --git a/docs/assets/pb-preview-worldcities-csv.png b/docs/assets/pb-preview-worldcities-csv.png deleted file mode 100644 index c45f6fba01..0000000000 Binary files a/docs/assets/pb-preview-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-run-worldcities_validation-extracts.png b/docs/assets/pb-run-worldcities_validation-extracts.png deleted file mode 100644 index f07b0b2ad3..0000000000 Binary files a/docs/assets/pb-run-worldcities_validation-extracts.png and /dev/null differ diff --git a/docs/assets/pb-run-worldcities_validation-fail-on-warning.png b/docs/assets/pb-run-worldcities_validation-fail-on-warning.png deleted file mode 100644 index d6ba622010..0000000000 Binary files a/docs/assets/pb-run-worldcities_validation-fail-on-warning.png and /dev/null differ diff --git a/docs/assets/pb-run-worldcities_validation-output.png b/docs/assets/pb-run-worldcities_validation-output.png deleted file mode 100644 index 6cdbec2f57..0000000000 Binary files a/docs/assets/pb-run-worldcities_validation-output.png and /dev/null differ diff --git a/docs/assets/pb-run-worldcities_validation.png b/docs/assets/pb-run-worldcities_validation.png deleted file mode 100644 index eb32f19262..0000000000 Binary files a/docs/assets/pb-run-worldcities_validation.png and /dev/null differ diff --git a/docs/assets/pb-scan-worldcities-csv.png b/docs/assets/pb-scan-worldcities-csv.png deleted file mode 100644 index 707d0b0b9a..0000000000 Binary files a/docs/assets/pb-scan-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-validate-exit-code.png b/docs/assets/pb-validate-exit-code.png deleted file mode 100644 index bf2c9193bf..0000000000 Binary files a/docs/assets/pb-validate-exit-code.png and /dev/null differ diff --git a/docs/assets/pb-validate-multi-check.png b/docs/assets/pb-validate-multi-check.png deleted file mode 100644 index 26b96d97d8..0000000000 Binary files a/docs/assets/pb-validate-multi-check.png and /dev/null differ diff --git a/docs/assets/pb-validate-rows-complete-worldcities-csv.png b/docs/assets/pb-validate-rows-complete-worldcities-csv.png deleted file mode 100644 index 189c9ae377..0000000000 Binary files a/docs/assets/pb-validate-rows-complete-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-validate-rows-distinct-worldcities-csv.png b/docs/assets/pb-validate-rows-distinct-worldcities-csv.png deleted file mode 100644 index 851a429499..0000000000 Binary files a/docs/assets/pb-validate-rows-distinct-worldcities-csv.png and /dev/null differ diff --git a/docs/assets/pb-validate-show-extract.png b/docs/assets/pb-validate-show-extract.png deleted file mode 100644 index fa15e7a457..0000000000 Binary files a/docs/assets/pb-validate-show-extract.png and /dev/null differ diff --git a/docs/assets/pb-validate-worldcities-gt-0-population.png b/docs/assets/pb-validate-worldcities-gt-0-population.png deleted file mode 100644 index 220ee185b5..0000000000 Binary files a/docs/assets/pb-validate-worldcities-gt-0-population.png and /dev/null differ diff --git a/docs/assets/pb-validate-worldcities-not-null-city_name.png b/docs/assets/pb-validate-worldcities-not-null-city_name.png deleted file mode 100644 index 5af9cff15e..0000000000 Binary files a/docs/assets/pb-validate-worldcities-not-null-city_name.png and /dev/null differ diff --git a/docs/assets/pb-validate-write-extract.png b/docs/assets/pb-validate-write-extract.png deleted file mode 100644 index 5b4c11508a..0000000000 Binary files a/docs/assets/pb-validate-write-extract.png and /dev/null differ diff --git a/docs/assets/pointblank-data-generation.png b/docs/assets/pointblank-data-generation.png deleted file mode 100644 index aeff48159d..0000000000 Binary files a/docs/assets/pointblank-data-generation.png and /dev/null differ diff --git a/docs/assets/pointblank-draft-validation-report.png b/docs/assets/pointblank-draft-validation-report.png deleted file mode 100644 index c88a3a2e0f..0000000000 Binary files a/docs/assets/pointblank-draft-validation-report.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.de.png b/docs/assets/pointblank-sales-data.de.png deleted file mode 100644 index 88bc5bca85..0000000000 Binary files a/docs/assets/pointblank-sales-data.de.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.es.png b/docs/assets/pointblank-sales-data.es.png deleted file mode 100644 index 4f5114dacd..0000000000 Binary files a/docs/assets/pointblank-sales-data.es.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.fr.png b/docs/assets/pointblank-sales-data.fr.png deleted file mode 100644 index 21b8ae8c37..0000000000 Binary files a/docs/assets/pointblank-sales-data.fr.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.it.png b/docs/assets/pointblank-sales-data.it.png deleted file mode 100644 index 8aa48d7a7d..0000000000 Binary files a/docs/assets/pointblank-sales-data.it.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.ja.png b/docs/assets/pointblank-sales-data.ja.png deleted file mode 100644 index 8f93949a34..0000000000 Binary files a/docs/assets/pointblank-sales-data.ja.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.ko.png b/docs/assets/pointblank-sales-data.ko.png deleted file mode 100644 index 7246a2009e..0000000000 Binary files a/docs/assets/pointblank-sales-data.ko.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.nl.png b/docs/assets/pointblank-sales-data.nl.png deleted file mode 100644 index ceb9f713e2..0000000000 Binary files a/docs/assets/pointblank-sales-data.nl.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.png b/docs/assets/pointblank-sales-data.png deleted file mode 100644 index e07a465b49..0000000000 Binary files a/docs/assets/pointblank-sales-data.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.pt-BR.png b/docs/assets/pointblank-sales-data.pt-BR.png deleted file mode 100644 index 9f83628c56..0000000000 Binary files a/docs/assets/pointblank-sales-data.pt-BR.png and /dev/null differ diff --git a/docs/assets/pointblank-sales-data.zh-CN.png b/docs/assets/pointblank-sales-data.zh-CN.png deleted file mode 100644 index abb4508c9d..0000000000 Binary files a/docs/assets/pointblank-sales-data.zh-CN.png and /dev/null differ diff --git a/docs/assets/pointblank-step-report.png b/docs/assets/pointblank-step-report.png deleted file mode 100644 index c1695462da..0000000000 Binary files a/docs/assets/pointblank-step-report.png and /dev/null differ diff --git a/docs/assets/pointblank-tabular-report.png b/docs/assets/pointblank-tabular-report.png deleted file mode 100644 index ce6355f346..0000000000 Binary files a/docs/assets/pointblank-tabular-report.png and /dev/null differ diff --git a/docs/assets/pointblank_logo.png b/docs/assets/pointblank_logo.png deleted file mode 100644 index 120dfbdd72..0000000000 Binary files a/docs/assets/pointblank_logo.png and /dev/null differ diff --git a/docs/assets/pointblank_logo.svg b/docs/assets/pointblank_logo.svg deleted file mode 100644 index b5e06e726b..0000000000 --- a/docs/assets/pointblank_logo.svg +++ /dev/null @@ -1,176 +0,0 @@ - - - pointblank_logo - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/assets/pointblank_logo_small.svg b/docs/assets/pointblank_logo_small.svg deleted file mode 100644 index 4c8c11f874..0000000000 --- a/docs/assets/pointblank_logo_small.svg +++ /dev/null @@ -1,12 +0,0 @@ - - - pointblank_logo_small - - - - - - - - - \ No newline at end of file diff --git a/docs/assets/posit-logo-black.svg b/docs/assets/posit-logo-black.svg deleted file mode 100644 index b85676deac..0000000000 --- a/docs/assets/posit-logo-black.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/docs/assets/validation-table-diagram.png b/docs/assets/validation-table-diagram.png deleted file mode 100644 index 4e9a8ecd27..0000000000 Binary files a/docs/assets/validation-table-diagram.png and /dev/null differ diff --git a/docs/assets/validation-test-units.png b/docs/assets/validation-test-units.png deleted file mode 100644 index 441c0b5552..0000000000 Binary files a/docs/assets/validation-test-units.png and /dev/null differ diff --git a/docs/assets/vhs/.gitignore b/docs/assets/vhs/.gitignore deleted file mode 100644 index 9a798285cd..0000000000 --- a/docs/assets/vhs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -worldcities.csv diff --git a/docs/assets/vhs/cli-cicd-workflows.gif b/docs/assets/vhs/cli-cicd-workflows.gif deleted file mode 100644 index 1b8848eced..0000000000 Binary files a/docs/assets/vhs/cli-cicd-workflows.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-cicd-workflows.tape b/docs/assets/vhs/cli-cicd-workflows.tape deleted file mode 100644 index 0bf1795f40..0000000000 --- a/docs/assets/vhs/cli-cicd-workflows.tape +++ /dev/null @@ -1,42 +0,0 @@ -Output cli-cicd-workflows.gif - -Set FontSize 16 -Set Width 1200 -Set Height 700 -Set Theme "Dracula" - -Type "# Example of a Production CI/CD Data Validation Workflow with the Pointblank CLI" -Enter -Sleep 3s - -Type "" -Enter -Type "# Pipeline Step 1: Checking for duplicates while using exit codes (this fails)" -Enter -Type "pb validate small_table --check rows-distinct --exit-code" -Enter -Sleep 10s - -Type "" -Enter -Type "# Pipeline Step 2: Checking for null values in an important column (this passes)" -Enter -Type "pb validate small_table --check col-vals-not-null --column date_time --exit-code" -Enter -Sleep 10s - -Type "" -Enter -Type "# Pipeline Step 3: Checking for values greater than 2 (this one fails)" -Enter -Type "pb validate small_table --check col-vals-gt --column a --value 2 --exit-code" -Enter -Sleep 10s - -Type "" -Enter -Type "# Make use of --exit-code in GitHub Actions or any other automation platform" -Enter -Type "# Exit codes: 0 = success, 1 = validation failed" -Enter -Sleep 6s diff --git a/docs/assets/vhs/cli-complete-workflow.gif b/docs/assets/vhs/cli-complete-workflow.gif deleted file mode 100644 index e0e27b50cf..0000000000 Binary files a/docs/assets/vhs/cli-complete-workflow.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-complete-workflow.tape b/docs/assets/vhs/cli-complete-workflow.tape deleted file mode 100644 index 42e71990c3..0000000000 --- a/docs/assets/vhs/cli-complete-workflow.tape +++ /dev/null @@ -1,80 +0,0 @@ -Output cli-complete-workflow.gif - -Set FontSize 13 -Set Width 1000 -Set Height 1100 -Set Theme "Dracula" - -Type "# This is a Demo of a Complete Data Quality Workflow Using the Pointblank CLI" -Enter -Sleep 1s - -Type "" -Enter -Type "# Step 1: Quickly preview a dataset" -Enter -Type "pb preview worldcities.csv" -Enter -Sleep 5s - -Type "" -Enter -Type "# Step 2: Quickly check for missing values in the dataset" -Enter -Type "pb missing worldcities.csv" -Enter -Sleep 6s - -Type "" -Enter -Type "# Step 3: Validate that population values are provided for each city (i.e., not null)" -Enter -Type "pb validate worldcities.csv --check col-vals-not-null --column population" -Enter -Sleep 8s - -Type "" -Enter -Type "# Step 4: Check for missing city names (and show the problematic rows)" -Enter -Type "pb validate worldcities.csv --check col-vals-not-null --column city_name --show-extract" -Enter -Sleep 8s - -Type "" -Enter -Type "# Step 5: Validate latitude range (latitude: -90 to 90) in two steps" -Enter -Type "pb validate worldcities.csv --check col-vals-ge --column latitude --value -90 --check col-vals-le --column latitude --value 90" -Enter -Sleep 8s - -Type "" -Enter -Type "# Step 6: Create a comprehensive validation script for production use" -Enter -Type "cat validation.py" -Enter -Sleep 8s - -Type "" -Enter -Type "# Step 7: Run that comprehensive validation and get a validation reporting table" -Enter -Type "pb run validation.py" -Enter -Sleep 10s - -Type "" -Enter -Type "# Step 8: Generate an HTML validation report file and show the failing rows" -Enter -Type "pb run validation.py --output-html validation_report.html --show-extract" -Enter -Sleep 8s - -Type "" -Enter -Type "# That was a complete workflow, moving from quick checks to more involved validations" -Enter -Sleep 3s diff --git a/docs/assets/vhs/cli-data-exploration.gif b/docs/assets/vhs/cli-data-exploration.gif deleted file mode 100644 index 4dfc440054..0000000000 Binary files a/docs/assets/vhs/cli-data-exploration.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-data-exploration.tape b/docs/assets/vhs/cli-data-exploration.tape deleted file mode 100644 index 23e037d90f..0000000000 --- a/docs/assets/vhs/cli-data-exploration.tape +++ /dev/null @@ -1,51 +0,0 @@ -# VHS tape for data exploration and profiling -# Shows preview, missing values, and column summaries using real-world city data - -Output cli-data-exploration.gif - -Set FontSize 14 -Set Width 1200 -Set Height 800 -Set Theme "Dracula" - -Type "# Data Exploration & Profiling with the Pointblank CLI" -Enter -Sleep 2s - -Type "" -Enter -Type "# 1. Ensure that data can be loaded and see some info about the dataset" -Enter -Type "pb info worldcities.csv" -Enter -Sleep 4s - -Type "" -Enter -Type "# 2. Get a quick data preview to see structure and sample rows" -Enter -Type "pb preview worldcities.csv --head 3 --tail 3" -Enter -Sleep 8s - -Type "" -Enter -Type "# 3. Missing values analysis: identify data gaps" -Enter -Type "pb missing worldcities.csv" -Enter -Sleep 8s - -Type "" -Enter -Type "# 4. Comprehensive data scan: statistical overview" -Enter -Type "pb scan worldcities.csv" -Enter -Sleep 10s - -Type "" -Enter -Type "# That's data profiling in seconds with the Pointblank CLI" -Enter -Sleep 4s diff --git a/docs/assets/vhs/cli-essential-validations.gif b/docs/assets/vhs/cli-essential-validations.gif deleted file mode 100644 index 8c9bdd888b..0000000000 Binary files a/docs/assets/vhs/cli-essential-validations.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-essential-validations.tape b/docs/assets/vhs/cli-essential-validations.tape deleted file mode 100644 index e2f69ae7f1..0000000000 --- a/docs/assets/vhs/cli-essential-validations.tape +++ /dev/null @@ -1,48 +0,0 @@ -Output cli-essential-validations.gif - -Set FontSize 16 -Set Width 1200 -Set Height 700 -Set Theme "Dracula" - -Type "# Essential Data Validation with Pointblank CLI" -Enter -Sleep 4s - -Type "" -Enter -Type "# 1. Get the table dimensions, just to know what you're working with" -Enter -Type "pb info small_table" -Enter -Sleep 8s - -Type "" -Enter -Type "# 2. Check for duplicate rows (the most common data quality issue)" -Enter -Type "pb validate small_table --check rows-distinct" -Enter -Sleep 8s - -Type "" -Enter -Type "# 3. Find missing values in important columns" -Enter -Type "pb validate small_table --check col-vals-not-null --column c" -Enter -Sleep 8s - -Type "" -Enter -Type "# 4. When validation fails, see exactly which rows are problematic with --show-extract" -Enter -Type "pb validate small_table --check col-vals-not-null --column c --show-extract" -Enter -Sleep 10s - -Type "" -Enter -Type "# That is just a sampling of the essentials with duplicates, nulls, with data extracts" -Enter -Sleep 3s diff --git a/docs/assets/vhs/cli-getting-started.gif b/docs/assets/vhs/cli-getting-started.gif deleted file mode 100644 index a16dfc3b03..0000000000 Binary files a/docs/assets/vhs/cli-getting-started.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-getting-started.tape b/docs/assets/vhs/cli-getting-started.tape deleted file mode 100644 index 6fda1e8ce4..0000000000 --- a/docs/assets/vhs/cli-getting-started.tape +++ /dev/null @@ -1,48 +0,0 @@ -Output cli-getting-started.gif - -Set FontSize 16 -Set Width 1200 -Set Height 700 -Set Theme "Dracula" - -Type "# Getting Started with the Pointblank CLI" -Enter -Sleep 5s - -Type "" -Enter -Type "# See all available commands for the Pointblank CLI" -Enter -Type "pb --help" -Enter -Sleep 10s - -Type "" -Enter -Type "# There's a number of datasets you can use for testing" -Enter -Type "pb datasets" -Enter -Sleep 8s - -Type "" -Enter -Type "# Learn about all the `pb validate` options" -Enter -Type "pb validate --help" -Enter -Sleep 12s - -Type "" -Enter -Type "# Your first validation: check for duplicates with rows-distinct" -Enter -Type "pb validate small_table --check rows-distinct" -Enter -Sleep 10s - -Type "" -Enter -Type "# You can validate any sort of dataset you want: CSV, Parquet, or a DB table" -Enter -Sleep 5s diff --git a/docs/assets/vhs/cli-using-polars.gif b/docs/assets/vhs/cli-using-polars.gif deleted file mode 100644 index 509c43e28f..0000000000 Binary files a/docs/assets/vhs/cli-using-polars.gif and /dev/null differ diff --git a/docs/assets/vhs/cli-using-polars.tape b/docs/assets/vhs/cli-using-polars.tape deleted file mode 100644 index 6ca94e222f..0000000000 --- a/docs/assets/vhs/cli-using-polars.tape +++ /dev/null @@ -1,64 +0,0 @@ -Output cli-using-polars.gif - -Set FontSize 13 -Set Width 1000 -Set Height 1100 -Set Theme "Dracula" - -Type "# Polars Data Manipulation with the Pointblank CLI" -Enter -Sleep 4s - -Type "" -Enter -Type "# 1. Load and explore the worldcities CSV dataset" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").head(10)'` -Enter -Sleep 8s - -Type "" -Enter -Type "# 2. Use the Polars methods for filtering and selecting columns" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").select(["city_name", "country", "population"]).filter(pl.col("population") > 1000000)'` -Enter -Sleep 10s - -Type "" -Enter -Type "# 3. Data manipulation -> missing data analysis via --output-format missing" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").select(["city_name", "country", "population"])' --output-format missing` -Enter -Sleep 10s - -Type "" -Enter -Type "# 4. Behold the power of piping! Filter large cities and validate for duplicates with --pipe | pb validate" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").filter(pl.col("population") > 5000000)' --pipe | pb validate --check rows-distinct` -Enter -Sleep 10s - -Type "" -Enter -Type "# 5. Do a complex transformation and pipe into a data scan" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").filter(pl.col("country").is_in(["United States", "China", "India"])).select(["city_name", "latitude", "longitude"])' --pipe | pb scan` -Enter -Sleep 12s - -Type "" -Enter -Type "# 6. Create derived metrics and pipe to a validation: so cool!" -Enter -Type `pb pl 'pl.read_csv("worldcities.csv").with_columns((pl.col("population") / 1000000).alias("pop_millions")).filter(pl.col("pop_millions") > 0.5)' --pipe | pb validate --check col-vals-gt --column pop_millions --value 0` -Enter -Sleep 12s - -Type "" -Enter -Type "# The pb pl command truly enables a powerful new class of data quality pipelines!" -Enter -Sleep 5s diff --git a/docs/assets/vhs/validation.py b/docs/assets/vhs/validation.py deleted file mode 100644 index 2cbad15e99..0000000000 --- a/docs/assets/vhs/validation.py +++ /dev/null @@ -1,28 +0,0 @@ -import pointblank as pb - -validation = ( - pb.Validate( - data="worldcities.csv", - thresholds=pb.Thresholds( - warning=1, # 1 failure - error=0.05, # 5% of rows failing - ), - ) - .col_schema_match( - schema=pb.Schema( - columns=[ - ("city_name", "object"), - ("latitude", "float64"), - ("longitude", "float64"), - ("country", "object"), - ("population", "float64"), - ] - ), - ) - .col_vals_not_null(columns="city_name") - .col_vals_not_null(columns="population") - .col_vals_gt(columns="population", value=0, na_pass=True) - .col_vals_between(columns="latitude", left=-90, right=90) - .col_vals_between(columns="longitude", left=-180, right=180) - .interrogate() -) diff --git a/docs/blog/all-about-actions/index.qmd b/docs/blog/all-about-actions/index.qmd deleted file mode 100644 index 2ab5cf5aac..0000000000 --- a/docs/blog/all-about-actions/index.qmd +++ /dev/null @@ -1,434 +0,0 @@ ---- -jupyter: python3 -html-table-processing: none -title: "Level Up Your Data Validation with `Actions` and `FinalActions`" -author: Rich Iannone -date: 2025-05-02 -freeze: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer=False) -``` - -Data validation is only useful if you can respond appropriately when problems arise. That's why -Pointblank's recent `v0.8.0` and `v0.8.1` releases have significantly enhanced our action framework, -allowing you to create sophisticated, automated responses to validation failures. - -In this post, we'll explore how to use: - -1. **Actions** to respond to individual validation failures -2. **FinalActions** to execute code after your entire validation plan completes -3. New customization features that make your validation workflows more expressive - -Let's dive into how these features can transform your data validation process from passive reporting -to active response. - -## From Passive Validation to Active Response - -Traditional data validation simply reports problems: "Column X has invalid values." But what if you -want to: - -- send a Slack message when critical errors occur? -- log detailed diagnostics about failing data? -- trigger automatic data cleaning processes? -- generate custom reports for stakeholders? - -This is where Pointblank's action system can help. By pairing thresholds with actions, you can -create automated responses that trigger exactly when needed. - -## Getting Started with Actions - -Actions are executed when validation steps fail to meet certain thresholds. Let's start with a -simple example: - -```{python} -import pointblank as pb - -validation_1 = ( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_gt( - columns="d", - value=1000, - thresholds=pb.Thresholds(warning=1, error=5), - actions=pb.Actions( - warning="⚠️ WARNING: Some values in column 'd' are below the minimum threshold!" - ) - ) - .interrogate() -) - -validation_1 -``` - -In this example: - -1. we're validating that values in column "d" are greater than 1000 -2. we set a warning threshold of 1 (triggers if any values fail) -3. we define an action that prints a warning message when the threshold is exceeded - -Since several values in column `d` are below `1000`, our 'warning' action is triggered and the -message appears above the validation report. - -## The Anatomy of Actions - -The [`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) class is a very -important piece of Pointblank's response system. Actions can be defined in several ways: - -1. **String messages**: simple text output to the console -2. **Callable functions**: custom Python functions that execute when triggered -3. **Lists of strings/callables**: multiple actions that execute in sequence - -Actions can be paired with different severity levels: - -- 'warning': for minor issues that need attention -- 'error': for more significant problems -- 'critical': for severe issues that require immediate action - -The `v0.8.0` release added two (very) useful new parameters: - -- `default=`: apply the same action to all threshold levels -- `highest_only=`: only trigger the action for the highest threshold level reached (`True` by -default) - -Let's see how these work in practice: - -```{python} -def log_problem(): - # Simple action that runs when thresholds are exceeded - print("A validation threshold has been exceeded!") - -validation_2 = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(default=log_problem) # Apply this action to all threshold levels - ) - .col_vals_regex( - columns="player_id", - pattern=r"[A-Z]{12}\d{3}" - ) - .col_vals_gt( - columns="item_revenue", - value=0.10 - ) - .interrogate() -) - -validation_2 -``` - -In this example, we're using a simple function that prints a generic message whenever any threshold -is exceeded. By using the `Actions(default=)` parameter, this same function gets applied to all -threshold levels ('warning', 'error', and 'critical'). This saves you from having to define separate -actions for each level when you want the same behavior for all of them. The `highest_only=` -parameter (`True` by default, so not shown here) is complementary and it ensures that only the -action for the highest threshold level reached will be triggered, preventing multiple notifications -for the same validation failure. - -## Dynamic Messages with Templating - -Actions don't have to be static messages. With Pointblank's templating system, you can create -context-aware notifications that include details about the specific validation failure. - -Available placeholders include: - -- `{type}`: the validation step type (e.g., `"col_vals_gt"`) -- `{level}`: the threshold level ('warning', 'error', 'critical') -- `{step}` or `{i}`: the step number in the validation workflow -- `{col}` or `{column}`: the column name being validated -- `{val}` or `{value}`: the comparison value used in the validation -- `{time}`: when the action was executed - -You can also capitalize placeholders (like `{LEVEL}`) to get uppercase text. - -```{python} -action_template = "[{LEVEL}] Step {step}: Values in '{column}' failed validation against {value}." - -validation_3 = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - thresholds=pb.Thresholds(warning=1, error=4, critical=10), - actions=pb.Actions(default=action_template) - ) - .col_vals_lt( - columns="d", - value=3000 - ) - .interrogate() -) - -validation_3 -``` - -This templating approach is a great way to create context-aware notifications that adapt to the -specific validation failures occurring. As the example shows, when values in column `d` fail -validation against the limit of `3000`, the template automatically generates a meaningful error -message showing exactly which step, column, and threshold value was involved. - -## Accessing Metadata in Custom Action Functions - -For more sophisticated actions, you often need access to details about the validation failure. The -`get_action_metadata()` function provides this context when called inside an action function: - -```{python} -def send_detailed_alert(): - # Get metadata about the validation failure - metadata = pb.get_action_metadata() - - # Create a customized alert message - print(f""" - VALIDATION FAILURE DETAILS - ------------------------- - Step: {metadata['step']} - Column: {metadata['column']} - Validation type: {metadata['type']} - Severity: {metadata['level']} (level {metadata['level_num']}) - Time: {metadata['time']} - - Explanation: {metadata['failure_text']} - """) - -validation_4 = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - thresholds=pb.Thresholds(critical=1), - actions=pb.Actions(critical=send_detailed_alert) - ) - .col_vals_gt( - columns="d", - value=5000 - ) - .interrogate() -) - -validation_4 -``` - -The metadata dictionary contains essential fields for a given validation step, including the step -number, column name, validation type, severity level, and failure explanation. This gives you -complete flexibility to create highly customized responses based on the specific nature of the -validation failure. - -## Final Actions with `FinalActions` - -While regular [`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) are great -for responding to individual validation steps, sometimes you need to take action based on the -overall validation results. This is where the new `FinalActions` feature from `v0.8.1` comes in. - -Unlike regular [`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) that -trigger during validation, -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html) execute after -all validation steps are complete. -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html) accepts any -number of actions (strings or callables) and executes them in sequence. Each argument can be a -string message to display in the console, a callable function, or a list of strings/callables for -multiple actions to execute in sequence. - -The real power of [`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html) -comes from the ability to access comprehensive information about your validation results using -[`get_validation_summary()`](https://posit-dev.github.io/pointblank/reference/get_validation_summary.html). -When called inside a function passed to -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html), this function -provides a dictionary containing counts of passing/failing steps and test units, threshold levels -exceeded, and much more: - -```{python} -def generate_summary(): - # Access comprehensive validation results - summary = pb.get_validation_summary() - - print("\n=== VALIDATION SUMMARY ===") - print(f"Total steps: {summary['n_steps']}") - print(f"Passing steps: {summary['n_passing_steps']}") - print(f"Failing steps: {summary['n_failing_steps']}") - - if summary['highest_severity'] == "critical": - print("\n⚠️ CRITICAL FAILURES DETECTED - immediate action required!") - elif summary['highest_severity'] == "error": - print("\n⚠️ ERRORS DETECTED - review needed") - elif summary['highest_severity'] == "warning": - print("\n⚠️ WARNINGS DETECTED - please investigate") - else: - print("\n✅ All validations passed!") - -validation_5 = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - tbl_name="small_table", - thresholds=pb.Thresholds(warning=1, error=5, critical=10), - final_actions=pb.FinalActions( - "Validation process complete.", # A simple string message - generate_summary # Our function using get_validation_summary() - ) - ) - .col_vals_gt(columns="a", value=1) - .col_vals_lt(columns="d", value=10000) - .interrogate() -) - -validation_5 -``` - -The [`get_validation_summary()`](https://posit-dev.github.io/pointblank/reference/get_validation_summary.html) -function is only available within functions passed to -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html). It gives you -access to these key dictionary fields: - -- `tbl_name`: name of the validated table -- `n_steps`: total number of validation steps -- `n_passing_steps`, n_failing_steps: count of passing/failing steps -- `n`, `n_passed`, `n_failed`: total test units and their pass/fail counts -- `highest_severity`: the most severe threshold level reached ('warning', 'error', 'critical') -- and many more detailed statistics - -This information allows you to create detailed and specific final actions that can respond -appropriately to the overall validation results. - -## Combining Regular and Final Actions - -You can use both [`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) and -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html) together for -comprehensive control over your validation workflow: - -```{python} -def step_alert(): - metadata = pb.get_action_metadata() - print(f"Step {metadata['step']} failed with {metadata['level']} severity") - - -def final_summary(): - summary = pb.get_validation_summary() - - # Get counts by checking each step's status in the dictionaries - steps = range(1, summary['n_steps'] + 1) - n_critical = sum(1 for step in steps if summary['dict_critical'].get(step, False)) - n_error = sum(1 for step in steps if summary['dict_error'].get(step, False)) - n_warning = sum(1 for step in steps if summary['dict_warning'].get(step, False)) - - print(f"\nValidation complete with:") - print(f"- {n_critical} critical issues") - print(f"- {n_error} errors") - print(f"- {n_warning} warnings") - - -validation_6 = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - thresholds=pb.Thresholds(warning=1, error=5, critical=10), - actions=pb.Actions(default=step_alert), - final_actions=pb.FinalActions(final_summary), - ) - .col_vals_gt(columns="a", value=5) - .col_vals_lt(columns="d", value=1000) - .interrogate() -) - -validation_6 -``` - -This approach allows you to log individual step failures during the validation process using -[`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) and generate a -comprehensive report after all validation steps are complete using -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html). Using both -action types gives you fine-grained control over when and how notifications and other actions are -triggered in your validation workflow. - -## Real-World Example: Building an Automated Validation Pipeline - -Let's put everything together in a more realistic example. Imagine you're validating a gaming -revenue dataset and want to: - -1. log detailed information about each failure -2. send a Slack notification if critical failures occur -3. generate a comprehensive report after validation completes - -```{python} -def log_step_failure(): - metadata = pb.get_action_metadata() - print(f"[{metadata['level'].upper()}] Step {metadata['step']}: {metadata['failure_text']}") - -def analyze_results(): - summary = pb.get_validation_summary() - - # Calculate overall pass rate - pass_rate = (summary['n_passing_steps'] / summary['n_steps']) * 100 - - print(f"\n==== VALIDATION RESULTS ====") - print(f"Table: {summary['tbl_name']}") - print(f"Pass rate: {pass_rate:.2f}%") - print(f"Failing steps: {summary['n_failing_steps']} of {summary['n_steps']}") - - # In a real scenario, here you might: - # 1. Save results to a database - # 2. Generate and email an HTML report - # 3. Trigger data cleansing workflows - - # Simulate a Slack notification - if summary['highest_severity'] == "critical": - print("\n🚨 [SLACK NOTIFICATION] Critical data quality issues detected!") - print("@data-team Please investigate immediately.") - -# Create our validation workflow with actions -validation_7 = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - tbl_name="game_revenue", - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(default=log_step_failure, highest_only=True), - final_actions=pb.FinalActions(analyze_results), - brief=True # Add automatically-generated briefs - ) - .col_vals_regex( - columns="player_id", - pattern=r"[A-Z]{12}\d{3}", - brief="Player IDs must follow standard format" # Custom brief text - ) - .col_vals_gt( - columns="item_revenue", - value=0.10 - ) - .col_vals_gt( - columns="session_duration", - value=15 - ) - .interrogate() -) - -validation_7 -``` - -## Wrapping Up: from Passive Validation to Active Data Quality Management - -With [`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) and -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html), Pointblank is -now more of a complete data quality management system. Instead of just detecting problems, you can -now: - -1. respond immediately to validation failures -2. customize responses based on severity level -3. generate comprehensive reports after validation completes -4. integrate with other systems through custom action functions -5. automate workflows based on validation results - -These capabilities transform data validation from a passive reporting activity into an active -component of your data pipeline, helping ensure that data quality issues are detected, reported, and -addressed efficiently. - -As we continue to enhance Pointblank, we'd love to hear how you're using -[`Actions`](https://posit-dev.github.io/pointblank/reference/Actions.html) and -[`FinalActions`](https://posit-dev.github.io/pointblank/reference/FinalActions.html) in your -workflows. Share your experiences or suggestions with us on -[Discord](https://discord.gg/YH7CybCNCQ) or file an issue on -[GitHub](https://github.com/posit-dev/pointblank/issues). - -## Learn More - -Explore our documentation to learn more about Pointblank's action capabilities: - -- [Actions documentation](https://posit-dev.github.io/pointblank/reference/Actions.html) -- [FinalActions documentation](https://posit-dev.github.io/pointblank/reference/FinalActions.html) -- [User Guide on Triggering Actions](https://posit-dev.github.io/pointblank/user-guide/actions.html) diff --git a/docs/blog/index.qmd b/docs/blog/index.qmd deleted file mode 100644 index 25318cff8c..0000000000 --- a/docs/blog/index.qmd +++ /dev/null @@ -1,10 +0,0 @@ ---- -listing: - type: table - sort: "date desc" - feed: true - contents: - - "**.qmd" ---- - -![](./pointblank_blog_logo.png) diff --git a/docs/blog/intro-pointblank/index.qmd b/docs/blog/intro-pointblank/index.qmd deleted file mode 100644 index 5af99fa5c1..0000000000 --- a/docs/blog/intro-pointblank/index.qmd +++ /dev/null @@ -1,263 +0,0 @@ ---- -jupyter: python3 -html-table-processing: none -title: "Introducing Pointblank" -author: Rich Iannone -date: 2025-04-04 -freeze: true ---- - -If you have tabular data (and who doesn't?) this is the package for you! I've long been interested -in data quality and so I've spent a lot of time building tooling that makes it possible to perform -data quality checks. And there's so many reasons to care about data quality. If I were to put down -just one good reason for why data quality is worth your time it is because having good data quality -strongly determines the quality of decisions. - -Having the ability to distinguish bad data from good data is the first step in solving DQ issues, -and the sustained practice of doing data validation will guard against intrusions of poor-quality -data. Pointblank has been designed to really help here. Though it's a fairly new package it is -currently quite capable. And it's available in PyPI, so you can install it by using: - -```bash -pip install pointblank -``` - -::: {.callout-note} -To run the examples in this post, you'll need to have a DataFrame library installed. Pointblank -works seamlessly with both Polars and Pandas but you'll need to install at least one of them on your -own. We also have a DuckDB example that's running via Ibis (so, you'll have to install Ibis with -the DuckDB backend for that to work). -::: - -## How Pointblank Transforms Your Data Validation Workflow - -What sets Pointblank apart is its intuitive, expressive approach to data validation. Rather than -writing dozens of ad-hoc checks scattered throughout your codebase, Pointblank lets you define a -comprehensive validation plan with just a few lines of code. The fluent API makes your validation -intentions crystal clear, whether you're ensuring numeric values fall within expected ranges, text -fields match specific patterns, or relationships between columns remain consistent. - -But say you find problems. What are you gonna do about it? Well, Pointblank wants to help at not -just finding problems but helping you understand them. When validation failures occur, the detailed -reporting capabilities (in the form of beautiful, sharable tables) show you exactly where issues -are. Right down to the specific rows and columns. This transforms data validation from a binary -pass/fail exercise into a super-insightful diagnostic tool. - -![](./step_report.png) - -Here's the the best part: Pointblank is designed to work with your existing data stack. Whether -you're using Polars, Pandas, DuckDB, or other database systems, Pointblank tries hard to integrate -without forcing you to change your workflow. We also have international spoken language support for -reporting, meaning that validation reports can be localized to your team's preferred language. This -making data quality accessible to everyone in your organization (like a team sport!). - -![](./pointblank-localized.png) - -Alright! Let's look at a few demonstrations of Pointblank's capabilities for data validation. - -## The Data Validation Workflow - -Let's get right to performing a basic check of a Polars DataFrame. We'll make use of the included -`small_table` dataset. - -```{python} -import pointblank as pb - -small_table = pb.load_dataset(dataset="small_table", tbl_type="polars") - -validation_1 = ( - pb.Validate( - data=small_table, - tbl_name="small_table", - label="Example Validation" - ) - .col_vals_lt(columns="a", value=10) - .col_vals_between(columns="d", left=0, right=5000) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) - .col_vals_regex(columns="b", pattern=r"^[0-9]-[a-z]{3}-[0-9]{3}$") - .interrogate() -) - -validation_1 -``` - -There's a lot to take in here so let's break down the code first! Note these three key pieces: - -- the `Validate(data=...)` argument takes a DataFrame (or database table) that you want to validate -- the methods starting with `col_*` specify validation steps that run on specific columns -- the `interrogate()` method executes the validation plan on the table (it's the finishing step) - -This common pattern is used in a validation workflow, where -[`Validate`](https://posit-dev.github.io/pointblank/reference/Validate.html) and -[`interrogate()`](https://posit-dev.github.io/pointblank/reference/Validate.interrogate.html) -bookend a validation plan generated through calling validation methods. - -Now, onto the result: it's a table! Naturally, we're using the awesome Great Tables package here in -Pointblank to really give you the goods on how the validation went down. Each row in this reporting -table represents a single validation step (one for each invocation of a `col_vals_*()` validation -method). Generally speaking, the left side of the validation report tables outlines the key -validation rules, and the right side provides the results of each validation step. - -We tried to keep it simple in principle, but a lot of useful information can be packed into this -validation table. Here's a diagram that describes a few of the important parts of the validation -report table: - -![](validation-table-diagram.png){width=100%} - -All of those numbers under the `UNITS`, `PASS`, and `FAIL` columns have to do with test units, a -measure of central importance in Pointblank. Each validation step will execute a type of validation -test on the target table. For example, a -[`col_vals_lt()`](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_lt.html) -validation step can test that each value in a column is less than a specified number. The key -finding that’s reported as a result of this test is the number of test units that pass or fail. This -little diagram explains what those numbers mean: - -![](validation-test-units.png){width=50%, text-align=center} - -Failing test units can be tied to threshold levels, which can provide a better indication of whether -failures should raise some basic awareness or spur you into action. Here's a validation workflow -that sets three failure threshold levels that signal the severity of data quality problems: - -```{python} -import pointblank as pb -import polars as pl - -validation_2 = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - tbl_name="game_revenue", - label="Data validation with threshold levels set.", - thresholds=pb.Thresholds(warning=1, error=20, critical=0.10), - ) - .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 - .col_vals_gt(columns="session_duration", value=5) # STEP 2 - .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 - .col_vals_in_set( # STEP 5 - columns="acquisition", - set=["google", "facebook", "organic", "crosspromo", "other_campaign"] - ) - .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 - .col_vals_between( # STEP 7 - columns="session_duration", - left=10, right=50, - pre = lambda df: df.select(pl.median("session_duration")) - ) - .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 8 - .row_count_match(count=2000) # STEP 9 - .col_exists(columns="start_day") # STEP 10 - .interrogate() -) - -validation_2 -``` - -This data validation makes use of the many -[validation methods available in the library](https://posit-dev.github.io/pointblank/reference/#validation-steps). -Because thresholds have been set at the `Validate(thresholds=)` parameter, we can now see where -certain validation steps have greater amounts of failures. Any validation steps with green -indicators passed with flying colors, whereas: (1) gray indicates the 'warning' condition was met -(at least one test unit failing), (2) yellow is for the 'error' condition (20 or more test units -failing), and (3) red means 'critical' and that's tripped when 10% of all test units are failing -ones. - -Reporting tables are essential to the package and they help communicate what went wrong (or well) in -a validation workflow. Now let's look at some additional reporting that Pointblank can give you to -better understand *where* things might've gone wrong. - -## Reporting for Individual Validation Steps - -The second validation step of the previous data validation showed 18 failing test units. That -translates to 18 spots in a 2,000 row DataFrame where a data quality assertion failed. We often -would like to know exactly what that failing data is; it's usually the next step toward addressing -data quality issues. - -Pointblank offers a method that gives you a tabular report on a specific step: -[`get_step_report()`](https://posit-dev.github.io/pointblank/reference/Validate.get_step_report.html). -The previous tables you've seen (the validation report table) dealt with providing a summary of all -validation steps. In contrast, a focused report on a single step can help to get to the heart of a data -quality issue. Here's how that looks for Step 2: - -```{python} -validation_2.get_step_report(i=2) -``` - -This report provides the 18 rows where the failure occurred. If you scroll the table to the right -you'll see the column that underwent testing (`session_duration`) is highlighted in red. All of -these values are `5.0` or less, which is in violation of the assertion (in the header) that -`session_duration > 5`. - -These types of bespoke reports are useful for finding a needle in a haystack. Another good use for -a step report is when validating a table schema. Using the -[`col_schema_match()`](https://posit-dev.github.io/pointblank/reference/Validate.col_schema_match.html) -validation method with a table schema prepared with the -[`Schema`](https://posit-dev.github.io/pointblank/reference/Schema.html) class allows us to verify -our understanding of the table structure. Here is a validation that performs a schema validation -with the `small_table` dataset prepared as a DuckDB table: - -```{python} -import pointblank as pb - -# Create a schema for the target table (`small_table` as a DuckDB table) -schema = pb.Schema( - columns=[ - ("date_time", "timestamp(6)"), - ("dates", "date"), - ("a", "int64"), - ("b",), - ("c",), - ("d", "float64"), - ("e", ["bool", "boolean"]), - ("f", "str"), - ] -) - -# Use the `col_schema_match()` validation method to perform a schema check -validation_3 = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="duckdb"), - tbl_name="small_table", - label="Schema check" - ) - .col_schema_match(schema=schema) - .interrogate() -) - -validation_3 -``` - -This step fails, but the validation report table doesn't tell us how (or where). Using -[`get_step_report()](https://posit-dev.github.io/pointblank/reference/Validate.get_step_report.html) -will show us what the underlying issues are: - -```{python} -validation_3.get_step_report(i=1) -``` - -The step report here shows the target table's schema on the left side and the expectation of the -schema on the right side. There appears to be two problems with our supplied schema: - -1. the second column is actually `date` instead of `dates` -2. the dtype of the `f` column is `"string"` and not `"str"` - -The convenience of this step report means we only have to look at one display of information, rather -than having to collect up the individual pieces and make careful comparisons. - -## Much More in Store - -Pointblank tries really hard to make it easy for you to test your data. All sorts of input tables -are supported since we integrate with the brilliant -[Narwhals](https://narwhals-dev.github.io/narwhals/) and -[Ibis](https://ibis-project.org) libraries. And even through the project has only started four -months ago, we already have an extensive catalog of well-tested validation methods. - -We care a great deal about documentation so much recent effort has been placed on getting the -[**User Guide**](https://posit-dev.github.io/pointblank/user-guide/) written. We hope it provides -for gentle introduction to the major features of the library. If you want some quick examples to get -your imagination going, check out our -[gallery of examples](https://posit-dev.github.io/pointblank/demos/). - -We really care about what **you** want in a validation package, so talk to us :) We just started a -[Discord](https://discord.com/invite/YH7CybCNCQ) so feel free to hop on and ask us anything. -Alternatively, we always like to get [issues](https://github.com/posit-dev/pointblank/issues) so -don't be shy in letting us know how we could improve! diff --git a/docs/blog/intro-pointblank/pointblank-localized.png b/docs/blog/intro-pointblank/pointblank-localized.png deleted file mode 100644 index aceb78507a..0000000000 Binary files a/docs/blog/intro-pointblank/pointblank-localized.png and /dev/null differ diff --git a/docs/blog/intro-pointblank/step_report.png b/docs/blog/intro-pointblank/step_report.png deleted file mode 100644 index 51b82b52c9..0000000000 Binary files a/docs/blog/intro-pointblank/step_report.png and /dev/null differ diff --git a/docs/blog/intro-pointblank/validation-table-diagram.png b/docs/blog/intro-pointblank/validation-table-diagram.png deleted file mode 100644 index 4723dabc90..0000000000 Binary files a/docs/blog/intro-pointblank/validation-table-diagram.png and /dev/null differ diff --git a/docs/blog/intro-pointblank/validation-test-units.png b/docs/blog/intro-pointblank/validation-test-units.png deleted file mode 100644 index ee319cda8b..0000000000 Binary files a/docs/blog/intro-pointblank/validation-test-units.png and /dev/null differ diff --git a/docs/blog/lets-workshop-together/index.qmd b/docs/blog/lets-workshop-together/index.qmd deleted file mode 100644 index 5630f0245f..0000000000 --- a/docs/blog/lets-workshop-together/index.qmd +++ /dev/null @@ -1,35 +0,0 @@ ---- -jupyter: python3 -html-table-processing: none -title: "C'mon C'mon: Let's Do a Pointblank Workshop!" -author: Rich Iannone -date: 2025-06-03 -toc: false ---- - -Recently, I've been giving free workshops to data teams on Pointblank. These sorts of engagements -energize me, and I truly enjoy hearing what people are concerned with when it comes to data -validation. If *your* team is interested, I would love to schedule something with y'all! Please -reach out at rich@posit.co or message me on the ([Pointblank Discord](https://discord.com/invite/YH7CybCNCQ)). - -Workshops have been especially helpful for understanding how people use and share Pointblank's -[validation results table](../../user-guide/index.qmd). For example, I learned that people like to -email or pull up the table for stakeholders so they can walk through problem cases (it's a great -conversation starter for getting to the heart of data quality issues). - -A couple of months ago, I gave a workshop to one of Apple's data teams. During the Q&A afterwards, -someone asked whether you can perform the same validation on different chunks of table rows. This -was a very helpful nudge for me to add a [`segments=` argument](../../user-guide/segmentation.qmd) -to all of Pointblank's validation methods! - -We are excited about how things are going with the Pointblank project and are *always up* for -providing a workshop to your data team (at no cost). If this sounds interesting to you please feel -free to contact me through email or via Discord. Don't be shy. Just know that if you have a need for -data validation, we're here to help! - -## Acknowledgment - -I'd like to give special thanks to Rami Krispin for his interest in Pointblank and for facilitating -a recent workshop. If you're interested in more data science insights, check out -[Rami's Data Newsletter](https://ramikrispin.substack.com) where he shares valuable perspectives -on data engineering, LLMs, and analytics. diff --git a/docs/blog/overhauled-user-guide/advanced-in-sidebar.png b/docs/blog/overhauled-user-guide/advanced-in-sidebar.png deleted file mode 100644 index 9508383fe5..0000000000 Binary files a/docs/blog/overhauled-user-guide/advanced-in-sidebar.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/breadth-and-depth.png b/docs/blog/overhauled-user-guide/breadth-and-depth.png deleted file mode 100644 index f6f2f9adf3..0000000000 Binary files a/docs/blog/overhauled-user-guide/breadth-and-depth.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/explanation-of-visual.png b/docs/blog/overhauled-user-guide/explanation-of-visual.png deleted file mode 100644 index 8358ef53de..0000000000 Binary files a/docs/blog/overhauled-user-guide/explanation-of-visual.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/index.qmd b/docs/blog/overhauled-user-guide/index.qmd deleted file mode 100644 index 65a6f228f1..0000000000 --- a/docs/blog/overhauled-user-guide/index.qmd +++ /dev/null @@ -1,154 +0,0 @@ ---- -jupyter: python3 -html-table-processing: none -title: Overhauling Pointblank's User Guide -author: Rich Iannone and Michael Chow -date: 2025-05-20 -freeze: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer=False) -``` - -The Pointblank documentation just got a major upgrade! We've completely overhauled our -[**User Guide**](../../user-guide/index.qmd). Our goal was to enable readers to start fast on -validation and work through the many pieces needed in realistic situations. - -We realized that at the core of Pointblank is the validation plan. Its made up of rules, results, -and steps. - -![](validation-three-things.png){width=80%} - -For example, the first row is a step that checks whether values in column 'a' are less than `10`. -The `COLUMNS` and `VALUES` column contain the rules used to configure the step. The `PASS` column -for the first row indicates that all 13 values in the column passed. - -In this post, we'll cover: - -- using a spiral sequence to balance introductory breadth with subsequent depth -- improving examples with direct explanatory text -- rounding out API coverage for cross-linking -- surfacing advanced topics - -Let us walk you through the key improvements in our refreshed [**User Guide**](../../user-guide/index.qmd)! - -## Introduction: Embracing the Spiral Sequence - -We chose to use a spiral sequence for our [Introduction](../../user-guide/index.qmd) and -*Validation Plan* section. The [Introduction](../../user-guide/index.qmd) quickly covers parts of -validation plan, while each article of the *Validation Plan* section dives -deeper into different aspects of defining validation rules. - -
- -![](breadth-and-depth.png){width=50%} - -
- -The introduction does a broad pass on the validation plan table diagram, identifying the core pieces -of the output and then giving a quick overview of the code that produced it. This sets people up for -the *Validation Plan* section in the guide, where each concept is discussed in depth. - -## Improving Examples - -Examples are everywhere in the [**User Guide**](../../user-guide/index.qmd). We've tightened up our -approach to examples by: - -- presenting example code, output, or both, early in each section -- showing the actual output you'll see in your environment -- following up with explanatory text that guides attention to specific places in the output - -This approach makes learning more intuitive. Here's an excerpt that shows this in practice. - -![](explanation-of-visual.png){width=80%} - -The blue arrow marks the flow of reading and the red arrows map where we anticipate people will look -from the text to the output. Focusing explicitly on where we think attention will go forces us to -think carefully about exactly what readers will get from the output. The hope is that readers get to -work more quickly on new concepts. - -## Rounding Out API Coverage - -Documentation has to balance jobs between a user guide and an API Reference: - -- User Guide: explains concepts that cut across functions (like common arguments across -[validation methods](/user-guide/validation-methods.qmd)) -- API Reference: explains each individual function - -Importantly, user guides often link to the API reference so, as part of this work, we made sure that -all individual API entries are well-documented and linked to from the guide. -Here's an excerpt from the [**User Guide**](../../user-guide/index.qmd) that shows links marked: - -![](links-in-guide.png){width=80%} - -## Surfacing Advanced Topics - -There's a lot of potential slicing and dicing involved in validation, as well as work after -validation (post interrogation) to make sense of the results. We added pages to the -[**User Guide**](../../user-guide/index.qmd) for some core situations. -In this section, I'll highlight two advanced topics we added pages for: - -- segmentation: splitting a column into groups, and validating each group -- step reports: view failing cases (e.g., view rows of data that failed validation) - -These are marked in the [**User Guide**](../../user-guide/index.qmd) sidebar screenshot below: - -
- -![](advanced-in-sidebar.png){width=35%} - -
- -### Segmentation - -Here's a screenshot of a validation report with two validation steps, one for each segment (`"low"` -and `"high"`) in the `f` column of the `small_table` dataset. - -![](segments.png){width=80%} - -Notice that segments split columns into groups and apply the same validation to each of the groups. -Each group is given its own step. - -Each of the 20+ validation methods accept a `segment=` argument. The value of the -[Segmentation article in the User Guide](../../user-guide/segmentation.qmd) is to describe this -cross-cutting behavior in a single place. - -Compare the `segments=` parameter in the API Reference (e.g., look at -[`col_vals_gt()`](../../reference/Validate.col_vals_gt.qmd)) and the -[Segmentation article](../../user-guide/segmentation.qmd) to get a feel for how each location -documents the segments feature. - -### Step Report - -Step reports display failing cases (e.g., rows) for a validation step, so you can dig deeper into -validation failures. Here's a screenshot of a step report for some validation step 2: - -![](step-report.png){width=80%} - -Notice the arrow pointing to 'Step 2' in the title. Failing values are highlighted in red. Once we -know we have failures, it's important to take action and discover *why* data is failing. Looking at -failing cases in step reports often uncovers obvious causes behind failures. - -The `get_step_report()` entry is one of 50 in the API Reference. Here it is listed the API -Reference, in the [Interrogation and Reporting section](../../reference/index.qmd#interrogation-and-reporting). -Critically, it's only one of 20 entries in the [User Guide](../../user-guide/index.qmd), which -emphasizes its importance in validation workflows. - -## Looking Forward - -The refreshed **User Guide** is just the beginning of our documentation improvements. We're -committed to continuously enhancing our documentation to support your data validation needs. - -Michael Chow gave feedback on this **User Guide** in preparation for his -[upcoming talk at SciPy 2025](https://cfp.scipy.org/scipy2025/talk/NRMNDX/). - -We'd love to hear your feedback on the new User Guide! Feel free to open an issue on our -[GitHub repository](https://github.com/posit-dev/pointblank/issues) with suggestions, corrections, -or requests for additional topics you'd like to see covered. You can also join our community -discussions in the dedicated `#Documentation` channel on our -[Discord server](https://discord.com/invite/YH7CybCNCQ), where you can share ideas, ask questions, -and get help directly from the Pointblank team and other users. diff --git a/docs/blog/overhauled-user-guide/links-in-guide.png b/docs/blog/overhauled-user-guide/links-in-guide.png deleted file mode 100644 index 146bf7b9da..0000000000 Binary files a/docs/blog/overhauled-user-guide/links-in-guide.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/segments.png b/docs/blog/overhauled-user-guide/segments.png deleted file mode 100644 index 3d6fed340c..0000000000 Binary files a/docs/blog/overhauled-user-guide/segments.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/step-report.png b/docs/blog/overhauled-user-guide/step-report.png deleted file mode 100644 index 3e1cf9dbf5..0000000000 Binary files a/docs/blog/overhauled-user-guide/step-report.png and /dev/null differ diff --git a/docs/blog/overhauled-user-guide/validation-three-things.png b/docs/blog/overhauled-user-guide/validation-three-things.png deleted file mode 100644 index 4379e5bd61..0000000000 Binary files a/docs/blog/overhauled-user-guide/validation-three-things.png and /dev/null differ diff --git a/docs/blog/pointblank_blog_logo.png b/docs/blog/pointblank_blog_logo.png deleted file mode 100644 index 52c1af385d..0000000000 Binary files a/docs/blog/pointblank_blog_logo.png and /dev/null differ diff --git a/docs/blog/validation-libs-2025/index.qmd b/docs/blog/validation-libs-2025/index.qmd deleted file mode 100644 index b5488f10be..0000000000 --- a/docs/blog/validation-libs-2025/index.qmd +++ /dev/null @@ -1,728 +0,0 @@ ---- -jupyter: python3 -html-table-processing: none -title: "Data Validation Libraries for Polars (2025 Edition)" -author: Rich Iannone -date: 2025-06-04 -freeze: true ---- - -Data validation is a very important part of any data pipeline. And with Polars gaining popularity as -a superfast and feature-packed DataFrame library, developers need validation tools that work -seamlessly with it. But here's the thing: not all validation libraries are created equal, and -choosing the wrong one can lead to frustration, technical debt, or validation gaps that could bite -you later. - -In this survey (conducted halfway through 2025) we'll explore five Python validation libraries that -support Polars DataFrames, each bringing distinct strengths to different validation challenges. - -::: {.callout-note} -Great Expectations, while being one of the most established data validation frameworks in the Python -ecosystem, is not included in this survey as it doesn't yet offer native Polars support. See [this -issue](https://github.com/great-expectations/great_expectations/issues/10702) and -[this discussion](https://github.com/great-expectations/great_expectations/discussions/10144) for -the inside baseball. -::: - -## Recommendations - -Here are the unique strengths for each library: - -```{python} -#| echo: false -import polars as pl -from great_tables import GT - -library_features = pl.DataFrame( - { - "lib": [ - 'Pandera', - 'Patito', - 'Pointblank', - 'Validoopsie', - 'Dataframely', - ], - "stars": [3838, 468, 173, 63, 319], - "feat": [ - "Statistical testing, schema-centric validation, mypy integration", - "Pydantic integration, model-based validation, row-level objects", - "Interactive reports, threshold management, stakeholder communication", - "Built-in logging, composable validation, impact levels, lightweight Great Expectations alternative", - "Collection validation, advanced type safety, failure analysis", - ], - } -) - -( - GT(library_features) - .cols_label(lib="Library", stars="⭐", feat="Best Features") - .fmt_markdown(columns="lib") - .fmt_integer(columns="stars") - .opt_horizontal_padding(scale=2) -) -``` - -Based on these strengths, here are my recommendations for which libraries to use according to use case: - -```{python} -#| echo: false - -use_cases = pl.DataFrame({ - "use_case": [ - "Type-safe pipelines", - "Stakeholder reporting", - "Row-level object modeling", - "Statistical validation", - "Data quality improvement" - ], - "libs": [ - "Pandera, Dataframely, Patito", - "Pointblank", - "Patito", - "Pandera", - "Pointblank, Validoopsie" - ], - "desc": [ - "Static type checking and compile-time validation", - "Sharing validation results with non-technical teams", - "Converting DataFrame rows to Python objects with business logic", - "Testing data distributions and statistical properties", - "Gradual quality improvement with threshold tracking" - ] -}) - -( - GT(use_cases) - .cols_label( - use_case="Use Case", - libs="Best Libraries", - desc="Description" - ) - .opt_horizontal_padding(scale=2) -) -``` - -## Setup - -We are going to run through examples with **Pandera**, **Patito**, **Pointblank**, **Validoopsie**, -and **Dataframely**, using this Polars DataFrame as our test case: - -```{python} -import polars as pl - -# Standard dataset for all validation examples -user_data = pl.DataFrame({ - "user_id": [1, 2, 3, 4, 5], - "age": [25, 30, 22, 45, 95], # <- includes a very high age - "email": [ - "user1@example.com", "user2@example.com", "invalid-email", # <- has an invalid email - "user4@example.com", "user5@example.com" - ], - "score": [85.5, 92.0, 78.3, 88.7, 95.2] -}) -``` - -We'll try to run the same data validation across the surveyed libraries, so we'll check: - -- schema validation (correct column types) -- `user_id` values greater than `0` -- `age` values between `18` and `80` (inclusive) -- `email` strings matching a basic email regex pattern -- `score` values between `0` and `100` (inclusive) - -Now let's dive into each library, starting with the statistically-focused Pandera. - -## 1. Pandera: Schema-First Validation with Statistical Checks - -Pandera is a statistical data validation toolkit designed to provide a flexible and expressive API -for performing data validation on dataframe-like objects. The library centers on schema-centric -validation, where you define the expected structure and constraints of your data upfront. You can -enable both runtime validation and static type checking integration. Pandera added Polars support in -version `0.19.0` (early 2024). - -### Example - -```{python} -import pandera.polars as pa - -# Define schema using our standard dataset -schema = pa.DataFrameSchema({ - "user_id": pa.Column(pl.Int64, checks=pa.Check.gt(0)), - "age": pa.Column(pl.Int64, checks=[pa.Check.ge(18), pa.Check.le(80)]), - "email": pa.Column(pl.Utf8, checks=pa.Check.str_matches(r"^[^@]+@[^@]+\.[^@]+$")), - "score": pa.Column(pl.Float64, checks=pa.Check.in_range(0, 100)) -}) - -# Validate the schema -try: - validated_data = schema.validate(user_data) - print("Validation successful!") -except pa.errors.SchemaError as e: - print(f"Validation failed: {e}") -``` - -This example demonstrates Pandera's declarative approach, where you define what your data should -look like rather than writing imperative validation logic. The schema acts as both documentation and -as a validation contract. Notice how multiple checks can be applied to a single column (here, the -`age` column receives two checks), and the validation either succeeds completely or provides -error information about what failed. - -### Comparisons - -Both Pandera and Patito use declarative, schema-centric approaches, but differ in their design -philosophies: - -- Pandera uses a dictionary-like schema structure with Column objects for defining validation rules -- Patito uses Pydantic model classes with familiar Field syntax for validation constraints -- Pandera focuses heavily on statistical validation capabilities like hypothesis testing -- Patito emphasizes integration with existing Pydantic workflows and object modeling -- a key behavioral difference: Patito reports all validation errors in a single pass, while Pandera -stops at the first failure - -The choice between them often comes down to whether you prefer Pandera's statistical focus or -Patito's Pydantic integration. - -Unlike Pointblank's step-by-step validation reporting, Pandera validates the entire schema at once. -Compared to Patito's model-based approach, Pandera focuses more on statistical validation -capabilities. Unlike Validoopsie's and Pointblank's method chaining style, Pandera uses a more -declarative, schema-centric approach. - -### Unique Strengths and When to Use - -Here are some of stand-out features that Pandera has: - -- type-safe schema definitions with `mypy` integration -- statistical hypothesis testing for data distributions: perform t-tests, chi-square tests, and -custom statistical tests directly in your validation schema -- excellent integration with Pandas, Polars, and Arrow support -- declarative schema syntax that serves as documentation -- built-in support for data coercion and transformation - -This statistical validation capability goes beyond basic type and range checking to test actual data -relationships and distributional assumptions. For example, you can validate that the mean height of -group `"M"` is significantly greater than group `"F"` using a two-sample t-test, or test whether a -column follows a normal distribution. This makes Pandera uniquely powerful for data science -workflows where the statistical properties of your data are as important as individual data points -meeting basic constraints. - -Data practitioners should choose Pandera when building type-safe data pipelines where schema -validation is critical, especially in data science workflows that require statistical validation. -It's ideal for users that value static type checking, need to validate statistical properties of -their data, or want schemas that double as documentation. - -Pandera also excels in environments where data contracts between teams are important and where the -statistical properties of data matter as much as basic type checking. - -## 2. Patito: Pydantic-Style Data Models for DataFrames - -Patito brings Pydantic's well-received model-based validation approach to DataFrame validation, -creating a bridge between Pydantic-style data validation and DataFrame processing. The library's -primary goal is to provide a familiar, Pydantic-style interface for defining and validating -DataFrame schemas, making it particularly appealing to developers already using Pydantic in their -applications. - -Patito launched with Polars support from the beginning (in late 2022). Native Polars integration is -touted as one of its core features, reflecting the growing adoption of Polars in the Python -ecosystem. - -### Example - -```{python} -import patito as pt -from typing import Annotated - -class UserModel(pt.Model): - user_id: int = pt.Field(gt=0) - age: Annotated[int, pt.Field(ge=18, le=80)] - email: str = pt.Field(pattern=r"^[^@]+@[^@]+\.[^@]+$") - score: float = pt.Field(ge=0.0, le=100.0) - -# Validate using the model -try: - UserModel.validate(user_data) - print("Validation successful!") -except pt.exceptions.DataFrameValidationError as e: - print(f"Validation failed: {e}") -``` - -This example showcases Patito's model-centric approach where validation rules are embedded in class -definitions. The use of Python's type hints and Pydantic's Field syntax makes the validation rules -self-documenting. Notably, Patito reports all validation errors at once, providing a fairly -comprehensive view of data quality issues, whereas other libraries (e.g., Pandera) stop at the first -failure. - -### Column Validation Approaches: Pandera vs Patito - -**Pandera offers a much more extensive and flexible system for column validation** compared to -Patito's field-based approach. While Patito provides a solid set of built-in field constraints -(like `gt`, `le`, `regex`, `unique`, etc.) that cover common validation scenarios, Pandera's Check -system is designed for both simple and highly sophisticated validation logic. - -The key architectural difference seems to lie in extensibility and complexity. Pandera's `Check` -objects accept arbitrary functions, allowing you to write custom validation logic that can be as -simple as `lambda s: s > 0` or as complex as statistical hypothesis tests using scipy. You can -create vectorized checks that operate on entire Series objects for performance, element-wise checks -for atomic validation, and even grouped checks that validate subsets of data based on other columns. -Patito's `Field` constraints, while clean and declarative, are more limited to the predefined -validation types that Pydantic and Patito provide. - -Pandera also supports advanced validation patterns that Patito doesn't directly offer, such as -wide-form data checks (validating relationships across multiple columns), grouped validation (where -checks are applied to subsets of data based on grouping columns), and the ability to raise warnings -instead of errors for non-critical validation failures. While Patito does support custom constraints -through Polars expressions via the `constraints` parameter, this requires knowledge of Polars -expression syntax and, depending on where you're coming from, could be less intuitive than Pandera's -function-based approach. - -For most common validation scenarios, Patito's field-based validation is simpler and more readable, -especially for teams already familiar with Pydantic. However, for complex data validation -requirements, statistical validation, or when you need maximum flexibility in defining validation -logic, Pandera's Check system provides significantly more power and extensibility. - -### Unique Strengths and When to Use - -- Pydantic-style model definitions with familiar syntax for Pydantic users -- rich type system integration with Python's typing system -- model inheritance and composition for complex data structures -- seamless integration with existing Pydantic-based applications -- row-level object modeling for converting DataFrame rows to Python objects with methods -- mock data generation for testing with `.examples()` method - -People should choose Patito when they're already using Pydantic in their applications and want -consistent validation patterns across data processing and application logic. It's great when you -need to validate DataFrames and then work with individual rows as rich Python objects with embedded -business logic and methods (e.g., a `Product` row that has a `.url` property or -`.calculate_discount()` method). Patito is also good when you need to generate realistic test data -and want object-oriented interfaces for their data models. - -## 3. Pointblank: Comprehensive Validation with Beautiful Reports - -Pointblank is a comprehensive data validation framework designed to make data quality assessment -both thorough and accessible to stakeholders. Originally inspired by the R package of the same name, -Pointblank's primary mission is to provide validation workflows that generate beautiful, interactive -reports that can be shared with both technical and non-technical team members. - -Pointblank launched with Polars support as a core feature from its initial Python release in late -2024, built on top of the Narwhals and Ibis compatibility layers to provide consistent DataFrame -operations across multiple backends including Polars, Pandas, and database connections. - -### Example - -```{python} -import pointblank as pb - -schema = pb.Schema( - columns=[("user_id", "Int64"), ("age", "Int64"), ("email", "String"), ("score", "Float64")] -) - -validation = ( - pb.Validate(data=user_data, label="An example.", tbl_name="users", thresholds=(0.1, 0.2, 0.3)) - .col_vals_gt(columns="user_id", value=0) - .col_vals_between(columns="age", left=18, right=80) - .col_vals_regex(columns="email", pattern=r"^[^@]+@[^@]+\.[^@]+$") - .col_vals_between(columns="score", left=0, right=100) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -This example demonstrates Pointblank's chainable validation approach where each validation step is -clearly defined and can be configured with different threshold levels. The resulting validation -object provides rich, interactive reporting that shows not just what passed or failed, but detailed -statistics about the validation process. The threshold system allows for nuanced responses to data -quality issues. - -### Comparisons - -Unlike Pandera's schema-first approach, Pointblank focuses on step-by-step validation with detailed -reporting and flexible failure thresholds that can be set at both the global and individual -validation step level. Both Pointblank and Validoopsie use numeric threshold values for granular -control over acceptable failure rates, but they differ in their primary focus: Pointblank emphasizes -comprehensive reporting and stakeholder communication, while Validoopsie prioritizes operational -resilience through its impact level system (low/medium/high) that controls whether threshold -breaches are logged, reported, or raise exceptions. - -While both libraries support custom validation logic, Pointblank's `specially()` method integrates -seamlessly with its reporting system, whereas Validoopsie provides a structured framework for -creating custom validation classes that fit into its modular validation catalog. - -### Unique Strengths and When to Use - -- beautiful, interactive HTML reports perfect for sharing with stakeholders -- threshold-based alerting system with configurable actions -- segmented validation for analyzing subsets of data -- LLM-powered validation suggestions via `DraftValidation` -- comprehensive data inspection tools and summary tables -- step-by-step validation reporting with detailed failure analysis (via `.get_step_report()`) - -Data practitioners might want to choose Pointblank when stakeholder communication and comprehensive -data quality reporting are priorities. Because of the reporting tables it can generate, it's -well-suited for data teams that need to regularly report on data quality to relevant stakeholders. -Pointblank also excels in production data monitoring scenarios, data observability workflows, and -situations where understanding the nuances of data quality issues matters more than simple pass/fail -validation. - -## 4. Validoopsie: Composable Checks with Smart Failure Handling - -Validoopsie is built around composable validation principles, providing a toolkit for creating -reusable validation functions organized into logical modules. Drawing inspiration from Great -Expectations but with a much lighter footprint, Validoopsie emphasizes building validation logic -from modular, testable components that can be combined in flexible ways to create complex validation -workflows. The library had Polars support from its very first release (early-2025). - -What sets Validoopsie apart is its sophisticated approach to handling validation failures through -*impact levels* and *threshold tolerances*. These features that give you fine-grained control over -how your validation pipeline behaves when things go wrong. - -### Example - -```{python} -from validoopsie import Validate -from narwhals.dtypes import Int64, Float64, String - -# Composable validation checks with impact levels and thresholds -validation = ( - Validate(user_data) - .ValuesValidation.ColumnValuesToBeBetween( - column="user_id", - min_value=0, - impact="high" # Critical - will raise exception - ) - .ValuesValidation.ColumnValuesToBeBetween( - column="age", - min_value=18, - max_value=80, - threshold=0.1, # Allow 10% failures - impact="medium" # Important but not critical - ) - .StringValidation.PatternMatch( - column="email", - pattern=r"^[^@]+@[^@]+\.[^@]+$", - threshold=0.05, # Allow 5% malformed emails - impact="low" # Record but don't interrupt - ) - .ValuesValidation.ColumnValuesToBeBetween( - column="score", - min_value=0, - max_value=100, - impact="medium" - ) - .TypeValidation.TypeCheck( - frame_schema_definition={ - "user_id": Int64, - "age": Int64, - "email": String, - "score": Float64 - }, - impact="high" # Schema compliance is critical - ) -) - -# Get validation results -validation.validate() - -# Access detailed results for analysis -print("Validation results:", validation.results) -``` - -This example showcases Validoopsie's key differentiators: modular validation categories -(`ValuesValidation`, `StringValidation`, `TypeValidation`) combined with *impact levels* that -control failure behavior and *thresholds* that allow controlled tolerance for data quality issues. -Unlike other libraries that treat all validation failures equally, Validoopsie lets you specify -which validations are critical ("high" impact raises exceptions) versus informational ("low" impact -just logs results). - -Validoopsie's most powerful feature is its three-tier `impact=` system combined with `threshold=` -tolerance: - -```{python} -# Example showing sophisticated failure handling -validation = ( - Validate(user_data) - # Critical validation - no tolerance - .NullValidation.ColumnNotBeNull( - column="user_id", - impact="high" # Will raise an exception if any Null values found - ) - # Important validation with tolerance - .StringValidation.PatternMatch( - column="email", - pattern=r"^[^@]+@[^@]+\.[^@]+$", - threshold=0.15, # Allow up to 15% malformed emails - impact="medium" # Log failures but don't stop processing - ) - # Informational validation - .ValuesValidation.ColumnValuesToBeBetween( - column="score", - min_value=90, - max_value=100, - threshold=0.8, # Allow 80% to be outside "excellent" range - impact="low" # Just track high performers - ) -) - -validation.validate() -``` - -Validoopsie strikes a unique balance between operational flexibility and production reliability, -making it an excellent choice for teams that need sophisticated failure handling without the -complexity of larger validation frameworks. - -### Comparisons - -Validoopsie's functional approach contrasts with Pandera's schema-centric methodology and Patito's -object-oriented models. While Pandera focuses on statistical validation and Patito emphasizes -Pydantic integration, Validoopsie prioritizes flexibility and operational robustness. - -Compared to Pointblank, both libraries offer sophisticated threshold-based failure handling using -numeric values (e.g., 0.1 for 10% tolerance), but they differ in their architectural approach: -Validoopsie combines numeric thresholds with impact levels (low/medium/high) that control the -behavioral response to threshold breaches, while Pointblank integrates thresholds directly into its -comprehensive reporting and alerting system. Both support custom validation, but Validoopsie uses a -modular validation catalog approach while Pointblank's `specially()` method integrates seamlessly -with its step-by-step reporting workflow. - -Validoopsie is the only library in this survey that provides built-in logging capabilities, making -it particularly valuable for production environments where validation events need to be tracked and -monitored. - -The library's Great Expectations inspiration is evident in its modular design, but Validoopsie -delivers this functionality with a much lighter dependency footprint and simpler API. Teams -familiar with Great Expectations will find Validoopsie's approach familiar but more streamlined. - -### Unique Strengths and When to Use - -Validoopsie's standout features include: - -- graduated failure handling through impact levels (low/medium/high) combined with numeric - thresholds that control both tolerance levels and behavioral responses to failures -- numeric threshold tolerance allowing controlled acceptance of data quality issues (e.g., "allow - 10% email format failures" with `threshold=0.1`) -- built-in structured logging using loguru allows for automatic logging of validation results, -failures, and performance metrics (unique among these libraries) -- being a lightweight Great Expectations alternative with similar composability but minimal -dependencies -- an extensive validation catalog organized into logical namespaces (Date, String, Null, Values, -etc.) -- custom validation framework with consistent patterns for creating domain-specific rules - -Choose Validoopsie when you need: - -- operational resilience in production pipelines where partial data quality issues shouldn't - stop processing -- comprehensive validation logging and monitoring for observability in production environments -- fine-grained control over validation failure behavior with different criticality levels -- lightweight Great Expectations functionality without the complexity and dependencies -- custom validation development with a clear, consistent framework -- modular validation design that promotes reusability across projects - -Validoopsie is particularly well-suited for data engineering teams building robust production -pipelines where data quality monitoring is important but pipeline availability is critical. Its -impact/threshold system makes it uniquely powerful for environments where you need to distinguish -between "nice to have" and "must have" data quality requirements. - -## 5. Dataframely: Type-Safe Schema Validation with Advanced Features - -Dataframely is a comprehensive data validation framework that brings type-safe schema validation to -Polars DataFrames with some of the most advanced features in the ecosystem. The library focuses on -providing both runtime validation and static type checking, with particular strengths in -collection validation for related DataFrames and extensive integration capabilities with external -tools. - -Dataframely launched in early 2025 with native Polars support as a core feature, built specifically -for the modern data ecosystem with first-class support for complex validation scenarios. - -### Example - -```{python} -#| eval: false -import polars as pl -import dataframely as dy - -class UserSchema(dy.Schema): - user_id = dy.Int64(primary_key=True, min=1, nullable=False) - age = dy.Int64(nullable=False) - email = dy.String(nullable=False, regex=r"^[^@]+@[^@]+\.[^@]+$") - score = dy.Float64(nullable=False, min=0.0, max=100.0) - - # Use @dy.rule() for age range validation - @dy.rule() - def age_in_range(cls) -> pl.Expr: - return pl.col("age").is_between(18, 80, closed="both") - -# Validate using the schema -try: - validated_data = UserSchema.validate(user_data, cast=True) - print("Validation successful!") - print(validated_data) -except Exception as e: - print(f"Validation failed: {e}") -``` - -This example showcases Dataframely's class-based schema approach with several notable features: -primary key constraints, comprehensive type validation with bounds, regex pattern matching, and -custom validation rules using the `@dy.rule()` decorator (used here for age range checking). - -The `cast=True` parameter automatically coerces column types to match the schema definitions. This -is really useful when working with data from external sources where column types might not exactly -match your schema expectations (e.g., integers loaded as strings from CSV files). - -Dataframely features soft validation and failure introspection. As one of Dataframely's standout -features, it brings a fairly sophisticated approach to validation failures. Rather than just raising -exceptions, it provides detailed failure analysis: - -```{python} -#| eval: false -# Soft validation: separate valid and invalid rows -good_data, failure_info = UserSchema.filter(user_data, cast=True) - -print("Valid rows:", len(good_data)) -print("Failure counts:", failure_info.counts()) -print("Co-occurrence analysis:", failure_info.cooccurrence_counts()) - -# Inspect the actual failed rows -failed_rows = failure_info.invalid() -print("Failed data:", failed_rows) -``` - -### Comparisons - -While both Dataframely and Pandera offer schema-centric validation approaches, they serve different -validation philosophies. Pandera excels in statistical validation with hypothesis testing and -distribution checks, making it ideal for data science workflows where statistical properties matter. -Dataframely, by contrast, emphasizes relational data integrity and type safety, providing more -sophisticated failure analysis and collection-level validation capabilities that Pandera doesn't -offer. - -The relationship between Dataframely and Patito is particularly interesting since both use -class-based schema definitions. However, Dataframely extends far beyond Patito's Pydantic-focused -approach. Where Patito provides clean, simple validation with excellent Pydantic integration, -Dataframely offers advanced features like collection validation, group rules, and comprehensive -failure introspection. Teams already invested in Pydantic workflows might prefer Patito's -simplicity, while those building complex data systems will appreciate Dataframely's feature set. - -Dataframely and Pointblank represent two different approaches to comprehensive data validation. -Pointblank shines in stakeholder communication with its beautiful interactive reports and -threshold-based alerting systems, making it perfect for data quality reporting. Dataframely focuses -instead on type safety and complex validation logic, with unique collection validation capabilities -that no other library in this survey provides. The choice between these two will comes down to -whether your priority is communicating validation results or ensuring complex data relationships -remain consistent. - -When compared to Validoopsie's method chaining approach, Dataframely offers a more structured, -schema-centric methodology with advanced type safety features that Validoopsie doesn't provide. -While Validoopsie excels in operational flexibility and lightweight design for building reusable -validation components, Dataframely's strength lies in its comprehensive type system integration, -collection validation capabilities, and sophisticated failure analysis. And that makes it ideal for -complex data engineering workflows where relationships between multiple DataFrames matter as much as -individual DataFrame validation. - -### Unique Strengths and When to Use - -Dataframely's standout features include: - -- advanced type safety with full mypy integration and generic DataFrame types -- collection validation for ensuring consistency across related DataFrames -- group-based validation rules using `@dy.rule(group_by=[...])` for aggregate constraints -- schema inheritance for reducing code duplication in related schemas -- production-ready soft validation that separates valid and invalid data - -One might choose Dataframely when building complex data systems where: - -- type safety and static analysis are critical for code quality -- you need to validate relationships between multiple related DataFrames -- you're working with production pipelines that need to handle partial data quality issues -gracefully -- schema reuse and inheritance would benefit your codebase organization - -Dataframely is particularly well-suited for data engineering teams building robust, type-safe data -pipelines where the relationships between different data entities are as important as the validation -of individual DataFrames. Its collection validation capabilities make it uniquely powerful for -ensuring referential integrity in complex data workflows. - -## Choosing the Right Library - -With five solid validation libraries to choose from, the decision often comes down to your team's -specific workflow, existing tech stack, and validation requirements. Here are some practical -considerations to help guide your choice: - -*Start with your existing tools* - -If you're already using Pydantic extensively, Patito will feel natural. Teams that are heavily -invested in type checking and statistical analysis should probably gravitate toward Pandera. If -you're building data products that need stakeholder buy-in, Pointblank's reporting capabilities -become incredibly useful in that context. For teams already committed to strong typing and static -analysis workflows, Dataframely's advanced type safety features will feel like a natural extension -of your existing practices. - -*Consider your validation complexity* - -For straightforward schema validation and type checking, any of these libraries will work well. But -if you need statistical hypothesis testing, Pandera is your best bet. For highly custom validation -logic that needs to be composed and reused, Validoopsie shines. When validation results need to be -communicated to non-technical stakeholders, Pointblank's interactive reports are basically -unmatched. If you're dealing with complex relational data where multiple DataFrames need to maintain -consistency with each other, Dataframely's collection validation capabilities are unique in the -ecosystem. - -*Think about failure tolerance requirements* - -One of the most important architectural differences among these libraries is how they handle -validation failures. Only Pointblank and Validoopsie offer numeric threshold-based failure -tolerance. This is the ability to accept a controlled percentage of validation failures without -treating the entire validation as failed. - -This distinction can be crucial for production environments where some level of data quality issues -is acceptable and you need fine-grained control over when validations should fail versus warn. In -many real-world scenarios, poor data quality is a given reality, and the goal becomes gradually -improving quality over time rather than enforcing perfection. Thresholds can then be seen not as -simple failure tolerances but more like data quality metrics and improvement goals (e.g., you might -start with `threshold=0.15` for email validation and progressively tighten to `0.05` as upstream -systems improve). - -*Think about your team's preferences* - -There's a human dimension here. Some data teams might prefer the declarative, schema-first approach -of Pandera, Patito, and Dataframely, whereas others like the step-by-step, method-chaining style of -Pointblank and Validoopsie. There's really no right or wrong choice here. It's all about what feels -right and most natural for your team's coding style and mental model. - -*Don't feel locked into one choice* - -My hunch is that many teams already successfully use different libraries for different parts of -their data pipeline. They're leveraging each tool's strengths where they matter most. So you could -conceivably use Patito for Pydantic-style validation, Pandera for statistical checks in your -analysis pipeline, Pointblank for generating stakeholder reports, and Dataframely for complex data -engineering workflows (use 'em all!). This multi-library approach can be particularly effective in -larger organizations with diverse validation needs. - -I suppose the key is to start with one library that fits your immediate needs, learn it well, and -then consider expanding your toolkit as your validation requirements evolve. - -## Summary and Wrapping Up - -The Python ecosystem offers truly excellent options for validating Polars DataFrames! Choosing is -always tough but this is how one could make the decision based on specific needs: - -- for type-safe pipelines, **Pandera**, **Dataframely**, or **Patito** are ideal -- for stakeholder reporting, **Pointblank** is a great choice -- for row-level object modeling, go with **Patito** -- for statistical validation, **Pandera** is perfect -- for data quality improvement, **Pointblank** or **Validoopsie** fit well - -Each library has evolved to serve different aspects of the data validation ecosystem. Try them all -and, with a little understanding of their strengths, you'll get good at picking the right data -validation tool for your specific use case. - -This survey represents our understanding of these libraries as of mid-2025. Given the rapid pace of -development in the Python data ecosystem, some details may become outdated or contain inaccuracies -(we may have even gotten things wrong at the outset). If you notice any errors or have updates to -share, we'd love to hear from you! Please reach out through: - -- [GitHub Issues](https://github.com/posit-dev/pointblank/issues) -- [GitHub Discussions](https://github.com/posit-dev/pointblank/discussions) -- Our [Discord Server](https://discord.com/invite/YH7CybCNCQ) - -Any feedback you provide helps keep this resource accurate and useful for the community! diff --git a/docs/demos/01-starter/index.qmd b/docs/demos/01-starter/index.qmd deleted file mode 100644 index 55405f1263..0000000000 --- a/docs/demos/01-starter/index.qmd +++ /dev/null @@ -1,59 +0,0 @@ ---- -pagetitle: Examples -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Starter Validation - -A validation with the basics. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( # Use pb.Validate to start - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="A starter validation" - ) - .col_vals_gt(columns="d", value=1000) # STEP 1 | - .col_vals_le(columns="c", value=5) # STEP 2 | <-- Build up a validation plan - .col_exists(columns=["date", "date_time"]) # STEP 3 | - .interrogate() # This will execute all validation steps and collect intel -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( # Use pb.Validate to start - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="A starter validation" - ) - .col_vals_gt(columns="d", value=1000) # STEP 1 | - .col_vals_le(columns="c", value=5) # STEP 2 | <-- Build up a validation plan - .col_exists(columns=["date", "date_time"]) # STEP 3 | - .interrogate() # This will execute all validation steps and collect intel -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/02-advanced/index.qmd b/docs/demos/02-advanced/index.qmd deleted file mode 100644 index b128b1cf57..0000000000 --- a/docs/demos/02-advanced/index.qmd +++ /dev/null @@ -1,95 +0,0 @@ ---- -pagetitle: "Examples: Advanced Validation" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Advanced Validation - -A validation with a comprehensive set of rules. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - tbl_name="game_revenue", - label="Comprehensive validation example", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35), - ) - .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 - .col_vals_gt(columns="session_duration", value=5) # STEP 2 - .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 - .col_vals_in_set( # STEP 5 - columns="acquisition", - set=["google", "facebook", "organic", "crosspromo", "other_campaign"] - ) - .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 - .col_vals_between( # STEP 7 - columns="session_duration", - left=10, right=50, - pre = lambda df: df.select(pl.median("session_duration")) - ) - .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 8 - .row_count_match(count=2000) # STEP 9 - .col_count_match(count=11) # STEP 10 - .col_vals_not_null(columns=pb.starts_with("item")) # STEPS 11-13 - .col_exists(columns="start_day") # STEP 14 - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - tbl_name="game_revenue", - label="Comprehensive validation example", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35), - ) - .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 - .col_vals_gt(columns="session_duration", value=5) # STEP 2 - .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 - .col_vals_in_set( # STEP 5 - columns="acquisition", - set=["google", "facebook", "organic", "crosspromo", "other_campaign"] - ) - .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 - .col_vals_between( # STEP 7 - columns="session_duration", - left=10, right=50, - pre = lambda df: df.select(pl.median("session_duration")) - ) - .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 8 - .row_count_match(count=2000) # STEP 9 - .col_count_match(count=11) # STEP 10 - .col_vals_not_null(columns=pb.starts_with("item")) # STEPS 11-13 - .col_exists(columns="start_day") # STEP 14 - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue"), n_head=10, n_tail=10) -``` - -
diff --git a/docs/demos/03-data-extracts/index.qmd b/docs/demos/03-data-extracts/index.qmd deleted file mode 100644 index 4591a365c9..0000000000 --- a/docs/demos/03-data-extracts/index.qmd +++ /dev/null @@ -1,69 +0,0 @@ ---- -pagetitle: "Examples: Data Extracts" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Data Extracts - -Pulling out data extracts that highlight rows with validation failures. - -

Validation with failures at *Step 2*:

- -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - tbl_name="game_revenue", - label="Validation with test unit failures available as an extract" - ) - .col_vals_gt(columns="item_revenue", value=0) - .col_vals_ge(columns="session_duration", value=5) - .interrogate() -) - -validation -``` - -
-

Extract from *Step 2* (which has 14 failing test units):

- -```{python} -#| echo: false -pb.preview(validation.get_data_extracts(i=2, frame=True), n_head=20, n_tail=20) -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - tbl_name="game_revenue", - label="Validation with test unit failures available as an extract" - ) - .col_vals_gt(columns="item_revenue", value=0) # STEP 1: no test unit failures - .col_vals_ge(columns="session_duration", value=5) # STEP 2: 14 test unit failures -> extract - .interrogate() -) -``` - -```python -pb.preview(validation.get_data_extracts(i=2, frame=True), n_head=20, n_tail=20) -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue")) -``` - -
diff --git a/docs/demos/04-sundered-data/index.qmd b/docs/demos/04-sundered-data/index.qmd deleted file mode 100644 index 2041a842a7..0000000000 --- a/docs/demos/04-sundered-data/index.qmd +++ /dev/null @@ -1,67 +0,0 @@ ---- -pagetitle: "Examples: Sundered Data" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Sundered Data - -Splitting your data into 'pass' and 'fail' subsets. - -```{python} -# | echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Sundering Data" - ) - .col_vals_gt(columns="d", value=1000) - .col_vals_le(columns="c", value=5) - .interrogate() -) - -validation -``` - -```{python} -#| echo: false -pb.preview(validation.get_sundered_data(type="pass")) -``` - -```python -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Sundering Data" - ) - .col_vals_gt(columns="d", value=1000) - .col_vals_le(columns="c", value=5) - .interrogate() -) - -validation -``` - -```python -pb.preview(validation.get_sundered_data(type="pass")) -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="pandas"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/05-step-report-column-check/index.qmd b/docs/demos/05-step-report-column-check/index.qmd deleted file mode 100644 index a36ae538f2..0000000000 --- a/docs/demos/05-step-report-column-check/index.qmd +++ /dev/null @@ -1,75 +0,0 @@ ---- -pagetitle: "Examples: Step Reports for Column Data Checks" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Step Report: Column Data Checks - -A step report for column checks shows what went wrong. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - tbl_name="small_table", - label="Step reports for column data checks" - ) - .col_vals_ge(columns="c", value=4, na_pass=True) - .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}") - .interrogate() -) - -validation -``` - -```{python} -#| echo: false -validation.get_step_report(i=1) -``` - -```{python} -#| echo: false -validation.get_step_report(i=2) -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - tbl_name="small_table", - label="Step reports for column data checks" - ) - .col_vals_ge(columns="c", value=4, na_pass=True) # has failing test units - .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}") # no failing test units - .interrogate() -) - -validation -``` - -```python -validation.get_step_report(i=1) -``` - -```python -validation.get_step_report(i=2) -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/06-step-report-schema-check/index.qmd b/docs/demos/06-step-report-schema-check/index.qmd deleted file mode 100644 index e05b8b4438..0000000000 --- a/docs/demos/06-step-report-schema-check/index.qmd +++ /dev/null @@ -1,92 +0,0 @@ ---- -pagetitle: "Examples: Step Report for a Schema Check" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Step Report: Schema Check - -When a schema doesn't match, a step report gives you the details. - -```{python} -# | echo: false - -import pointblank as pb - -schema = pb.Schema( - columns=[ - ("date_time", "timestamp"), - ("dates", "date"), - ("a", "int64"), - ("b",), - ("c",), - ("d", "float64"), - ("e", ["bool", "boolean"]), - ("f", "str"), - ] -) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="duckdb"), - tbl_name="small_table", - label="Step report for a schema check" - ) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -```{python} -#| echo: false -validation.get_step_report(i=1) -``` - -```python -import pointblank as pb - -# Create a schema for the target table (`small_table` as a DuckDB table) -schema = pb.Schema( - columns=[ - ("date_time", "timestamp"), # this dtype doesn't match - ("dates", "date"), # this column name doesn't match - ("a", "int64"), - ("b",), # omit dtype to not check for it - ("c",), # "" "" "" "" - ("d", "float64"), - ("e", ["bool", "boolean"]), # try several dtypes (second one matches) - ("f", "str"), # this dtype doesn't match - ] -) - -# Use the `col_schema_match()` validation method to perform a schema check -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="duckdb"), - tbl_name="small_table", - label="Step report for a schema check" - ) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -```python -validation.get_step_report(i=1) -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="duckdb"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/07-validation-with-actions/index.qmd b/docs/demos/07-validation-with-actions/index.qmd deleted file mode 100644 index ea18193389..0000000000 --- a/docs/demos/07-validation-with-actions/index.qmd +++ /dev/null @@ -1,115 +0,0 @@ ---- -pagetitle: "Examples: Validation with Actions" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Validation with Actions - -Configure actions to trigger when validation thresholds are exceeded, such as logging warnings or errors. - -```{python} -#| echo: false - -import pointblank as pb - -def log_warning(): - """Custom action to log validation warnings""" - metadata = pb.get_action_metadata() - print(f"⚠️ WARNING: Validation step '{metadata['step']}' exceeded threshold!") - -def log_error(): - """Custom action to log validation errors""" - metadata = pb.get_action_metadata() - print(f"❌ ERROR: Critical validation failure in step '{metadata['step']}'!") - print(f" This requires immediate attention.") - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="nycflights", tbl_type="polars"), - label="Validation with actions" - ) - .col_vals_between( - columns="distance", - left=100, right=2000, - thresholds=pb.Thresholds(warning=0.1), # Allow 10% failures before warning - actions=pb.Actions(warning=log_warning), - brief="Column 'distance' range check." - ) - .col_vals_gt( - columns="air_time", - value=25, - na_pass=True, - thresholds=pb.Thresholds(error=200), # Allow only 200 failures before error - actions=pb.Actions(error=log_error), - brief="Column 'origin' check for minimum value." - ) - .col_vals_not_null( - columns="carrier", - thresholds=(1, 0.05), # No tolerance for null values - actions=pb.Actions(warning=log_warning, error=log_error), - brief="Column 'carrier' completeness check." - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -def log_warning(): - """Custom action to log validation warnings""" - metadata = pb.get_action_metadata() - print(f"⚠️ WARNING: Validation step '{metadata['step']}' exceeded threshold!") - -def log_error(): - """Custom action to log validation errors""" - metadata = pb.get_action_metadata() - print(f"❌ ERROR: Critical validation failure in step '{metadata['step']}'!") - print(f" This requires immediate attention.") - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="nycflights", tbl_type="polars"), - label="Validation with actions" - ) - .col_vals_between( - columns="distance", - left=100, right=2000, - thresholds=pb.Thresholds(warning=0.1), # Allow 10% failures before warning - actions=pb.Actions(warning=log_warning), - brief="Column 'distance' range check." - ) - .col_vals_gt( - columns="air_time", - value=25, - na_pass=True, - thresholds=pb.Thresholds(error=200), # Allow only 200 failures before error - actions=pb.Actions(error=log_error), - brief="Column 'origin' check for minimum value." - ) - .col_vals_not_null( - columns="carrier", - thresholds=(1, 0.05), # No tolerance for null values - actions=pb.Actions(warning=log_warning, error=log_error), - brief="Column 'carrier' completeness check." - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="nycflights", tbl_type="polars")) -``` - -
diff --git a/docs/demos/08-validation-with-final-actions/index.qmd b/docs/demos/08-validation-with-final-actions/index.qmd deleted file mode 100644 index 96789290ba..0000000000 --- a/docs/demos/08-validation-with-final-actions/index.qmd +++ /dev/null @@ -1,113 +0,0 @@ ---- -pagetitle: "Examples: Validation with Final Actions" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Validation with Final Actions - -Execute actions after validation completes, such as sending alerts or generating summary reports. - -```{python} -#| echo: false - -import pointblank as pb - -def send_alert(): - """Check validation summary and send alert if critical failures found""" - summary = pb.get_validation_summary() - if summary and summary.get("highest_severity") == "critical": - print(f"🚨 ALERT: Critical validation failures found!") - print(f" Failed steps: {summary['n_failing_steps']}") - elif summary and summary.get("highest_severity") == "error": - print(f"⚠️ WARNING: Error-level validation failures detected.") - else: - print("✅ All validation checks passed successfully!") - -def generate_summary_report(): - """Generate a summary report of validation results""" - summary = pb.get_validation_summary() - if summary: - print("\n--- Validation Summary Report ---") - print(f"Total validation steps: {summary['n_steps']}") - print(f"Passed steps: {summary['n_passing_steps']}") - print(f"Failed steps: {summary['n_failing_steps']}") - print(f"Highest severity: {summary['highest_severity']}") - print("--- End of Report ---") - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - label="Validation with final actions", - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - final_actions=pb.FinalActions( - "Validation workflow completed.", # String message - send_alert, # Alert function - generate_summary_report # Report function - ) - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -def send_alert(): - """Check validation summary and send alert if critical failures found""" - summary = pb.get_validation_summary() - if summary and summary.get("highest_severity") == "critical": - print(f"🚨 ALERT: Critical validation failures found!") - print(f" Failed steps: {summary['n_failing_steps']}") - elif summary and summary.get("highest_severity") == "error": - print(f"⚠️ WARNING: Error-level validation failures detected.") - else: - print("✅ All validation checks passed successfully!") - -def generate_summary_report(): - """Generate a summary report of validation results""" - summary = pb.get_validation_summary() - if summary: - print("\n--- Validation Summary Report ---") - print(f"Total validation steps: {summary['n_steps']}") - print(f"Passed steps: {summary['n_passing_steps']}") - print(f"Failed steps: {summary['n_failing_steps']}") - print(f"Highest severity: {summary['highest_severity']}") - print("--- End of Report ---") - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - label="Validation with final actions", - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - final_actions=pb.FinalActions( - "Validation workflow completed.", # String message - send_alert, # Alert function - generate_summary_report # Report function - ) - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue", tbl_type="polars")) -``` - -
diff --git a/docs/demos/apply-checks-to-several-columns/index.qmd b/docs/demos/apply-checks-to-several-columns/index.qmd deleted file mode 100644 index 890fae44b2..0000000000 --- a/docs/demos/apply-checks-to-several-columns/index.qmd +++ /dev/null @@ -1,53 +0,0 @@ ---- -pagetitle: "Examples: Apply Validation Rules to Multiple Columns" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Apply Validation Rules to Multiple Columns - -Create multiple validation steps by using a list of column names with `columns=`. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_ge(columns=["a", "c", "d"], value=0) # check values in 'a', 'c', and 'd' - .col_exists(columns=["date_time", "date"]) # check for the existence of two columns - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_ge(columns=["a", "c", "d"], value=0) # check values in 'a', 'c', and 'd' - .col_exists(columns=["date_time", "date"]) # check for the existence of two columns - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/check-for-freshness/index.qmd b/docs/demos/check-for-freshness/index.qmd deleted file mode 100644 index b29012f1bc..0000000000 --- a/docs/demos/check-for-freshness/index.qmd +++ /dev/null @@ -1,125 +0,0 @@ ---- -pagetitle: "Examples: Validating Data Freshness" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Validating Data Freshness - -Use date/datetime-based validations to ensure your data is current and recent. This is critical for applications that depend on timely data updates. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl -from datetime import date, datetime, timedelta - -# Create sample data with mixed freshness levels -freshness_data = pl.DataFrame({ - "data_timestamp": [ - datetime(2023, 12, 28, 10, 30), # 3 days ago from Dec 31 - datetime(2023, 12, 29, 14, 15), # 2 days ago - datetime(2023, 12, 30, 9, 45), # 1 day ago - datetime(2023, 12, 31, 16, 20), # Today - ], - "sensor_id": ["TEMP_01", "TEMP_02", "TEMP_01", "TEMP_03"], - "reading": [22.5, 21.8, 23.1, 22.9], - "quality_score": [0.95, 0.88, 0.92, 0.97] -}) - -# Assuming today is 2023-12-31, check for data freshness -current_date = date(2023, 12, 31) -freshness_cutoff = current_date - timedelta(days=2) # Data should be within 2 days - -validation = ( - pb.Validate(freshness_data) - .specially( - expr=lambda df: df.filter( - pl.col("data_timestamp").dt.date() >= freshness_cutoff - ).height > 0, - brief=f"Recent data available (within 2 days of {current_date})" - ) - .col_vals_ge( - columns="data_timestamp", - value=current_date - timedelta(days=7), # Within last week - brief="All data points are from the last week" - ) - .specially( - expr=lambda df: ( - df.select(pl.col("data_timestamp").max()).item().date() >= current_date - ), - brief="Most recent data is from today" - ) - .col_vals_not_null( - columns="data_timestamp", - brief="No missing timestamps" - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl -from datetime import date, datetime, timedelta - -# Create sample data with mixed freshness levels -freshness_data = pl.DataFrame({ - "data_timestamp": [ - datetime(2023, 12, 28, 10, 30), # 3 days ago from Dec 31 - datetime(2023, 12, 29, 14, 15), # 2 days ago - datetime(2023, 12, 30, 9, 45), # 1 day ago - datetime(2023, 12, 31, 16, 20), # Today - ], - "sensor_id": ["TEMP_01", "TEMP_02", "TEMP_01", "TEMP_03"], - "reading": [22.5, 21.8, 23.1, 22.9], - "quality_score": [0.95, 0.88, 0.92, 0.97] -}) - -# Assuming today is 2023-12-31, check for data freshness -current_date = date(2023, 12, 31) -freshness_cutoff = current_date - timedelta(days=2) # Data should be within 2 days - -validation = ( - pb.Validate(freshness_data) - .specially( - expr=lambda df: df.filter( - pl.col("data_timestamp").dt.date() >= freshness_cutoff - ).height > 0, - brief=f"Recent data available (within 2 days of {current_date})" - ) - .col_vals_ge( - columns="data_timestamp", - value=current_date - timedelta(days=7), # Within last week - brief="All data points are from the last week" - ) - .specially( - expr=lambda df: ( - df.select(pl.col("data_timestamp").max()).item().date() >= current_date - ), - brief="Most recent data is from today" - ) - .col_vals_not_null( - columns="data_timestamp", - brief="No missing timestamps" - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(freshness_data) -``` - -
diff --git a/docs/demos/check-row-column-counts/index.qmd b/docs/demos/check-row-column-counts/index.qmd deleted file mode 100644 index 1d624302d2..0000000000 --- a/docs/demos/check-row-column-counts/index.qmd +++ /dev/null @@ -1,65 +0,0 @@ ---- -pagetitle: "Examples: Verifying Row and Column Counts" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Verifying Row and Column Counts - -Check the dimensions of the table with the `*_count_match()` validation methods. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") - ) - .col_count_match(count=11) # expect 11 columns in the table - .row_count_match(count=2000) # expect 2,000 rows in the table - .row_count_match(count=0, inverse=True) # expect that the table has rows - .col_count_match( # compare column count against - count=pb.load_dataset( # that of another table - dataset="game_revenue", tbl_type="pandas" - ) - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") - ) - .col_count_match(count=11) # expect 11 columns in the table - .row_count_match(count=2000) # expect 2,000 rows in the table - .row_count_match(count=0, inverse=True) # expect that the table has rows - .col_count_match( # compare column count against - count=pb.load_dataset( # that of another table - dataset="game_revenue", tbl_type="pandas" - ) - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue", tbl_type="duckdb")) -``` - -
diff --git a/docs/demos/checks-for-missing/index.qmd b/docs/demos/checks-for-missing/index.qmd deleted file mode 100644 index c418236741..0000000000 --- a/docs/demos/checks-for-missing/index.qmd +++ /dev/null @@ -1,59 +0,0 @@ ---- -pagetitle: "Examples: Checks for Missing Values" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Checks for Missing Values - -Perform validations that check whether missing/NA/Null values are present. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_not_null(columns="a") # expect no Null values - .col_vals_not_null(columns="b") # "" "" - .col_vals_not_null(columns="c") # "" "" - .col_vals_not_null(columns="d") # "" "" - .col_vals_null(columns="a") # expect all values to be Null - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_not_null(columns="a") # expect no Null values - .col_vals_not_null(columns="b") # "" "" - .col_vals_not_null(columns="c") # "" "" - .col_vals_not_null(columns="d") # "" "" - .col_vals_null(columns="a") # expect all values to be Null - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/cli-interactive/index.qmd b/docs/demos/cli-interactive/index.qmd deleted file mode 100644 index a04fed36c1..0000000000 --- a/docs/demos/cli-interactive/index.qmd +++ /dev/null @@ -1,147 +0,0 @@ ---- -title: "CLI Interactive Demos" -pagetitle: "CLI Interactive Demos" -format: - html: - toc: true - css: | - .demo-section { - margin-bottom: 40px; - } - .gif-container { - margin: 20px 0; - text-align: center; - border: 1px solid #e1e5e9; - border-radius: 8px; - padding: 20px; - background: #f8f9fa; - } - .gif-container img { - max-width: 100%; - border-radius: 4px; - box-shadow: 0 4px 8px rgba(0,0,0,0.1); - } ---- - -These CLI demos showcase practical data quality workflows that you can use! - -::: {.callout-tip} -## 🎬 Workflow-Based Demonstrations -- **Essential validations** for everyday data quality checks -- **Data exploration** tools that require no Python knowledge -- **CI/CD integration** patterns for automated data quality -- **Complete pipelines** from exploration to production validation -::: - -::: {.callout-note} -## Prerequisites -To follow along with these demonstrations: -```bash -pip install pointblank -pb --help # Verify installation -``` -::: - -### Getting Started with the CLI - -Learn the basics of Pointblank's CLI and run your first validation: - -::: {.gif-container} -![Getting Started](../../assets/vhs/cli-getting-started.gif) -*CLI overview and your first data quality validation* -::: - -### Essential Data Quality Validations - -See the most commonly used validation checks that catch critical data issues: - -::: {.gif-container} -![Essential Validations](../../assets/vhs/cli-essential-validations.gif) -*Duplicate detection, null checks, and data extract debugging* -::: - -### Data Exploration Tools - -Discover how to profile and explore data using CLI tools that are quick and easy to use: - -::: {.gif-container} -![Data Exploration](../../assets/vhs/cli-data-exploration.gif) -*Preview data, find missing values, and generate column summaries* -::: - -### Using Polars - -You can use Polars in the CLI to load and transform data, and, pass the data to other CLI tools: - -::: {.gif-container} -![Using Polars](../../assets/vhs/cli-using-polars.gif) -*Preview data, find missing values, and generate column summaries* -::: - -### CI/CD Integration & Automation - -Learn how to integrate data quality checks into automated pipelines: - -::: {.gif-container} -![CI/CD Integration](../../assets/vhs/cli-cicd-workflows.gif) -*Exit codes, pipeline integration, and automated quality gates* -::: - -### Complete Data Quality Workflow - -Follow an end-to-end data quality pipeline combining exploration, validation, and profiling: - -::: {.gif-container} -![Complete Workflow](../../assets/vhs/cli-complete-workflow.gif) -*Full pipeline: explore → validate → automate* -::: - -## Getting Started - -Ready to implement data quality workflows? Here's how to get started: - -#### 1. Install and Verify - -```bash -pip install pointblank -pb --help -``` - -#### 2. Explore Various Data Sources - -```bash -# Try previewing a built-in dataset -pb preview small_table - -# Access local files (even use patterns to combine multiple Parquet files) -pb preview sales_data.csv -pb scan "data/*.parquet" - -# Inspect datasets in GitHub repositories (no need to download the data!) -pb preview "https://github.com/user/repo/blob/main/data.csv" -pb missing "https://raw.githubusercontent.com/user/repo/main/sales.parquet" - -# Work with DB tables through connection strings -pb info "duckdb:///warehouse/analytics.ddb::customers" -``` - -#### 3. Run Essential Validations - -```bash -# Check for duplicate rows -pb validate small_table --check rows-distinct - -# Validate data from multiple sources -pb validate "data/*.parquet" --check col-vals-not-null --column customer_id -pb validate "https://github.com/user/repo/blob/main/sales.csv" --check rows-distinct - -# Extract failing data for debugging -pb validate small_table --check col-vals-gt --column a --value 5 --show-extract -``` - -#### 4. Integrate with CI/CD - -```bash -# Use exit codes for automation (0 = pass, 1 = fail) -pb validate small_table --check rows-distinct --exit-code -``` diff --git a/docs/demos/col-vals-custom-expr/index.qmd b/docs/demos/col-vals-custom-expr/index.qmd deleted file mode 100644 index 3f37a20852..0000000000 --- a/docs/demos/col-vals-custom-expr/index.qmd +++ /dev/null @@ -1,51 +0,0 @@ ---- -pagetitle: "Examples: Custom Expression for Checking Column Values" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Custom Expression for Checking Column Values - -A column expression can be used to check column values. Just use `col_vals_expr()` for this. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas") - ) - .col_vals_expr(expr=lambda df: (df["d"] % 1 != 0) & (df["a"] < 10)) # Pandas column expr - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas") - ) - .col_vals_expr(expr=lambda df: (df["d"] % 1 != 0) & (df["a"] < 10)) # Pandas column expr - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="pandas"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/column-selector-functions/index.qmd b/docs/demos/column-selector-functions/index.qmd deleted file mode 100644 index 34cff0f93e..0000000000 --- a/docs/demos/column-selector-functions/index.qmd +++ /dev/null @@ -1,81 +0,0 @@ ---- -pagetitle: "Examples: Column Selector Functions: Easily Pick Columns" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Column Selector Functions: Easily Pick Columns - -Use column selector functions in the `columns=` argument to conveniently choose columns. - -```{python} -#| echo: false - -import pointblank as pb -import narwhals.selectors as ncs - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .col_vals_ge( - columns=pb.matches("rev|dur"), # check values in columns having 'rev' or 'dur' in name - value=0 - ) - .col_vals_regex( - columns=pb.ends_with("_id"), # check values in columns with names ending in '_id' - pattern=r"^[A-Z]{12}\d{3}" - ) - .col_vals_not_null( - columns=pb.last_n(2) # check that the last two columns don't have Null values - ) - .col_vals_regex( - columns=ncs.string(), # check that all string columns are non-empty strings - pattern=r"(.|\s)*\S(.|\s)*" - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import narwhals.selectors as ncs - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .col_vals_ge( - columns=pb.matches("rev|dur"), # check values in columns having 'rev' or 'dur' in name - value=0 - ) - .col_vals_regex( - columns=pb.ends_with("_id"), # check values in columns with names ending in '_id' - pattern=r"^[A-Z]{12}\d{3}" - ) - .col_vals_not_null( - columns=pb.last_n(2) # check that the last two columns don't have Null values - ) - .col_vals_regex( - columns=ncs.string(), # check that all string columns are non-empty strings - pattern=r"(.|\s)*\S(.|\s)*" - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue", tbl_type="polars")) -``` - -
diff --git a/docs/demos/comparisons-across-columns/index.qmd b/docs/demos/comparisons-across-columns/index.qmd deleted file mode 100644 index 62b4b513f7..0000000000 --- a/docs/demos/comparisons-across-columns/index.qmd +++ /dev/null @@ -1,63 +0,0 @@ ---- -pagetitle: "Examples: Comparison Checks Across Columns" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Comparison Checks Across Columns - -Perform comparisons of values in columns to values in other columns. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_lt(columns="a", value=pb.col("c")) # values in 'a' > values in 'c' - .col_vals_between( - columns="d", # values in 'd' are between values - left=pb.col("c"), # in 'c' and the fixed value of 12,000; - right=12000, # any missing values encountered result - na_pass=True # in a passing test unit - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_lt(columns="a", value=pb.col("c")) # values in 'a' > values in 'c' - .col_vals_between( - columns="d", # values in 'd' are between values - left=pb.col("c"), # in 'c' and the fixed value of 12,000; - right=12000, # any missing values encountered result - na_pass=True # in a passing test unit - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/custom-validation-specially/index.qmd b/docs/demos/custom-validation-specially/index.qmd deleted file mode 100644 index 87faaa196f..0000000000 --- a/docs/demos/custom-validation-specially/index.qmd +++ /dev/null @@ -1,93 +0,0 @@ ---- -pagetitle: "Examples: Custom Validation with `specially()`" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Custom Validation with `specially()` - -Create bespoke validations using `specially()` to implement domain-specific business rules. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl - -def within_std_deviations(df, column, n_std=2): - """Check if all values are within n standard deviations of the mean""" - mean_val = df[column].mean() - std_val = df[column].std() - - lower_bound = mean_val - (n_std * std_val) - upper_bound = mean_val + (n_std * std_val) - - # Add a boolean column and return the modified DataFrame - return df.with_columns( - pl.col(column).is_between(lower_bound, upper_bound, closed="both").alias("validation_result") - ) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .specially( - expr=lambda df: within_std_deviations(df, column="session_duration", n_std=2), - brief="All values in column 'a' should be within 2 std devs of mean" - ) - .specially( - expr=lambda df: within_std_deviations(df, column="session_duration", n_std=3), - brief="All values in column 'c' should be within 3 std devs of mean" - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl - -def within_std_deviations(df, column, n_std=2): - """Check if all values are within n standard deviations of the mean""" - mean_val = df[column].mean() - std_val = df[column].std() - - lower_bound = mean_val - (n_std * std_val) - upper_bound = mean_val + (n_std * std_val) - - # Add a boolean column and return the modified DataFrame - return df.with_columns( - pl.col(column).is_between(lower_bound, upper_bound, closed="both").alias("validation_result") - ) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .specially( - expr=lambda df: within_std_deviations(df, column="session_duration", n_std=2), - brief="All values in column 'a' should be within 2 std devs of mean" - ) - .specially( - expr=lambda df: within_std_deviations(df, column="session_duration", n_std=3), - brief="All values in column 'c' should be within 3 std devs of mean" - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue", tbl_type="polars")) -``` - -
diff --git a/docs/demos/data/game_revenue.parquet b/docs/demos/data/game_revenue.parquet deleted file mode 100644 index 56c2bf60b8..0000000000 Binary files a/docs/demos/data/game_revenue.parquet and /dev/null differ diff --git a/docs/demos/datetime-validations/index.qmd b/docs/demos/datetime-validations/index.qmd deleted file mode 100644 index 7a7d12dd74..0000000000 --- a/docs/demos/datetime-validations/index.qmd +++ /dev/null @@ -1,155 +0,0 @@ ---- -pagetitle: "Examples: Date and Datetime Validations" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Date and Datetime Validations - -**pointblank** provides comprehensive support for validating date and datetime values, including timezone-aware comparisons. This ensures temporal data quality in applications that handle time-sensitive information. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl -from datetime import date, datetime -from zoneinfo import ZoneInfo - -# Create sample data with various temporal data types -temporal_data = pl.DataFrame({ - "order_date": [ - date(2023, 1, 15), - date(2023, 6, 10), - date(2023, 12, 5), - date(2024, 3, 20) - ], - "created_at": [ - datetime(2023, 1, 15, 9, 30, 0), - datetime(2023, 6, 10, 14, 45, 30), - datetime(2023, 12, 5, 8, 15, 0), - datetime(2024, 3, 20, 17, 22, 45) - ], - "event_time_tz": [ - datetime(2023, 1, 15, 9, 0, tzinfo=ZoneInfo("America/New_York")), - datetime(2023, 6, 10, 12, 30, tzinfo=ZoneInfo("America/New_York")), - datetime(2023, 12, 5, 15, 45, tzinfo=ZoneInfo("America/New_York")), - datetime(2024, 3, 20, 18, 15, tzinfo=ZoneInfo("America/New_York")) - ], - "order_id": [1001, 1002, 1003, 1004], - "amount": [150.0, 275.5, 89.99, 420.00] -}) - -validation = ( - pb.Validate(temporal_data) - .col_vals_ge( - columns="order_date", - value=date(2023, 1, 1), - brief="Orders are from 2023 or later" - ) - .col_vals_between( - columns="created_at", - left=datetime(2023, 1, 1, 0, 0, 0), - right=datetime(2024, 12, 31, 23, 59, 59), - brief="Creation timestamps within expected range" - ) - .col_vals_ge( - columns="event_time_tz", - value=datetime(2023, 1, 1, 8, 0, tzinfo=ZoneInfo("America/New_York")), - brief="Timezone-aware events after 8 AM Eastern" - ) - .col_schema_match( - pb.Schema( - columns=[ - ("order_date", "Date"), - ("created_at", "Datetime(time_unit='us', time_zone=None)"), - ("event_time_tz", "Datetime(time_unit='us', time_zone='America/New_York')"), - ("order_id", "Int64"), - ("amount", "Float64") - ] - ), - brief="Schema includes proper date/datetime types" - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl -from datetime import date, datetime -from zoneinfo import ZoneInfo - -# Create sample data with various temporal data types -temporal_data = pl.DataFrame({ - "order_date": [ - date(2023, 1, 15), - date(2023, 6, 10), - date(2023, 12, 5), - date(2024, 3, 20) - ], - "created_at": [ - datetime(2023, 1, 15, 9, 30, 0), - datetime(2023, 6, 10, 14, 45, 30), - datetime(2023, 12, 5, 8, 15, 0), - datetime(2024, 3, 20, 17, 22, 45) - ], - "event_time_tz": [ - datetime(2023, 1, 15, 9, 0, tzinfo=ZoneInfo("America/New_York")), - datetime(2023, 6, 10, 12, 30, tzinfo=ZoneInfo("America/New_York")), - datetime(2023, 12, 5, 15, 45, tzinfo=ZoneInfo("America/New_York")), - datetime(2024, 3, 20, 18, 15, tzinfo=ZoneInfo("America/New_York")) - ], - "order_id": [1001, 1002, 1003, 1004], - "amount": [150.0, 275.5, 89.99, 420.00] -}) - -validation = ( - pb.Validate(temporal_data) - .col_vals_ge( - columns="order_date", - value=date(2023, 1, 1), - brief="Orders are from 2023 or later" - ) - .col_vals_between( - columns="created_at", - left=datetime(2023, 1, 1, 0, 0, 0), - right=datetime(2024, 12, 31, 23, 59, 59), - brief="Creation timestamps within expected range" - ) - .col_vals_ge( - columns="event_time_tz", - value=datetime(2023, 1, 1, 8, 0, tzinfo=ZoneInfo("America/New_York")), - brief="Timezone-aware events after 8 AM Eastern" - ) - .col_schema_match( - pb.Schema( - columns=[ - ("order_date", "Date"), - ("created_at", "Datetime(time_unit='us', time_zone=None)"), - ("event_time_tz", "Datetime(time_unit='us', time_zone='America/New_York')"), - ("order_id", "Int64"), - ("amount", "Float64") - ] - ), - brief="Schema includes proper date/datetime types" - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(temporal_data) -``` - -
diff --git a/docs/demos/expect-no-duplicate-rows/index.qmd b/docs/demos/expect-no-duplicate-rows/index.qmd deleted file mode 100644 index 1de23b8fb9..0000000000 --- a/docs/demos/expect-no-duplicate-rows/index.qmd +++ /dev/null @@ -1,51 +0,0 @@ ---- -pagetitle: "Examples: Expect No Duplicate Rows" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Expect No Duplicate Rows - -We can check for duplicate rows in the table with `rows_distinct()`. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct() # expect no duplicate rows - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct() # expect no duplicate rows - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/expect-no-duplicate-values/index.qmd b/docs/demos/expect-no-duplicate-values/index.qmd deleted file mode 100644 index 8d611adf0c..0000000000 --- a/docs/demos/expect-no-duplicate-values/index.qmd +++ /dev/null @@ -1,51 +0,0 @@ ---- -pagetitle: "Examples: Checking for Duplicate Values" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Checking for Duplicate Values - -To check for duplicate values down a column, use `rows_distinct()` with a `columns_subset=` value. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct(columns_subset="b") # expect no duplicate values in 'b' - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct(columns_subset="b") # expect no duplicate values in 'b' - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/expect-text-pattern/index.qmd b/docs/demos/expect-text-pattern/index.qmd deleted file mode 100644 index 54c9f9048a..0000000000 --- a/docs/demos/expect-text-pattern/index.qmd +++ /dev/null @@ -1,53 +0,0 @@ ---- -pagetitle: "Examples: Expectations with a Text Pattern" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Expectations with a Text Pattern - -With the `col_vals_regex()`, check for conformance to a regular expression. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_regex(columns="b", pattern=r"^\d-[a-z]{3}-\d{3}$") # check pattern in 'b' - .col_vals_regex(columns="f", pattern=r"high|low|mid") # check pattern in 'f' - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_regex(columns="b", pattern=r"^\d-[a-z]{3}-\d{3}$") # check pattern in 'b' - .col_vals_regex(columns="f", pattern=r"high|low|mid") # check pattern in 'f' - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/failure-thresholds/index.qmd b/docs/demos/failure-thresholds/index.qmd deleted file mode 100644 index 500e4b5add..0000000000 --- a/docs/demos/failure-thresholds/index.qmd +++ /dev/null @@ -1,77 +0,0 @@ ---- -pagetitle: "Examples: Set Failure Threshold Levels" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Set Failure Threshold Levels - -Set threshold levels to better gauge adverse data quality. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds( # setting relative threshold defaults for all steps - warning=0.05, # 5% failing test units: warning threshold (gray) - error=0.10, # 10% failed test units: error threshold (yellow) - critical=0.15 # 15% failed test units: critical threshold (red) - ), - ) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=4, - thresholds=(5, 10, 20) # setting absolute thresholds for *this* step (W, E, C) - ) - .col_exists(columns="end_day") - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds( # setting relative threshold defaults for all steps - warning=0.05, # 5% failing test units: warning threshold (gray) - error=0.10, # 10% failed test units: error threshold (yellow) - critical=0.15 # 15% failed test units: critical threshold (red) - ), - ) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=4, - thresholds=(5, 10, 20) # setting absolute thresholds for *this* step (W, E, C) - ) - .col_exists(columns="end_day") - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="game_revenue", tbl_type="duckdb")) -``` - -
diff --git a/docs/demos/img/advanced_validation.png b/docs/demos/img/advanced_validation.png deleted file mode 100644 index 76c0e90e5a..0000000000 Binary files a/docs/demos/img/advanced_validation.png and /dev/null differ diff --git a/docs/demos/img/data_extracts.png b/docs/demos/img/data_extracts.png deleted file mode 100644 index 32d70c391c..0000000000 Binary files a/docs/demos/img/data_extracts.png and /dev/null differ diff --git a/docs/demos/img/starter_validation.png b/docs/demos/img/starter_validation.png deleted file mode 100644 index 0fad6b0389..0000000000 Binary files a/docs/demos/img/starter_validation.png and /dev/null differ diff --git a/docs/demos/img/step_report_column_schema.png b/docs/demos/img/step_report_column_schema.png deleted file mode 100644 index 9a6f72092b..0000000000 Binary files a/docs/demos/img/step_report_column_schema.png and /dev/null differ diff --git a/docs/demos/img/step_report_column_values.png b/docs/demos/img/step_report_column_values.png deleted file mode 100644 index 1d89f0197c..0000000000 Binary files a/docs/demos/img/step_report_column_values.png and /dev/null differ diff --git a/docs/demos/img/sundered_data.png b/docs/demos/img/sundered_data.png deleted file mode 100644 index 61c7927675..0000000000 Binary files a/docs/demos/img/sundered_data.png and /dev/null differ diff --git a/docs/demos/img/validation_with_actions.png b/docs/demos/img/validation_with_actions.png deleted file mode 100644 index 65fee603e2..0000000000 Binary files a/docs/demos/img/validation_with_actions.png and /dev/null differ diff --git a/docs/demos/img/validation_with_final_actions.png b/docs/demos/img/validation_with_final_actions.png deleted file mode 100644 index ac1d05b025..0000000000 Binary files a/docs/demos/img/validation_with_final_actions.png and /dev/null differ diff --git a/docs/demos/index.qmd b/docs/demos/index.qmd deleted file mode 100644 index 923d8678cc..0000000000 --- a/docs/demos/index.qmd +++ /dev/null @@ -1,180 +0,0 @@ ---- -pagetitle: Examples -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - - - -

A Selection of Examples

- -:::::: {.column-page} - - -::::: {.grid} -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Starter Validation](./01-starter/index.qmd) -
- -

A validation with the basics.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Advanced Validation](./02-advanced/index.qmd) -
- -

A validation with a comprehensive set of rules.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Data Extracts](./03-data-extracts/index.qmd) -
- -

Pulling out data extracts that highlight rows with validation failures.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Sundered Data](./04-sundered-data/index.qmd) -
- -

Splitting your data into 'pass' and 'fail' subsets.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Step Reports for Column Data Checks](./05-step-report-column-check/index.qmd) -
- -

A step report for column checks shows what went wrong.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Step Report for a Schema Check](./06-step-report-schema-check/index.qmd) -
- -

When a schema doesn't match, a step report gives you the details.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Step-Level Actions](./07-validation-with-actions/index.qmd) -
- -

Configure actions to trigger when validation thresholds are exceeded, such as logging warnings or errors.

-
-::: - -:::{.g-col-lg-6 .g-col-12 .example} -
-
-[Final Actions](./08-validation-with-final-actions/index.qmd) -
- -

Execute actions after validation completes, such as sending alerts or generating summary reports.

-
-::: - -::::: - -:::::: - -
- -
- -[Numeric Comparisons](./numeric-comparisons/index.qmd)
-Perform comparisons of values in columns to fixed values. - -[Comparison Checks Across Columns](./comparisons-across-columns/index.qmd)
-Perform comparisons of values in columns to values in other columns. - -[Apply Validation Rules to Multiple Columns](./apply-checks-to-several-columns/index.qmd)
-Create multiple validation steps by using a list of column names with `columns=`. - -[Checks for Missing Values](./checks-for-missing/index.qmd)
-Perform validations that check whether missing/NA/Null values are present. - -[Expectations with a Text Pattern](./expect-text-pattern/index.qmd)
-With `col_vals_regex()`, check for conformance to a regular expression. - -[Set Membership](./set-membership/index.qmd)
-Perform validations that check whether values are part of a set (or *not* part of one). - -[Expect No Duplicate Rows](./expect-no-duplicate-rows/index.qmd)
-We can check for duplicate rows in the table with `rows_distinct()`. - -[Checking for Duplicate Values](./expect-no-duplicate-values/index.qmd)
-To check for duplicate values down a column, use `rows_distinct()` with a `columns_subset=` value. - -[Custom Expression for Checking Column Values](./col-vals-custom-expr/index.qmd)
-A column expression can be used to check column values. Just use `col_vals_expr()` for this. - -[Mutate the Table in a Validation Step](./mutate-table-in-step/index.qmd)
-For far more specialized validations, modify the table with the `pre=` argument before checking it. - -[Verifying Row and Column Counts](./check-row-column-counts/index.qmd)
-Check the dimensions of the table with the `*_count_match()` validation methods. - -[Validating Data Freshness](./check-for-freshness/index.qmd)
-Use date-based validations to ensure your data is current and recent. - -[Date and Datetime Validations](./datetime-validations/index.qmd)
-Comprehensive examples of date, datetime, and timezone-aware datetime comparisons. - -[Custom Validation with `specially()`](./custom-validation-specially/index.qmd)
-Create bespoke validations using `specially()` to implement domain-specific business rules. - -[Set Failure Threshold Levels](./failure-thresholds/index.qmd)
-Set threshold levels to better gauge adverse data quality. - -[Column Selector Functions: Easily Pick Columns](./column-selector-functions/index.qmd)
-Use column selector functions in the `columns=` argument to conveniently choose columns. - -[Check the Schema of a Table](./schema-check/index.qmd)
-The schema of a table can be flexibly defined with `Schema` and verified with `col_schema_match()`. - -[Using Parquet Data](./using-parquet-data/index.qmd)
-A Parquet dataset can be used for data validation, thanks to Ibis. - -[CLI Interactive Demos](./cli-interactive/index.qmd)
-These CLI demos showcase practical data quality workflows that you can use! - -
diff --git a/docs/demos/mutate-table-in-step/index.qmd b/docs/demos/mutate-table-in-step/index.qmd deleted file mode 100644 index 66b9c3a5aa..0000000000 --- a/docs/demos/mutate-table-in-step/index.qmd +++ /dev/null @@ -1,97 +0,0 @@ ---- -pagetitle: "Examples: Mutate the Table in a Validation Step" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Mutate the Table in a Validation Step - -For far more specialized validations, modify the table with the `pre=` argument before checking it. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl -import narwhals as nw - -# Define preprocessing functions -def get_median_a(df): - """Use a Polars expression to aggregate column `a`.""" - return df.select(pl.median("a")) - -def add_b_length_column(df): - """Use Narwhals to add a string length column `b_len`.""" - return ( - nw.from_native(df) - .with_columns(b_len=nw.col("b").str.len_chars()) - ) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_between( - columns="a", - left=3, right=6, - pre=get_median_a - ) - .col_vals_eq( - columns="b_len", - value=9, - pre=add_b_length_column - ) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl -import narwhals as nw - -# Define preprocessing functions -def get_median_a(df): - """Use a Polars expression to aggregate column `a`.""" - return df.select(pl.median("a")) - -def add_b_length_column(df): - """Use Narwhals to add a string length column `b_len`.""" - return ( - nw.from_native(df) - .with_columns(b_len=nw.col("b").str.len_chars()) - ) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_between( - columns="a", - left=3, right=6, - pre=get_median_a - ) - .col_vals_eq( - columns="b_len", - value=9, - pre=add_b_length_column - ) - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/numeric-comparisons/index.qmd b/docs/demos/numeric-comparisons/index.qmd deleted file mode 100644 index bd38dfbacc..0000000000 --- a/docs/demos/numeric-comparisons/index.qmd +++ /dev/null @@ -1,61 +0,0 @@ ---- -pagetitle: "Examples: Numeric Comparisons" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Numeric Comparisons - -Perform comparisons of values in columns to fixed values. - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_gt(columns="d", value=1000) # values in 'd' > 1000 - .col_vals_lt(columns="d", value=10000) # values in 'd' < 10000 - .col_vals_ge(columns="a", value=1) # values in 'a' >= 1 - .col_vals_le(columns="c", value=5) # values in 'c' <= 5 - .col_vals_ne(columns="a", value=7) # values in 'a' not equal to 7 - .col_vals_between(columns="c", left=0, right=15) # 0 <= 'c' values <= 15 - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_gt(columns="d", value=1000) # values in 'd' > 1000 - .col_vals_lt(columns="d", value=10000) # values in 'd' < 10000 - .col_vals_ge(columns="a", value=1) # values in 'a' >= 1 - .col_vals_le(columns="c", value=5) # values in 'c' <= 5 - .col_vals_ne(columns="a", value=7) # values in 'a' not equal to 7 - .col_vals_between(columns="c", left=0, right=15) # 0 <= 'c' values <= 15 - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/schema-check/index.qmd b/docs/demos/schema-check/index.qmd deleted file mode 100644 index c2470f537d..0000000000 --- a/docs/demos/schema-check/index.qmd +++ /dev/null @@ -1,75 +0,0 @@ ---- -pagetitle: "Examples: Check the Schema of a Table" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Check the Schema of a Table - -The schema of a table can be flexibly defined with `Schema` and verified with `col_schema_match()`. - -```{python} -#| echo: false - -import pointblank as pb -import polars as pl - -tbl = pl.DataFrame( - { - "a": ["apple", "banana", "cherry", "date"], - "b": [1, 6, 3, 5], - "c": [1.1, 2.2, 3.3, 4.4], - } -) - -# Use the Schema class to define the column schema as loosely or rigorously as required -schema = pb.Schema( - columns=[ - ("a", "String"), # Column 'a' has dtype 'String' - ("b", ["Int", "Int64"]), # Column 'b' has dtype 'Int' or 'Int64' - ("c", ) # Column 'c' follows 'b' but we don't specify a dtype here - ] -) - -# Use the `col_schema_match()` validation method to perform the schema check -validation = ( - pb.Validate(data=tbl) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import polars as pl - -tbl = pl.DataFrame( - { - "a": ["apple", "banana", "cherry", "date"], - "b": [1, 6, 3, 5], - "c": [1.1, 2.2, 3.3, 4.4], - } -) - -# Use the Schema class to define the column schema as loosely or rigorously as required -schema = pb.Schema( - columns=[ - ("a", "String"), # Column 'a' has dtype 'String' - ("b", ["Int", "Int64"]), # Column 'b' has dtype 'Int' or 'Int64' - ("c", ) # Column 'c' follows 'b' but we don't specify a dtype here - ] -) - -# Use the `col_schema_match()` validation method to perform the schema check -validation = ( - pb.Validate(data=tbl) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` diff --git a/docs/demos/set-membership/index.qmd b/docs/demos/set-membership/index.qmd deleted file mode 100644 index 6af556b868..0000000000 --- a/docs/demos/set-membership/index.qmd +++ /dev/null @@ -1,53 +0,0 @@ ---- -pagetitle: "Examples: Set Membership" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Set Membership - -Perform validations that check whether values are part of a set (or *not* part of one). - -```{python} -#| echo: false - -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) # part of this set - .col_vals_not_in_set(columns="f", set=["zero", "infinity"]) # not part of this set - .interrogate() -) - -validation -``` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) # part of this set - .col_vals_not_in_set(columns="f", set=["zero", "infinity"]) # not part of this set - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -
diff --git a/docs/demos/using-parquet-data/index.qmd b/docs/demos/using-parquet-data/index.qmd deleted file mode 100644 index 1c7922d434..0000000000 --- a/docs/demos/using-parquet-data/index.qmd +++ /dev/null @@ -1,61 +0,0 @@ ---- -pagetitle: "Examples: Using Parquet Data" -notebook-links: false -page-navigation: false -toc: false -html-table-processing: none ---- - -### Using Parquet Data - -A Parquet dataset can be used for data validation, thanks to Ibis. - -```{python} -# | echo: false - -import pointblank as pb -import ibis - -game_revenue = ibis.read_parquet("../data/game_revenue.parquet") - -validation = ( - pb.Validate(data=game_revenue, label="Example using a Parquet dataset.") - .col_vals_lt(columns="item_revenue", value=200) - .col_vals_gt(columns="item_revenue", value=0) - .col_vals_gt(columns="session_duration", value=5) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .interrogate() -) - -validation -``` - -```python -import pointblank as pb -import ibis - -game_revenue = ibis.read_parquet("data/game_revenue.parquet") - -validation = ( - pb.Validate(data=game_revenue, label="Example using a Parquet dataset.") - .col_vals_lt(columns="item_revenue", value=200) - .col_vals_gt(columns="item_revenue", value=0) - .col_vals_gt(columns="session_duration", value=5) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .interrogate() -) - -validation -``` - -
-Preview of Input Table - -```{python} -# | echo: false -pb.preview(game_revenue) -``` - -
diff --git a/docs/index.qmd b/docs/index.qmd deleted file mode 100644 index 3f53915a3e..0000000000 --- a/docs/index.qmd +++ /dev/null @@ -1,191 +0,0 @@ ---- -title: "" -jupyter: python3 -html-table-processing: none ---- - -
- -![](assets/pointblank_logo.svg){width=85%} - -**Data validation toolkit for assessing and monitoring data quality.** - -
- -Pointblank is a data validation framework for Python that makes data quality checks beautiful, -powerful, and stakeholder-friendly. Instead of cryptic error messages, get stunning interactive -reports that turn data issues into conversations. - -Here's what a validation looks like (click "Show the code" to see how it's done): - -```{python} -#| code-fold: true -#| code-summary: "Show the code" - -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - tbl_name="game_revenue", - label="Comprehensive validation of game revenue data", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35), - brief=True - ) - .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 - .col_vals_gt(columns="session_duration", value=20) # STEP 2 - .col_vals_ge(columns="item_revenue", value=0.20) # STEP 3 - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 - .col_vals_in_set( # STEP 5 - columns="acquisition", - set=["google", "facebook", "organic", "crosspromo", "other_campaign"] - ) - .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 - .col_vals_between( # STEP 7 - columns="session_duration", - left=10, right=50, - pre = lambda df: df.select(pl.median("session_duration")), - brief="Expect that the median of `session_duration` should be between `10` and `50`." - ) - .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 8 - .row_count_match(count=2000) # STEP 9 - .col_count_match(count=11) # STEP 10 - .col_vals_not_null(columns="item_type") # STEP 11 - .col_exists(columns="start_day") # STEP 12 - .interrogate() -) - -validation.get_tabular_report(title="Game Revenue Validation Report") -``` - -That's the kind of report you get from Pointblank: clear, interactive, and designed for everyone on -your team. And if you need help getting started or want to work faster, Pointblank has built-in AI -support through the [`assistant()`](reference/assistant.qmd) function to guide you along the way. -You can also use [`DraftValidation`](user-guide/draft-validation.qmd) to quickly generate a -validation plan from your existing data (great for getting started fast). - -Ready to validate? Start with our [Installation](user-guide/installation.qmd) guide or jump straight -to the [User Guide](user-guide/index.qmd). - -By the way, Pointblank is made with 💙 by [Posit](https://posit.co/). - -## What is Data Validation? - -Data validation ensures your data meets quality standards before it's used in analysis, reports, or -downstream systems. Pointblank provides a structured way to define validation rules, execute them, -and communicate results to both technical and non-technical stakeholders. - -With Pointblank you can: - -- **Validate data** through a fluent, chainable API with [25+ validation methods](reference/index.qmd#validation-steps) -- **Set thresholds** to define acceptable levels of data quality (warning, error, critical) -- **Take actions** when thresholds are exceeded (notifications, logging, custom functions) -- **Generate reports** that make data quality issues immediately understandable -- **Inspect data** with built-in tools for previewing, summarizing, and finding missing values - -## Why Pointblank? - -Pointblank is designed for the entire data team, not just engineers: - -- 🎨 **Beautiful Reports**: Interactive validation reports that stakeholders actually want to read -- 📊 **Threshold Management**: Define quality standards with warning, error, and critical levels -- 🔍 **Error Drill-Down**: Inspect failing data to get to root causes quickly -- 🔗 **Universal Compatibility**: Works with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, and more -- 🌍 **Multilingual Support**: Reports available in 40 languages for global teams -- 📝 **YAML Support**: Write validations in YAML for version control and team collaboration -- ⚡ **CLI Tools**: Run validations from the command line for CI/CD pipelines or as quick checks -- 📋 **Rich Inspection**: Preview data, analyze columns, and visualize missing values - -## Quick Examples - -### Threshold-Based Quality - -Set expectations and react when data quality degrades (with alerts, logging, or custom functions): - -```python -validation = ( - pb.Validate(data=sales_data, thresholds=(0.01, 0.02, 0.05)) # Three threhold levels set - .col_vals_not_null(columns="customer_id") - .col_vals_in_set(columns="status", set=["pending", "shipped", "delivered"]) - .interrogate() -) -``` - -### YAML Workflows - -Works wonderfully for CI/CD pipelines and team collaboration: - -```yaml -validate: - data: sales_data - tbl_name: "sales_data" - thresholds: [0.01, 0.02, 0.05] - -steps: - - col_vals_not_null: - columns: "customer_id" - - col_vals_in_set: - columns: "status" - set: ["pending", "shipped", "delivered"] -``` - -```python -validation = pb.yaml_interrogate("validation.yaml") -``` - -### Command Line Power - -Run validations without writing code: - -```bash -# Quick validation -pb validate sales_data.csv --check col-vals-not-null --column customer_id - -# Run YAML workflows -pb run validation.yaml --exit-code # <- Great for CI/CD! - -# Explore your data -pb scan sales_data.csv -pb missing sales_data.csv -``` - -## Installation - -Install Pointblank using pip or conda: - -```bash -pip install pointblank -# or -conda install conda-forge::pointblank -``` - -For specific backends: - -```bash -pip install "pointblank[pl]" # Polars support -pip install "pointblank[pd]" # Pandas support -pip install "pointblank[duckdb]" # DuckDB support -pip install "pointblank[postgres]" # PostgreSQL support -``` - -See the [Installation guide](user-guide/installation.qmd) for more details. - -## Text Formats - -The docs are also available in `llms.txt` format: - -- [`llms.txt`](llms.txt): a sitemap listing all documentation pages -- [`llms-full.txt`](llms-full.txt): all the documentation in one file - -## Join the Community - -We'd love to hear from you! Connect with us: - -- [GitHub Issues](https://github.com/posit-dev/pointblank/issues) for bug reports and feature requests -- [Discord server](https://discord.com/invite/YH7CybCNCQ) for discussions and help -- [Contributing guidelines](https://github.com/posit-dev/pointblank/blob/main/CONTRIBUTING.md) if you'd like to contribute - ---- - -**License**: MIT | **© 2024-2026 Posit Software, PBC** diff --git a/docs/llms-full.txt b/docs/llms-full.txt deleted file mode 100644 index 139fa3f825..0000000000 --- a/docs/llms-full.txt +++ /dev/null @@ -1,17983 +0,0 @@ ----------------------------------------------------------------------- -This is the API documentation for the Pointblank library. ----------------------------------------------------------------------- - - -## The Validate family - -When peforming data validation, you'll need the `Validate` class to get the -process started. It's given the target table and you can optionally provide some metadata and/or -failure thresholds (using the `Thresholds` class or through shorthands for this task). The -`Validate` class has numerous methods for defining validation steps and for obtaining -post-interrogation metrics and data. - -Validate(data: 'IntoDataFrame', reference: 'IntoFrame | None' = None, tbl_name: 'str | None' = None, label: 'str | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, final_actions: 'FinalActions | None' = None, brief: 'str | bool | None' = None, lang: 'str | None' = None, locale: 'str | None' = None, owner: 'str | None' = None, consumers: 'str | list[str] | None' = None, version: 'str | None' = None) -> None - - Workflow for defining a set of validations on a table and interrogating for results. - - The `Validate` class is used for defining a set of validation steps on a table and interrogating - the table with the *validation plan*. This class is the main entry point for the *data quality - reporting* workflow. The overall aim of this workflow is to generate comprehensive reporting - information to assess the level of data quality for a target table. - - We can supply as many validation steps as needed, and having a large number of them should - increase the validation coverage for a given table. The validation methods (e.g., - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_between()`](`pointblank.Validate.col_vals_between`), etc.) translate to discrete - validation steps, where each step will be sequentially numbered (useful when viewing the - reporting data). This process of calling validation methods is known as developing a - *validation plan*. - - The validation methods, when called, are merely instructions up to the point the concluding - [`interrogate()`](`pointblank.Validate.interrogate`) method is called. That kicks off the - process of acting on the *validation plan* by querying the target table getting reporting - results for each step. Once the interrogation process is complete, we can say that the workflow - now has reporting information. We can then extract useful information from the reporting data - to understand the quality of the table. Printing the `Validate` object (or using the - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`) method) will return a table - with the results of the interrogation and - [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) allows for the splitting of the - table based on passing and failing rows. - - Parameters - ---------- - data - The table to validate, which could be a DataFrame object, an Ibis table object, a CSV - file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file, or a - database connection string. When providing a CSV or Parquet file path (as a string or - `pathlib.Path` object), the file will be automatically loaded using an available DataFrame - library (Polars or Pandas). Parquet input also supports glob patterns, directories - containing .parquet files, and Spark-style partitioned datasets. GitHub URLs are - automatically transformed to raw content URLs and downloaded. Connection strings enable - direct database access via Ibis with optional table specification using the `::table_name` - suffix. Read the *Supported Input Table Types* section for details on the supported table - types. - tbl_name - An optional name to assign to the input table object. If no value is provided, a name will - be generated based on whatever information is available. This table name will be displayed - in the header area of the tabular report. - label - An optional label for the validation plan. If no value is provided, a label will be - generated based on the current system date and time. Markdown can be used here to make the - label more visually appealing (it will appear in the header area of the tabular report). - thresholds - Generate threshold failure levels so that all validation steps can report and react - accordingly when exceeding the set levels. The thresholds are set at the global level and - can be overridden at the validation step level (each validation step has its own - `thresholds=` parameter). The default is `None`, which means that no thresholds will be set. - Look at the *Thresholds* section for information on how to set threshold levels. - actions - The actions to take when validation steps meet or exceed any set threshold levels. These - actions are paired with the threshold levels and are executed during the interrogation - process when there are exceedances. The actions are executed right after each step is - evaluated. Such actions should be provided in the form of an `Actions` object. If `None` - then no global actions will be set. View the *Actions* section for information on how to set - actions. - final_actions - The actions to take when the validation process is complete and the final results are - available. This is useful for sending notifications or reporting the overall status of the - validation process. The final actions are executed after all validation steps have been - processed and the results have been collected. The final actions are not tied to any - threshold levels, they are executed regardless of the validation results. Such actions - should be provided in the form of a `FinalActions` object. If `None` then no finalizing - actions will be set. Please see the *Actions* section for information on how to set final - actions. - brief - A global setting for briefs, which are optional brief descriptions for validation steps - (they be displayed in the reporting table). For such a global setting, templating elements - like `"{step}"` (to insert the step number) or `"{auto}"` (to include an automatically - generated brief) are useful. If `True` then each brief will be automatically generated. If - `None` (the default) then briefs aren't globally set. - lang - The language to use for various reporting elements. By default, `None` will select English - (`"en"`) as the but other options include French (`"fr"`), German (`"de"`), Italian - (`"it"`), Spanish (`"es"`), and several more. Have a look at the *Reporting Languages* - section for the full list of supported languages and information on how the language setting - is utilized. - locale - An optional locale ID to use for formatting values in the reporting table according the - locale's rules. Examples include `"en-US"` for English (United States) and `"fr-FR"` for - French (France). More simply, this can be a language identifier without a designation of - territory, like `"es"` for Spanish. - owner - An optional string identifying the owner of the data being validated. This is useful for - governance purposes, indicating who is responsible for the quality and maintenance of the - data. For example, `"data-platform-team"` or `"analytics-engineering"`. - consumers - An optional string or list of strings identifying who depends on or consumes this data. - This helps document data dependencies and can be useful for impact analysis when data - quality issues are detected. For example, `"ml-team"` or `["ml-team", "analytics"]`. - version - An optional string representing the version of the validation plan or data contract. This - supports semantic versioning (e.g., `"1.0.0"`, `"2.1.0"`) and is useful for tracking changes - to validation rules over time and for organizational governance. - - Returns - ------- - Validate - A `Validate` object with the table and validations to be performed. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires - the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas - DataFrame, the Ibis library is not required. - - To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is - provided. The file will be automatically detected and loaded using the best available DataFrame - library. The loading preference is Polars first, then Pandas as a fallback. - - Connection strings follow database URL formats and must also specify a table using the - `::table_name` suffix. Examples include: - - ``` - "duckdb:///path/to/database.ddb::table_name" - "sqlite:///path/to/database.db::table_name" - "postgresql://user:password@localhost:5432/database::table_name" - "mysql://user:password@localhost:3306/database::table_name" - "bigquery://project/dataset::table_name" - "snowflake://user:password@account/database/schema::table_name" - ``` - - When using connection strings, the Ibis library with the appropriate backend driver is required. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for all validation - steps. They are set here at the global level but can be overridden at the validation step level - (each validation step has its own local `thresholds=` parameter). - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values can - either be set as a proportion failing of all test units (a value between `0` to `1`), or, the - absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is the - 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units for - the 'warning' level only - - If the number of failing test units for a validation step exceeds set thresholds, the validation - step will be marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need - to be set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to take - for each level of failure (using the `actions=` parameter). - - Actions - ------- - The `actions=` and `final_actions=` parameters provide mechanisms to respond to validation - results. These actions can be used to notify users of validation failures, log issues, or - trigger other processes when problems are detected. - - *Step Actions* - - The `actions=` parameter allows you to define actions that are triggered when validation steps - exceed specific threshold levels (warning, error, or critical). These actions are executed - during the interrogation process, right after each step is evaluated. - - Step actions should be provided using the [`Actions`](`pointblank.Actions`) class, which lets - you specify different actions for different severity levels: - - ```python - # Define an action that logs a message when warning threshold is exceeded - def log_warning(): - metadata = pb.get_action_metadata() - print(f"WARNING: Step {metadata['step']} failed with type {metadata['type']}") - - # Define actions for different threshold levels - actions = pb.Actions( - warning = log_warning, - error = lambda: send_email("Error in validation"), - critical = "CRITICAL FAILURE DETECTED" - ) - - # Use in Validate - validation = pb.Validate( - data=my_data, - actions=actions # Global actions for all steps - ) - ``` - - You can also provide step-specific actions in individual validation methods: - - ```python - validation.col_vals_gt( - columns="revenue", - value=0, - actions=pb.Actions(warning=log_warning) # Only applies to this step - ) - ``` - - Step actions have access to step-specific context through the - [`get_action_metadata()`](`pointblank.get_action_metadata`) function, which provides details - about the current validation step that triggered the action. - - *Final Actions* - - The `final_actions=` parameter lets you define actions that execute after all validation steps - have completed. These are useful for providing summaries, sending notifications based on - overall validation status, or performing cleanup operations. - - Final actions should be provided using the [`FinalActions`](`pointblank.FinalActions`) class: - - ```python - def send_report(): - summary = pb.get_validation_summary() - if summary["status"] == "CRITICAL": - send_alert_email( - subject=f"CRITICAL validation failures in {summary['tbl_name']}", - body=f"{summary['critical_steps']} steps failed with critical severity." - ) - - validation = pb.Validate( - data=my_data, - final_actions=pb.FinalActions(send_report) - ) - ``` - - Final actions have access to validation-wide summary information through the - [`get_validation_summary()`](`pointblank.get_validation_summary`) function, which provides a - comprehensive overview of the entire validation process. - - The combination of step actions and final actions provides a flexible system for responding to - data quality issues at both the individual step level and the overall validation level. - - Reporting Languages - ------------------- - Various pieces of reporting in Pointblank can be localized to a specific language. This is done - by setting the `lang=` parameter in `Validate`. Any of the following languages can be used (just - provide the language code): - - - English (`"en"`) - - French (`"fr"`) - - German (`"de"`) - - Italian (`"it"`) - - Spanish (`"es"`) - - Portuguese (`"pt"`) - - Dutch (`"nl"`) - - Swedish (`"sv"`) - - Danish (`"da"`) - - Norwegian Bokmål (`"nb"`) - - Icelandic (`"is"`) - - Finnish (`"fi"`) - - Polish (`"pl"`) - - Czech (`"cs"`) - - Romanian (`"ro"`) - - Greek (`"el"`) - - Russian (`"ru"`) - - Turkish (`"tr"`) - - Arabic (`"ar"`) - - Hindi (`"hi"`) - - Simplified Chinese (`"zh-Hans"`) - - Traditional Chinese (`"zh-Hant"`) - - Japanese (`"ja"`) - - Korean (`"ko"`) - - Vietnamese (`"vi"`) - - Indonesian (`"id"`) - - Ukrainian (`"uk"`) - - Bulgarian (`"bg"`) - - Croatian (`"hr"`) - - Estonian (`"et"`) - - Hungarian (`"hu"`) - - Irish (`"ga"`) - - Latvian (`"lv"`) - - Lithuanian (`"lt"`) - - Maltese (`"mt"`) - - Slovak (`"sk"`) - - Slovenian (`"sl"`) - - Hebrew (`"he"`) - - Thai (`"th"`) - - Persian (`"fa"`) - - Automatically generated briefs (produced by using `brief=True` or `brief="...{auto}..."`) will - be written in the selected language. The language setting will also used when generating the - validation report table through - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`) (or printing the `Validate` - object in a notebook environment). - - Examples - -------- - ### Creating a validation plan and interrogating - - Let's walk through a data quality analysis of an extremely small table. It's actually called - `"small_table"` and it's accessible through the [`load_dataset()`](`pointblank.load_dataset`) - function. - - ```python - import pointblank as pb - - # Load the `small_table` dataset - small_table = pb.load_dataset(dataset="small_table", tbl_type="polars") - - # Preview the table - pb.preview(small_table) - ``` - - We ought to think about what's tolerable in terms of data quality so let's designate - proportional failure thresholds to the 'warning', 'error', and 'critical' states. This can be - done by using the [`Thresholds`](`pointblank.Thresholds`) class. - - ```python - thresholds = pb.Thresholds(warning=0.10, error=0.25, critical=0.35) - ``` - - Now, we use the `Validate` class and give it the `thresholds` object (which serves as a default - for all validation steps but can be overridden). The static thresholds provided in `thresholds=` - will make the reporting a bit more useful. We also need to provide a target table and we'll use - `small_table` for this. - - ```python - validation = ( - pb.Validate( - data=small_table, - tbl_name="small_table", - label="`Validate` example.", - thresholds=thresholds - ) - ) - ``` - - Then, as with any `Validate` object, we can add steps to the validation plan by using as many - validation methods as we want. To conclude the process (and actually query the data table), we - use the [`interrogate()`](`pointblank.Validate.interrogate`) method. - - ```python - validation = ( - validation - .col_vals_gt(columns="d", value=100) - .col_vals_le(columns="c", value=5) - .col_vals_between(columns="c", left=3, right=10, na_pass=True) - .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}") - .col_exists(columns=["date", "date_time"]) - .interrogate() - ) - ``` - - The `validation` object can be printed as a reporting table. - - ```python - validation - ``` - - The report could be further customized by using the - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`) method, which contains - options for modifying the display of the table. - - ### Adding briefs - - Briefs are short descriptions of the validation steps. While they can be set for each step - individually, they can also be set globally. The global setting is done by using the - `brief=` argument in `Validate`. The global setting can be as simple as `True` to have - automatically-generated briefs for each step. Alternatively, we can use templating elements - like `"{step}"` (to insert the step number) or `"{auto}"` (to include an automatically generated - brief). Here's an example of a global setting for briefs: - - ```python - validation_2 = ( - pb.Validate( - data=pb.load_dataset(), - tbl_name="small_table", - label="Validation example with briefs", - brief="Step {step}: {auto}", - ) - .col_vals_gt(columns="d", value=100) - .col_vals_between(columns="c", left=3, right=10, na_pass=True) - .col_vals_regex( - columns="b", - pattern=r"[0-9]-[a-z]{3}-[0-9]{3}", - brief="Regex check for column {col}" - ) - .interrogate() - ) - - validation_2 - ``` - - We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore, - the global brief's template (`"Step {step}: {auto}"`) is applied to all steps except for the - final step, where the step-level `brief=` argument provided an override. - - If you should want to cancel the globally-defined brief for one or more validation steps, you - can set `brief=False` in those particular steps. - - ### Post-interrogation methods - - The `Validate` class has a number of post-interrogation methods that can be used to extract - useful information from the validation results. For example, the - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method can be used to get - the data extracts for each validation step. - - ```python - validation_2.get_data_extracts() - ``` - - We can also view step reports for each validation step using the - [`get_step_report()`](`pointblank.Validate.get_step_report`) method. This method adapts to the - type of validation step and shows the relevant information for a step's validation. - - ```python - validation_2.get_step_report(i=2) - ``` - - The `Validate` class also has a method for getting the sundered data, which is the data that - passed or failed the validation steps. This can be done using the - [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method. - - ```python - pb.preview(validation_2.get_sundered_data()) - ``` - - The sundered data is a DataFrame that contains the rows that passed or failed the validation. - The default behavior is to return the rows that failed the validation, as shown above. - - ### Working with CSV Files - - The `Validate` class can directly accept CSV file paths, making it easy to validate data stored - in CSV files without manual loading: - - ```python - # Get a path to a CSV file from the package data - csv_path = pb.get_data_path("global_sales", "csv") - - validation_3 = ( - pb.Validate( - data=csv_path, - label="CSV validation example" - ) - .col_exists(["customer_id", "product_id", "revenue"]) - .col_vals_not_null(["customer_id", "product_id"]) - .col_vals_gt(columns="revenue", value=0) - .interrogate() - ) - - validation_3 - ``` - - You can also use a Path object to specify the CSV file. Here's an example of how to do that: - - ```python - from pathlib import Path - - csv_file = Path(pb.get_data_path("game_revenue", "csv")) - - validation_4 = ( - pb.Validate(data=csv_file, label="Game Revenue Validation") - .col_exists(["player_id", "session_id", "item_name"]) - .col_vals_regex( - columns="session_id", - pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}" - ) - .col_vals_gt(columns="item_revenue", value=0, na_pass=True) - .interrogate() - ) - - validation_4 - ``` - - The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided, - Pointblank will automatically load the file using the best available DataFrame library (Polars - preferred, Pandas as fallback). The loaded data can then be used with all validation methods - just like any other supported table type. - - ### Working with Parquet Files - - The `Validate` class can directly accept Parquet files and datasets in various formats. The - following examples illustrate how to validate Parquet files: - - ```python - # Single Parquet file from package data - parquet_path = pb.get_data_path("nycflights", "parquet") - - validation_5 = ( - pb.Validate( - data=parquet_path, - tbl_name="NYC Flights Data" - ) - .col_vals_not_null(["carrier", "origin", "dest"]) - .col_vals_gt(columns="distance", value=0) - .interrogate() - ) - - validation_5 - ``` - - You can also use glob patterns and directories. Here are some examples for how to: - - 1. load multiple Parquet files - 2. load a Parquet-containing directory - 3. load a partitioned Parquet dataset - - ```python - # Multiple Parquet files with glob patterns - validation_6 = pb.Validate(data="data/sales_*.parquet") - - # Directory containing Parquet files - validation_7 = pb.Validate(data="parquet_data/") - - # Partitioned Parquet dataset - validation_8 = ( - pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet - .col_exists(["transaction_id", "amount", "year", "quarter", "region"]) - .interrogate() - ) - ``` - - When you point to a directory that contains a partitioned Parquet dataset (with subdirectories - like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically: - - - discover all Parquet files recursively - - extract partition column values from directory paths - - add partition columns to the final DataFrame - - combine all partitions into a single table for validation - - Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with - either DataFrame library. The loading preference is Polars first, then Pandas as a fallback. - - ### Working with Database Connection Strings - - The `Validate` class supports database connection strings for direct validation of database - tables. Connection strings must specify a table using the `::table_name` suffix: - - ```python - # Get path to a DuckDB database file from package data - duckdb_path = pb.get_data_path("game_revenue", "duckdb") - - validation_9 = ( - pb.Validate( - data=f"duckdb:///{duckdb_path}::game_revenue", - label="DuckDB Game Revenue Validation" - ) - .col_exists(["player_id", "session_id", "item_revenue"]) - .col_vals_gt(columns="item_revenue", value=0) - .interrogate() - ) - - validation_9 - ``` - - For comprehensive documentation on supported connection string formats, error handling, and - installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`) - function. This function handles all the connection logic and provides helpful error messages - when table specifications are missing or backend dependencies are not installed. - - -Thresholds(warning: 'int | float | bool | None' = None, error: 'int | float | bool | None' = None, critical: 'int | float | bool | None' = None) -> None - - Definition of threshold values. - - Thresholds are used to set limits on the number of failing test units at different levels. The - levels are 'warning', 'error', and 'critical'. These levels correspond to different levels of - severity when a threshold is reached. The threshold values can be set as absolute counts or as - fractions of the total number of test units. When a threshold is reached, an action can be taken - (e.g., displaying a message or calling a function) if there is an associated action defined for - that level (defined through the [`Actions`](`pointblank.Actions`) class). - - Parameters - ---------- - warning - The threshold for the 'warning' level. This can be an absolute count or a fraction of the - total. Using `True` will set this threshold value to `1`. - error - The threshold for the 'error' level. This can be an absolute count or a fraction of the - total. Using `True` will set this threshold value to `1`. - critical - The threshold for the 'critical' level. This can be an absolute count or a fraction of the - total. Using `True` will set this threshold value to `1`. - - Returns - ------- - Thresholds - A `Thresholds` object. This can be used when using the [`Validate`](`pointblank.Validate`) - class (to set thresholds globally) or when defining validation steps like - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) (so that threshold values are scoped to - individual validation steps, overriding any global thresholds). - - Examples - -------- - In a data validation workflow, you can set thresholds for the number of failing test units at - different levels. For example, you can set a threshold for the 'warning' level when the number - of failing test units exceeds 10% of the total number of test units: - - ```python - thresholds_1 = pb.Thresholds(warning=0.1) - ``` - - You can also set thresholds for the 'error' and 'critical' levels: - - ```python - thresholds_2 = pb.Thresholds(warning=0.1, error=0.2, critical=0.05) - ``` - - Thresholds can also be set as absolute counts. Here's an example where the 'warning' level is - set to `5` failing test units: - - ```python - thresholds_3 = pb.Thresholds(warning=5) - ``` - - The `thresholds` object can be used to set global thresholds for all validation steps. Or, you - can set thresholds for individual validation steps, which will override the global thresholds. - Here's a data validation workflow example where we set global thresholds and then override with - different thresholds at the [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) step: - - ```python - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - label="Example Validation", - thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3) - ) - .col_vals_not_null(columns=["c", "d"]) - .col_vals_gt(columns="a", value=3, thresholds=pb.Thresholds(warning=5)) - .interrogate() - ) - - validation - ``` - - As can be seen, the last step ([`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)) has its own - thresholds, which override the global thresholds set at the beginning of the validation workflow - (in the [`Validate`](`pointblank.Validate`) class). - - -Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: 'str | Callable | list[str | Callable] | None' = None, critical: 'str | Callable | list[str | Callable] | None' = None, default: 'str | Callable | list[str | Callable] | None' = None, highest_only: 'bool' = True) -> None - - Definition of action values. - - Actions complement threshold values by defining what action should be taken when a threshold - level is reached. The action can be a string or a `Callable`. When a string is used, it is - interpreted as a message to be displayed. When a `Callable` is used, it will be invoked at - interrogation time if the threshold level is met or exceeded. - - There are three threshold levels: 'warning', 'error', and 'critical'. These levels correspond - to different levels of severity when a threshold is reached. Those thresholds can be defined - using the [`Thresholds`](`pointblank.Thresholds`) class or various shorthand forms. Actions - don't have to be defined for all threshold levels; if an action is not defined for a level in - exceedance, no action will be taken. Likewise, there is no negative consequence (other than a - no-op) for defining actions for thresholds that don't exist (e.g., setting an action for the - 'critical' level when no corresponding 'critical' threshold has been set). - - Parameters - ---------- - warning - A string, `Callable`, or list of `Callable`/string values for the 'warning' level. Using - `None` means no action should be performed at the 'warning' level. - error - A string, `Callable`, or list of `Callable`/string values for the 'error' level. Using - `None` means no action should be performed at the 'error' level. - critical - A string, `Callable`, or list of `Callable`/string values for the 'critical' level. Using - `None` means no action should be performed at the 'critical' level. - default - A string, `Callable`, or list of `Callable`/string values for all threshold levels. This - parameter can be used to set the same action for all threshold levels. If an action is - defined for a specific threshold level, it will override the action set for all levels. - highest_only - A boolean value that, when set to `True` (the default), results in executing only the action - for the highest threshold level that is exceeded. Useful when you want to ensure that only - the most severe action is taken when multiple threshold levels are exceeded. - - Returns - ------- - Actions - An `Actions` object. This can be used when using the [`Validate`](`pointblank.Validate`) - class (to set actions for meeting different threshold levels globally) or when defining - validation steps like [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) (so that actions - are scoped to individual validation steps, overriding any globally set actions). - - Types of Actions - ---------------- - Actions can be defined in different ways: - - 1. **String**: A message to be displayed when the threshold level is met or exceeded. - 2. **Callable**: A function that is called when the threshold level is met or exceeded. - 3. **List of Strings/Callables**: Multiple messages or functions to be called when the threshold - level is met or exceeded. - - The actions are executed at interrogation time when the threshold level assigned to the action - is exceeded by the number or proportion of failing test units. When providing a string, it will - simply be printed to the console. A callable will also be executed at the time of interrogation. - If providing a list of strings or callables, each item in the list will be executed in order. - Such a list can contain a mix of strings and callables. - - String Templating - ----------------- - When using a string as an action, you can include placeholders for the following variables: - - - `{type}`: The validation step type where the action is executed (e.g., 'col_vals_gt', - 'col_vals_lt', etc.) - - `{level}`: The threshold level where the action is executed ('warning', 'error', or - 'critical') - - `{step}` or `{i}`: The step number in the validation workflow where the action is executed - - `{col}` or `{column}`: The column name where the action is executed - - `{val}` or `{value}`: An associated value for the validation method (e.g., the value to - compare against in a 'col_vals_gt' validation step) - - `{time}`: A datetime value for when the action was executed - - The first two placeholders can also be used in uppercase (e.g., `{TYPE}` or `{LEVEL}`) and the - corresponding values will be displayed in uppercase. The placeholders are replaced with the - actual values during interrogation. - - For example, the string `"{LEVEL}: '{type}' threshold exceeded for column {col}."` will be - displayed as `"WARNING: 'col_vals_gt' threshold exceeded for column a."` when the 'warning' - threshold is exceeded in a 'col_vals_gt' validation step involving column `a`. - - Crafting Callables with `get_action_metadata()` - ----------------------------------------------- - When creating a callable function to be used as an action, you can use the - [`get_action_metadata()`](`pointblank.get_action_metadata`) function to retrieve metadata about - the step where the action is executed. This metadata contains information about the validation - step, including the step type, level, step number, column name, and associated value. You can - use this information to craft your action message or to take specific actions based on the - metadata provided. - - Examples - -------- - Let's define both threshold values and actions for a data validation workflow. We'll set these - thresholds and actions globally for all validation steps. In this specific example, the only - actions we'll define are for the 'critical' level: - - ```python - import pointblank as pb - - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(critical="Major data quality issue found in step {step}."), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() - ) - - validation - ``` - - Because we set the 'critical' action to display `"Major data quality issue found."` in the - console, this message will be displayed if the number of failing test units exceeds the - 'critical' threshold (set to 15% of the total number of test units). In step 3 of the validation - workflow, the 'critical' threshold is exceeded, so the message is displayed in the console. - - Actions can be defined locally for individual validation steps, which will override any global - actions set at the beginning of the validation workflow. Here's a variation of the above example - where we set global threshold values but assign an action only for an individual validation - step: - - ```python - def dq_issue(): - from datetime import datetime - - print(f"Data quality issue found ({datetime.now()}).") - - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=15, - actions=pb.Actions(warning=dq_issue), - ) - .interrogate() - ) - - validation - ``` - - In this case, the 'warning' action is set to call the `dq_issue()` function. This action is - only executed when the 'warning' threshold is exceeded in the 'session_duration' column. Because - all three thresholds are exceeded in step 3, the 'warning' action of executing the function - occurs (resulting in a message being printed to the console). If actions were set for the other - two threshold levels, they would also be executed. - - See Also - -------- - The [`get_action_metadata()`](`pointblank.get_action_metadata`) function, which can be used to - retrieve metadata about the step where the action is executed. - - -FinalActions(*args) - - Define actions to be taken after validation is complete. - - Final actions are executed after all validation steps have been completed. They provide a - mechanism to respond to the overall validation results, such as sending alerts when critical - failures are detected or generating summary reports. - - Parameters - ---------- - *actions - One or more actions to execute after validation. An action can be (1) a callable function - that will be executed with no arguments, or (2) a string message that will be printed to the - console. - - Returns - ------- - FinalActions - An `FinalActions` object. This can be used when using the - [`Validate`](`pointblank.Validate`) class (to set final actions for the validation - workflow). - - Types of Actions - ---------------- - Final actions can be defined in two different ways: - - 1. **String**: A message to be displayed when the validation is complete. - 2. **Callable**: A function that is called when the validation is complete. - - The actions are executed at the end of the validation workflow. When providing a string, it will - simply be printed to the console. A callable will also be executed at the time of validation - completion. Several strings and callables can be provided to the `FinalActions` class, and - they will be executed in the order they are provided. - - Crafting Callables with `get_validation_summary()` - ------------------------------------------------- - When creating a callable function to be used as a final action, you can use the - [`get_validation_summary()`](`pointblank.get_validation_summary`) function to retrieve the - summary of the validation results. This summary contains information about the validation - workflow, including the number of test units, the number of failing test units, and the - threshold levels that were exceeded. You can use this information to craft your final action - message or to take specific actions based on the validation results. - - Examples - -------- - Final actions provide a powerful way to respond to the overall results of a validation workflow. - They're especially useful for sending notifications, generating reports, or taking corrective - actions based on the complete validation outcome. - - The following example shows how to create a final action that checks for critical failures - and sends an alert: - - ```python - import pointblank as pb - - def send_alert(): - summary = pb.get_validation_summary() - if summary["highest_severity"] == "critical": - print(f"ALERT: Critical validation failures found in {summary['tbl_name']}") - - validation = ( - pb.Validate( - data=my_data, - final_actions=pb.FinalActions(send_alert) - ) - .col_vals_gt(columns="revenue", value=0) - .interrogate() - ) - ``` - - In this example, the `send_alert()` function is defined to check the validation summary for - critical failures. If any are found, an alert message is printed to the console. The function is - passed to the `FinalActions` class, which ensures it will be executed after all validation steps - are complete. Note that we used the - [`get_validation_summary()`](`pointblank.get_validation_summary`) function to retrieve the - summary of the validation results to help craft the alert message. - - Multiple final actions can be provided in a sequence. They will be executed in the order they - are specified after all validation steps have completed: - - ```python - validation = ( - pb.Validate( - data=my_data, - final_actions=pb.FinalActions( - "Validation complete.", # a string message - send_alert, # a callable function - generate_report # another callable function - ) - ) - .col_vals_gt(columns="revenue", value=0) - .interrogate() - ) - ``` - - See Also - -------- - The [`get_validation_summary()`](`pointblank.get_validation_summary`) function, which can be - used to retrieve the summary of the validation results. - - -Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs) -Definition of a schema object. - - The schema object defines the structure of a table. Once it is defined, the object can be used - in a validation workflow, using `Validate` and its methods, to ensure that the structure of a - table matches the expected schema. The validation method that works with the schema object is - called [`col_schema_match()`](`pointblank.Validate.col_schema_match`). - - A schema for a table can be constructed with the `Schema` class in a number of ways: - - 1. providing a list of column names to `columns=` (to check only the column names) - 2. using a list of one- or two-element tuples in `columns=` (to check both column names and - optionally dtypes, should be in the form of `[(column_name, dtype), ...]`) - 3. providing a dictionary to `columns=`, where the keys are column names and the values are - dtypes - 4. providing individual column arguments in the form of keyword arguments (constructed as - `column_name=dtype`) - - The schema object can also be constructed by providing a DataFrame or Ibis table object (using - the `tbl=` parameter) and the schema will be collected from either type of object. The schema - object can be printed to display the column names and dtypes. Note that if `tbl=` is provided - then there shouldn't be any other inputs provided through either `columns=` or `**kwargs`. - - Parameters - ---------- - columns - A list of strings (representing column names), a list of tuples (for column names and column - dtypes), or a dictionary containing column and dtype information. If any of these inputs are - provided here, it will take precedence over any column arguments provided via `**kwargs`. - tbl - A DataFrame (Polars or Pandas) or an Ibis table object from which the schema will be - collected. Read the *Supported Input Table Types* section for details on the supported - table types. - **kwargs - Individual column arguments that are in the form of `column=dtype` or - `column=[dtype1, dtype2, ...]`. These will be ignored if the `columns=` parameter is not - `None`. - - Returns - ------- - Schema - A schema object. - - Supported Input Table Types - --------------------------- - The `tbl=` parameter, if used, can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `Schema(tbl=)` with these types of tables - requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or - Pandas DataFrame, the availability of Ibis is not needed. - - Additional Notes on Schema Construction - --------------------------------------- - While there is flexibility in how a schema can be constructed, there is the potential for some - confusion. So let's go through each of the methods of constructing a schema in more detail and - single out some important points. - - When providing a list of column names to `columns=`, a - [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will only check - the column names. Any arguments pertaining to dtypes will be ignored. - - When using a list of tuples in `columns=`, the tuples could contain the column name and dtype - or just the column name. This construction allows for more flexibility in constructing the - schema as some columns will be checked for dtypes and others will not. This method is the only - way to have mixed checks of column names and dtypes in - [`col_schema_match()`](`pointblank.Validate.col_schema_match`). - - When providing a dictionary to `columns=`, the keys are the column names and the values are the - dtypes. This method of input is useful in those cases where you might already have a dictionary - of column names and dtypes that you want to use as the schema. - - If using individual column arguments in the form of keyword arguments, the column names are the - keyword arguments and the dtypes are the values. This method emphasizes readability and is - perhaps more convenient when manually constructing a schema with a small number of columns. - - Finally, multiple dtypes can be provided for a single column by providing a list or tuple of - dtypes in place of a scalar string value. Having multiple dtypes for a column allows for the - dtype check via [`col_schema_match()`](`pointblank.Validate.col_schema_match`) to make multiple - attempts at matching the column dtype. Should any of the dtypes match the column dtype, that - part of the schema check will pass. Here are some examples of how you could provide single and - multiple dtypes for a column: - - ```python - # list of tuples - schema_1 = pb.Schema(columns=[("name", "String"), ("age", ["Float64", "Int64"])]) - - # dictionary - schema_2 = pb.Schema(columns={"name": "String", "age": ["Float64", "Int64"]}) - - # keyword arguments - schema_3 = pb.Schema(name="String", age=["Float64", "Int64"]) - ``` - - All of the above examples will construct the same schema object. - - Examples - -------- - A schema can be constructed via the `Schema` class in multiple ways. Let's use the following - Polars DataFrame as a basis for constructing a schema: - - ```python - import pointblank as pb - import polars as pl - - df = pl.DataFrame({ - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], - "height": [5.6, 6.0, 5.8] - }) - ``` - - You could provide `Schema(columns=)` a list of tuples containing column names and data types: - - ```python - schema = pb.Schema(columns=[("name", "String"), ("age", "Int64"), ("height", "Float64")]) - ``` - - Alternatively, a dictionary containing column names and dtypes also works: - - ```python - schema = pb.Schema(columns={"name": "String", "age": "Int64", "height": "Float64"}) - ``` - - Another input method involves using individual column arguments in the form of keyword - arguments: - - ```python - schema = pb.Schema(name="String", age="Int64", height="Float64") - ``` - - Finally, could also provide a DataFrame (Polars and Pandas) or an Ibis table object to `tbl=` - and the schema will be collected: - - ```python - schema = pb.Schema(tbl=df) - ``` - - Whichever method you choose, you can verify the schema inputs by printing the `schema` object: - - ```python - print(schema) - ``` - - The `Schema` object can be used to validate the structure of a table against the schema. The - relevant `Validate` method for this is - [`col_schema_match()`](`pointblank.Validate.col_schema_match`). In a validation workflow, you'll - have a target table (defined at the beginning of the workflow) and you might want to ensure that - your expectations of the table structure are met. The - [`col_schema_match()`](`pointblank.Validate.col_schema_match`) method works with a `Schema` - object to validate the structure of the table. Here's an example of how you could use - [`col_schema_match()`](`pointblank.Validate.col_schema_match`) in a validation workflow: - - ```python - # Define the schema - schema = pb.Schema(name="String", age="Int64", height="Float64") - - # Define a validation that checks the schema against the table (`df`) - validation = ( - pb.Validate(data=df) - .col_schema_match(schema=schema) - .interrogate() - ) - - # Display the validation results - validation - ``` - - The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation method will - validate the structure of the table against the schema during interrogation. If the structure of - the table does not match the schema, the single test unit will fail. In this case, the defined - schema matched the structure of the table, so the validation passed. - - We can also choose to check only the column names of the target table. This can be done by - providing a simplified `Schema` object, which is given a list of column names: - - ```python - schema = pb.Schema(columns=["name", "age", "height"]) - - validation = ( - pb.Validate(data=df) - .col_schema_match(schema=schema) - .interrogate() - ) - - validation - ``` - - In this case, the schema only checks the column names of the table against the schema during - interrogation. If the column names of the table do not match the schema, the single test unit - will fail. In this case, the defined schema matched the column names of the table, so the - validation passed. - - If you wanted to check column names and dtypes only for a subset of columns (and just the column - names for the rest), you could use a list of mixed one- or two-item tuples in `columns=`: - - ```python - schema = pb.Schema(columns=[("name", "String"), ("age", ), ("height", )]) - - validation = ( - pb.Validate(data=df) - .col_schema_match(schema=schema) - .interrogate() - ) - - validation - ``` - - Not specifying a dtype for a column (as is the case for the `age` and `height` columns in the - above example) will only check the column name. - - There may also be the case where you want to check the column names and specify multiple dtypes - for a column to have several attempts at matching the dtype. This can be done by providing a - list of dtypes where there would normally be a single dtype: - - ```python - schema = pb.Schema( - columns=[("name", "String"), ("age", ["Float64", "Int64"]), ("height", "Float64")] - ) - - validation = ( - pb.Validate(data=df) - .col_schema_match(schema=schema) - .interrogate() - ) - - validation - ``` - - For the `age` column, the schema will check for both `Float64` and `Int64` dtypes. If either of - these dtypes is found in the column, the portion of the schema check will succeed. - - See Also - -------- - The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation method, where a - `Schema` object is used in a validation workflow. - - -DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ssl: 'bool' = True) -> None - - Draft a validation plan for a given table using an LLM. - - By using a large language model (LLM) to draft a validation plan, you can quickly generate a - starting point for validating a table. This can be useful when you have a new table and you - want to get a sense of how to validate it (and adjustments could always be made later). The - `DraftValidation` class uses the `chatlas` package to draft a validation plan for a given table - using an LLM from either the `"anthropic"`, `"openai"`, `"ollama"` or `"bedrock"` provider. You - can install all requirements for the class through an optional 'generate' install of Pointblank - via `pip install pointblank[generate]`. - - :::{.callout-warning} - The `DraftValidation` class is still experimental. Please report any issues you encounter in - the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - data - The data to be used for drafting a validation plan. - model - The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, - `"ollama"`, and `"bedrock"`. - api_key - The API key to be used for the model. - verify_ssl - Whether to verify SSL certificates when making requests to the LLM provider. Set to `False` - to disable SSL verification (e.g., when behind a corporate firewall with self-signed - certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose - security risks. - - Returns - ------- - str - The drafted validation plan. - - Constructing the `model` Argument - --------------------------------- - The `model=` argument should be constructed using the provider and model name separated by a - colon (`provider:model`). The provider text can any of: - - - `"anthropic"` (Anthropic) - - `"openai"` (OpenAI) - - `"ollama"` (Ollama) - - `"bedrock"` (Amazon Bedrock) - - The model name should be the specific model to be used from the provider. Model names are - subject to change so consult the provider's documentation for the most up-to-date model names. - - Notes on Authentication - ----------------------- - Providing a valid API key as a string in the `api_key` argument is adequate for getting started - but you should consider using a more secure method for handling API keys. - - One way to do this is to load the API key from an environent variable and retrieve it using the - `os` module (specifically the `os.getenv()` function). Places to store the API key might - include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`. - - Another solution is to store one or more model provider API keys in an `.env` file (in the root - of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or - `OPENAI_API_KEY`) then DraftValidation will automatically load the API key from the `.env` file - and there's no need to provide the `api_key` argument. An `.env` file might look like this: - - ```plaintext - ANTHROPIC_API_KEY="your_anthropic_api_key_here" - OPENAI_API_KEY="your_openai_api_key_here" - ``` - - There's no need to have the `python-dotenv` package installed when using `.env` files in this - way. - - Notes on SSL Certificate Verification - -------------------------------------- - By default, SSL certificate verification is enabled for all requests to LLM providers. However, - in certain network environments (such as corporate networks with self-signed certificates or - firewall proxies), you may encounter SSL certificate verification errors. - - To disable SSL verification, set the `verify_ssl` parameter to `False`: - - ```python - import pointblank as pb - - data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") - - # Disable SSL verification for networks with self-signed certificates - pb.DraftValidation( - data=data, - model="anthropic:claude-sonnet-4-5", - verify_ssl=False - ) - ``` - - :::{.callout-warning} - Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to - man-in-the-middle attacks. Only use this option in trusted network environments and when - absolutely necessary. - ::: - - Notes on Data Sent to the Model Provider - ---------------------------------------- - The data sent to the model provider is a JSON summary of the table. This data summary is - generated internally by `DraftValidation` using the `DataScan` class. The summary includes the - following information: - - - the number of rows and columns in the table - - the type of dataset (e.g., Polars, DuckDB, Pandas, etc.) - - the column names and their types - - column level statistics such as the number of missing values, min, max, mean, and median, etc. - - a short list of data values in each column - - The JSON summary is used to provide the model with the necessary information to draft a - validation plan. As such, even very large tables can be used with the `DraftValidation` class - since the contents of the table are not sent to the model provider. - - The Amazon Bedrock is a special case since it is a self-hosted model and security controls are - in place to ensure that data is kept within the user's AWS environment. If using an Ollama - model all data is handled locally, though only a few models are capable enough to perform the - task of drafting a validation plan. - - Examples - -------- - Let's look at how the `DraftValidation` class can be used to draft a validation plan for a - table. The table to be used is `"nycflights"`, which is available here via the - [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is - `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The - example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`. - - ```python - import pointblank as pb - - # Load the "nycflights" dataset as a DuckDB table - data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") - - # Draft a validation plan for the "nycflights" table - pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") - ``` - - The output will be a drafted validation plan for the `"nycflights"` table and this will appear - in the console. - - ````plaintext - ```python - import pointblank as pb - - # Define schema based on column names and dtypes - schema = pb.Schema(columns=[ - ("year", "int64"), - ("month", "int64"), - ("day", "int64"), - ("dep_time", "int64"), - ("sched_dep_time", "int64"), - ("dep_delay", "int64"), - ("arr_time", "int64"), - ("sched_arr_time", "int64"), - ("arr_delay", "int64"), - ("carrier", "string"), - ("flight", "int64"), - ("tailnum", "string"), - ("origin", "string"), - ("dest", "string"), - ("air_time", "int64"), - ("distance", "int64"), - ("hour", "int64"), - ("minute", "int64") - ]) - - # The validation plan - validation = ( - pb.Validate( - data=your_data, # Replace your_data with the actual data variable - label="Draft Validation", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35) - ) - .col_schema_match(schema=schema) - .col_vals_not_null(columns=[ - "year", "month", "day", "sched_dep_time", "carrier", "flight", - "origin", "dest", "distance", "hour", "minute" - ]) - .col_vals_between(columns="month", left=1, right=12) - .col_vals_between(columns="day", left=1, right=31) - .col_vals_between(columns="sched_dep_time", left=106, right=2359) - .col_vals_between(columns="dep_delay", left=-43, right=1301, na_pass=True) - .col_vals_between(columns="air_time", left=20, right=695, na_pass=True) - .col_vals_between(columns="distance", left=17, right=4983) - .col_vals_between(columns="hour", left=1, right=23) - .col_vals_between(columns="minute", left=0, right=59) - .col_vals_in_set(columns="origin", set=["EWR", "LGA", "JFK"]) - .col_count_match(count=18) - .row_count_match(count=336776) - .rows_distinct() - .interrogate() - ) - - validation - ``` - ```` - - The drafted validation plan can be copied and pasted into a Python script or notebook for - further use. In other words, the generated plan can be adjusted as needed to suit the specific - requirements of the table being validated. - - Note that the output does not know how the data was obtained, so it uses the placeholder - `your_data` in the `data=` argument of the `Validate` class. When adapted for use, this should - be replaced with the actual data variable. - - - -## The Validation Steps family - -Validation steps can be thought of as sequential validations on the target -data. We call `Validate`'s validation methods to build up a validation plan: a collection of steps -that, in the aggregate, provides good validation coverage. - -col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data greater than a fixed value or data in another column? - - The `col_vals_gt()` validation method checks whether column values in a table are - *greater than* a specified `value=` (the exact comparison used in this function is - `col_val > value`). The `value=` can be specified as a single, literal value or as a column - name given in [`col()`](`pointblank.col`). This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 7, 6, 5], - "b": [1, 2, 1, 2, 2, 2], - "c": [2, 1, 2, 2, 3, 4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all greater than the value of `4`. We'll - determine if this validation had any failing test units (there are six test units, one for - each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=4) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_gt()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_gt()` to check - whether the values in column `c` are greater than values in column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="c", value=pb.col("b")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 1: `c` is `1` and `b` is `2`. - - Row 3: `c` is `2` and `b` is `2`. - - -col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data less than a fixed value or data in another column? - - The `col_vals_lt()` validation method checks whether column values in a table are - *less than* a specified `value=` (the exact comparison used in this function is - `col_val < value`). The `value=` can be specified as a single, literal value or as a column - name given in [`col()`](`pointblank.col`). This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 9, 7, 5], - "b": [1, 2, 1, 2, 2, 2], - "c": [2, 1, 1, 4, 3, 4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all less than the value of `10`. We'll - determine if this validation had any failing test units (there are six test units, one for - each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_lt(columns="a", value=10) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_lt()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_lt()` to check - whether the values in column `b` are less than values in column `c`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_lt(columns="b", value=pb.col("c")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 1: `b` is `2` and `c` is `1`. - - Row 2: `b` is `1` and `c` is `1`. - - -col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data greater than or equal to a fixed value or data in another column? - - The `col_vals_ge()` validation method checks whether column values in a table are - *greater than or equal to* a specified `value=` (the exact comparison used in this function - is `col_val >= value`). The `value=` can be specified as a single, literal value or as a - column name given in [`col()`](`pointblank.col`). This validation will operate over the - number of test units that is equal to the number of rows in the table (determined after any - `pre=` mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 9, 7, 5], - "b": [5, 3, 1, 8, 2, 3], - "c": [2, 3, 1, 4, 3, 4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all greater than or equal to the value of `5`. - We'll determine if this validation had any failing test units (there are six test units, one - for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_ge(columns="a", value=5) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_ge()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_ge()` to check - whether the values in column `b` are greater than values in column `c`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_ge(columns="b", value=pb.col("c")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 0: `b` is `2` and `c` is `3`. - - Row 4: `b` is `3` and `c` is `4`. - - -col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data less than or equal to a fixed value or data in another column? - - The `col_vals_le()` validation method checks whether column values in a table are - *less than or equal to* a specified `value=` (the exact comparison used in this function is - `col_val <= value`). The `value=` can be specified as a single, literal value or as a column - name given in [`col()`](`pointblank.col`). This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 9, 7, 5], - "b": [1, 3, 1, 5, 2, 5], - "c": [2, 1, 1, 4, 3, 4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all less than or equal to the value of `9`. - We'll determine if this validation had any failing test units (there are six test units, one - for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_le(columns="a", value=9) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_le()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_le()` to check - whether the values in column `c` are less than values in column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_le(columns="c", value=pb.col("b")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 0: `c` is `2` and `b` is `1`. - - Row 4: `c` is `3` and `b` is `2`. - - -col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data equal to a fixed value or data in another column? - - The `col_vals_eq()` validation method checks whether column values in a table are - *equal to* a specified `value=` (the exact comparison used in this function is - `col_val == value`). The `value=` can be specified as a single, literal value or as a column - name given in [`col()`](`pointblank.col`). This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 5, 5, 5, 5, 5], - "b": [5, 5, 5, 6, 5, 4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all equal to the value of `5`. We'll determine - if this validation had any failing test units (there are six test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_eq(columns="a", value=5) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_eq()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_eq()` to check - whether the values in column `a` are equal to the values in column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_eq(columns="a", value=pb.col("b")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 3: `a` is `5` and `b` is `6`. - - Row 5: `a` is `5` and `b` is `4`. - - -col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', value: 'float | int | Column', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data not equal to a fixed value or data in another column? - - The `col_vals_ne()` validation method checks whether column values in a table are - *not equal to* a specified `value=` (the exact comparison used in this function is - `col_val != value`). The `value=` can be specified as a single, literal value or as a column - name given in [`col()`](`pointblank.col`). This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - value - The value to compare against. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison. For more information on which types of values are allowed, see the - *What Can Be Used in `value=`?* section. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `value=`? - ----------------------------- - The `value=` argument allows for a variety of input types. The most common are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column name - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value as the `value=` argument. There is flexibility in how - you provide the date or datetime value, as it can be: - - - a string-based date or datetime (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - a date or datetime object using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in the `value=` argument, it must be specified within - [`col()`](`pointblank.col`). This is a column-to-column comparison and, crucially, the - columns being compared must be of the same type (e.g., both numeric, both date, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `value=col(...)` that are expected to be present in the - transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 5, 5, 5, 5, 5], - "b": [5, 6, 3, 6, 5, 8], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are not equal to the value of `3`. We'll determine - if this validation had any failing test units (there are six test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_ne(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_ne()`. All test units passed, and there are no failing test units. - - Aside from checking a column against a literal value, we can also use a column name in the - `value=` argument (with the helper function [`col()`](`pointblank.col`) to perform a - column-to-column comparison. For the next example, we'll use `col_vals_ne()` to check - whether the values in column `a` aren't equal to the values in column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_ne(columns="a", value=pb.col("b")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are in rows - 0 and 4, where `a` is `5` and `b` is `5` in both cases (i.e., they are equal to each other). - - -col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Do column data lie between two specified values or data in other columns? - - The `col_vals_between()` validation method checks whether column values in a table fall - within a range. The range is specified with three arguments: `left=`, `right=`, and - `inclusive=`. The `left=` and `right=` values specify the lower and upper bounds. These - bounds can be specified as literal values or as column names provided within - [`col()`](`pointblank.col`). The validation will operate over the number of test units that - is equal to the number of rows in the table (determined after any `pre=` mutation has been - applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - left - The lower bound of the range. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison for this bound. See the *What Can Be Used in `left=` and `right=`?* section - for details on this. - right - The upper bound of the range. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison for this bound. See the *What Can Be Used in `left=` and `right=`?* section - for details on this. - inclusive - A tuple of two boolean values indicating whether the comparison should be inclusive. The - position of the boolean values correspond to the `left=` and `right=` values, - respectively. By default, both values are `True`. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `left=` and `right=`? - ----------------------------------------- - The `left=` and `right=` arguments both allow for a variety of input types. The most common - are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column in the target table - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value within `left=` and `right=`. There is flexibility in how - you provide the date or datetime values for the bounds; they can be: - - - string-based dates or datetimes (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - date or datetime objects using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in either `left=` or `right=` (or both), it must be - specified within [`col()`](`pointblank.col`). This facilitates column-to-column comparisons - and, crucially, the columns being compared to either/both of the bounds must be of the same - type as the column data (e.g., all numeric, all dates, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `left=col(...)`/`right=col(...)` that are expected to be present - in the transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [2, 3, 2, 4, 3, 4], - "b": [5, 6, 1, 6, 8, 5], - "c": [9, 8, 8, 7, 7, 8], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all between the fixed boundary values of `1` - and `5`. We'll determine if this validation had any failing test units (there are six test - units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_between(columns="a", left=1, right=5) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_between()`. All test units passed, and there are no failing test units. - - Aside from checking a column against two literal values representing the lower and upper - bounds, we can also provide column names to the `left=` and/or `right=` arguments (by using - the helper function [`col()`](`pointblank.col`). In this way, we can perform three - additional comparison types: - - 1. `left=column`, `right=column` - 2. `left=literal`, `right=column` - 3. `left=column`, `right=literal` - - For the next example, we'll use `col_vals_between()` to check whether the values in column - `b` are between than corresponding values in columns `a` (lower bound) and `c` (upper - bound). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_between(columns="b", left=pb.col("a"), right=pb.col("c")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 2: `b` is `1` but the bounds are `2` (`a`) and `8` (`c`). - - Row 4: `b` is `8` but the bounds are `3` (`a`) and `7` (`c`). - - -col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', left: 'float | int | Column', right: 'float | int | Column', inclusive: 'tuple[bool, bool]' = (True, True), na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Do column data lie outside of two specified values or data in other columns? - - The `col_vals_between()` validation method checks whether column values in a table *do not* - fall within a certain range. The range is specified with three arguments: `left=`, `right=`, - and `inclusive=`. The `left=` and `right=` values specify the lower and upper bounds. These - bounds can be specified as literal values or as column names provided within - [`col()`](`pointblank.col`). The validation will operate over the number of test units that - is equal to the number of rows in the table (determined after any `pre=` mutation has been - applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - left - The lower bound of the range. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison for this bound. See the *What Can Be Used in `left=` and `right=`?* section - for details on this. - right - The upper bound of the range. This can be a single value or a single column name given - in [`col()`](`pointblank.col`). The latter option allows for a column-to-column - comparison for this bound. See the *What Can Be Used in `left=` and `right=`?* section - for details on this. - inclusive - A tuple of two boolean values indicating whether the comparison should be inclusive. The - position of the boolean values correspond to the `left=` and `right=` values, - respectively. By default, both values are `True`. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - What Can Be Used in `left=` and `right=`? - ----------------------------------------- - The `left=` and `right=` arguments both allow for a variety of input types. The most common - are: - - - a single numeric value - - a single date or datetime value - - A [`col()`](`pointblank.col`) object that represents a column in the target table - - When supplying a number as the basis of comparison, keep in mind that all resolved columns - must also be numeric. Should you have columns that are of the date or datetime types, you - can supply a date or datetime value within `left=` and `right=`. There is flexibility in how - you provide the date or datetime values for the bounds; they can be: - - - string-based dates or datetimes (e.g., `"2023-10-01"`, `"2023-10-01 13:45:30"`, etc.) - - date or datetime objects using the `datetime` module (e.g., `datetime.date(2023, 10, 1)`, - `datetime.datetime(2023, 10, 1, 13, 45, 30)`, etc.) - - Finally, when supplying a column name in either `left=` or `right=` (or both), it must be - specified within [`col()`](`pointblank.col`). This facilitates column-to-column comparisons - and, crucially, the columns being compared to either/both of the bounds must be of the same - type as the column data (e.g., all numeric, all dates, etc.). - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns=` and `left=col(...)`/`right=col(...)` that are expected to be present - in the transformed table, but may not exist in the table before preprocessing. Regarding the - lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 7, 5, 5], - "b": [2, 3, 6, 4, 3, 6], - "c": [9, 8, 8, 9, 9, 7], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all outside the fixed boundary values of `1` - and `4`. We'll determine if this validation had any failing test units (there are six test - units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_outside(columns="a", left=1, right=4) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_outside()`. All test units passed, and there are no failing test units. - - Aside from checking a column against two literal values representing the lower and upper - bounds, we can also provide column names to the `left=` and/or `right=` arguments (by using - the helper function [`col()`](`pointblank.col`). In this way, we can perform three - additional comparison types: - - 1. `left=column`, `right=column` - 2. `left=literal`, `right=column` - 3. `left=column`, `right=literal` - - For the next example, we'll use `col_vals_outside()` to check whether the values in column - `b` are outside of the range formed by the corresponding values in columns `a` (lower bound) - and `c` (upper bound). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_outside(columns="b", left=pb.col("a"), right=pb.col("c")) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are: - - - Row 2: `b` is `6` and the bounds are `5` (`a`) and `8` (`c`). - - Row 5: `b` is `6` and the bounds are `5` (`a`) and `7` (`c`). - - -col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether column values are in a set of values. - - The `col_vals_in_set()` validation method checks whether column values in a table are part - of a specified `set=` of values. This validation will operate over the number of test units - that is equal to the number of rows in the table (determined after any `pre=` mutation has - been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - set - A collection of values to compare against. Can be a list of values, a Python Enum class, - or a collection containing Enum instances. When an Enum class is provided, all enum - values will be used. When a collection contains Enum instances, their values will be - extracted automatically. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 2, 4, 6, 2, 5], - "b": [5, 8, 2, 6, 5, 1], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all in the set of `[2, 3, 4, 5, 6]`. We'll - determine if this validation had any failing test units (there are six test units, one for - each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_in_set(columns="a", set=[2, 3, 4, 5, 6]) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_in_set()`. All test units passed, and there are no failing test units. - - Now, let's use that same set of values for a validation on column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_in_set(columns="b", set=[2, 3, 4, 5, 6]) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are for the - column `b` values of `8` and `1`, which are not in the set of `[2, 3, 4, 5, 6]`. - - **Using Python Enums** - - The `col_vals_in_set()` method also supports Python Enum classes and instances, which can - make validations more readable and maintainable: - - ```python - from enum import Enum - - class Color(Enum): - RED = "red" - GREEN = "green" - BLUE = "blue" - - # Create a table with color data - tbl_colors = pl.DataFrame({ - "product": ["shirt", "pants", "hat", "shoes"], - "color": ["red", "blue", "green", "yellow"] - }) - - # Validate using an Enum class (all enum values are allowed) - validation = ( - pb.Validate(data=tbl_colors) - .col_vals_in_set(columns="color", set=Color) - .interrogate() - ) - - validation - ``` - - This validation will fail for the `"yellow"` value since it's not in the `Color` enum. - - You can also use specific Enum instances or mix them with regular values: - - ```python - # Validate using specific Enum instances - validation = ( - pb.Validate(data=tbl_colors) - .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE]) - .interrogate() - ) - - # Mix Enum instances with regular values - validation = ( - pb.Validate(data=tbl_colors) - .col_vals_in_set(columns="color", set=[Color.RED, Color.BLUE, "yellow"]) - .interrogate() - ) - - validation - ``` - - In this case, the `"green"` value will cause a failing test unit since it's not part of the - specified set. - - -col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', set: 'Collection[Any]', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether column values are not in a set of values. - - The `col_vals_not_in_set()` validation method checks whether column values in a table are - *not* part of a specified `set=` of values. This validation will operate over the number of - test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - set - A collection of values to compare against. Can be a list of values, a Python Enum class, - or a collection containing Enum instances. When an Enum class is provided, all enum - values will be used. When a collection contains Enum instances, their values will be - extracted automatically. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 8, 1, 9, 1, 7], - "b": [1, 8, 2, 6, 9, 1], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that none of the values in column `a` are in the set of `[2, 3, 4, 5, 6]`. - We'll determine if this validation had any failing test units (there are six test units, one - for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_not_in_set(columns="a", set=[2, 3, 4, 5, 6]) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_not_in_set()`. All test units passed, and there are no failing test - units. - - Now, let's use that same set of values for a validation on column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_not_in_set(columns="b", set=[2, 3, 4, 5, 6]) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are for the - column `b` values of `2` and `6`, both of which are in the set of `[2, 3, 4, 5, 6]`. - - **Using Python Enums** - - Like `col_vals_in_set()`, this method also supports Python Enum classes and instances: - - ```python - from enum import Enum - - class InvalidStatus(Enum): - DELETED = "deleted" - ARCHIVED = "archived" - - # Create a table with status data - status_table = pl.DataFrame({ - "product": ["widget", "gadget", "tool", "device"], - "status": ["active", "pending", "deleted", "active"] - }) - - # Validate that no values are in the invalid status set - validation = ( - pb.Validate(data=status_table) - .col_vals_not_in_set(columns="status", set=InvalidStatus) - .interrogate() - ) - - validation - ``` - - This `"deleted"` value in the `status` column will fail since it matches one of the invalid - statuses in the `InvalidStatus` enum. - - -col_vals_increasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, decreasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data increasing by row? - - The `col_vals_increasing()` validation method checks whether column values in a table are - increasing when moving down a table. There are options for allowing missing values in the - target column, allowing stationary phases (where consecutive values don't change), and even - one for allowing decreasing movements up to a certain threshold. This validation will - operate over the number of test units that is equal to the number of rows in the table - (determined after any `pre=` mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - allow_stationary - An option to allow pauses in increasing values. For example, if the values for the test - units are `[80, 82, 82, 85, 88]` then the third unit (`82`, appearing a second time) - would be marked as failing when `allow_stationary` is `False`. Using - `allow_stationary=True` will result in all the test units in `[80, 82, 82, 85, 88]` to - be marked as passing. - decreasing_tol - An optional threshold value that allows for movement of numerical values in the negative - direction. By default this is `None` but using a numerical value will set the absolute - threshold of negative travel allowed across numerical test units. Note that setting a - value here also has the effect of setting `allow_stationary` to `True`. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The - table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [1, 2, 2, 3, 4, 5], - "c": [1, 2, 1, 3, 4, 5], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are increasing. We'll determine if this validation - had any failing test units (there are six test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_increasing(columns="a") - .interrogate() - ) - - validation - ``` - - The validation passed as all values in column `a` are increasing. Now let's check column - `b` which has a stationary value: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_increasing(columns="b") - .interrogate() - ) - - validation - ``` - - This validation fails at the third row because the value `2` is repeated. If we want to - allow stationary values, we can use `allow_stationary=True`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_increasing(columns="b", allow_stationary=True) - .interrogate() - ) - - validation - ``` - - -col_vals_decreasing(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', allow_stationary: 'bool' = False, increasing_tol: 'float | None' = None, na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Are column data decreasing by row? - - The `col_vals_decreasing()` validation method checks whether column values in a table are - decreasing when moving down a table. There are options for allowing missing values in the - target column, allowing stationary phases (where consecutive values don't change), and even - one for allowing increasing movements up to a certain threshold. This validation will - operate over the number of test units that is equal to the number of rows in the table - (determined after any `pre=` mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - allow_stationary - An option to allow pauses in decreasing values. For example, if the values for the test - units are `[88, 85, 85, 82, 80]` then the third unit (`85`, appearing a second time) - would be marked as failing when `allow_stationary` is `False`. Using - `allow_stationary=True` will result in all the test units in `[88, 85, 85, 82, 80]` to - be marked as passing. - increasing_tol - An optional threshold value that allows for movement of numerical values in the positive - direction. By default this is `None` but using a numerical value will set the absolute - threshold of positive travel allowed across numerical test units. Note that setting a - value here also has the effect of setting `allow_stationary` to `True`. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with a numeric column (`a`). The - table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [6, 5, 4, 3, 2, 1], - "b": [5, 4, 4, 3, 2, 1], - "c": [5, 4, 5, 3, 2, 1], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are decreasing. We'll determine if this validation - had any failing test units (there are six test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_decreasing(columns="a") - .interrogate() - ) - - validation - ``` - - The validation passed as all values in column `a` are decreasing. Now let's check column - `b` which has a stationary value: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_decreasing(columns="b") - .interrogate() - ) - - validation - ``` - - This validation fails at the third row because the value `4` is repeated. If we want to - allow stationary values, we can use `allow_stationary=True`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_decreasing(columns="b", allow_stationary=True) - .interrogate() - ) - - validation - ``` - - -col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether values in a column are Null. - - The `col_vals_null()` validation method checks whether column values in a table are Null. - This validation will operate over the number of test units that is equal to the number - of rows in the table. - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [None, None, None, None], - "b": [None, 2, None, 9], - } - ).with_columns(pl.col("a").cast(pl.Int64)) - - pb.preview(tbl) - ``` - - Let's validate that values in column `a` are all Null values. We'll determine if this - validation had any failing test units (there are four test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_null(columns="a") - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_null()`. All test units passed, and there are no failing test units. - - Now, let's use that same set of values for a validation on column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_null(columns="b") - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are for the - two non-Null values in column `b`. - - -col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether values in a column are not Null. - - The `col_vals_not_null()` validation method checks whether column values in a table are not - Null. This validation will operate over the number of test units that is equal to the number - of rows in the table. - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two numeric columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [4, 7, 2, 8], - "b": [5, None, 1, None], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that none of the values in column `a` are Null values. We'll determine if - this validation had any failing test units (there are four test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_not_null(columns="a") - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_not_null()`. All test units passed, and there are no failing test units. - - Now, let's use that same set of values for a validation on column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_not_null(columns="b") - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are for the - two Null values in column `b`. - - -col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pattern: 'str', na_pass: 'bool' = False, inverse: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether column values match a regular expression pattern. - - The `col_vals_regex()` validation method checks whether column values in a table - correspond to a `pattern=` matching expression. This validation will operate over the number - of test units that is equal to the number of rows in the table (determined after any `pre=` - mutation has been applied). - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - pattern - A regular expression pattern to compare against. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - inverse - Should the validation step be inverted? If `True`, then the expectation is that column - values should *not* match the specified `pattern=` regex. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with two string columns (`a` and - `b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": ["rb-0343", "ra-0232", "ry-0954", "rc-1343"], - "b": ["ra-0628", "ra-583", "rya-0826", "rb-0735"], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that all of the values in column `a` match a particular regex pattern. We'll - determine if this validation had any failing test units (there are four test units, one for - each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_regex(columns="a", pattern=r"r[a-z]-[0-9]{4}") - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_regex()`. All test units passed, and there are no failing test units. - - Now, let's use the same regex for a validation on column `b`. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_regex(columns="b", pattern=r"r[a-z]-[0-9]{4}") - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The specific failing cases are for the - string values of rows 1 and 2 in column `b`. - - -col_vals_within_spec(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', spec: 'str', na_pass: 'bool' = False, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether column values fit within a specification. - - The `col_vals_within_spec()` validation method checks whether column values in a table - correspond to a specification (`spec=`) type (details of which are available in the - *Specifications* section). Specifications include common data types like email addresses, - URLs, postal codes, vehicle identification numbers (VINs), International Bank Account - Numbers (IBANs), and more. This validation will operate over the number of test units that - is equal to the number of rows in the table. - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - spec - A specification string for defining the specification type. Examples are `"email"`, - `"url"`, and `"postal_code[USA]"`. See the *Specifications* section for all available - options. - na_pass - Should any encountered None, NA, or Null values be considered as passing test units? By - default, this is `False`. Set to `True` to pass test units with missing values. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Specifications - -------------- - A specification type must be used with the `spec=` argument. This is a string-based keyword - that corresponds to the type of data in the specified columns. The following keywords can - be used: - - - `"isbn"`: The International Standard Book Number (ISBN) is a unique numerical identifier - for books. This keyword validates both 10-digit and 13-digit ISBNs. - - - `"vin"`: A vehicle identification number (VIN) is a unique code used by the automotive - industry to identify individual motor vehicles. - - - `"postal_code[]"`: A postal code (also known as postcodes, PIN, or ZIP - codes) is a series of letters, digits, or both included in a postal address. Because the - coding varies by country, a country code in either the 2-letter (ISO 3166-1 alpha-2) or - 3-letter (ISO 3166-1 alpha-3) format needs to be supplied (e.g., `"postal_code[US]"` or - `"postal_code[USA]"`). The keyword alias `"zip"` can be used for US ZIP codes. - - - `"credit_card"`: A credit card number can be validated across a variety of issuers. The - validation uses the Luhn algorithm. - - - `"iban[]"`: The International Bank Account Number (IBAN) is a system of - identifying bank accounts across countries. Because the length and coding varies by - country, a country code needs to be supplied (e.g., `"iban[DE]"` or `"iban[DEU]"`). - - - `"swift"`: Business Identifier Codes (also known as SWIFT-BIC, BIC, or SWIFT code) are - unique identifiers for financial and non-financial institutions. - - - `"phone"`, `"email"`, `"url"`, `"ipv4"`, `"ipv6"`, `"mac"`: Phone numbers, email - addresses, Internet URLs, IPv4 or IPv6 addresses, and MAC addresses can be validated with - their respective keywords. - - Only a single `spec=` value should be provided per function call. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - a column via `columns=` that is expected to be present in the transformed table, but may not - exist in the table before preprocessing. Regarding the lifetime of the transformed table, it - only exists during the validation step and is not stored in the `Validate` object or used in - subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with an email column. The table - is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "email": [ - "user@example.com", - "admin@test.org", - "invalid-email", - "contact@company.co.uk", - ], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that all of the values in the `email` column are valid email addresses. - We'll determine if this validation had any failing test units (there are four test units, - one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_within_spec(columns="email", spec="email") - .interrogate() - ) - - validation - ``` - - The validation table shows that one test unit failed (the invalid email address in row 3). - - -col_vals_expr(self, expr: 'Any', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate column values using a custom expression. - - The `col_vals_expr()` validation method checks whether column values in a table satisfy a - custom `expr=` expression. This validation will operate over the number of test units that - is equal to the number of rows in the table (determined after any `pre=` mutation has been - applied). - - Parameters - ---------- - expr - A column expression that will evaluate each row in the table, returning a boolean value - per table row. If the target table is a Polars DataFrame, the expression should either - be a Polars column expression or a Narwhals one. For a Pandas DataFrame, the expression - should either be a lambda expression or a Narwhals column expression. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Regarding the lifetime of the - transformed table, it only exists during the validation step and is not stored in the - `Validate` object or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 1, 7, 8, 6], - "b": [0, 0, 0, 1, 1, 1], - "c": [0.5, 0.3, 0.8, 1.4, 1.9, 1.2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the values in column `a` are all integers. We'll determine if this - validation had any failing test units (there are six test units, one for each row). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_expr(expr=pl.col("a") % 1 == 0) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_vals_expr()`. All test units passed, with no failing test units. - - -col_sum_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column sum satisfy a greater than comparison? - - The `col_sum_gt()` validation method checks whether the sum of values in a column - is greater than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single sum value that is then compared against the target. The - comparison used in this function is `sum(column) > value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the sum to be computed. - value - The value to compare the column sum against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sum_gt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sum_gt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_gt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the sum of column `a` is greater than `15`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_gt(columns="a", value=15) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the sum comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_gt(columns=["a", "b"], value=15) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_gt(columns="a", value=15, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sum_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column sum satisfy a less than comparison? - - The `col_sum_lt()` validation method checks whether the sum of values in a column - is less than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single sum value that is then compared against the target. The - comparison used in this function is `sum(column) < value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the sum to be computed. - value - The value to compare the column sum against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sum_lt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sum_lt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_lt()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the sum of column `a` is less than `15`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_lt(columns="a", value=15) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the sum comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_lt(columns=["a", "b"], value=15) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_lt(columns="a", value=15, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sum_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column sum satisfy a greater than or equal to comparison? - - The `col_sum_ge()` validation method checks whether the sum of values in a column - is at least a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single sum value that is then compared against the target. The - comparison used in this function is `sum(column) >= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the sum to be computed. - value - The value to compare the column sum against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sum_ge()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sum_ge(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_ge()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the sum of column `a` is at least `15`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_ge(columns="a", value=15) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the sum comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_ge(columns=["a", "b"], value=15) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_ge(columns="a", value=15, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sum_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column sum satisfy a less than or equal to comparison? - - The `col_sum_le()` validation method checks whether the sum of values in a column - is at most a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single sum value that is then compared against the target. The - comparison used in this function is `sum(column) <= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the sum to be computed. - value - The value to compare the column sum against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sum_le()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sum_le(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sum_le()`, a tolerance of `tol=0.5` would mean the sum can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the sum of column `a` is at most `15`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_le(columns="a", value=15) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the sum comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_le(columns=["a", "b"], value=15) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_le(columns="a", value=15, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sum_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column sum satisfy an equal to comparison? - - The `col_sum_eq()` validation method checks whether the sum of values in a column - equals a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single sum value that is then compared against the target. The - comparison used in this function is `sum(column) == value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the sum to be computed. - value - The value to compare the column sum against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose sum will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a sum that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sum_eq()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sum_eq(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter is particularly useful with `col_sum_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the sum of column `a` equals `15`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_eq(columns="a", value=15) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the sum comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_eq(columns=["a", "b"], value=15) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sum_eq(columns="a", value=15, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_avg_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column average satisfy a greater than comparison? - - The `col_avg_gt()` validation method checks whether the average of values in a column - is greater than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single average value that is then compared against the target. The - comparison used in this function is `average(column) > value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the average to be computed. - value - The value to compare the column average against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_avg_gt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_avg_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_avg_gt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_gt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the average of column `a` is greater than `3`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_gt(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the average comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_gt(columns=["a", "b"], value=3) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_gt(columns="a", value=3, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_avg_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column average satisfy a less than comparison? - - The `col_avg_lt()` validation method checks whether the average of values in a column - is less than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single average value that is then compared against the target. The - comparison used in this function is `average(column) < value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the average to be computed. - value - The value to compare the column average against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_avg_lt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_avg_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_avg_lt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_lt()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the average of column `a` is less than `3`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_lt(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the average comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_lt(columns=["a", "b"], value=3) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_lt(columns="a", value=3, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_avg_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column average satisfy a greater than or equal to comparison? - - The `col_avg_ge()` validation method checks whether the average of values in a column - is at least a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single average value that is then compared against the target. The - comparison used in this function is `average(column) >= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the average to be computed. - value - The value to compare the column average against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_avg_ge()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_avg_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_avg_ge(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_ge()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the average of column `a` is at least `3`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_ge(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the average comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_ge(columns=["a", "b"], value=3) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_ge(columns="a", value=3, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_avg_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column average satisfy a less than or equal to comparison? - - The `col_avg_le()` validation method checks whether the average of values in a column - is at most a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single average value that is then compared against the target. The - comparison used in this function is `average(column) <= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the average to be computed. - value - The value to compare the column average against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_avg_le()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_avg_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_avg_le(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_avg_le()`, a tolerance of `tol=0.5` would mean the average can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the average of column `a` is at most `3`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_le(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the average comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_le(columns=["a", "b"], value=3) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_le(columns="a", value=3, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_avg_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column average satisfy an equal to comparison? - - The `col_avg_eq()` validation method checks whether the average of values in a column - equals a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single average value that is then compared against the target. The - comparison used in this function is `average(column) == value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the average to be computed. - value - The value to compare the column average against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose average will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a average that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_avg_eq()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_avg_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_avg_eq(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter is particularly useful with `col_avg_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the average of column `a` equals `3`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_eq(columns="a", value=3) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the average comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_eq(columns=["a", "b"], value=3) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_avg_eq(columns="a", value=3, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sd_gt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column standard deviation satisfy a greater than comparison? - - The `col_sd_gt()` validation method checks whether the standard deviation of values in a column - is greater than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single standard deviation value that is then compared against the target. The - comparison used in this function is `standard deviation(column) > value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the standard deviation to be computed. - value - The value to compare the column standard deviation against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sd_gt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sd_gt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sd_gt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_gt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the standard deviation of column `a` is greater than `2`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_gt(columns="a", value=2) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the standard deviation comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_gt(columns=["a", "b"], value=2) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_gt(columns="a", value=2, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sd_lt(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column standard deviation satisfy a less than comparison? - - The `col_sd_lt()` validation method checks whether the standard deviation of values in a column - is less than a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single standard deviation value that is then compared against the target. The - comparison used in this function is `standard deviation(column) < value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the standard deviation to be computed. - value - The value to compare the column standard deviation against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sd_lt()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sd_lt(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sd_lt(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_lt()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the standard deviation of column `a` is less than `2`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_lt(columns="a", value=2) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the standard deviation comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_lt(columns=["a", "b"], value=2) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_lt(columns="a", value=2, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sd_ge(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column standard deviation satisfy a greater than or equal to comparison? - - The `col_sd_ge()` validation method checks whether the standard deviation of values in a column - is at least a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single standard deviation value that is then compared against the target. The - comparison used in this function is `standard deviation(column) >= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the standard deviation to be computed. - value - The value to compare the column standard deviation against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sd_ge()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sd_ge(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sd_ge(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_ge()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the standard deviation of column `a` is at least `2`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_ge(columns="a", value=2) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the standard deviation comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_ge(columns=["a", "b"], value=2) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_ge(columns="a", value=2, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sd_le(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column standard deviation satisfy a less than or equal to comparison? - - The `col_sd_le()` validation method checks whether the standard deviation of values in a column - is at most a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single standard deviation value that is then compared against the target. The - comparison used in this function is `standard deviation(column) <= value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the standard deviation to be computed. - value - The value to compare the column standard deviation against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sd_le()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sd_le(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sd_le(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter expands the acceptable range for the comparison. For - `col_sd_le()`, a tolerance of `tol=0.5` would mean the standard deviation can be within `0.5` of the - target value and still pass validation. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the standard deviation of column `a` is at most `2`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_le(columns="a", value=2) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the standard deviation comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_le(columns=["a", "b"], value=2) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_le(columns="a", value=2, tol=1.0) - .interrogate() - ) - - validation - ``` - -col_sd_eq(self: 'Validate', columns: 'str | Collection[str]', value: 'float | int | Column | ReferenceColumn | None' = None, tol: 'float' = 0, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, brief: 'str | bool | None' = None, actions: 'Actions | None' = None, active: 'bool | Callable' = True) -> 'Validate' -Does the column standard deviation satisfy an equal to comparison? - - The `col_sd_eq()` validation method checks whether the standard deviation of values in a column - equals a specified `value=`. This is an aggregation-based validation where the entire - column is reduced to a single standard deviation value that is then compared against the target. The - comparison used in this function is `standard deviation(column) == value`. - - Unlike row-level validations (e.g., `col_vals_gt()`), this method treats the entire column as - a single test unit. The validation either passes completely (if the aggregated value satisfies - the comparison) or fails completely. - - Parameters - ---------- - columns - A single column or a list of columns to validate. If multiple columns are supplied, - there will be a separate validation step generated for each column. The columns must - contain numeric data for the standard deviation to be computed. - value - The value to compare the column standard deviation against. This can be: (1) a numeric literal - (`int` or `float`), (2) a [`col()`](`pointblank.col`) object referencing another column - whose standard deviation will be used for comparison, (3) a [`ref()`](`pointblank.ref`) object - referencing a column in reference data (when `Validate(reference=)` has been set), or (4) - `None` to automatically compare against the same column in reference data (shorthand for - `ref(column_name)` when reference data is set). - tol - A tolerance value for the comparison. The default is `0`, meaning exact comparison. When - set to a positive value, the comparison becomes more lenient. For example, with `tol=0.5`, - a standard deviation that differs from the target by up to `0.5` will still pass. The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - thresholds - Failure threshold levels so that the validation step can react accordingly when - failing test units are level. Since this is an aggregation-based validation with only - one test unit, threshold values typically should be set as absolute counts (e.g., `1`) to - indicate pass/fail, or as proportions where any value less than `1.0` means failure is - acceptable. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. - Inspection functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate - a step based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Using Reference Data - -------------------- - The `col_sd_eq()` method supports comparing column aggregations against reference data. This - is useful for validating that statistical properties remain consistent across different - versions of a dataset, or for comparing current data against historical baselines. - - To use reference data, set the `reference=` parameter when creating the `Validate` object: - - ```python - validation = ( - pb.Validate(data=current_data, reference=baseline_data) - .col_sd_eq(columns="revenue") # Compares sum(current.revenue) vs sum(baseline.revenue) - .interrogate() - ) - ``` - - When `value=None` and reference data is set, the method automatically compares against the - same column in the reference data. You can also explicitly specify reference columns using - the `ref()` helper: - - ```python - .col_sd_eq(columns="revenue", value=pb.ref("baseline_revenue")) - ``` - - Understanding Tolerance - ----------------------- - The `tol=` parameter allows for fuzzy comparisons, which is especially important for - floating-point aggregations where exact equality is often unreliable. - - The `tol=` parameter is particularly useful with `col_sd_eq()` since exact equality - comparisons on floating-point aggregations can be problematic due to numerical precision. - Setting a small tolerance (e.g., `tol=0.001`) allows for minor differences that arise from - floating-point arithmetic. - - For equality comparisons (`col_*_eq`), the tolerance creates a range `[value - tol, value + tol]` - within which the aggregation is considered valid. For inequality comparisons, the tolerance - shifts the comparison boundary. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. Since aggregation - validations operate on a single test unit (the aggregated value), threshold values are - typically set as absolute counts: - - - `thresholds=1` means any failure triggers a 'warning' - - `thresholds=(1, 1, 1)` means any failure triggers all three levels - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - Examples - -------- - For the examples, we'll use a simple Polars DataFrame with numeric columns. The table is - shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [2, 2, 2, 2, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the standard deviation of column `a` equals `2`: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_eq(columns="a", value=2) - .interrogate() - ) - - validation - ``` - - The validation result shows whether the standard deviation comparison passed or failed. Since this - is an aggregation-based validation, there is exactly one test unit per column. - - When validating multiple columns, each column gets its own validation step: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_eq(columns=["a", "b"], value=2) - .interrogate() - ) - - validation - ``` - - Using tolerance for flexible comparisons: - - ```python - validation = ( - pb.Validate(data=tbl) - .col_sd_eq(columns="a", value=2, tol=1.0) - .interrogate() - ) - - validation - ``` - -rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether rows in the table are distinct. - - The `rows_distinct()` method checks whether rows in the table are distinct. This validation - will operate over the number of test units that is equal to the number of rows in the table - (determined after any `pre=` mutation has been applied). - - Parameters - ---------- - columns_subset - A single column or a list of columns to use as a subset for the distinct comparison. - If `None`, then all columns in the table will be used for the comparison. If multiple - columns are supplied, the distinct comparison will be made over the combination of - values in those columns. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns_subset=` that are expected to be present in the transformed table, but - may not exist in the table before preprocessing. Regarding the lifetime of the transformed - table, it only exists during the validation step and is not stored in the `Validate` object - or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three string columns - (`col_1`, `col_2`, and `col_3`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "col_1": ["a", "b", "c", "d"], - "col_2": ["a", "a", "c", "d"], - "col_3": ["a", "a", "d", "e"], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the rows in the table are distinct with `rows_distinct()`. We'll - determine if this validation had any failing test units (there are four test units, one for - each row). A failing test units means that a given row is not distinct from every other row. - - ```python - validation = ( - pb.Validate(data=tbl) - .rows_distinct() - .interrogate() - ) - - validation - ``` - - From this validation table we see that there are no failing test units. All rows in the - table are distinct from one another. - - We can also use a subset of columns to determine distinctness. Let's specify the subset - using columns `col_2` and `col_3` for the next validation. - - ```python - validation = ( - pb.Validate(data=tbl) - .rows_distinct(columns_subset=["col_2", "col_3"]) - .interrogate() - ) - - validation - ``` - - The validation table reports two failing test units. The first and second rows are - duplicated when considering only the values in columns `col_2` and `col_3`. There's only - one set of duplicates but there are two failing test units since each row is compared to all - others. - - -rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether row data are complete by having no missing values. - - The `rows_complete()` method checks whether rows in the table are complete. Completeness - of a row means that there are no missing values within the row. This validation will operate - over the number of test units that is equal to the number of rows in the table (determined - after any `pre=` mutation has been applied). A subset of columns can be specified for the - completeness check. If no subset is provided, all columns in the table will be used. - - Parameters - ---------- - columns_subset - A single column or a list of columns to use as a subset for the completeness check. If - `None` (the default), then all columns in the table will be used. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). Read the *Segmentation* section for usage information. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that you can refer to - columns via `columns_subset=` that are expected to be present in the transformed table, but - may not exist in the table before preprocessing. Regarding the lifetime of the transformed - table, it only exists during the validation step and is not stored in the `Validate` object - or used in subsequent validation steps. - - Segmentation - ------------ - The `segments=` argument allows for the segmentation of a validation step into multiple - segments. This is useful for applying the same validation step to different subsets of the - data. The segmentation can be done based on a single column or specific fields within a - column. - - Providing a single column name will result in a separate validation step for each unique - value in that column. For example, if you have a column called `"region"` with values - `"North"`, `"South"`, and `"East"`, the validation step will be applied separately to each - region. - - Alternatively, you can provide a tuple that specifies a column name and its corresponding - values to segment on. For example, if you have a column called `"date"` and you want to - segment on only specific dates, you can provide a tuple like - `("date", ["2023-01-01", "2023-01-02"])`. Any other values in the column will be disregarded - (i.e., no validation steps will be created for them). - - A list with a combination of column names and tuples can be provided as well. This allows - for more complex segmentation scenarios. The following inputs are both valid: - - ``` - # Segments from all unique values in the `region` column - # and specific dates in the `date` column - segments=["region", ("date", ["2023-01-01", "2023-01-02"])] - - # Segments from all unique values in the `region` and `date` columns - segments=["region", "date"] - ``` - - The segmentation is performed during interrogation, and the resulting validation steps will - be numbered sequentially. Each segment will have its own validation step, and the results - will be reported separately. This allows for a more granular analysis of the data and helps - identify issues within specific segments. - - Importantly, the segmentation process will be performed after any preprocessing of the data - table. Because of this, one can conceivably use the `pre=` argument to generate a column - that can be used for segmentation. For example, you could create a new column called - `"segment"` through use of `pre=` and then use that column for segmentation. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three string columns - (`col_1`, `col_2`, and `col_3`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "col_1": ["a", None, "c", "d"], - "col_2": ["a", "a", "c", None], - "col_3": ["a", "a", "d", None], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the rows in the table are complete with `rows_complete()`. We'll - determine if this validation had any failing test units (there are four test units, one for - each row). A failing test units means that a given row is not complete (i.e., has at least - one missing value). - - ```python - validation = ( - pb.Validate(data=tbl) - .rows_complete() - .interrogate() - ) - - validation - ``` - - From this validation table we see that there are two failing test units. This is because - two rows in the table have at least one missing value (the second row and the last row). - - We can also use a subset of columns to determine completeness. Let's specify the subset - using columns `col_2` and `col_3` for the next validation. - - ```python - validation = ( - pb.Validate(data=tbl) - .rows_complete(columns_subset=["col_2", "col_3"]) - .interrogate() - ) - - validation - ``` - - The validation table reports a single failing test units. The last row contains missing - values in both the `col_2` and `col_3` columns. - others. - - -col_exists(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether one or more columns exist in the table. - - The `col_exists()` method checks whether one or more columns exist in the target table. The - only requirement is specification of the column names. Each validation step or expectation - will operate over a single test unit, which is whether the column exists or not. - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with a string columns (`a`) and a - numeric column (`b`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": ["apple", "banana", "cherry", "date"], - "b": [1, 6, 3, 5], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the columns `a` and `b` actually exist in the table. We'll determine if - this validation had any failing test units (each validation will have a single test unit). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_exists(columns=["a", "b"]) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows two entries (one check per column) generated by the - `col_exists()` validation step. Both steps passed since both columns provided in `columns=` - are present in the table. - - Now, let's check for the existence of a different set of columns. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_exists(columns=["b", "c"]) - .interrogate() - ) - - validation - ``` - - The validation table reports one passing validation step (the check for column `b`) and one - failing validation step (the check for column `c`, which doesn't exist). - - -col_pct_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', p: 'float', tol: 'Tolerance' = 0, thresholds: 'int | float | None | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether a column has a specific percentage of Null values. - - The `col_pct_null()` validation method checks whether the percentage of Null values in a - column matches a specified percentage `p=` (within an optional tolerance `tol=`). This - validation operates at the column level, generating a single validation step per column that - passes or fails based on whether the actual percentage of Null values falls within the - acceptable range defined by `p ± tol`. - - Parameters - ---------- - columns - A single column or a list of columns to validate. Can also use - [`col()`](`pointblank.col`) with column selectors to specify one or more columns. If - multiple columns are supplied or resolved, there will be a separate validation step - generated for each column. - p - The expected percentage of Null values in the column, expressed as a decimal between - `0.0` and `1.0`. For example, `p=0.5` means 50% of values should be Null. - tol - The tolerance allowed when comparing the actual percentage of Null values to the - expected percentage `p=`. The validation passes if the actual percentage falls within - the range `[p - tol, p + tol]`. Default is `0`, meaning an exact match is required. See - the *Tolerance* section for details on all supported formats (absolute, relative, - symmetric, and asymmetric bounds). - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step(s) meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Tolerance - --------- - The `tol=` parameter accepts several different formats to specify the acceptable deviation - from the expected percentage `p=`. The tolerance can be expressed as: - - 1. *single integer* (absolute tolerance): the exact number of test units that can deviate. - For example, `tol=2` means the actual count can differ from the expected count by up to 2 - units in either direction. - - 2. *single float between 0 and 1* (relative tolerance): a proportion of the expected - count. For example, if the expected count is 50 and `tol=0.1`, the acceptable range is - 45 to 55 (50 ± 10% of 50 = 50 ± 5). - - 3. *tuple of two integers* (absolute bounds): explicitly specify the lower and upper - bounds as absolute deviations. For example, `tol=(1, 3)` means the actual count can be - 1 unit below or 3 units above the expected count. - - 4. *tuple of two floats between 0 and 1* (relative bounds): explicitly specify the lower - and upper bounds as proportional deviations. For example, `tol=(0.05, 0.15)` means the - lower bound is 5% below and the upper bound is 15% above the expected count. - - When using a single value (integer or float), the tolerance is applied symmetrically in both - directions. When using a tuple, you can specify asymmetric tolerances where the lower and - upper bounds differ. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three columns (`a`, `b`, - and `c`) that have different percentages of Null values. The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6, 7, 8], - "b": [1, None, 3, None, 5, None, 7, None], - "c": [None, None, None, None, None, None, 1, 2], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that column `a` has 0% Null values (i.e., no Null values at all). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="a", p=0.0) - .interrogate() - ) - - validation - ``` - - Printing the `validation` object shows the validation table in an HTML viewing environment. - The validation table shows the single entry that corresponds to the validation step created - by using `col_pct_null()`. The validation passed since column `a` has no Null values. - - Now, let's check that column `b` has exactly 50% Null values. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="b", p=0.5) - .interrogate() - ) - - validation - ``` - - This validation also passes, as column `b` has exactly 4 out of 8 values as Null (50%). - - Finally, let's validate column `c` with a tolerance. Column `c` has 75% Null values, so - we'll check if it's approximately 70% Null with a tolerance of 10%. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="c", p=0.70, tol=0.10) - .interrogate() - ) - - validation - ``` - - This validation passes because the actual percentage (75%) falls within the acceptable - range of 60% to 80% (70% ± 10%). - - The `tol=` parameter supports multiple formats to express tolerance. Let's explore all the - different ways to specify tolerance using column `b`, which has exactly 50% Null values - (4 out of 8 values). - - *Using an absolute tolerance (integer)*: Specify the exact number of rows that can - deviate. With `tol=1`, we allow the count to differ by 1 row in either direction. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="b", p=0.375, tol=1) # Expect 3 nulls, allow ±1 (range: 2-4) - .interrogate() - ) - - validation - ``` - - This passes because column `b` has 4 Null values, which falls within the acceptable range - of 2 to 4 (3 ± 1). - - *Using a relative tolerance (float)*: Specify the tolerance as a proportion of the - expected count. With `tol=0.25`, we allow a 25% deviation from the expected count. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="b", p=0.375, tol=0.25) # Expect 3 nulls, allow ±25% (range: 2.25-3.75) - .interrogate() - ) - - validation - ``` - - This passes because 4 Null values falls within the acceptable range (3 ± 0.75 calculates - to 2.25 to 3.75, which rounds down to 2 to 3 rows). - - *Using asymmetric absolute bounds (tuple of integers)*: Specify different lower and - upper bounds as absolute values. With `tol=(0, 2)`, we allow no deviation below but up - to 2 rows above the expected count. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="b", p=0.25, tol=(0, 2)) # Expect 2 Nulls, allow +0/-2 (range: 2-4) - .interrogate() - ) - - validation - ``` - - This passes because 4 Null values falls within the acceptable range of 2 to 4. - - *Using asymmetric relative bounds (tuple of floats)*: Specify different lower and upper - bounds as proportions. With `tol=(0.1, 0.3)`, we allow 10% below and 30% above the - expected count. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_pct_null(columns="b", p=0.375, tol=(0.1, 0.3)) # Expect 3 Nulls, allow -10%/+30% - .interrogate() - ) - - validation - ``` - - This passes because 4 Null values falls within the acceptable range (3 - 0.3 to 3 + 0.9 - calculates to 2.7 to 3.9, which rounds down to 2 to 3 rows). - - -data_freshness(self, column: 'str', max_age: 'str | datetime.timedelta', reference_time: 'datetime.datetime | str | None' = None, timezone: 'str | None' = None, allow_tz_mismatch: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate that data in a datetime column is not older than a specified maximum age. - - The `data_freshness()` validation method checks whether the most recent timestamp in the - specified datetime column is within the allowed `max_age=` from the `reference_time=` (which - defaults to the current time). This is useful for ensuring data pipelines are delivering - fresh data and for enforcing data SLAs. - - This method helps detect stale data by comparing the maximum (most recent) value in a - datetime column against an expected freshness threshold. - - Parameters - ---------- - column - The name of the datetime column to check for freshness. This column should contain - date or datetime values. - max_age - The maximum allowed age of the data. Can be specified as: (1) a string with a - human-readable duration like `"24 hours"`, `"1 day"`, `"30 minutes"`, `"2 weeks"`, etc. - (supported units: `seconds`, `minutes`, `hours`, `days`, `weeks`), or (2) a - `datetime.timedelta` object for precise control. - reference_time - The reference point in time to compare against. Defaults to `None`, which uses the - current time (UTC if `timezone=` is not specified). Can be: (1) a `datetime.datetime` - object (timezone-aware recommended), (2) a string in ISO 8601 format (e.g., - `"2024-01-15T10:30:00"` or `"2024-01-15T10:30:00+05:30"`), or (3) `None` to use the - current time. - timezone - The timezone to use for interpreting the data and reference time. Accepts IANA - timezone names (e.g., `"America/New_York"`), hour offsets (e.g., `"-7"`), or ISO 8601 - offsets (e.g., `"-07:00"`). When `None` (default), naive datetimes are treated as UTC. - See the *The `timezone=` Parameter* section for details. - allow_tz_mismatch - Whether to allow timezone mismatches between the column data and reference time. - By default (`False`), a warning note is added when comparing timezone-naive with - timezone-aware datetimes. Set to `True` to suppress these warnings. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - How Timezones Affect Freshness Checks - ------------------------------------- - Freshness validation involves comparing two times: the **data time** (the most recent - timestamp in your column) and the **execution time** (when and where the validation runs). - Timezone confusion typically arises because these two times may originate from different - contexts. - - Consider these common scenarios: - - - your data timestamps are stored in UTC (common for databases), but you're running - validation on your laptop in New York (Eastern Time) - - you develop and test validation locally, then deploy it to a cloud workflow that runs - in UTC—suddenly your 'same' validation behaves differently - - your data comes from servers in multiple regions, each recording timestamps in their - local timezone - - The `timezone=` parameter exists to solve this problem by establishing a single, explicit - timezone context for the freshness comparison. When you specify a timezone, Pointblank - interprets both the data timestamps (if naive) and the execution time in that timezone, - ensuring consistent behavior whether you run validation on your laptop or in a cloud - workflow. - - **Scenario 1: Data has timezone-aware datetimes** - - ```python - # Your data column has values like: 2024-01-15 10:30:00+00:00 (UTC) - # Comparison is straightforward as both sides have explicit timezones - .data_freshness(column="updated_at", max_age="24 hours") - ``` - - **Scenario 2: Data has naive datetimes (no timezone)** - - ```python - # Your data column has values like: 2024-01-15 10:30:00 (no timezone) - # Specify the timezone the data was recorded in: - .data_freshness(column="updated_at", max_age="24 hours", timezone="America/New_York") - ``` - - **Scenario 3: Ensuring consistent behavior across environments** - - ```python - # Pin the timezone to ensure identical results whether running locally or in the cloud - .data_freshness( - column="updated_at", - max_age="24 hours", - timezone="UTC", # Explicit timezone removes environment dependence - ) - ``` - - The `timezone=` Parameter - --------------------------- - The `timezone=` parameter accepts several convenient formats, making it easy to specify - timezones in whatever way is most natural for your use case. The following examples - illustrate the three supported input styles. - - **IANA Timezone Names** (recommended for regions with daylight saving time): - - ```python - timezone="America/New_York" # Eastern Time (handles DST automatically) - timezone="Europe/London" # UK time - timezone="Asia/Tokyo" # Japan Standard Time - timezone="Australia/Sydney" # Australian Eastern Time - timezone="UTC" # Coordinated Universal Time - ``` - - **Simple Hour Offsets** (quick and easy): - - **ISO 8601 Offset Format** (precise, including fractional hours): - - When a timezone is specified: - - - naive datetime values in the column are assumed to be in this timezone. - - the reference time (if naive) is assumed to be in this timezone. - - the validation report will show times in this timezone. - - When `None` (default): - - - if your column has timezone-aware datetimes, those timezones are used - - if your column has naive datetimes, they're treated as UTC - - the current time reference uses UTC - - Note that IANA timezone names are preferred when daylight saving time transitions matter, as - they automatically handle the offset changes. Fixed offsets like `"-7"` or `"-07:00"` do not - account for DST. - - Recommendations for Working with Timestamps - ------------------------------------------- - When working with datetime data, storing timestamps in UTC in your databases is strongly - recommended since it provides a consistent reference point regardless of where your data - originates or where it's consumed. Using timezone-aware datetimes whenever possible helps - avoid ambiguity—when a datetime has an explicit timezone, there's no guessing about what - time it actually represents. - - If you're working with naive datetimes (which lack timezone information), always specify the - `timezone=` parameter so Pointblank knows how to interpret those values. When providing - `reference_time=` as a string, use ISO 8601 format with the timezone offset included (e.g., - `"2024-01-15T10:30:00+00:00"`) to ensure unambiguous parsing. Finally, prefer IANA timezone - names (like `"America/New_York"`) over fixed offsets (like `"-05:00"`) when daylight saving - time transitions matter, since IANA names automatically handle the twice-yearly offset - changes. To see all available IANA timezone names in Python, use - `zoneinfo.available_timezones()` from the standard library's `zoneinfo` module. - - Examples - -------- - The simplest use of `data_freshness()` requires just two arguments: the `column=` containing - your timestamps and `max_age=` specifying how old the data can be. In this first example, - we create sample data with an `"updated_at"` column containing timestamps from 1, 12, and - 20 hours ago. By setting `max_age="24 hours"`, we're asserting that the most recent - timestamp should be within 24 hours of the current time. Since the newest record is only - 1 hour old, this validation passes. - - ```python - import pointblank as pb - import polars as pl - from datetime import datetime, timedelta - - # Create sample data with recent timestamps - recent_data = pl.DataFrame({ - "id": [1, 2, 3], - "updated_at": [ - datetime.now() - timedelta(hours=1), - datetime.now() - timedelta(hours=12), - datetime.now() - timedelta(hours=20), - ] - }) - - validation = ( - pb.Validate(data=recent_data) - .data_freshness(column="updated_at", max_age="24 hours") - .interrogate() - ) - - validation - ``` - - The `max_age=` parameter accepts human-readable strings with various time units. You can - chain multiple `data_freshness()` calls to check different freshness thresholds - simultaneously—useful for tiered SLAs where you might want warnings at 30 minutes but - errors at 2 days. - - ```python - # Check data is fresh within different time windows - validation = ( - pb.Validate(data=recent_data) - .data_freshness(column="updated_at", max_age="30 minutes") # Very fresh - .data_freshness(column="updated_at", max_age="2 days") # Reasonably fresh - .data_freshness(column="updated_at", max_age="1 week") # Within a week - .interrogate() - ) - - validation - ``` - - When your data contains naive datetimes (timestamps without timezone information), use the - `timezone=` parameter to specify what timezone those values represent. Here we have event - data recorded in Eastern Time, so we set `timezone="America/New_York"` to ensure the - freshness comparison is done correctly. - - ```python - # Data with naive datetimes (assume they're in Eastern Time) - eastern_data = pl.DataFrame({ - "event_time": [ - datetime.now() - timedelta(hours=2), - datetime.now() - timedelta(hours=5), - ] - }) - - validation = ( - pb.Validate(data=eastern_data) - .data_freshness( - column="event_time", - max_age="12 hours", - timezone="America/New_York" # Interpret times as Eastern - ) - .interrogate() - ) - - validation - ``` - - For reproducible validations or historical checks, you can use `reference_time=` to compare - against a specific point in time instead of the current time. This is particularly useful - for testing or when validating data snapshots. The reference time should include a timezone - offset (like `+00:00` for UTC) to avoid ambiguity. - - ```python - validation = ( - pb.Validate(data=recent_data) - .data_freshness( - column="updated_at", - max_age="24 hours", - reference_time="2024-01-15T12:00:00+00:00" - ) - .interrogate() - ) - - validation - ``` - - -col_schema_match(self, schema: 'Schema', complete: 'bool' = True, in_order: 'bool' = True, case_sensitive_colnames: 'bool' = True, case_sensitive_dtypes: 'bool' = True, full_match_dtypes: 'bool' = True, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Do columns in the table (and their types) match a predefined schema? - - The `col_schema_match()` method works in conjunction with an object generated by the - [`Schema`](`pointblank.Schema`) class. That class object is the expectation for the actual - schema of the target table. The validation step operates over a single test unit, which is - whether the schema matches that of the table (within the constraints enforced by the - `complete=`, and `in_order=` options). - - Parameters - ---------- - schema - A `Schema` object that represents the expected schema of the table. This object is - generated by the [`Schema`](`pointblank.Schema`) class. - complete - Should the schema match be complete? If `True`, then the target table must have all - columns specified in the schema. If `False`, then the table can have additional columns - not in the schema (i.e., the schema is a subset of the target table's columns). - in_order - Should the schema match be in order? If `True`, then the columns in the schema must - appear in the same order as they do in the target table. If `False`, then the order of - columns in the schema and the target table can differ. - case_sensitive_colnames - Should the schema match be case-sensitive with regard to column names? If `True`, then - the column names in the schema and the target table must match exactly. If `False`, then - the column names are compared in a case-insensitive manner. - case_sensitive_dtypes - Should the schema match be case-sensitive with regard to column data types? If `True`, - then the column data types in the schema and the target table must match exactly. If - `False`, then the column data types are compared in a case-insensitive manner. - full_match_dtypes - Should the schema match require a full match of data types? If `True`, then the column - data types in the schema and the target table must match exactly. If `False` then - substring matches are allowed, so a schema data type of `Int` would match a target table - data type of `Int64`. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. Regarding the lifetime of the transformed table, it only exists during the - validation step and is not stored in the `Validate` object or used in subsequent validation - steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three columns (string, - integer, and float). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": ["apple", "banana", "cherry", "date"], - "b": [1, 6, 3, 5], - "c": [1.1, 2.2, 3.3, 4.4], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the columns in the table match a predefined schema. A schema can be - defined using the [`Schema`](`pointblank.Schema`) class. - - ```python - schema = pb.Schema( - columns=[("a", "String"), ("b", "Int64"), ("c", "Float64")] - ) - ``` - - You can print the schema object to verify that the expected schema is as intended. - - ```python - print(schema) - ``` - - Now, we'll use the `col_schema_match()` method to validate the table against the expected - `schema` object. There is a single test unit for this validation step (whether the schema - matches the table or not). - - ```python - validation = ( - pb.Validate(data=tbl) - .col_schema_match(schema=schema) - .interrogate() - ) - - validation - ``` - - The validation table shows that the schema matches the table. The single test unit passed - since the table columns and their types match the schema. - - -row_count_match(self, count: 'int | Any', tol: 'Tolerance' = 0, inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether the row count of the table matches a specified count. - - The `row_count_match()` method checks whether the row count of the target table matches a - specified count. This validation will operate over a single test unit, which is whether the - row count matches the specified count. - - We also have the option to invert the validation step by setting `inverse=True`. This will - make the expectation that the row count of the target table *does not* match the specified - count. - - Parameters - ---------- - count - The expected row count of the table. This can be an integer value, a Polars or Pandas - DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the row - count of that object will be used as the expected count. - tol - The tolerance allowable for the row count match. This can be specified as a single - numeric value (integer or float) or as a tuple of two integers representing the lower - and upper bounds of the tolerance range. If a single integer value (greater than 1) is - provided, it represents the absolute bounds of the tolerance, ie. plus or minus the value. - If a float value (between 0-1) is provided, it represents the relative tolerance, ie. - plus or minus the relative percentage of the target. If a tuple is provided, it represents - the lower and upper absolute bounds of the tolerance range. See the examples for more. - inverse - Should the validation step be inverted? If `True`, then the expectation is that the row - count of the target table should not match the specified `count=` value. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Regarding the lifetime of the - transformed table, it only exists during the validation step and is not stored in the - `Validate` object or used in subsequent validation steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use the built in dataset `"small_table"`. The table can be - obtained by calling `load_dataset("small_table")`. - - Let's validate that the number of rows in the table matches a fixed value. In this case, we - will use the value `13` as the expected row count. - - ```python - validation = ( - pb.Validate(data=small_table) - .row_count_match(count=13) - .interrogate() - ) - - validation - ``` - - The validation table shows that the expectation value of `13` matches the actual count of - rows in the target table. So, the single test unit passed. - - - Let's modify our example to show the different ways we can allow some tolerance to our validation - by using the `tol` argument. - - ```python - smaller_small_table = small_table.sample(n = 12) # within the lower bound - validation = ( - pb.Validate(data=smaller_small_table) - .row_count_match(count=13,tol=(2, 0)) # minus 2 but plus 0, ie. 11-13 - .interrogate() - ) - - validation - - validation = ( - pb.Validate(data=smaller_small_table) - .row_count_match(count=13,tol=.05) # .05% tolerance of 13 - .interrogate() - ) - - even_smaller_table = small_table.sample(n = 2) - validation = ( - pb.Validate(data=even_smaller_table) - .row_count_match(count=13,tol=5) # plus or minus 5; this test will fail - .interrogate() - ) - - validation - ``` - - - -col_count_match(self, count: 'int | Any', inverse: 'bool' = False, pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether the column count of the table matches a specified count. - - The `col_count_match()` method checks whether the column count of the target table matches a - specified count. This validation will operate over a single test unit, which is whether the - column count matches the specified count. - - We also have the option to invert the validation step by setting `inverse=True`. This will - make the expectation that column row count of the target table *does not* match the - specified count. - - Parameters - ---------- - count - The expected column count of the table. This can be an integer value, a Polars or Pandas - DataFrame object, or an Ibis backend table. If a DataFrame/table is provided, the column - count of that object will be used as the expected count. - inverse - Should the validation step be inverted? If `True`, then the expectation is that the - column count of the target table should not match the specified `count=` value. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Regarding the lifetime of the - transformed table, it only exists during the validation step and is not stored in the - `Validate` object or used in subsequent validation steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use the built in dataset `"game_revenue"`. The table can be - obtained by calling `load_dataset("game_revenue")`. - - Let's validate that the number of columns in the table matches a fixed value. In this case, - we will use the value `11` as the expected column count. - - ```python - validation = ( - pb.Validate(data=game_revenue) - .col_count_match(count=11) - .interrogate() - ) - - validation - ``` - - The validation table shows that the expectation value of `11` matches the actual count of - columns in the target table. So, the single test unit passed. - - -tbl_match(self, tbl_compare: 'Any', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate whether the target table matches a comparison table. - - The `tbl_match()` method checks whether the target table's composition matches that of a - comparison table. The validation performs a comprehensive comparison using progressively - stricter checks (from least to most stringent): - - 1. **Column count match**: both tables must have the same number of columns - 2. **Row count match**: both tables must have the same number of rows - 3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order) - 4. **Schema match (order)**: columns in the correct order (case-insensitive names) - 5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order) - 6. **Data match**: values in corresponding cells must be identical - - This progressive approach helps identify exactly where tables differ. The validation will - fail at the first check that doesn't pass, making it easier to diagnose mismatches. This - validation operates over a single test unit (pass/fail for complete table match). - - Parameters - ---------- - tbl_compare - The comparison table to validate against. This can be a DataFrame object (Polars or - Pandas), an Ibis table object, or a callable that returns a table. If a callable is - provided, it will be executed during interrogation to obtain the comparison table. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Note that the same preprocessing - is **not** applied to the comparison table; only the target table is preprocessed. Regarding - the lifetime of the transformed table, it only exists during the validation step and is not - stored in the `Validate` object or used in subsequent validation steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Cross-Backend Validation - ------------------------ - The `tbl_match()` method supports **automatic backend coercion** when comparing tables from - different backends (e.g., comparing a Polars DataFrame against a Pandas DataFrame, or - comparing database tables from DuckDB/SQLite against in-memory DataFrames). When tables with - different backends are detected, the comparison table is automatically converted to match the - data table's backend before validation proceeds. - - **Certified Backend Combinations:** - - All combinations of the following backends have been tested and certified to work (in both - directions): - - - Pandas DataFrame - - Polars DataFrame - - DuckDB (native) - - DuckDB (as Ibis table) - - SQLite (via Ibis) - - Note that database backends (DuckDB, SQLite, PostgreSQL, MySQL, Snowflake, BigQuery) are - automatically materialized during validation: - - - if comparing **against Polars**: materialized to Polars - - if comparing **against Pandas**: materialized to Pandas - - if **both tables are database backends**: both materialized to Polars - - This ensures optimal performance and type consistency. - - **Data Types That Work Best in Cross-Backend Validation:** - - - numeric types: int, float columns (including proper NaN handling) - - string types: text columns with consistent encodings - - boolean types: True/False values - - null values: `None` and `NaN` are treated as equivalent across backends - - list columns: nested list structures (with basic types) - - **Known Limitations:** - - While many data types work well in cross-backend validation, there are some known - limitations to be aware of: - - - date/datetime types: When converting between Polars and Pandas, date objects may be - represented differently. For example, `datetime.date` objects in Pandas may become - `pd.Timestamp` objects when converted from Polars, leading to false mismatches. To work - around this, ensure both tables use the same datetime representation before comparison. - - custom types: User-defined types or complex nested structures may not convert cleanly - between backends and could cause unexpected comparison failures. - - categorical types: Categorical/factor columns may have different internal - representations across backends. - - timezone-aware datetimes: Timezone handling differs between backends and may cause - comparison issues. - - Here are some ideas to overcome such limitations: - - - for date/datetime columns, consider using `pre=` preprocessing to normalize representations - before comparison. - - when working with custom types, manually convert tables to the same backend before using - `tbl_match()`. - - use the same datetime precision (e.g., milliseconds vs microseconds) in both tables. - - Examples - -------- - For the examples here, we'll create two simple tables to demonstrate the `tbl_match()` - validation. - - ```python - import pointblank as pb - import polars as pl - - # Create the first table - tbl_1 = pl.DataFrame({ - "a": [1, 2, 3, 4], - "b": ["w", "x", "y", "z"], - "c": [4.0, 5.0, 6.0, 7.0] - }) - - # Create an identical table - tbl_2 = pl.DataFrame({ - "a": [1, 2, 3, 4], - "b": ["w", "x", "y", "z"], - "c": [4.0, 5.0, 6.0, 7.0] - }) - - pb.preview(tbl_1) - ``` - - Let's validate that `tbl_1` matches `tbl_2`. Since these tables are identical, the - validation should pass. - - ```python - validation = ( - pb.Validate(data=tbl_1) - .tbl_match(tbl_compare=tbl_2) - .interrogate() - ) - - validation - ``` - - The validation table shows that the single test unit passed, indicating that the two tables - match completely. - - Now, let's create a table with a slight difference and see what happens. - - ```python - # Create a table with one different value - tbl_3 = pl.DataFrame({ - "a": [1, 2, 3, 4], - "b": ["w", "x", "y", "z"], - "c": [4.0, 5.5, 6.0, 7.0] # Changed 5.0 to 5.5 - }) - - validation = ( - pb.Validate(data=tbl_1) - .tbl_match(tbl_compare=tbl_3) - .interrogate() - ) - - validation - ``` - - The validation table shows that the single test unit failed because the tables don't match - (one value is different in column `c`). - - -conjointly(self, *exprs: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Perform multiple row-wise validations for joint validity. - - The `conjointly()` validation method checks whether each row in the table passes multiple - validation conditions simultaneously. This enables compound validation logic where a test - unit (typically a row) must satisfy all specified conditions to pass the validation. - - This method accepts multiple validation expressions as callables, which should return - boolean expressions when applied to the data. You can use lambdas that incorporate - Polars/Pandas/Ibis expressions (based on the target table type) or create more complex - validation functions. The validation will operate over the number of test units that is - equal to the number of rows in the table (determined after any `pre=` mutation has been - applied). - - Parameters - ---------- - *exprs - Multiple validation expressions provided as callable functions. Each callable should - accept a table as its single argument and return a boolean expression or Series/Column - that evaluates to boolean values for each row. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Regarding the lifetime of the - transformed table, it only exists during the validation step and is not stored in the - `Validate` object or used in subsequent validation steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - For the examples here, we'll use a simple Polars DataFrame with three numeric columns (`a`, - `b`, and `c`). The table is shown below: - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 7, 1, 3, 9, 4], - "b": [6, 3, 0, 5, 8, 2], - "c": [10, 4, 8, 9, 10, 5], - } - ) - - pb.preview(tbl) - ``` - - Let's validate that the values in each row satisfy multiple conditions simultaneously: - - 1. Column `a` should be greater than 2 - 2. Column `b` should be less than 7 - 3. The sum of `a` and `b` should be less than the value in column `c` - - We'll use `conjointly()` to check all these conditions together: - - ```python - validation = ( - pb.Validate(data=tbl) - .conjointly( - lambda df: pl.col("a") > 2, - lambda df: pl.col("b") < 7, - lambda df: pl.col("a") + pl.col("b") < pl.col("c") - ) - .interrogate() - ) - - validation - ``` - - The validation table shows that not all rows satisfy all three conditions together. For a - row to pass the conjoint validation, all three conditions must be true for that row. - - We can also use preprocessing to filter the data before applying the conjoint validation: - - ```python - # Define preprocessing function for serialization compatibility - def filter_by_c_gt_5(df): - return df.filter(pl.col("c") > 5) - - validation = ( - pb.Validate(data=tbl) - .conjointly( - lambda df: pl.col("a") > 2, - lambda df: pl.col("b") < 7, - lambda df: pl.col("a") + pl.col("b") < pl.col("c"), - pre=filter_by_c_gt_5 - ) - .interrogate() - ) - - validation - ``` - - This allows for more complex validation scenarios where the data is first prepared and then - validated against multiple conditions simultaneously. - - Or, you can use the backend-agnostic column expression helper - [`expr_col()`](`pointblank.expr_col`) to write expressions that work across different table - backends: - - ```python - tbl = pl.DataFrame( - { - "a": [5, 7, 1, 3, 9, 4], - "b": [6, 3, 0, 5, 8, 2], - "c": [10, 4, 8, 9, 10, 5], - } - ) - - # Using backend-agnostic syntax with expr_col() - validation = ( - pb.Validate(data=tbl) - .conjointly( - lambda df: pb.expr_col("a") > 2, - lambda df: pb.expr_col("b") < 7, - lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c") - ) - .interrogate() - ) - - validation - ``` - - Using [`expr_col()`](`pointblank.expr_col`) allows your validation code to work consistently - across Pandas, Polars, and Ibis table backends without changes, making your validation - pipelines more portable. - - See Also - -------- - Look at the documentation of the [`expr_col()`](`pointblank.expr_col`) function for more - information on how to use it with different table backends. - - -specially(self, expr: 'Callable', pre: 'Callable | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Perform a specialized validation with customized logic. - - The `specially()` validation method allows for the creation of specialized validation - expressions that can be used to validate specific conditions or logic in the data. This - method provides maximum flexibility by accepting a custom callable that encapsulates - your validation logic. - - The callable function can have one of two signatures: - - - a function accepting a single parameter (the data table): `def validate(data): ...` - - a function with no parameters: `def validate(): ...` - - The second form is particularly useful for environment validations that don't need to - inspect the data table. - - The callable function must ultimately return one of: - - 1. a single boolean value or boolean list - 2. a table where the final column contains boolean values (column name is unimportant) - - The validation will operate over the number of test units that is equal to the number of - rows in the data table (if returning a table with boolean values). If returning a scalar - boolean value, the validation will operate over a single test unit. For a return of a list - of boolean values, the length of the list constitutes the number of test units. - - Parameters - ---------- - expr - A callable function that defines the specialized validation logic. This function should: - (1) accept the target data table as its single argument (though it may ignore it), or - (2) take no parameters at all (for environment validations). The function must - ultimately return boolean values representing validation results. Design your function - to incorporate any custom parameters directly within the function itself using closure - variables or default parameters. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - Have a look at the *Preprocessing* section for more information on how to use this - argument. - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. Look at the *Thresholds* - section for information on how to set threshold levels. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Preprocessing - ------------- - The `pre=` argument allows for a preprocessing function or lambda to be applied to the data - table during interrogation. This function should take a table as input and return a modified - table. This is useful for performing any necessary transformations or filtering on the data - before the validation step is applied. - - The preprocessing function can be any callable that takes a table as input and returns a - modified table. For example, you could use a lambda function to filter the table based on - certain criteria or to apply a transformation to the data. Regarding the lifetime of the - transformed table, it only exists during the validation step and is not stored in the - `Validate` object or used in subsequent validation steps. - - Thresholds - ---------- - The `thresholds=` parameter is used to set the failure-condition levels for the validation - step. If they are set here at the step level, these thresholds will override any thresholds - set at the global level in `Validate(thresholds=...)`. - - There are three threshold levels: 'warning', 'error', and 'critical'. The threshold values - can either be set as a proportion failing of all test units (a value between `0` to `1`), - or, the absolute number of failing test units (as integer that's `1` or greater). - - Thresholds can be defined using one of these input schemes: - - 1. use the [`Thresholds`](`pointblank.Thresholds`) class (the most direct way to create - thresholds) - 2. provide a tuple of 1-3 values, where position `0` is the 'warning' level, position `1` is - the 'error' level, and position `2` is the 'critical' level - 3. create a dictionary of 1-3 value entries; the valid keys: are 'warning', 'error', and - 'critical' - 4. a single integer/float value denoting absolute number or fraction of failing test units - for the 'warning' level only - - If the number of failing test units exceeds set thresholds, the validation step will be - marked as 'warning', 'error', or 'critical'. All of the threshold levels don't need to be - set, you're free to set any combination of them. - - Aside from reporting failure conditions, thresholds can be used to determine the actions to - take for each level of failure (using the `actions=` parameter). - - Examples - -------- - The `specially()` method offers maximum flexibility for validation, allowing you to create - custom validation logic that fits your specific needs. The following examples demonstrate - different patterns and use cases for this powerful validation approach. - - ### Simple validation with direct table access - - This example shows the most straightforward use case where we create a function that - directly checks if the sum of two columns is positive. - - ```python - import pointblank as pb - import polars as pl - - simple_tbl = pl.DataFrame({ - "a": [5, 7, 1, 3, 9, 4], - "b": [6, 3, 0, 5, 8, 2] - }) - - # Simple function that validates directly on the table - def validate_sum_positive(data): - return data.select(pl.col("a") + pl.col("b") > 0) - - ( - pb.Validate(data=simple_tbl) - .specially(expr=validate_sum_positive) - .interrogate() - ) - ``` - - The function returns a Polars DataFrame with a single boolean column indicating whether - the sum of columns `a` and `b` is positive for each row. Each row in the resulting DataFrame - is a distinct test unit. This pattern works well for simple validations where you don't need - configurable parameters. - - ### Advanced validation with closure variables for parameters - - When you need to make your validation configurable, you can use the function factory pattern - (also known as closures) to create parameterized validations: - - ```python - # Create a parameterized validation function using closures - def make_column_ratio_validator(col1, col2, min_ratio): - def validate_column_ratio(data): - return data.select((pl.col(col1) / pl.col(col2)) > min_ratio) - return validate_column_ratio - - ( - pb.Validate(data=simple_tbl) - .specially( - expr=make_column_ratio_validator(col1="a", col2="b", min_ratio=0.5) - ) - .interrogate() - ) - ``` - - This approach allows you to create reusable validation functions that can be configured with - different parameters without modifying the function itself. - - ### Validation function returning a list of booleans - - This example demonstrates how to create a validation function that returns a list of boolean - values, where each element represents a separate test unit: - - ```python - import pointblank as pb - import polars as pl - import random - - # Create sample data - transaction_tbl = pl.DataFrame({ - "transaction_id": [f"TX{i:04d}" for i in range(1, 11)], - "amount": [120.50, 85.25, 50.00, 240.75, 35.20, 150.00, 85.25, 65.00, 210.75, 90.50], - "category": ["food", "shopping", "entertainment", "travel", "utilities", - "food", "shopping", "entertainment", "travel", "utilities"] - }) - - # Define a validation function that returns a list of booleans - def validate_transaction_rules(data): - # Create a list to store individual test results - test_results = [] - - # Check each row individually against multiple business rules - for row in data.iter_rows(named=True): - # Rule: transaction IDs must start with "TX" and be 6 chars long - valid_id = row["transaction_id"].startswith("TX") and len(row["transaction_id"]) == 6 - - # Rule: Amounts must be appropriate for their category - valid_amount = True - if row["category"] == "food" and (row["amount"] < 10 or row["amount"] > 200): - valid_amount = False - elif row["category"] == "utilities" and (row["amount"] < 20 or row["amount"] > 300): - valid_amount = False - elif row["category"] == "entertainment" and row["amount"] > 100: - valid_amount = False - - # A transaction passes if it satisfies both rules - test_results.append(valid_id and valid_amount) - - return test_results - - ( - pb.Validate(data=transaction_tbl) - .specially( - expr=validate_transaction_rules, - brief="Validate transaction IDs and amounts by category." - ) - .interrogate() - ) - ``` - - This example shows how to create a validation function that applies multiple business rules - to each row and returns a list of boolean results. Each boolean in the list represents a - separate test unit, and a test unit passes only if all rules are satisfied for a given row. - - The function iterates through each row in the data table, checking: - - 1. if transaction IDs follow the required format - 2. if transaction amounts are appropriate for their respective categories - - This approach is powerful when you need to apply complex, conditional logic that can't be - easily expressed using the built-in validation functions. - - ### Table-level validation returning a single boolean - - Sometimes you need to validate properties of the entire table rather than row-by-row. In - these cases, your function can return a single boolean value: - - ```python - def validate_table_properties(data): - # Check if table has at least one row with column 'a' > 10 - has_large_values = data.filter(pl.col("a") > 10).height > 0 - - # Check if mean of column 'b' is positive - has_positive_mean = data.select(pl.mean("b")).item() > 0 - - # Return a single boolean for the entire table - return has_large_values and has_positive_mean - - ( - pb.Validate(data=simple_tbl) - .specially(expr=validate_table_properties) - .interrogate() - ) - ``` - - This example demonstrates how to perform multiple checks on the table as a whole and combine - them into a single validation result. - - ### Environment validation that doesn't use the data table - - The `specially()` validation method can even be used to validate aspects of your environment - that are completely independent of the data: - - ```python - def validate_pointblank_version(): - try: - import importlib.metadata - version = importlib.metadata.version("pointblank") - version_parts = version.split(".") - - # Get major and minor components regardless of how many parts there are - major = int(version_parts[0]) - minor = int(version_parts[1]) - - # Check both major and minor components for version `0.9+` - return (major > 0) or (major == 0 and minor >= 9) - - except Exception as e: - # More specific error handling could be added here - print(f"Version check failed: {e}") - return False - - ( - pb.Validate(data=simple_tbl) - .specially( - expr=validate_pointblank_version, - brief="Check Pointblank version `>=0.9.0`." - ) - .interrogate() - ) - ``` - - This pattern shows how to validate external dependencies or environment conditions as part - of your validation workflow. Notice that the function doesn't take any parameters at all, - which makes it cleaner when the validation doesn't need to access the data table. - - By combining these patterns, you can create sophisticated validation workflows that address - virtually any data quality requirement in your organization. - - -prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | None' = None, batch_size: 'int' = 1000, max_concurrent: 'int' = 3, pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds | None' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool | Callable' = True) -> 'Validate' - - Validate rows using AI/LLM-powered analysis. - - The `prompt()` validation method uses Large Language Models (LLMs) to validate rows of data - based on natural language criteria. Similar to other Pointblank validation methods, this - generates binary test results (pass/fail) that integrate seamlessly with the standard - reporting framework. - - Like `col_vals_*()` methods, `prompt()` evaluates data against specific criteria, but - instead of using programmatic rules, it uses natural language prompts interpreted by an LLM. - Like `rows_distinct()` and `rows_complete()`, it operates at the row level and allows you to - specify a subset of columns for evaluation using `columns_subset=`. - - The system automatically combines your validation criteria from the `prompt=` parameter with - the necessary technical context, data formatting instructions, and response structure - requirements. This is all so you only need to focus on describing your validation logic in - plain language. - - Each row becomes a test unit that either passes or fails the validation criteria, producing - the familiar True/False results that appear in Pointblank validation reports. This method - is particularly useful for complex validation rules that are difficult to express with - traditional validation methods, such as semantic checks, context-dependent validation, or - subjective quality assessments. - - Parameters - ---------- - prompt - A natural language description of the validation criteria. This prompt should clearly - describe what constitutes valid vs invalid rows. Some examples: - `"Each row should contain a valid email address and a realistic person name"`, - `"Values should indicate positive sentiment"`, - `"The description should mention a country name"`. - columns_subset - A single column or list of columns to include in the validation. If `None`, all columns - will be included. Specifying fewer columns can improve performance and reduce API costs - so try to include only the columns necessary for the validation. - model - The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, - `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from - the provider. Model names are subject to change so consult the provider's documentation - for the most up-to-date model names. - batch_size - Number of rows to process in each batch. Larger batches are more efficient but may hit - API limits. Default is `1000`. - max_concurrent - Maximum number of concurrent API requests. Higher values speed up processing but may - hit rate limits. Default is `3`. - pre - An optional preprocessing function or lambda to apply to the data table during - interrogation. This function should take a table as input and return a modified table. - segments - An optional directive on segmentation, which serves to split a validation step into - multiple (one step per segment). Can be a single column name, a tuple that specifies a - column name and its corresponding values to segment on, or a combination of both - (provided as a list). - thresholds - Set threshold failure levels for reporting and reacting to exceedences of the levels. - The thresholds are set at the step level and will override any global thresholds set in - `Validate(thresholds=...)`. The default is `None`, which means that no thresholds will - be set locally and global thresholds (if any) will take effect. - actions - Optional actions to take when the validation step meets or exceeds any set threshold - levels. If provided, the [`Actions`](`pointblank.Actions`) class should be used to - define the actions. - brief - An optional brief description of the validation step that will be displayed in the - reporting table. You can use the templating elements like `"{step}"` to insert - the step number, or `"{auto}"` to include an automatically generated brief. If `True` - the entire brief will be automatically generated. If `None` (the default) then there - won't be a brief. - active - A boolean value or callable that determines whether the validation step should be - active. Using `False` will make the validation step inactive (still reporting its - presence and keeping indexes for the steps unchanged). A callable can also be - provided; it will receive the data table as its single argument and must return a - boolean value. The callable is evaluated *before* any `pre=` processing. Inspection - functions like [`has_columns()`](`pointblank.has_columns`) and - [`has_rows()`](`pointblank.has_rows`) can be used here to conditionally activate a step - based on properties of the target table. - - Returns - ------- - Validate - The `Validate` object with the added validation step. - - Constructing the `model` Argument - --------------------------------- - The `model=` argument should be constructed using the provider and model name separated by a - colon (`provider:model`). The provider text can any of: - - - `"anthropic"` (Anthropic) - - `"openai"` (OpenAI) - - `"ollama"` (Ollama) - - `"bedrock"` (Amazon Bedrock) - - The model name should be the specific model to be used from the provider. Model names are - subject to change so consult the provider's documentation for the most up-to-date model - names. - - Notes on Authentication - ----------------------- - API keys are automatically loaded from environment variables or `.env` files and are **not** - stored in the validation object for security reasons. You should consider using a secure - method for handling API keys. - - One way to do this is to load the API key from an environment variable and retrieve it using - the `os` module (specifically the `os.getenv()` function). Places to store the API key might - include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`. - - Another solution is to store one or more model provider API keys in an `.env` file (in the - root of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or - `OPENAI_API_KEY`) then the AI validation will automatically load the API key from the `.env` - file. An `.env` file might look like this: - - ```plaintext - ANTHROPIC_API_KEY="your_anthropic_api_key_here" - OPENAI_API_KEY="your_openai_api_key_here" - ``` - - There's no need to have the `python-dotenv` package installed when using `.env` files in - this way. - - **Provider-specific setup**: - - - **OpenAI**: set `OPENAI_API_KEY` environment variable or create `.env` file - - **Anthropic**: set `ANTHROPIC_API_KEY` environment variable or create `.env` file - - **Ollama**: no API key required, just ensure Ollama is running locally - - **Bedrock**: configure AWS credentials through standard AWS methods - - AI Validation Process - --------------------- - The AI validation process works as follows: - - 1. data batching: the data is split into batches of the specified size - 2. row deduplication: duplicate rows (based on selected columns) are identified and only - unique combinations are sent to the LLM for analysis - 3. json conversion: each batch of unique rows is converted to JSON format for the LLM - 4. prompt construction: the user prompt is embedded in a structured system prompt - 5. llm processing: each batch is sent to the LLM for analysis - 6. response parsing: LLM responses are parsed to extract validation results - 7. result projection: results are mapped back to all original rows using row signatures - 8. result aggregation: results from all batches are combined - - **Performance Optimization**: the process uses row signature memoization to avoid redundant - LLM calls. When multiple rows have identical values in the selected columns, only one - representative row is validated, and the result is applied to all matching rows. This can - dramatically reduce API costs and processing time for datasets with repetitive patterns. - - The LLM receives data in this JSON format: - - ```json - { - "columns": ["col1", "col2", "col3"], - "rows": [ - {"col1": "value1", "col2": "value2", "col3": "value3", "_pb_row_index": 0}, - {"col1": "value4", "col2": "value5", "col3": "value6", "_pb_row_index": 1} - ] - } - ``` - - The LLM returns validation results in this format: - ```json - [ - {"index": 0, "result": true}, - {"index": 1, "result": false} - ] - ``` - - Prompt Design Tips - ------------------ - For best results, design prompts that are: - - - boolean-oriented: frame validation criteria to elicit clear valid/invalid responses - - specific: clearly define what makes a row valid/invalid - - unambiguous: avoid subjective language that could be interpreted differently - - context-aware: include relevant business rules or domain knowledge - - example-driven: consider providing examples in the prompt when helpful - - **Critical**: Prompts must be designed so the LLM can determine whether each row passes or - fails the validation criteria. The system expects binary validation responses, so avoid - open-ended questions or prompts that might generate explanatory text instead of clear - pass/fail judgments. - - Good prompt examples: - - - "Each row should contain a valid email address in the 'email' column and a non-empty name - in the 'name' column" - - "The 'sentiment' column should contain positive sentiment words (happy, good, excellent, - etc.)" - - "Product descriptions should mention at least one technical specification" - - Poor prompt examples (avoid these): - - - "What do you think about this data?" (too open-ended) - - "Describe the quality of each row" (asks for description, not validation) - - "How would you improve this data?" (asks for suggestions, not pass/fail) - - Performance Considerations - -------------------------- - AI validation is significantly slower than traditional validation methods due to API calls - to LLM providers. However, performance varies dramatically based on data characteristics: - - **High Memoization Scenarios** (seconds to minutes): - - - data with many duplicate rows in the selected columns - - low cardinality data (repeated patterns) - - small number of unique row combinations - - **Low Memoization Scenarios** (minutes to hours): - - - high cardinality data with mostly unique rows - - large datasets with few repeated patterns - - all or most rows requiring individual LLM evaluation - - The row signature memoization optimization can reduce processing time significantly when - data has repetitive patterns. For datasets where every row is unique, expect longer - processing times similar to validating each row individually. - - **Strategies to Reduce Processing Time**: - - - test on data slices: define a sampling function like `def sample_1000(df): return df.head(1000)` - and use `pre=sample_1000` to validate on smaller samples - - filter relevant data: define filter functions like `def active_only(df): return df.filter(df["status"] == "active")` - and use `pre=active_only` to focus on a specific subset - - optimize column selection: use `columns_subset=` to include only the columns necessary - for validation - - start with smaller batches: begin with `batch_size=100` for testing, then increase - gradually - - reduce concurrency: lower `max_concurrent=1` if hitting rate limits - - use faster/cheaper models: consider using smaller or more efficient models for initial - testing before switching to more capable models - - Examples - -------- - The following examples demonstrate how to use AI validation for different types of data - quality checks. These examples show both basic usage and more advanced configurations with - custom thresholds and actions. - - **Basic AI validation example:** - - This first example shows a simple validation scenario where we want to check that customer - records have both valid email addresses and non-empty names. Notice how we use - `columns_subset=` to focus only on the relevant columns, which improves both performance - and cost-effectiveness. - - ```python - import pointblank as pb - import polars as pl - - # Sample data with email and name columns - tbl = pl.DataFrame({ - "email": ["john@example.com", "invalid-email", "jane@test.org"], - "name": ["John Doe", "", "Jane Smith"], - "age": [25, 30, 35] - }) - - # Validate using AI - validation = ( - pb.Validate(data=tbl) - .prompt( - prompt="Each row should have a valid email address and a non-empty name", - columns_subset=["email", "name"], # Only check these columns - model="openai:gpt-4o-mini", - ) - .interrogate() - ) - - validation - ``` - - In this example, the AI will identify that the second row fails validation because it has - an invalid email format (`"invalid-email"`) and the third row also fails because it has an - empty name field. The validation results will show 2 out of 3 rows failing the criteria. - - **Advanced example with custom thresholds:** - - This more sophisticated example demonstrates how to use AI validation with custom thresholds - and actions. Here we're validating phone number formats to ensure they include area codes, - which is a common data quality requirement for customer contact information. - - ```python - customer_data = pl.DataFrame({ - "customer_id": [1, 2, 3, 4, 5], - "name": ["John Doe", "Jane Smith", "Bob Johnson", "Alice Brown", "Charlie Davis"], - "phone_number": [ - "(555) 123-4567", # Valid with area code - "555-987-6543", # Valid with area code - "123-4567", # Missing area code - "(800) 555-1234", # Valid with area code - "987-6543" # Missing area code - ] - }) - - validation = ( - pb.Validate(data=customer_data) - .prompt( - prompt="Do all the phone numbers include an area code?", - columns_subset="phone_number", # Only check the `phone_number` column - model="openai:gpt-4o", - batch_size=500, - max_concurrent=5, - thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3), - actions=pb.Actions(error="Too many phone numbers missing area codes.") - ) - .interrogate() - ) - ``` - - This validation will identify that 2 out of 5 phone numbers (40%) are missing area codes, - which exceeds all threshold levels. The validation will trigger the specified error action - since the failure rate (40%) is above the error threshold (20%). The AI can recognize - various phone number formats and determine whether they include area codes. - - - -## The Column Selection family - -A flexible way to select columns for validation is to use the `col()` -function along with column selection helper functions. A combination of `col()` + `starts_with()`, -`matches()`, etc., allows for the selection of multiple target columns (mapping a validation across -many steps). Furthermore, the `col()` function can be used to declare a comparison column (e.g., -for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value -for comparison. - -col(exprs: 'str | ColumnSelector | ColumnSelectorNarwhals | nw.selectors.Selector') -> 'Column | ColumnLiteral | ColumnSelectorNarwhals' - - Helper function for referencing a column in the input table. - - Many of the validation methods (i.e., `col_vals_*()` methods) in Pointblank have a `value=` - argument. These validations are comparisons between column values and a literal value, or, - between column values and adjacent values in another column. The `col()` helper function is used - to specify that it is a column being referenced, not a literal value. - - The `col()` doesn't check that the column exists in the input table. It acts to signal that the - value being compared is a column value. During validation (i.e., when - [`interrogate()`](`pointblank.Validate.interrogate`) is called), Pointblank will then check that - the column exists in the input table. - - For creating expressions to use with the `conjointly()` validation method, use the - [`expr_col()`](`pointblank.expr_col`) function instead. - - Parameters - ---------- - exprs - Either the name of a single column in the target table, provided as a string, or, an - expression involving column selector functions (e.g., `starts_with("a")`, - `ends_with("e") | starts_with("a")`, etc.). - - Returns - ------- - Column | ColumnLiteral | ColumnSelectorNarwhals: - A column object or expression representing the column reference. - - Usage with the `columns=` Argument - ----------------------------------- - The `col()` function can be used in the `columns=` argument of the following validation methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - If specifying a single column with certainty (you have the exact name), `col()` is not necessary - since you can just pass the column name as a string (though it is still valid to use - `col("column_name")`, if preferred). However, if you want to select columns based on complex - logic involving multiple column selector functions (e.g., columns that start with `"a"` but - don't end with `"e"`), you need to use `col()` to wrap expressions involving column selector - functions and logical operators such as `&`, `|`, `-`, and `~`. - - Here is an example of such usage with the [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - validation method: - - ```python - col_vals_gt(columns=col(starts_with("a") & ~ends_with("e")), value=10) - ``` - - If using only a single column selector function, you can pass the function directly to the - `columns=` argument of the validation method, or, you can use `col()` to wrap the function - (either is valid though the first is more concise). Here is an example of that simpler usage: - - ```python - col_vals_gt(columns=starts_with("a"), value=10) - ``` - - Usage with the `value=`, `left=`, and `right=` Arguments - -------------------------------------------------------- - The `col()` function can be used in the `value=` argument of the following validation methods - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - and in the `left=` and `right=` arguments (either or both) of these two validation methods - - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - You cannot use column selector functions such as [`starts_with()`](`pointblank.starts_with`) - in either of the `value=`, `left=`, or `right=` arguments since there would be no guarantee that - a single column will be resolved from the target table with this approach. The `col()` function - is used to signal that the value being compared is a column value and not a literal value. - - Available Selectors - ------------------- - There is a collection of selectors available in pointblank, allowing you to select columns based - on attributes of column names and positions. The selectors are: - - - [`starts_with()`](`pointblank.starts_with`) - - [`ends_with()`](`pointblank.ends_with`) - - [`contains()`](`pointblank.contains`) - - [`matches()`](`pointblank.matches`) - - [`everything()`](`pointblank.everything`) - - [`first_n()`](`pointblank.first_n`) - - [`last_n()`](`pointblank.last_n`) - - Alternatively, we support selectors from the Narwhals library! Those selectors can additionally - take advantage of the data types of the columns. The selectors are: - - - `boolean()` - - `by_dtype()` - - `categorical()` - - `matches()` - - `numeric()` - - `string()` - - Have a look at the [Narwhals API documentation on selectors](https://narwhals-dev.github.io/narwhals/api-reference/selectors/) - for more information. - - Examples - -------- - Suppose we have a table with columns `a` and `b` and we'd like to validate that the values in - column `a` are greater than the values in column `b`. We can use the `col()` helper function to - reference the comparison column when creating the validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 7, 6, 5], - "b": [4, 2, 3, 3, 4, 3], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=pb.col("b")) - .interrogate() - ) - - validation - ``` - - From results of the validation table it can be seen that values in `a` were greater than values - in `b` for every row (or test unit). Using `value=pb.col("b")` specified that the greater-than - comparison is across columns, not with a fixed literal value. - - If you want to select an arbitrary set of columns upon which to base a validation, you can use - column selector functions (e.g., [`starts_with()`](`pointblank.starts_with`), - [`ends_with()`](`pointblank.ends_with`), etc.) to specify columns in the `columns=` argument of - a validation method. Let's use the [`starts_with()`](`pointblank.starts_with`) column selector - function to select columns that start with `"paid"` and validate that the values in those - columns are greater than `10`. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "paid_2021": [16.32, 16.25, 15.75], - "paid_2022": [18.62, 16.95, 18.25], - "person_id": ["A123", "B456", "C789"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.col(pb.starts_with("paid")), value=10) - .interrogate() - ) - - validation - ``` - - In the above example the `col()` function contains the invocation of the - [`starts_with()`](`pointblank.starts_with`) column selector function. This is not strictly - necessary when using a single column selector function, so `columns=pb.starts_with("paid")` - would be equivalent usage here. However, the use of `col()` is required when using multiple - column selector functions with logical operators. Here is an example of that more complex usage: - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "hours_2022": [160, 180, 160], - "hours_2023": [182, 168, 175], - "hours_2024": [200, 165, 190], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns=pb.col(pb.starts_with("paid") & pb.matches("2023|2024")), - value=10 - ) - .interrogate() - ) - - validation - ``` - - In the above example the `col()` function contains the invocation of the - [`starts_with()`](`pointblank.starts_with`) and [`matches()`](`pointblank.matches`) column - selector functions, combined with the `&` operator. This is necessary to specify the set of - columns that start with `"paid"` *and* match the text `"2023"` or `"2024"`. - - If you'd like to take advantage of Narwhals selectors, that's also possible. Here is an example - of using the `numeric()` column selector function to select all numeric columns for validation, - checking that their values are greater than `0`. - - ```python - import narwhals.selectors as ncs - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "hours_2022": [160, 180, 160], - "hours_2023": [182, 168, 175], - "hours_2024": [200, 165, 190], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_ge(columns=pb.col(ncs.numeric()), value=0) - .interrogate() - ) - - validation - ``` - - In the above example the `col()` function contains the invocation of the `numeric()` column - selector function from Narwhals. As with the other selectors, this is not strictly necessary - when using a single column selector, so `columns=ncs.numeric()` would also be fine here. - - Narwhals selectors can also use operators to combine multiple selectors. Here is an example of - using the `numeric()` and [`matches()`](`pointblank.matches`) selectors together to select all - numeric columns that fit a specific pattern. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2022_status": ["ft", "ft", "pt"], - "2023_status": ["ft", "pt", "ft"], - "2024_status": ["ft", "pt", "ft"], - "2022_pay_total": [18.62, 16.95, 18.25], - "2023_pay_total": [19.29, 17.75, 18.35], - "2024_pay_total": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_lt(columns=pb.col(ncs.numeric() & ncs.matches("2023|2024")), value=30) - .interrogate() - ) - - validation - ``` - - In the above example the `col()` function contains the invocation of the `numeric()` and - [`matches()`](`pointblank.matches`) column selector functions from Narwhals, combined with the - `&` operator. This is necessary to specify the set of columns that are numeric *and* match the - text `"2023"` or `"2024"`. - - See Also - -------- - Create a column expression for use in `conjointly()` validation with the - [`expr_col()`](`pointblank.expr_col`) function. - - -starts_with(text: 'str', case_sensitive: 'bool' = False) -> 'StartsWith' - - Select columns that start with specified text. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `starts_with()` selector - function can be used to select one or more columns that start with some specified text. So if - the set of table columns consists of - - `[name_first, name_last, age, address]` - - and you want to validate columns that start with `"name"`, you can use - `columns=starts_with("name")`. This will select the `name_first` and `name_last` columns. - - There will be a validation step created for every resolved column. Note that if there aren't any - columns resolved from using `starts_with()` (or any other expression using selector functions), - the validation step will fail to be evaluated during the interrogation process. Such a failure - to evaluate will be reported in the validation results but it won't affect the interrogation - process overall (i.e., the process won't be halted). - - Parameters - ---------- - text - The text that the column name should start with. - case_sensitive - Whether column names should be treated as case-sensitive. The default is `False`. - - Returns - ------- - StartsWith - A `StartsWith` object, which can be used to select columns that start with the specified - text. - - Relevant Validation Methods where `starts_with()` can be Used - ------------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `starts_with()` selector function doesn't need to be used in isolation. Read the next - section for information on how to compose it with other column selectors for more refined ways - to select columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `starts_with()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select columns that start with `"a"` and end with `"e"`, you - can use the `starts_with()` and [`ends_with()`](`pointblank.ends_with`) functions together. The - only condition is that the expressions are wrapped in the [`col()`](`pointblank.col`) function, - like this: - - ```python - col(starts_with("a") & ends_with("e")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `name`, `paid_2021`, `paid_2022`, and `person_id` and - we'd like to validate that the values in columns that start with `"paid"` are greater than `10`. - We can use the `starts_with()` column selector function to specify the columns that start with - `"paid"` as the columns to validate. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "paid_2021": [16.32, 16.25, 15.75], - "paid_2022": [18.62, 16.95, 18.25], - "person_id": ["A123", "B456", "C789"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.starts_with("paid"), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `paid_2021` and - one for `paid_2022`. The values in both columns were all greater than `10`. - - We can also use the `starts_with()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select columns that start with - `"paid"` and match the text `"2023"` or `"2024"`, we can use the `&` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "hours_2022": [160, 180, 160], - "hours_2023": [182, 168, 175], - "hours_2024": [200, 165, 190], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns=pb.col(pb.starts_with("paid") & pb.matches("23|24")), - value=10 - ) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `paid_2023` and - one for `paid_2024`. - - -ends_with(text: 'str', case_sensitive: 'bool' = False) -> 'EndsWith' - - Select columns that end with specified text. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `ends_with()` selector - function can be used to select one or more columns that end with some specified text. So if the - set of table columns consists of - - `[first_name, last_name, age, address]` - - and you want to validate columns that end with `"name"`, you can use - `columns=ends_with("name")`. This will select the `first_name` and `last_name` columns. - - There will be a validation step created for every resolved column. Note that if there aren't any - columns resolved from using `ends_with()` (or any other expression using selector functions), - the validation step will fail to be evaluated during the interrogation process. Such a failure - to evaluate will be reported in the validation results but it won't affect the interrogation - process overall (i.e., the process won't be halted). - - Parameters - ---------- - text - The text that the column name should end with. - case_sensitive - Whether column names should be treated as case-sensitive. The default is `False`. - - Returns - ------- - EndsWith - An `EndsWith` object, which can be used to select columns that end with the specified text. - - Relevant Validation Methods where `ends_with()` can be Used - ----------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `ends_with()` selector function doesn't need to be used in isolation. Read the next section - for information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `ends_with()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select columns that end with `"e"` and start with `"a"`, you - can use the `ends_with()` and [`starts_with()`](`pointblank.starts_with`) functions together. - The only condition is that the expressions are wrapped in the [`col()`](`pointblank.col`) - function, like this: - - ```python - col(ends_with("e") & starts_with("a")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `name`, `2021_pay`, `2022_pay`, and `person_id` and - we'd like to validate that the values in columns that end with `"pay"` are greater than `10`. - We can use the `ends_with()` column selector function to specify the columns that end with - `"pay"` as the columns to validate. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2021_pay": [16.32, 16.25, 15.75], - "2022_pay": [18.62, 16.95, 18.25], - "person_id": ["A123", "B456", "C789"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.ends_with("pay"), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2021_pay` and one - for `2022_pay`. The values in both columns were all greater than `10`. - - We can also use the `ends_with()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select columns that end with `"pay"` - and match the text `"2023"` or `"2024"`, we can use the `&` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2022_hours": [160, 180, 160], - "2023_hours": [182, 168, 175], - "2024_hours": [200, 165, 190], - "2022_pay": [18.62, 16.95, 18.25], - "2023_pay": [19.29, 17.75, 18.35], - "2024_pay": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns=pb.col(pb.ends_with("pay") & pb.matches("2023|2024")), - value=10 - ) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2023_pay` and one - for `2024_pay`. - - -contains(text: 'str', case_sensitive: 'bool' = False) -> 'Contains' - - Select columns that contain specified text. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `contains()` selector - function can be used to select one or more columns that contain some specified text. So if the - set of table columns consists of - - `[profit, conv_first, conv_last, highest_conv, age]` - - and you want to validate columns that have `"conv"` in the name, you can use - `columns=contains("conv")`. This will select the `conv_first`, `conv_last`, and `highest_conv` - columns. - - There will be a validation step created for every resolved column. Note that if there aren't any - columns resolved from using `contains()` (or any other expression using selector functions), the - validation step will fail to be evaluated during the interrogation process. Such a failure to - evaluate will be reported in the validation results but it won't affect the interrogation - process overall (i.e., the process won't be halted). - - Parameters - ---------- - text - The text that the column name should contain. - case_sensitive - Whether column names should be treated as case-sensitive. The default is `False`. - - Returns - ------- - Contains - A `Contains` object, which can be used to select columns that contain the specified text. - - Relevant Validation Methods where `contains()` can be Used - ---------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `contains()` selector function doesn't need to be used in isolation. Read the next section - for information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `contains()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select columns that have the text `"_n"` and start with - `"item"`, you can use the `contains()` and [`starts_with()`](`pointblank.starts_with`) functions - together. The only condition is that the expressions are wrapped in the - [`col()`](`pointblank.col`) function, like this: - - ```python - col(contains("_n") & starts_with("item")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `name`, `2021_pay_total`, `2022_pay_total`, and `person_id` - and we'd like to validate that the values in columns having `"pay"` in the name are greater than - `10`. We can use the `contains()` column selector function to specify the column names that - contain `"pay"` as the columns to validate. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2021_pay_total": [16.32, 16.25, 15.75], - "2022_pay_total": [18.62, 16.95, 18.25], - "person_id": ["A123", "B456", "C789"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.contains("pay"), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2021_pay_total` - and one for `2022_pay_total`. The values in both columns were all greater than `10`. - - We can also use the `contains()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select columns that contain `"pay"` - and match the text `"2023"` or `"2024"`, we can use the `&` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2022_hours": [160, 180, 160], - "2023_hours": [182, 168, 175], - "2024_hours": [200, 165, 190], - "2022_pay_total": [18.62, 16.95, 18.25], - "2023_pay_total": [19.29, 17.75, 18.35], - "2024_pay_total": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns=pb.col(pb.contains("pay") & pb.matches("2023|2024")), - value=10 - ) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2023_pay_total` - and one for `2024_pay_total`. - - -matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches' - - Select columns that match a specified regular expression pattern. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `matches()` selector - function can be used to select one or more columns matching a provided regular expression - pattern. So if the set of table columns consists of - - `[rev_01, rev_02, profit_01, profit_02, age]` - - and you want to validate columns that have two digits at the end of the name, you can use - `columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and - `profit_02` columns. - - There will be a validation step created for every resolved column. Note that if there aren't any - columns resolved from using `matches()` (or any other expression using selector functions), the - validation step will fail to be evaluated during the interrogation process. Such a failure to - evaluate will be reported in the validation results but it won't affect the interrogation - process overall (i.e., the process won't be halted). - - Parameters - ---------- - pattern - The regular expression pattern that the column name should match. - case_sensitive - Whether column names should be treated as case-sensitive. The default is `False`. - - Returns - ------- - Matches - A `Matches` object, which can be used to select columns that match the specified pattern. - - Relevant Validation Methods where `matches()` can be Used - --------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `matches()` selector function doesn't need to be used in isolation. Read the next section - for information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `matches()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select columns that have the text starting with five digits - and end with `"_id"`, you can use the `matches()` and [`ends_with()`](`pointblank.ends_with`) - functions together. The only condition is that the expressions are wrapped in the - [`col()`](`pointblank.col`) function, like this: - - ```python - col(matches(r"^[0-9]{5}") & ends_with("_id")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `name`, `id_old`, `new_identifier`, and `pay_2021` and we'd - like to validate that text values in columns having `"id"` or `"identifier"` in the name have a - specific syntax. We can use the `matches()` column selector function to specify the columns that - match the pattern. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "id_old": ["ID0021", "ID0032", "ID0043"], - "new_identifier": ["ID9054", "ID9065", "ID9076"], - "pay_2021": [16.32, 16.25, 15.75], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}") - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `id_old` and one - for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`. - - We can also use the `matches()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select columns that contain `"pay"` - and match the text `"2023"` or `"2024"`, we can use the `&` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "2022_hours": [160, 180, 160], - "2023_hours": [182, 168, 175], - "2024_hours": [200, 165, 190], - "2022_pay_total": [18.62, 16.95, 18.25], - "2023_pay_total": [19.29, 17.75, 18.35], - "2024_pay_total": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns=pb.col(pb.contains("pay") & pb.matches("2023|2024")), - value=10 - ) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2023_pay_total` - and one for `2024_pay_total`. - - -everything() -> 'Everything' - - Select all columns. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `everything()` selector - function can be used to select every column in the table. If you have a table with six columns - and they're all suitable for a specific type of validation, you can use `columns=everything())` - and all six columns will be selected for validation. - - Returns - ------- - Everything - An `Everything` object, which can be used to select all columns. - - Relevant Validation Methods where `everything()` can be Used - ------------------------------------------------------------ - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `everything()` selector function doesn't need to be used in isolation. Read the next section - for information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `everything()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select all column names except those having starting with - "id_", you can use the `everything()` and [`starts_with()`](`pointblank.starts_with`) - functions together. The only condition is that the expressions are wrapped in the - [`col()`](`pointblank.col`) function, like this: - - ```python - col(everything() - starts_with("id_")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with several numeric columns and we'd like to validate that all these - columns have less than `1000`. We can use the `everything()` column selector function to select - all columns for validation. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "2023_hours": [182, 168, 175], - "2024_hours": [200, 165, 190], - "2023_pay_total": [19.29, 17.75, 18.35], - "2024_pay_total": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_lt(columns=pb.everything(), value=1000) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get four validation steps, one each column in the - table. The values in every column were all lower than `1000`. - - We can also use the `everything()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select every column except those that - begin with `"2023"` we can use the `-` operator to combine column selectors. - - ```python - tbl = pl.DataFrame( - { - "2023_hours": [182, 168, 175], - "2024_hours": [200, 165, 190], - "2023_pay_total": [19.29, 17.75, 18.35], - "2024_pay_total": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_lt(columns=pb.col(pb.everything() - pb.starts_with("2023")), value=1000) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get two validation steps, one for `2024_hours` and - one for `2024_pay_total`. - - -first_n(n: 'int', offset: 'int' = 0) -> 'FirstN' - - Select the first `n` columns in the column list. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `first_n()` selector - function can be used to select *n* columns positioned at the start of the column list. So if the - set of table columns consists of - - `[rev_01, rev_02, profit_01, profit_02, age]` - - and you want to validate the first two columns, you can use `columns=first_n(2)`. This will - select the `rev_01` and `rev_02` columns and a validation step will be created for each. - - The `offset=` parameter can be used to skip a certain number of columns from the start of the - column list. So if you want to select the third and fourth columns, you can use - `columns=first_n(2, offset=2)`. - - Parameters - ---------- - n - The number of columns to select from the start of the column list. Should be a positive - integer value. If `n` is greater than the number of columns in the table, all columns will - be selected. - offset - The offset from the start of the column list. The default is `0`. If `offset` is greater - than the number of columns in the table, no columns will be selected. - - Returns - ------- - FirstN - A `FirstN` object, which can be used to select the first `n` columns. - - Relevant Validation Methods where `first_n()` can be Used - --------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `first_n()` selector function doesn't need to be used in isolation. Read the next section - for information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `first_n()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select all column names starting with "rev" along with the - first two columns, you can use the `first_n()` and [`starts_with()`](`pointblank.starts_with`) - functions together. The only condition is that the expressions are wrapped in the - [`col()`](`pointblank.col`) function, like this: - - ```python - col(first_n(2) | starts_with("rev")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `paid_2021`, `paid_2022`, `paid_2023`, `paid_2024`, and - `name` and we'd like to validate that the values in the first four columns are greater than - `10`. We can use the `first_n()` column selector function to specify that the first four columns - in the table are the columns to validate. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "paid_2021": [17.94, 16.55, 17.85], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - "name": ["Alice", "Bob", "Charlie"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.first_n(4), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get four validation steps. The values in all those - columns were all greater than `10`. - - We can also use the `first_n()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select the first four columns but - also omit those columns that end with `"2023"`, we can use the `-` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "paid_2021": [17.94, 16.55, 17.85], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - "name": ["Alice", "Bob", "Charlie"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.col(pb.first_n(4) - pb.ends_with("2023")), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get three validation steps, one for `paid_2021`, - `paid_2022`, and `paid_2024`. - - -last_n(n: 'int', offset: 'int' = 0) -> 'LastN' - - Select the last `n` columns in the column list. - - Many validation methods have a `columns=` argument that can be used to specify the columns for - validation (e.g., [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). The `last_n()` selector - function can be used to select *n* columns positioned at the end of the column list. So if the - set of table columns consists of - - `[age, rev_01, rev_02, profit_01, profit_02]` - - and you want to validate the last two columns, you can use `columns=last_n(2)`. This will select - the `profit_01` and `profit_02` columns and a validation step will be created for each. - - The `offset=` parameter can be used to skip a certain number of columns from the end of the - column list. So if you want to select the third and fourth columns from the end, you can use - `columns=last_n(2, offset=2)`. - - Parameters - ---------- - n - The number of columns to select from the end of the column list. Should be a positive - integer value. If `n` is greater than the number of columns in the table, all columns will - be selected. - offset - The offset from the end of the column list. The default is `0`. If `offset` is greater than - the number of columns in the table, no columns will be selected. - - Returns - ------- - LastN - A `LastN` object, which can be used to select the last `n` columns. - - Relevant Validation Methods where `last_n()` can be Used - -------------------------------------------------------- - This selector function can be used in the `columns=` argument of the following validation - methods: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_exists()`](`pointblank.Validate.col_exists`) - - The `last_n()` selector function doesn't need to be used in isolation. Read the next section for - information on how to compose it with other column selectors for more refined ways to select - columns. - - Additional Flexibilty through Composition with Other Column Selectors - --------------------------------------------------------------------- - The `last_n()` function can be composed with other column selectors to create fine-grained - column selections. For example, to select all column names starting with "rev" along with the - last two columns, you can use the `last_n()` and [`starts_with()`](`pointblank.starts_with`) - functions together. The only condition is that the expressions are wrapped in the - [`col()`](`pointblank.col`) function, like this: - - ```python - col(last_n(2) | starts_with("rev")) - ``` - - There are four operators that can be used to compose column selectors: - - - `&` (*and*) - - `|` (*or*) - - `-` (*difference*) - - `~` (*not*) - - The `&` operator is used to select columns that satisfy both conditions. The `|` operator is - used to select columns that satisfy either condition. The `-` operator is used to select columns - that satisfy the first condition but not the second. The `~` operator is used to select columns - that don't satisfy the condition. As many selector functions can be used as needed and the - operators can be combined to create complex column selection criteria (parentheses can be used - to group conditions and control the order of evaluation). - - Examples - -------- - Suppose we have a table with columns `name`, `paid_2021`, `paid_2022`, `paid_2023`, and - `paid_2024` and we'd like to validate that the values in the last four columns are greater than - `10`. We can use the `last_n()` column selector function to specify that the last four columns - in the table are the columns to validate. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "paid_2021": [17.94, 16.55, 17.85], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.last_n(4), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get four validation steps. The values in all those - columns were all greater than `10`. - - We can also use the `last_n()` function in combination with other column selectors (within - [`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select - columns that satisfy multiple conditions). For example, to select the last four columns but also - omit those columns that end with `"2023"`, we can use the `-` operator to combine column - selectors. - - ```python - tbl = pl.DataFrame( - { - "name": ["Alice", "Bob", "Charlie"], - "paid_2021": [17.94, 16.55, 17.85], - "paid_2022": [18.62, 16.95, 18.25], - "paid_2023": [19.29, 17.75, 18.35], - "paid_2024": [20.73, 18.35, 20.10], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns=pb.col(pb.last_n(4) - pb.ends_with("2023")), value=10) - .interrogate() - ) - - validation - ``` - - From the results of the validation table we get three validation steps, one for `paid_2021`, - `paid_2022`, and `paid_2024`. - - -expr_col(column_name: 'str') -> 'ColumnExpression' - - Create a column expression for use in `conjointly()` validation. - - This function returns a ColumnExpression object that supports operations like `>`, `<`, `+`, - etc. for use in [`conjointly()`](`pointblank.Validate.conjointly`) validation expressions. - - Parameters - ---------- - column_name - The name of the column to reference. - - Returns - ------- - ColumnExpression - A column expression that can be used in comparisons and operations. - - Examples - -------- - Let's say we have a table with three columns: `a`, `b`, and `c`. We want to validate that: - - - The values in column `a` are greater than `2`. - - The values in column `b` are less than `7`. - - The sum of columns `a` and `b` is less than the values in column `c`. - - We can use the `expr_col()` function to create a column expression for each of these conditions. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 7, 1, 3, 9, 4], - "b": [6, 3, 0, 5, 8, 2], - "c": [10, 4, 8, 9, 10, 5], - } - ) - - # Using expr_col() to create backend-agnostic validation expressions - validation = ( - pb.Validate(data=tbl) - .conjointly( - lambda df: pb.expr_col("a") > 2, - lambda df: pb.expr_col("b") < 7, - lambda df: pb.expr_col("a") + pb.expr_col("b") < pb.expr_col("c") - ) - .interrogate() - ) - - validation - ``` - - The above code creates a validation object that checks the specified conditions using the - `expr_col()` function. The resulting validation table will show whether each condition was - satisfied for each row in the table. - - See Also - -------- - The [`conjointly()`](`pointblank.Validate.conjointly`) validation method, which is where this - function should be used. - - - -## The Segments family - -Combine multiple values into a single segment using `seg_*()` helper functions. - -seg_group(values: 'list[Any]') -> 'Segment' - - Group together values for segmentation. - - Many validation methods have a `segments=` argument that can be used to specify one or more - columns, or certain values within a column, to create segments for validation (e.g., - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`), etc.). When passing in a column, or - a tuple with a column and certain values, a segment will be created for each individual value - within the column or given values. The `seg_group()` selector enables values to be grouped - together into a segment. For example, if you were to create a segment for a column "region", - investigating just "North" and "South" regions, a typical segment would look like: - - `segments=("region", ["North", "South"])` - - This would create two validation steps, one for each of the regions. If you wanted to group - these two regions into a single segment, you could use the `seg_group()` function like this: - - `segments=("region", pb.seg_group(["North", "South"]))` - - You could create a second segment for "East" and "West" regions like this: - - `segments=("region", pb.seg_group([["North", "South"], ["East", "West"]]))` - - There will be a validation step created for every segment. Note that if there aren't any - segments created using `seg_group()` (or any other segment expression), the validation step will - fail to be evaluated during the interrogation process. Such a failure to evaluate will be - reported in the validation results but it won't affect the interrogation process overall - (i.e., the process won't be halted). - - Parameters - ---------- - values - A list of values to be grouped into a segment. This can be a single list or a list of lists. - - Returns - ------- - Segment - A `Segment` object, which can be used to combine values into a segment. - - Examples - -------- - Let's say we're analyzing sales from our local bookstore, and want to check the number of books - sold for the month exceeds a certain threshold. We could pass in the argument - `segments="genre"`, which would return a segment for each unique genre in the datasets. We could - also pass in `segments=("genre", ["Fantasy", "Science Fiction"])`, to only create segments for - those two genres. However, if we wanted to group these two genres into a single segment, we - could use the `seg_group()` function. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "title": [ - "The Hobbit", - "Harry Potter and the Sorcerer's Stone", - "The Lord of the Rings", - "A Game of Thrones", - "The Name of the Wind", - "The Girl with the Dragon Tattoo", - "The Da Vinci Code", - "The Hitchhiker's Guide to the Galaxy", - "The Martian", - "Brave New World" - ], - "genre": [ - "Fantasy", - "Fantasy", - "Fantasy", - "Fantasy", - "Fantasy", - "Mystery", - "Mystery", - "Science Fiction", - "Science Fiction", - "Science Fiction", - ], - "units_sold": [875, 932, 756, 623, 445, 389, 678, 534, 712, 598], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns="units_sold", - value=500, - segments=("genre", pb.seg_group(["Fantasy", "Science Fiction"])) - ) - .interrogate() - ) - - validation - ``` - - What's more, we can create multiple segments, combining the genres in different ways. - - ```python - validation = ( - pb.Validate(data=tbl) - .col_vals_gt( - columns="units_sold", - value=500, - segments=("genre", pb.seg_group([ - ["Fantasy", "Science Fiction"], - ["Fantasy", "Mystery"], - ["Mystery", "Science Fiction"] - ])) - ) - .interrogate() - ) - - validation - ``` - - - - -## The Interrogation and Reporting family - -The validation plan is put into action when `interrogate()` is called. -The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding -validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation -report table (by printing the object or using `get_tabular_report()`), extract key metrics, or we -can split the data based on the validation results (with `get_sundered_data()`). - -interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' = True, get_first_n: 'int | None' = None, sample_n: 'int | None' = None, sample_frac: 'int | float | None' = None, extract_limit: 'int' = 500) -> 'Validate' - - Execute each validation step against the table and store the results. - - When a validation plan has been set with a series of validation steps, the interrogation - process through `interrogate()` should then be invoked. Interrogation will evaluate each - validation step against the table and store the results. - - The interrogation process will collect extracts of failing rows if the `collect_extracts=` - option is set to `True` (the default). We can control the number of rows collected using the - `get_first_n=`, `sample_n=`, and `sample_frac=` options. The `extract_limit=` option will - enforce a hard limit on the number of rows collected when `collect_extracts=True`. - - After interrogation is complete, the `Validate` object will have gathered information, and - we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`), - [`f_failed()`](`pointblank.Validate.f_failed`), etc., to understand how the table performed - against the validation plan. A visual representation of the validation results can be viewed - by printing the `Validate` object; this will display the validation table in an HTML viewing - environment. - - Parameters - ---------- - collect_extracts - An option to collect rows of the input table that didn't pass a particular validation - step. The default is `True` and further options (i.e., `get_first_n=`, `sample_*=`) - allow for fine control of how these rows are collected. - collect_tbl_checked - The processed data frames produced by executing the validation steps is collected and - stored in the `Validate` object if `collect_tbl_checked=True`. This information is - necessary for some methods (e.g., - [`get_sundered_data()`](`pointblank.Validate.get_sundered_data`)), but it can - potentially make the object grow to a large size. To opt out of attaching this data, set - this to `False`. - get_first_n - If the option to collect rows where test units is chosen, there is the option here to - collect the first `n` rows. Supply an integer number of rows to extract from the top of - subset table containing non-passing rows (the ordering of data from the original table - is retained). - sample_n - If the option to collect non-passing rows is chosen, this option allows for the - sampling of `n` rows. Supply an integer number of rows to sample from the subset table. - If `n` happens to be greater than the number of non-passing rows, then all such rows - will be returned. - sample_frac - If the option to collect non-passing rows is chosen, this option allows for the sampling - of a fraction of those rows. Provide a number in the range of `0` and `1`. The number of - rows to return could be very large, however, the `extract_limit=` option will apply a - hard limit to the returned rows. - extract_limit - A value that limits the possible number of rows returned when extracting non-passing - rows. The default is `500` rows. This limit is applied after any sampling or limiting - options are applied. If the number of rows to be returned is greater than this limit, - then the number of rows returned will be limited to this value. This is useful for - preventing the collection of too many rows when the number of non-passing rows is very - large. - - Returns - ------- - Validate - The `Validate` object with the results of the interrogation. - - Examples - -------- - Let's use a built-in dataset (`"game_revenue"`) to demonstrate some of the options of the - interrogation process. A series of validation steps will populate our validation plan. After - setting up the plan, the next step is to interrogate the table and see how well it aligns - with our expectations. We'll use the `get_first_n=` option so that any extracts of failing - rows are limited to the first `n` rows. - - ```python - import pointblank as pb - import polars as pl - - validation = ( - pb.Validate(data=pb.load_dataset(dataset="game_revenue")) - .col_vals_lt(columns="item_revenue", value=200) - .col_vals_gt(columns="item_revenue", value=0) - .col_vals_gt(columns="session_duration", value=5) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - ) - - validation.interrogate(get_first_n=10) - ``` - - The validation table shows that step 3 (checking for `session_duration` greater than `5`) - has 18 failing test units. This means that 18 rows in the table are problematic. We'd like - to see the rows that failed this validation step and we can do that with the - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method. - - ```python - pb.preview(validation.get_data_extracts(i=3, frame=True)) - ``` - - The [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`) method will return a - Polars DataFrame here with the first 10 rows that failed the validation step (we passed that - into the [`preview()`](`pointblank.preview`) function for a better display). There are - actually 18 rows that failed but we limited the collection of extracts with - `get_first_n=10`. - - -set_tbl(self, tbl: 'Any', tbl_name: 'str | None' = None, label: 'str | None' = None) -> 'Validate' - - Set or replace the table associated with the Validate object. - - This method allows you to replace the table associated with a Validate object with a - different (but presumably similar) table. This is useful when you want to apply the same - validation plan to multiple tables or when you have a validation workflow defined but want - to swap in a different data source. - - Parameters - ---------- - tbl - The table to replace the existing table with. This can be any supported table type - including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, - GitHub URLs, or database connection strings. The same table type constraints apply as in - the `Validate` constructor. - tbl_name - An optional name to assign to the new input table object. If no value is provided, the - existing table name will be retained. - label - An optional label for the validation plan. If no value is provided, the existing label - will be retained. - - Returns - ------- - Validate - A new `Validate` object with the replacement table. - - When to Use - ----------- - The `set_tbl()` method is particularly useful in scenarios where you have: - - - multiple similar tables that need the same validation checks - - a template validation workflow that should be applied to different data sources - - YAML-defined validations where you want to override the table specified in the YAML - - The `set_tbl()` method creates a copy of the validation object with the new table, so the - original validation object remains unchanged. This allows you to reuse validation plans - across multiple tables without interference. - - Examples - -------- - We will first create two similar tables for our future validation plans. - - ```python - import pointblank as pb - import polars as pl - - # Create two similar tables - table_1 = pl.DataFrame({ - "x": [1, 2, 3, 4, 5], - "y": [5, 4, 3, 2, 1], - "z": ["a", "b", "c", "d", "e"] - }) - - table_2 = pl.DataFrame({ - "x": [2, 4, 6, 8, 10], - "y": [10, 8, 6, 4, 2], - "z": ["f", "g", "h", "i", "j"] - }) - ``` - - Create a validation plan with the first table. - - ```python - validation_table_1 = ( - pb.Validate( - data=table_1, - tbl_name="Table 1", - label="Validation applied to the first table" - ) - .col_vals_gt(columns="x", value=0) - .col_vals_lt(columns="y", value=10) - ) - ``` - - Now apply the same validation plan to the second table. - - ```python - validation_table_2 = ( - validation_table_1 - .set_tbl( - tbl=table_2, - tbl_name="Table 2", - label="Validation applied to the second table" - ) - ) - ``` - - Here is the interrogation of the first table: - - ```python - validation_table_1.interrogate() - ``` - - And the second table: - - ```python - validation_table_2.interrogate() - ``` - - -get_tabular_report(self, title: 'str | None' = ':default:', incl_header: 'bool | None' = None, incl_footer: 'bool | None' = None, incl_footer_timings: 'bool | None' = None, incl_footer_notes: 'bool | None' = None) -> 'GT' - - Validation report as a GT table. - - The `get_tabular_report()` method returns a GT table object that represents the validation - report. This validation table provides a summary of the validation results, including the - validation steps, the number of test units, the number of failing test units, and the - fraction of failing test units. The table also includes status indicators for the 'warning', - 'error', and 'critical' levels. - - You could simply display the validation table without the use of the `get_tabular_report()` - method. However, the method provides a way to customize the title of the report. In the - future this method may provide additional options for customizing the report. - - Parameters - ---------- - title - Options for customizing the title of the report. The default is the `":default:"` value - which produces a generic title. Another option is `":tbl_name:"`, and that presents the - name of the table as the title for the report. If no title is wanted, then `":none:"` - can be used. Aside from keyword options, text can be provided for the title. This will - be interpreted as Markdown text and transformed internally to HTML. - incl_header - Controls whether the header section should be displayed. If `None`, uses the global - configuration setting. The header contains the table name, label, and threshold - information. - incl_footer - Controls whether the footer section should be displayed. If `None`, uses the global - configuration setting. The footer can contain validation timing information and notes. - incl_footer_timings - Controls whether validation timing information (start time, duration, end time) should - be displayed in the footer. If `None`, uses the global configuration setting. Only - applies when `incl_footer=True`. - incl_footer_notes - Controls whether notes from validation steps should be displayed in the footer. If - `None`, uses the global configuration setting. Only applies when `incl_footer=True`. - - Returns - ------- - GT - A GT table object that represents the validation report. - - Examples - -------- - Let's create a `Validate` object with a few validation steps and then interrogate the data - table to see how it performs against the validation plan. We can then generate a tabular - report to get a summary of the results. - - ```python - import pointblank as pb - import polars as pl - - # Create a Polars DataFrame - tbl_pl = pl.DataFrame({"x": [1, 2, 3, 4], "y": [4, 5, 6, 7]}) - - # Validate data using Polars DataFrame - validation = ( - pb.Validate(data=tbl_pl, tbl_name="tbl_xy", thresholds=(2, 3, 4)) - .col_vals_gt(columns="x", value=1) - .col_vals_lt(columns="x", value=3) - .col_vals_le(columns="y", value=7) - .interrogate() - ) - - # Look at the validation table - validation - ``` - - The validation table is displayed with a default title ('Validation Report'). We can use the - `get_tabular_report()` method to customize the title of the report. For example, we can set - the title to the name of the table by using the `title=":tbl_name:"` option. This will use - the string provided in the `tbl_name=` argument of the `Validate` object. - - ```python - validation.get_tabular_report(title=":tbl_name:") - ``` - - The title of the report is now set to the name of the table, which is 'tbl_xy'. This can be - useful if you have multiple tables and want to keep track of which table the validation - report is for. - - Alternatively, you can provide your own title for the report. - - ```python - validation.get_tabular_report(title="Report for Table XY") - ``` - - The title of the report is now set to 'Report for Table XY'. This can be useful if you want - to provide a more descriptive title for the report. - - -get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None' = None, header: 'str' = ':default:', limit: 'int | None' = 10) -> 'GT' - - Get a detailed report for a single validation step. - - The `get_step_report()` method returns a report of what went well---or what failed - spectacularly---for a given validation step. The report includes a summary of the validation - step and a detailed breakdown of the interrogation results. The report is presented as a GT - table object, which can be displayed in a notebook or exported to an HTML file. - - :::{.callout-warning} - The `get_step_report()` method is still experimental. Please report any issues you encounter - in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - i - The step number for which to get the report. - columns_subset - The columns to display in a step report that shows errors in the input table. By default - all columns are shown (`None`). If a subset of columns is desired, we can provide a list - of column names, a string with a single column name, a `Column` object, or a - `ColumnSelector` object. The last two options allow for more flexible column selection - using column selector functions. Errors are raised if the column names provided don't - match any columns in the table (when provided as a string or list of strings) or if - column selector expressions don't resolve to any columns. - header - Options for customizing the header of the step report. The default is the `":default:"` - value which produces a header with a standard title and set of details underneath. Aside - from this default, free text can be provided for the header. This will be interpreted as - Markdown text and transformed internally to HTML. You can provide one of two templating - elements: `{title}` and `{details}`. The default header has the template - `"{title}{details}"` so you can easily start from that and modify as you see fit. If you - don't want a header at all, you can set `header=None` to remove it entirely. - limit - The number of rows to display for those validation steps that check values in rows (the - `col_vals_*()` validation steps). The default is `10` rows and the limit can be removed - entirely by setting `limit=None`. - - Returns - ------- - GT - A GT table object that represents the detailed report for the validation step. - - Types of Step Reports - --------------------- - The `get_step_report()` method produces a report based on the *type* of validation step. - The following column-value or row-based validation step validation methods will produce a - report that shows the rows of the data that failed: - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`) - - [`conjointly()`](`pointblank.Validate.conjointly`) - - [`prompt()`](`pointblank.Validate.prompt`) - - [`rows_complete()`](`pointblank.Validate.rows_complete`) - - The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a - report that shows duplicate rows (or duplicate values in one or a set of columns as defined - in that method's `columns_subset=` parameter. - - The [`col_schema_match()`](`pointblank.Validate.col_schema_match`) validation step will - produce a report that shows the schema of the data table and the schema of the validation - step. The report will indicate whether the schemas match or not. - - Examples - -------- - Let's create a validation plan with a few validation steps and interrogate the data. With - that, we'll have a look at the validation reporting table for the entire collection of - steps and what went well or what failed. - - ```python - import pointblank as pb - - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Example for the get_step_report() method", - thresholds=(1, 0.20, 0.40) - ) - .col_vals_lt(columns="d", value=3500) - .col_vals_between(columns="c", left=1, right=8) - .col_vals_gt(columns="a", value=3) - .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}") - .interrogate() - ) - - validation - ``` - - There were four validation steps performed, where the first three steps had failing test - units and the last step had no failures. Let's get a detailed report for the first step by - using the `get_step_report()` method. - - ```python - validation.get_step_report(i=1) - ``` - - The report for the first step is displayed. The report includes a summary of the validation - step and a detailed breakdown of the interrogation results. The report provides details on - what the validation step was checking, the extent to which the test units failed, and a - table that shows the failing rows of the data with the column of interest highlighted. - - The second and third steps also had failing test units. Reports for those steps can be - viewed by using `get_step_report(i=2)` and `get_step_report(i=3)` respectively. - - The final step did not have any failing test units. A report for the final step can still be - viewed by using `get_step_report(i=4)`. The report will indicate that every test unit passed - and a prview of the target table will be provided. - - ```python - validation.get_step_report(i=4) - ``` - - If you'd like to trim down the number of columns shown in the report, you can provide a - subset of columns to display. For example, if you only want to see the columns `a`, `b`, and - `c`, you can provide those column names as a list. - - ```python - validation.get_step_report(i=1, columns_subset=["a", "b", "c"]) - ``` - - If you'd like to increase or reduce the maximum number of rows shown in the report, you can - provide a different value for the `limit` parameter. For example, if you'd like to see only - up to 5 rows, you can set `limit=5`. - - ```python - validation.get_step_report(i=3, limit=5) - ``` - - Step 3 actually had 7 failing test units, but only the first 5 rows are shown in the step - report because of the `limit=5` parameter. - - -get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'list[str] | None' = None) -> 'str' - - Get a report of the validation results as a JSON-formatted string. - - The `get_json_report()` method provides a machine-readable report of validation results in - JSON format. This is particularly useful for programmatic processing, storing validation - results, or integrating with other systems. The report includes detailed information about - each validation step, such as assertion type, columns validated, threshold values, test - results, and more. - - By default, all available validation information fields are included in the report. However, - you can customize the fields to include or exclude using the `use_fields=` and - `exclude_fields=` parameters. - - Parameters - ---------- - use_fields - An optional list of specific fields to include in the report. If provided, only these - fields will be included in the JSON output. If `None` (the default), all standard - validation report fields are included. Have a look at the *Available Report Fields* - section below for a list of fields that can be included in the report. - exclude_fields - An optional list of fields to exclude from the report. If provided, these fields will - be omitted from the JSON output. If `None` (the default), no fields are excluded. - This parameter cannot be used together with `use_fields=`. The *Available Report Fields* - provides a listing of fields that can be excluded from the report. - - Returns - ------- - str - A JSON-formatted string representing the validation report, with each validation step - as an object in the report array. - - Available Report Fields - ----------------------- - The JSON report can include any of the standard validation report fields, including: - - - `i`: the step number (1-indexed) - - `i_o`: the original step index from the validation plan (pre-expansion) - - `assertion_type`: the type of validation assertion (e.g., `"col_vals_gt"`, etc.) - - `column`: the column being validated (or columns used in certain validations) - - `values`: the comparison values or parameters used in the validation - - `inclusive`: whether the comparison is inclusive (for range-based validations) - - `na_pass`: whether `NA`/`Null` values are considered passing (for certain validations) - - `pre`: preprocessing function applied before validation - - `segments`: data segments to which the validation was applied - - `thresholds`: threshold level statement that was used for the validation step - - `label`: custom label for the validation step - - `brief`: a brief description of the validation step - - `active`: whether the validation step is active - - `all_passed`: whether all test units passed in the step - - `n`: total number of test units - - `n_passed`, `n_failed`: number of test units that passed and failed - - `f_passed`, `f_failed`: Fraction of test units that passed and failed - - `warning`, `error`, `critical`: whether the namesake threshold level was exceeded (is - `null` if threshold not set) - - `time_processed`: when the validation step was processed (ISO 8601 format) - - `proc_duration_s`: the processing duration in seconds - - Examples - -------- - Let's create a validation plan with a few validation steps and generate a JSON report of the - results: - - ```python - import pointblank as pb - import polars as pl - - # Create a sample DataFrame - tbl = pl.DataFrame({ - "a": [5, 7, 8, 9], - "b": [3, 4, 2, 1] - }) - - # Create and execute a validation plan - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=6) - .col_vals_lt(columns="b", value=4) - .interrogate() - ) - - # Get the full JSON report - json_report = validation.get_json_report() - - print(json_report) - ``` - - You can also customize which fields to include: - - ```python - json_report = validation.get_json_report( - use_fields=["i", "assertion_type", "column", "n_passed", "n_failed"] - ) - - print(json_report) - ``` - - Or which fields to exclude: - - ```python - json_report = validation.get_json_report( - exclude_fields=[ - "i_o", "thresholds", "pre", "segments", "values", - "na_pass", "inclusive", "label", "brief", "active", - "time_processed", "proc_duration_s" - ] - ) - - print(json_report) - ``` - - The JSON output can be further processed or analyzed programmatically: - - ```python - import json - - # Parse the JSON report - report_data = json.loads(validation.get_json_report()) - - # Extract and analyze validation results - failing_steps = [step for step in report_data if step["n_failed"] > 0] - print(f"Number of failing validation steps: {len(failing_steps)}") - ``` - - See Also - -------- - - [`get_tabular_report()`](`pointblank.Validate.get_tabular_report`): Get a formatted HTML - report as a GT table - - [`get_data_extracts()`](`pointblank.Validate.get_data_extracts`): Get rows that - failed validation - - -get_sundered_data(self, type='pass') -> 'Any' - - Get the data that passed or failed the validation steps. - - Validation of the data is one thing but, sometimes, you want to use the best part of the - input dataset for something else. The `get_sundered_data()` method works with a `Validate` - object that has been interrogated (i.e., the - [`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the - 'pass' data piece (rows with no failing test units across all column-value based validation - functions), or, the 'fail' data piece (rows with at least one failing test unit across the - same series of validations). - - Details - ------- - There are some caveats to sundering. The validation steps considered for this splitting will - only involve steps where: - - - of certain check types, where test units are cells checked down a column (e.g., the - `col_vals_*()` methods) - - `active=` is not set to `False` - - `pre=` has not been given an expression for modifying the input table - - So long as these conditions are met, the data will be split into two constituent tables: one - with the rows that passed all validation steps and another with the rows that failed at - least one validation step. - - Parameters - ---------- - type - The type of data to return. Options are `"pass"` or `"fail"`, where the former returns - a table only containing rows where test units always passed validation steps, and the - latter returns a table only containing rows had test units that failed in at least one - validation step. - - Returns - ------- - Any - A table containing the data that passed or failed the validation steps. - - Examples - -------- - Let's create a `Validate` object with three validation steps and then interrogate the data. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 6, 9, 7, 3, 2], - "b": [9, 8, 10, 5, 10, 6], - "c": ["c", "d", "a", "b", "a", "b"] - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=5) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation - ``` - - From the validation table, we can see that the first and second steps each had 4 passing - test units. A failing test unit will mark the entire row as failing in the context of the - `get_sundered_data()` method. We can use this method to get the rows of data that passed the - during interrogation. - - ```python - pb.preview(validation.get_sundered_data()) - ``` - - The returned DataFrame contains the rows that passed all validation steps (we passed this - object to [`preview()`](`pointblank.preview`) to show it in an HTML view). From the six-row - input DataFrame, the first two rows and the last two rows had test units that failed - validation. Thus the middle two rows are the only ones that passed all validation steps and - that's what we see in the returned DataFrame. - - -get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = False) -> 'dict[int, Any] | Any' - - Get the rows that failed for each validation step. - - After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the - `get_data_extracts()` method can be used to extract the rows that failed in each - column-value or row-based validation step (e.g., - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`), - [`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a - dictionary of tables containing the rows that failed in every validation step. If - `frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing - the dictionary structure). - - Parameters - ---------- - i - The validation step number(s) from which the failed rows are obtained. Can be provided - as a list of integers or a single integer. If `None`, all steps are included. - frame - If `True` and `i=` is a scalar, return the value as a DataFrame instead of a dictionary. - - Returns - ------- - dict[int, Any] | Any - A dictionary of tables containing the rows that failed in every compatible validation - step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar. - - Compatible Validation Methods for Yielding Extracted Rows - --------------------------------------------------------- - The following validation methods operate on column values and will have rows extracted when - there are failing test units. - - - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) - - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) - - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) - - [`col_vals_le()`](`pointblank.Validate.col_vals_le`) - - [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`) - - [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`) - - [`col_vals_between()`](`pointblank.Validate.col_vals_between`) - - [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`) - - [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`) - - [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`) - - [`col_vals_increasing()`](`pointblank.Validate.col_vals_increasing`) - - [`col_vals_decreasing()`](`pointblank.Validate.col_vals_decreasing`) - - [`col_vals_null()`](`pointblank.Validate.col_vals_null`) - - [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`) - - [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`) - - [`col_vals_within_spec()`](`pointblank.Validate.col_vals_within_spec`) - - [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`) - - [`conjointly()`](`pointblank.Validate.conjointly`) - - [`prompt()`](`pointblank.Validate.prompt`) - - An extracted row for these validation methods means that a test unit failed for that row in - the validation step. - - These row-based validation methods will also have rows extracted should there be failing - rows: - - - [`rows_distinct()`](`pointblank.Validate.rows_distinct`) - - [`rows_complete()`](`pointblank.Validate.rows_complete`) - - The extracted rows are a subset of the original table and are useful for further analysis - or for understanding the nature of the failing test units. - - Examples - -------- - Let's perform a series of validation steps on a Polars DataFrame. We'll use the - [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) in the first step, - [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) in the second step, and - [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`) in the third step. The - [`interrogate()`](`pointblank.Validate.interrogate`) method executes the validation; then, - we can extract the rows that failed for each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [5, 6, 5, 3, 6, 1], - "b": [1, 2, 1, 5, 2, 6], - "c": [3, 7, 2, 6, 3, 1], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=4) - .col_vals_lt(columns="c", value=5) - .col_vals_ge(columns="b", value=1) - .interrogate() - ) - - validation.get_data_extracts() - ``` - - The `get_data_extracts()` method returns a dictionary of tables, where each table contains - a subset of rows from the table. These are the rows that failed for each validation step. - - In the first step, the[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`) method was used - to check if the values in column `a` were greater than `4`. The extracted table shows the - rows where this condition was not met; look at the `a` column: all values are less than `4`. - - In the second step, the [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`) method was - used to check if the values in column `c` were less than `5`. In the extracted two-row - table, we see that the values in column `c` are greater than `5`. - - The third step ([`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)) checked if the values - in column `b` were greater than or equal to `1`. There were no failing test units, so the - extracted table is empty (i.e., has columns but no rows). - - The `i=` argument can be used to narrow down the extraction to one or more steps. For - example, to extract the rows that failed in the first step only: - - ```python - validation.get_data_extracts(i=1) - ``` - - Note that the first validation step is indexed at `1` (not `0`). This 1-based indexing is - in place here to match the step numbers reported in the validation table. What we get back - is still a dictionary, but it only contains one table (the one for the first step). - - If you want to get the extracted table as a DataFrame, set `frame=True` and provide a scalar - value for `i`. For example, to get the extracted table for the second step as a DataFrame: - - ```python - pb.preview(validation.get_data_extracts(i=2, frame=True)) - ``` - - The extracted table is now a DataFrame, which can serve as a more convenient format for - further analysis or visualization. We further used the [`preview()`](`pointblank.preview`) - function to show the DataFrame in an HTML view. - - -all_passed(self) -> 'bool' - - Determine if every validation step passed perfectly, with no failing test units. - - The `all_passed()` method determines if every validation step passed perfectly, with no - failing test units. This method is useful for quickly checking if the table passed all - validation steps with flying colors. If there's even a single failing test unit in any - validation step, this method will return `False`. - - This validation metric might be overly stringent for some validation plans where failing - test units are generally expected (and the strategy is to monitor data quality over time). - However, the value of `all_passed()` could be suitable for validation plans designed to - ensure that every test unit passes perfectly (e.g., checks for column presence, - null-checking tests, etc.). - - Returns - ------- - bool - `True` if all validation steps had no failing test units, `False` otherwise. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, and the second step will have a failing test - unit (the value `10` isn't less than `9`). After interrogation, the `all_passed()` method is - used to determine if all validation steps passed perfectly. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 9, 5], - "b": [5, 6, 10, 3], - "c": ["a", "b", "a", "a"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=0) - .col_vals_lt(columns="b", value=9) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.all_passed() - ``` - - The returned value is `False` since the second validation step had a failing test unit. If - it weren't for that one failing test unit, the return value would have been `True`. - - -assert_passing(self) -> 'None' - - Raise an `AssertionError` if all tests are not passing. - - The `assert_passing()` method will raise an `AssertionError` if a test does not pass. This - method simply wraps `all_passed` for more ready use in test suites. The step number and - assertion made is printed in the `AssertionError` message if a failure occurs, ensuring - some details are preserved. - - If the validation has not yet been interrogated, this method will automatically call - [`interrogate()`](`pointblank.Validate.interrogate`) with default parameters before checking - for passing tests. - - Raises - ------- - AssertionError - If any validation step has failing test units. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, and the second step will have a failing test - unit (the value `10` isn't less than `9`). The `assert_passing()` method is used to assert - that all validation steps passed perfectly, automatically performing the interrogation if - needed. - - ```python - #| error: True - - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 9, 5], - "b": [5, 6, 10, 3], - "c": ["a", "b", "a", "a"], - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=0) - .col_vals_lt(columns="b", value=9) # this assertion is false - .col_vals_in_set(columns="c", set=["a", "b"]) - ) - - # No need to call [`interrogate()`](`pointblank.Validate.interrogate`) explicitly - validation.assert_passing() - ``` - - -assert_below_threshold(self, level: 'str' = 'warning', i: 'int | None' = None, message: 'str | None' = None) -> 'None' - - Raise an `AssertionError` if validation steps exceed a specified threshold level. - - The `assert_below_threshold()` method checks whether validation steps' failure rates are - below a given threshold level (`"warning"`, `"error"`, or `"critical"`). This is - particularly useful in automated testing environments where you want to ensure your data - quality meets minimum standards before proceeding. - - If any validation step exceeds the specified threshold level, an `AssertionError` will be - raised with details about which steps failed. If the validation has not yet been - interrogated, this method will automatically call - [`interrogate()`](`pointblank.Validate.interrogate`) with default parameters. - - Parameters - ---------- - level - The threshold level to check against, which could be any of `"warning"` (the default), - `"error"`, or `"critical"`. An `AssertionError` will be raised if any validation step - exceeds this level. - i - Specific validation step number(s) to check. Can be provided as a single integer or a - list of integers. If `None` (the default), all steps are checked. - message - Custom error message to use if assertion fails. If `None`, a default message will be - generated that lists the specific steps that exceeded the threshold. - - Returns - ------- - None - - Raises - ------ - AssertionError - If any specified validation step exceeds the given threshold level. - ValueError - If an invalid threshold level is provided. - - Examples - -------- - Below are some examples of how to use the `assert_below_threshold()` method. First, we'll - create a simple Polars DataFrame with two columns (`a` and `b`). - - ```python - import polars as pl - - tbl = pl.DataFrame({ - "a": [7, 4, 9, 7, 12], - "b": [9, 8, 10, 5, 10] - }) - ``` - - Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`, - `critical=0.3`). After interrogating, we display the validation report table: - - ```python - import pointblank as pb - - validation = ( - pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3)) - .col_vals_gt(columns="a", value=5) # 1 failing test unit - .col_vals_lt(columns="b", value=10) # 2 failing test units - .interrogate() - ) - - validation - ``` - - Using `assert_below_threshold(level="warning")` will raise an `AssertionError` if any step - exceeds the 'warning' threshold: - - Check a specific step against the 'critical' threshold using the `i=` parameter: - - ```python - validation.assert_below_threshold(level="critical", i=1) # Won't raise an error - ``` - - As the first step is below the 'critical' threshold (it exceeds the 'warning' and 'error' - thresholds), no error is raised and nothing is printed. - - We can also provide a custom error message with the `message=` parameter. Let's try that - here: - - ```python - try: - validation.assert_below_threshold( - level="error", - message="Data quality too low for processing!" - ) - except AssertionError as e: - print(f"Custom error: {e}") - ``` - - See Also - -------- - - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation - step - - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step - - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each - validation step - - [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass - completely - - -above_threshold(self, level: 'str' = 'warning', i: 'int | None' = None) -> 'bool' - - Check if any validation steps exceed a specified threshold level. - - The `above_threshold()` method checks whether validation steps exceed a given threshold - level. This provides a non-exception-based alternative to - [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional - workflow control based on validation results. - - This method is useful in scenarios where you want to check if any validation steps failed - beyond a certain threshold without raising an exception, allowing for more flexible - programmatic responses to validation issues. - - Parameters - ---------- - level - The threshold level to check against. Valid options are: `"warning"` (the least severe - threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the - most severe threshold level). The default is `"warning"`. - i - Specific validation step number(s) to check. If a single integer, checks only that step. - If a list of integers, checks all specified steps. If `None` (the default), checks all - validation steps. Step numbers are 1-based (first step is `1`, not `0`). - - Returns - ------- - bool - `True` if any of the specified validation steps exceed the given threshold level, - `False` otherwise. - - Raises - ------ - ValueError - If an invalid threshold level is provided. - - Examples - -------- - Below are some examples of how to use the `above_threshold()` method. First, we'll create a - simple Polars DataFrame with a single column (`values`). - - Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`, - `critical=0.3`). After interrogating, we display the validation report table: - - ```python - import pointblank as pb - - validation = ( - pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3)) - .col_vals_gt(columns="values", value=0) - .col_vals_lt(columns="values", value=10) - .col_vals_between(columns="values", left=0, right=5) - .interrogate() - ) - - validation - ``` - - Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method. - A message will be printed if that's the case: - - ```python - if validation.above_threshold(level="warning"): - print("Some steps have exceeded the warning threshold") - ``` - - Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument: - - ```python - if validation.above_threshold(level="error", i=[2, 3]): - print("Steps 2 and/or 3 have exceeded the error threshold") - ``` - - You can use this in a workflow to conditionally trigger processes. Here's a snippet of how - you might use this in a function: - - ```python - def process_data(validation_obj): - # Only continue processing if validation passes critical thresholds - if not validation_obj.above_threshold(level="critical"): - # Continue with processing - print("Data meets critical quality thresholds, proceeding...") - return True - else: - # Log failure and stop processing - print("Data fails critical quality checks, aborting...") - return False - ``` - - Note that this is just a suggestion for how to implement conditional workflow processes. You - should adapt this pattern to your specific requirements, which might include different - threshold levels, custom logging mechanisms, or integration with your organization's data - pipelines and notification systems. - - See Also - -------- - - [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar - method that raises an exception if thresholds are exceeded - - [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation - step - - [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step - - [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each - validation step - - -n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int' - - Provides a dictionary of the number of test units for each validation step. - - The `n()` method provides the number of test units for each validation step. This is the - total number of test units that were evaluated in the validation step. It is always an - integer value. - - Test units are the atomic units of the validation process. Different validations can have - different numbers of test units. For example, a validation that checks for the presence of - a column in a table will have a single test unit. A validation that checks for the presence - of a value in a column will have as many test units as there are rows in the table. - - The method provides a dictionary of the number of test units for each validation step. If - the `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a - scalar instead of a dictionary. The total number of test units for a validation step is the - sum of the number of passing and failing test units (i.e., `n = n_passed + n_failed`). - - Parameters - ---------- - i - The validation step number(s) from which the number of test units is obtained. - Can be provided as a list of integers or a single integer. If `None`, all steps are - included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, int] | int - A dictionary of the number of test units for each validation step or a scalar value. - - Examples - -------- - Different types of validation steps can have different numbers of test units. In the example - below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and `c`). There - will be three validation steps, and the number of test units for each step will be a little - bit different. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [1, 2, 9, 5], - "b": [5, 6, 10, 3], - "c": ["a", "b", "a", "a"], - } - ) - - # Define a preprocessing function - def filter_by_a_gt_1(df): - return df.filter(pl.col("a") > 1) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=0) - .col_exists(columns="b") - .col_vals_lt(columns="b", value=9, pre=filter_by_a_gt_1) - .interrogate() - ) - ``` - - The first validation step checks that all values in column `a` are greater than `0`. Let's - use the `n()` method to determine the number of test units this validation step. - - ```python - validation.n(i=1, scalar=True) - ``` - - The returned value of `4` is the number of test units for the first validation step. This - value is the same as the number of rows in the table. - - The second validation step checks for the existence of column `b`. Using the `n()` method - we can get the number of test units for this the second step. - - ```python - validation.n(i=2, scalar=True) - ``` - - There's a single test unit here because the validation step is checking for the presence of - a single column. - - The third validation step checks that all values in column `b` are less than `9` after - filtering the table to only include rows where the value in column `a` is greater than `1`. - Because the table is filtered, the number of test units will be less than the total number - of rows in the input table. Let's prove this by using the `n()` method. - - ```python - validation.n(i=3, scalar=True) - ``` - - The returned value of `3` is the number of test units for the third validation step. When - using the `pre=` argument, the input table can be mutated before performing the validation. - The `n()` method is a good way to determine whether the mutation performed as expected. - - In all of these examples, the `scalar=True` argument was used to return the value as a - scalar integer value. If `scalar=False`, the method will return a dictionary with an entry - for the validation step number (from the `i=` argument) and the number of test units. - Futhermore, leaving out the `i=` argument altogether will return a dictionary with filled - with the number of test units for each validation step. Here's what that looks like: - - ```python - validation.n() - ``` - - -n_passed(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int' - - Provides a dictionary of the number of test units that passed for each validation step. - - The `n_passed()` method provides the number of test units that passed for each validation - step. This is the number of test units that passed in the the validation step. It is always - some integer value between `0` and the total number of test units. - - Test units are the atomic units of the validation process. Different validations can have - different numbers of test units. For example, a validation that checks for the presence of - a column in a table will have a single test unit. A validation that checks for the presence - of a value in a column will have as many test units as there are rows in the table. - - The method provides a dictionary of the number of passing test units for each validation - step. If the `scalar=True` argument is provided and `i=` is a scalar, the value is returned - as a scalar instead of a dictionary. Furthermore, a value obtained here will be the - complement to the analogous value returned by the - [`n_passed()`](`pointblank.Validate.n_passed`) method (i.e., `n - n_failed`). - - Parameters - ---------- - i - The validation step number(s) from which the number of passing test units is obtained. - Can be provided as a list of integers or a single integer. If `None`, all steps are - included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, int] | int - A dictionary of the number of passing test units for each validation step or a scalar - value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps and, as it turns out, all of them will have - failing test units. After interrogation, the `n_passed()` method is used to determine the - number of passing test units for each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 4, 9, 7, 12], - "b": [9, 8, 10, 5, 10], - "c": ["a", "b", "c", "a", "b"] - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=5) - .col_vals_gt(columns="b", value=pb.col("a")) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.n_passed() - ``` - - The returned dictionary shows that all validation steps had no passing test units (each - value was less than `5`, which is the total number of test units for each step). - - If we wanted to check the number of passing test units for a single validation step, we can - provide the step number. Also, we could forego the dictionary and get a scalar value by - setting `scalar=True` (ensuring that `i=` is a scalar). - - ```python - validation.n_passed(i=1) - ``` - - The returned value of `4` is the number of passing test units for the first validation step. - - -n_failed(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int' - - Provides a dictionary of the number of test units that failed for each validation step. - - The `n_failed()` method provides the number of test units that failed for each validation - step. This is the number of test units that did not pass in the the validation step. It is - always some integer value between `0` and the total number of test units. - - Test units are the atomic units of the validation process. Different validations can have - different numbers of test units. For example, a validation that checks for the presence of - a column in a table will have a single test unit. A validation that checks for the presence - of a value in a column will have as many test units as there are rows in the table. - - The method provides a dictionary of the number of failing test units for each validation - step. If the `scalar=True` argument is provided and `i=` is a scalar, the value is returned - as a scalar instead of a dictionary. Furthermore, a value obtained here will be the - complement to the analogous value returned by the - [`n_passed()`](`pointblank.Validate.n_passed`) method (i.e., `n - n_passed`). - - Parameters - ---------- - i - The validation step number(s) from which the number of failing test units is obtained. - Can be provided as a list of integers or a single integer. If `None`, all steps are - included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, int] | int - A dictionary of the number of failing test units for each validation step or a scalar - value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps and, as it turns out, all of them will have - failing test units. After interrogation, the `n_failed()` method is used to determine the - number of failing test units for each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 4, 9, 7, 12], - "b": [9, 8, 10, 5, 10], - "c": ["a", "b", "c", "a", "b"] - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=5) - .col_vals_gt(columns="b", value=pb.col("a")) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.n_failed() - ``` - - The returned dictionary shows that all validation steps had failing test units. - - If we wanted to check the number of failing test units for a single validation step, we can - provide the step number. Also, we could forego the dictionary and get a scalar value by - setting `scalar=True` (ensuring that `i=` is a scalar). - - ```python - validation.n_failed(i=1) - ``` - - The returned value of `1` is the number of failing test units for the first validation step. - - -f_passed(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, float] | float' - - Provides a dictionary of the fraction of test units that passed for each validation step. - - A measure of the fraction of test units that passed is provided by the `f_passed` attribute. - This is the fraction of test units that passed the validation step over the total number of - test units. Given this is a fractional value, it will always be in the range of `0` to `1`. - - Test units are the atomic units of the validation process. Different validations can have - different numbers of test units. For example, a validation that checks for the presence of - a column in a table will have a single test unit. A validation that checks for the presence - of a value in a column will have as many test units as there are rows in the table. - - This method provides a dictionary of the fraction of passing test units for each validation - step. If the `scalar=True` argument is provided and `i=` is a scalar, the value is returned - as a scalar instead of a dictionary. Furthermore, a value obtained here will be the - complement to the analogous value returned by the - [`f_failed()`](`pointblank.Validate.f_failed`) method (i.e., `1 - f_failed()`). - - Parameters - ---------- - i - The validation step number(s) from which the fraction of passing test units is obtained. - Can be provided as a list of integers or a single integer. If `None`, all steps are - included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, float] | float - A dictionary of the fraction of passing test units for each validation step or a scalar - value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, all having some failing test units. After - interrogation, the `f_passed()` method is used to determine the fraction of passing test - units for each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 4, 9, 7, 12, 3, 10], - "b": [9, 8, 10, 5, 10, 6, 2], - "c": ["a", "b", "c", "a", "b", "d", "c"] - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=5) - .col_vals_gt(columns="b", value=pb.col("a")) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.f_passed() - ``` - - The returned dictionary shows the fraction of passing test units for each validation step. - The values are all less than `1` since there were failing test units in each step. - - If we wanted to check the fraction of passing test units for a single validation step, we - can provide the step number. Also, we could have the value returned as a scalar by setting - `scalar=True` (ensuring that `i=` is a scalar). - - ```python - validation.f_passed(i=1) - ``` - - The returned value is the proportion of passing test units for the first validation step - (5 passing test units out of 7 total test units). - - -f_failed(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, float] | float' - - Provides a dictionary of the fraction of test units that failed for each validation step. - - A measure of the fraction of test units that failed is provided by the `f_failed` attribute. - This is the fraction of test units that failed the validation step over the total number of - test units. Given this is a fractional value, it will always be in the range of `0` to `1`. - - Test units are the atomic units of the validation process. Different validations can have - different numbers of test units. For example, a validation that checks for the presence of - a column in a table will have a single test unit. A validation that checks for the presence - of a value in a column will have as many test units as there are rows in the table. - - This method provides a dictionary of the fraction of failing test units for each validation - step. If the `scalar=True` argument is provided and `i=` is a scalar, the value is returned - as a scalar instead of a dictionary. Furthermore, a value obtained here will be the - complement to the analogous value returned by the - [`f_passed()`](`pointblank.Validate.f_passed`) method (i.e., `1 - f_passed()`). - - Parameters - ---------- - i - The validation step number(s) from which the fraction of failing test units is obtained. - Can be provided as a list of integers or a single integer. If `None`, all steps are - included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, float] | float - A dictionary of the fraction of failing test units for each validation step or a scalar - value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, all having some failing test units. After - interrogation, the `f_failed()` method is used to determine the fraction of failing test - units for each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 4, 9, 7, 12, 3, 10], - "b": [9, 8, 10, 5, 10, 6, 2], - "c": ["a", "b", "c", "a", "b", "d", "c"] - } - ) - - validation = ( - pb.Validate(data=tbl) - .col_vals_gt(columns="a", value=5) - .col_vals_gt(columns="b", value=pb.col("a")) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.f_failed() - ``` - - The returned dictionary shows the fraction of failing test units for each validation step. - The values are all greater than `0` since there were failing test units in each step. - - If we wanted to check the fraction of failing test units for a single validation step, we - can provide the step number. Also, we could have the value returned as a scalar by setting - `scalar=True` (ensuring that `i=` is a scalar). - - ```python - validation.f_failed(i=1) - ``` - - The returned value is the proportion of failing test units for the first validation step - (2 failing test units out of 7 total test units). - - -warning(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, bool] | bool' - - Get the 'warning' level status for each validation step. - - The 'warning' status for a validation step is `True` if the fraction of failing test units - meets or exceeds the threshold for the 'warning' level. Otherwise, the status is `False`. - - The ascribed name of 'warning' is semantic and does not imply that a warning message is - generated, it is simply a status indicator that could be used to trigger some action to be - taken. Here's how it fits in with other status indicators: - - - 'warning': the status obtained by calling 'warning()', least severe - - 'error': the status obtained by calling [`error()`](`pointblank.Validate.error`), middle - severity - - 'critical': the status obtained by calling [`critical()`](`pointblank.Validate.critical`), - most severe - - This method provides a dictionary of the 'warning' status for each validation step. If the - `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar - instead of a dictionary. - - Parameters - ---------- - i - The validation step number(s) from which the 'warning' status is obtained. Can be - provided as a list of integers or a single integer. If `None`, all steps are included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, bool] | bool - A dictionary of the 'warning' status for each validation step or a scalar value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, and the first step will have some failing test - units, the rest will be completely passing. We've set thresholds here for each of the steps - by using `thresholds=(2, 4, 5)`, which means: - - - the 'warning' threshold is `2` failing test units - - the 'error' threshold is `4` failing test units - - the 'critical' threshold is `5` failing test units - - After interrogation, the `warning()` method is used to determine the 'warning' status for - each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [7, 4, 9, 7, 12, 3, 10], - "b": [9, 8, 10, 5, 10, 6, 2], - "c": ["a", "b", "a", "a", "b", "b", "a"] - } - ) - - validation = ( - pb.Validate(data=tbl, thresholds=(2, 4, 5)) - .col_vals_gt(columns="a", value=5) - .col_vals_lt(columns="b", value=15) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.warning() - ``` - - The returned dictionary provides the 'warning' status for each validation step. The first - step has a `True` value since the number of failing test units meets the threshold for the - 'warning' level. The second and third steps have `False` values since the number of failing - test units was `0`, which is below the threshold for the 'warning' level. - - We can also visually inspect the 'warning' status across all steps by viewing the validation - table: - - ```python - validation - ``` - - We can see that there's a filled gray circle in the first step (look to the far right side, - in the `W` column) indicating that the 'warning' threshold was met. The other steps have - empty gray circles. This means that thresholds were 'set but not met' in those steps. - - If we wanted to check the 'warning' status for a single validation step, we can provide the - step number. Also, we could have the value returned as a scalar by setting `scalar=True` - (ensuring that `i=` is a scalar). - - ```python - validation.warning(i=1) - ``` - - The returned value is `True`, indicating that the first validation step had met the - 'warning' threshold. - - -error(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, bool] | bool' - - Get the 'error' level status for each validation step. - - The 'error' status for a validation step is `True` if the fraction of failing test units - meets or exceeds the threshold for the 'error' level. Otherwise, the status is `False`. - - The ascribed name of 'error' is semantic and does not imply that the validation process - is halted, it is simply a status indicator that could be used to trigger some action to be - taken. Here's how it fits in with other status indicators: - - - 'warning': the status obtained by calling [`warning()`](`pointblank.Validate.warning`), - least severe - - 'error': the status obtained by calling `error()`, middle severity - - 'critical': the status obtained by calling [`critical()`](`pointblank.Validate.critical`), - most severe - - This method provides a dictionary of the 'error' status for each validation step. If the - `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar - instead of a dictionary. - - Parameters - ---------- - i - The validation step number(s) from which the 'error' status is obtained. Can be - provided as a list of integers or a single integer. If `None`, all steps are included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, bool] | bool - A dictionary of the 'error' status for each validation step or a scalar value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, and the first step will have some failing test - units, the rest will be completely passing. We've set thresholds here for each of the steps - by using `thresholds=(2, 4, 5)`, which means: - - - the 'warning' threshold is `2` failing test units - - the 'error' threshold is `4` failing test units - - the 'critical' threshold is `5` failing test units - - After interrogation, the `error()` method is used to determine the 'error' status for each - validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [3, 4, 9, 7, 2, 3, 8], - "b": [9, 8, 10, 5, 10, 6, 2], - "c": ["a", "b", "a", "a", "b", "b", "a"] - } - ) - - validation = ( - pb.Validate(data=tbl, thresholds=(2, 4, 5)) - .col_vals_gt(columns="a", value=5) - .col_vals_lt(columns="b", value=15) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.error() - ``` - - The returned dictionary provides the 'error' status for each validation step. The first step - has a `True` value since the number of failing test units meets the threshold for the - 'error' level. The second and third steps have `False` values since the number of failing - test units was `0`, which is below the threshold for the 'error' level. - - We can also visually inspect the 'error' status across all steps by viewing the validation - table: - - ```python - validation - ``` - - We can see that there are filled gray and yellow circles in the first step (far right side, - in the `W` and `E` columns) indicating that the 'warning' and 'error' thresholds were met. - The other steps have empty gray and yellow circles. This means that thresholds were 'set but - not met' in those steps. - - If we wanted to check the 'error' status for a single validation step, we can provide the - step number. Also, we could have the value returned as a scalar by setting `scalar=True` - (ensuring that `i=` is a scalar). - - ```python - validation.error(i=1) - ``` - - The returned value is `True`, indicating that the first validation step had the 'error' - threshold met. - - -critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, bool] | bool' - - Get the 'critical' level status for each validation step. - - The 'critical' status for a validation step is `True` if the fraction of failing test units - meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`. - - The ascribed name of 'critical' is semantic and is thus simply a status indicator that could - be used to trigger some action to be take. Here's how it fits in with other status - indicators: - - - 'warning': the status obtained by calling [`warning()`](`pointblank.Validate.warning`), - least severe - - 'error': the status obtained by calling [`error()`](`pointblank.Validate.error`), middle - severity - - 'critical': the status obtained by calling `critical()`, most severe - - This method provides a dictionary of the 'critical' status for each validation step. If the - `scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar - instead of a dictionary. - - Parameters - ---------- - i - The validation step number(s) from which the 'critical' status is obtained. Can be - provided as a list of integers or a single integer. If `None`, all steps are included. - scalar - If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary. - - Returns - ------- - dict[int, bool] | bool - A dictionary of the 'critical' status for each validation step or a scalar value. - - Examples - -------- - In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and - `c`). There will be three validation steps, and the first step will have many failing test - units, the rest will be completely passing. We've set thresholds here for each of the steps - by using `thresholds=(2, 4, 5)`, which means: - - - the 'warning' threshold is `2` failing test units - - the 'error' threshold is `4` failing test units - - the 'critical' threshold is `5` failing test units - - After interrogation, the `critical()` method is used to determine the 'critical' status for - each validation step. - - ```python - import pointblank as pb - import polars as pl - - tbl = pl.DataFrame( - { - "a": [2, 4, 4, 7, 2, 3, 8], - "b": [9, 8, 10, 5, 10, 6, 2], - "c": ["a", "b", "a", "a", "b", "b", "a"] - } - ) - - validation = ( - pb.Validate(data=tbl, thresholds=(2, 4, 5)) - .col_vals_gt(columns="a", value=5) - .col_vals_lt(columns="b", value=15) - .col_vals_in_set(columns="c", set=["a", "b"]) - .interrogate() - ) - - validation.critical() - ``` - - The returned dictionary provides the 'critical' status for each validation step. The first - step has a `True` value since the number of failing test units meets the threshold for the - 'critical' level. The second and third steps have `False` values since the number of failing - test units was `0`, which is below the threshold for the 'critical' level. - - We can also visually inspect the 'critical' status across all steps by viewing the - validation table: - - ```python - validation - ``` - - We can see that there are filled gray, yellow, and red circles in the first step (far right - side, in the `W`, `E`, and `C` columns) indicating that the 'warning', 'error', and - 'critical' thresholds were met. The other steps have empty gray, yellow, and red circles. - This means that thresholds were 'set but not met' in those steps. - - If we wanted to check the 'critical' status for a single validation step, we can provide the - step number. Also, we could have the value returned as a scalar by setting `scalar=True` - (ensuring that `i=` is a scalar). - - ```python - validation.critical(i=1) - ``` - - The returned value is `True`, indicating that the first validation step had the 'critical' - threshold met. - - - -## The Inspection and Assistance family - -The *Inspection and Assistance* group contains functions that are helpful for -getting to grips on a new data table. Use the `DataScan` class to get a quick overview of the data, -`preview()` to see the first and last few rows of a table, `col_summary_tbl()` for a column-level -summary of a table, `missing_vals_tbl()` to see where there are missing values in a table, and -`get_column_count()`/`get_row_count()` to get the number of columns and rows in a table. Several -datasets included in the package can be accessed via the `load_dataset()` function. Finally, the -`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use -the `assistant()` function to get help with Pointblank. - -DataScan(data: 'Any', tbl_name: 'str | None' = None) -> 'None' - - Get a summary of a dataset. - - The `DataScan` class provides a way to get a summary of a dataset. The summary includes the - following information: - - - the name of the table (if provided) - - the type of the table (e.g., `"polars"`, `"pandas"`, etc.) - - the number of rows and columns in the table - - column-level information, including: - - the column name - - the column type - - measures of missingness and distinctness - - measures of negative, zero, and positive values (for numerical columns) - - a sample of the data (the first 5 values) - - statistics (if the column contains numbers, strings, or datetimes) - - To obtain a dictionary representation of the summary, you can use the `to_dict()` method. To - get a JSON representation of the summary, you can use the `to_json()` method. To save the JSON - text to a file, the `save_to_json()` method could be used. - - :::{.callout-warning} - The `DataScan()` class is still experimental. Please report any issues you encounter in the - [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - data - The data to scan and summarize. This could be a DataFrame object, an Ibis table object, - a CSV file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file, - or a database connection string. - tbl_name - Optionally, the name of the table could be provided as `tbl_name`. - - Measures of Missingness and Distinctness - ---------------------------------------- - For each column, the following measures are provided: - - - `n_missing_values`: the number of missing values in the column - - `f_missing_values`: the fraction of missing values in the column - - `n_unique_values`: the number of unique values in the column - - `f_unique_values`: the fraction of unique values in the column - - The fractions are calculated as the ratio of the measure to the total number of rows in the - dataset. - - Counts and Fractions of Negative, Zero, and Positive Values - ----------------------------------------------------------- - For numerical columns, the following measures are provided: - - - `n_negative_values`: the number of negative values in the column - - `f_negative_values`: the fraction of negative values in the column - - `n_zero_values`: the number of zero values in the column - - `f_zero_values`: the fraction of zero values in the column - - `n_positive_values`: the number of positive values in the column - - `f_positive_values`: the fraction of positive values in the column - - The fractions are calculated as the ratio of the measure to the total number of rows in the - dataset. - - Statistics for Numerical and String Columns - ------------------------------------------- - For numerical and string columns, several statistical measures are provided. Please note that - for string columms, the statistics are based on the lengths of the strings in the column. - - The following descriptive statistics are provided: - - - `mean`: the mean of the column - - `std_dev`: the standard deviation of the column - - Additionally, the following quantiles are provided: - - - `min`: the minimum value in the column - - `p05`: the 5th percentile of the column - - `q_1`: the first quartile of the column - - `med`: the median of the column - - `q_3`: the third quartile of the column - - `p95`: the 95th percentile of the column - - `max`: the maximum value in the column - - `iqr`: the interquartile range of the column - - Statistics for Date and Datetime Columns - ---------------------------------------- - For date/datetime columns, the following statistics are provided: - - - `min`: the minimum date/datetime in the column - - `max`: the maximum date/datetime in the column - - Returns - ------- - DataScan - A DataScan object. - - -preview(data: 'Any', columns_subset: 'str | list[str] | Column | None' = None, n_head: 'int' = 5, n_tail: 'int' = 5, limit: 'int' = 50, show_row_numbers: 'bool' = True, max_col_width: 'int' = 250, min_tbl_width: 'int' = 500, incl_header: 'bool | None' = None) -> 'GT' - - Display a table preview that shows some rows from the top, some from the bottom. - - To get a quick look at the data in a table, we can use the `preview()` function to display a - preview of the table. The function shows a subset of the rows from the start and end of the - table, with the number of rows from the start and end determined by the `n_head=` and `n_tail=` - parameters (set to `5` by default). This function works with any table that is supported by the - `pointblank` library, including Pandas, Polars, and Ibis backend tables (e.g., DuckDB, MySQL, - PostgreSQL, SQLite, Parquet, etc.). - - The view is optimized for readability, with column names and data types displayed in a compact - format. The column widths are sized to fit the column names, dtypes, and column content up to - a configurable maximum width of `max_col_width=` pixels. The table can be scrolled horizontally - to view even very large datasets. Since the output is a Great Tables (`GT`) object, it can be - further customized using the `great_tables` API. - - Parameters - ---------- - data - The table to preview, which could be a DataFrame object, an Ibis table object, a CSV - file path, a Parquet file path, or a database connection string. When providing a CSV or - Parquet file path (as a string or `pathlib.Path` object), the file will be automatically - loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports - glob patterns, directories containing .parquet files, and Spark-style partitioned datasets. - Connection strings enable direct database access via Ibis with optional table specification - using the `::table_name` suffix. Read the *Supported Input Table Types* section for details - on the supported table types. - columns_subset - The columns to display in the table, by default `None` (all columns are shown). This can - be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter - two options allow for more flexible column selection using column selector functions. Errors - are raised if the column names provided don't match any columns in the table (when provided - as a string or list of strings) or if column selector expressions don't resolve to any - columns. - n_head - The number of rows to show from the start of the table. Set to `5` by default. - n_tail - The number of rows to show from the end of the table. Set to `5` by default. - limit - The limit value for the sum of `n_head=` and `n_tail=` (the total number of rows shown). - If the sum of `n_head=` and `n_tail=` exceeds the limit, an error is raised. The default - value is `50`. - show_row_numbers - Should row numbers be shown? The numbers shown reflect the row numbers of the head and tail - in the input `data=` table. By default, this is set to `True`. - max_col_width - The maximum width of the columns (in pixels) before the text is truncated. The default value - is `250` (`"250px"`). - min_tbl_width - The minimum width of the table in pixels. If the sum of the column widths is less than this - value, the all columns are sized up to reach this minimum width value. The default value is - `500` (`"500px"`). - incl_header - Should the table include a header with the table type and table dimensions? Set to `True` by - default. - - Returns - ------- - GT - A GT object that displays the preview of the table. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables - requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or - Pandas DataFrame, the availability of Ibis is not needed. - - To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is - provided. The file will be automatically detected and loaded using the best available DataFrame - library. The loading preference is Polars first, then Pandas as a fallback. - - Connection strings follow database URL formats and must also specify a table using the - `::table_name` suffix. Examples include: - - ``` - "duckdb:///path/to/database.ddb::table_name" - "sqlite:///path/to/database.db::table_name" - "postgresql://user:password@localhost:5432/database::table_name" - "mysql://user:password@localhost:3306/database::table_name" - "bigquery://project/dataset::table_name" - "snowflake://user:password@account/database/schema::table_name" - ``` - - When using connection strings, the Ibis library with the appropriate backend driver is required. - - Examples - -------- - It's easy to preview a table using the `preview()` function. Here's an example using the - `small_table` dataset (itself loaded using the [`load_dataset()`](`pointblank.load_dataset`) - function): - - This table is a Polars DataFrame, but the `preview()` function works with any table supported - by `pointblank`, including Pandas DataFrames and Ibis backend tables. Here's an example using - a DuckDB table handled by Ibis: - - ```python - small_table_duckdb = pb.load_dataset("small_table", tbl_type="duckdb") - - pb.preview(small_table_duckdb) - ``` - - The blue dividing line marks the end of the first `n_head=` rows and the start of the last - `n_tail=` rows. - - We can adjust the number of rows shown from the start and end of the table by setting the - `n_head=` and `n_tail=` parameters. Let's enlarge each of these to `10`: - - ```python - pb.preview(small_table_polars, n_head=10, n_tail=10) - ``` - - In the above case, the entire dataset is shown since the sum of `n_head=` and `n_tail=` is - greater than the number of rows in the table (which is 13). - - The `columns_subset=` parameter can be used to show only specific columns in the table. You can - provide a list of column names to make the selection. Let's try that with the `"game_revenue"` - dataset as a Pandas DataFrame: - - ```python - game_revenue_pandas = pb.load_dataset("game_revenue", tbl_type="pandas") - - pb.preview(game_revenue_pandas, columns_subset=["player_id", "item_name", "item_revenue"]) - ``` - - Alternatively, we can use column selector functions like - [`starts_with()`](`pointblank.starts_with`) and [`matches()`](`pointblank.matches`)` to select - columns based on text or patterns: - - ```python - pb.preview(game_revenue_pandas, n_head=2, n_tail=2, columns_subset=pb.starts_with("session")) - ``` - - Multiple column selector functions can be combined within [`col()`](`pointblank.col`) using - operators like `|` and `&`: - - ```python - pb.preview( - game_revenue_pandas, - n_head=2, - n_tail=2, - columns_subset=pb.col(pb.starts_with("item") | pb.matches("player")) - ) - ``` - - ### Working with CSV Files - - The `preview()` function can directly accept CSV file paths, making it easy to preview data - stored in CSV files without manual loading: - - You can also use a Path object to specify the CSV file: - - ### Working with Parquet Files - - The `preview()` function can directly accept Parquet files and datasets in various formats: - - You can also use glob patterns and directories: - - ```python - # Multiple Parquet files with glob patterns - pb.preview("data/sales_*.parquet") - - # Directory containing Parquet files - pb.preview("parquet_data/") - - # Partitioned Parquet dataset - pb.preview("sales_data/") # Auto-discovers partition columns - ``` - - ### Working with Database Connection Strings - - The `preview()` function supports database connection strings for direct preview of database - tables. Connection strings must specify a table using the `::table_name` suffix: - - For comprehensive documentation on supported connection string formats, error handling, and - installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`) - function. - - -col_summary_tbl(data: 'Any', tbl_name: 'str | None' = None) -> 'GT' - - Generate a column-level summary table of a dataset. - - The `col_summary_tbl()` function generates a summary table of a dataset, focusing on providing - column-level information about the dataset. The summary includes the following information: - - - the type of the table (e.g., `"polars"`, `"pandas"`, etc.) - - the number of rows and columns in the table - - column-level information, including: - - the column name - - the column type - - measures of missingness and distinctness - - descriptive stats and quantiles - - statistics for datetime columns - - The summary table is returned as a GT object, which can be displayed in a notebook or saved to - an HTML file. - - :::{.callout-warning} - The `col_summary_tbl()` function is still experimental. Please report any issues you encounter - in the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - data - The table to summarize, which could be a DataFrame object, an Ibis table object, a CSV - file path, a Parquet file path, or a database connection string. Read the *Supported Input - Table Types* section for details on the supported table types. - tbl_name - Optionally, the name of the table could be provided as `tbl_name=`. - - Returns - ------- - GT - A GT object that displays the column-level summaries of the table. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - GitHub URLs (direct links to CSV or Parquet files on GitHub) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `col_summary_tbl()` with these types of - tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a - Polars or Pandas DataFrame, the availability of Ibis is not needed. - - Examples - -------- - It's easy to get a column-level summary of a table using the `col_summary_tbl()` function. - Here's an example using the `small_table` dataset (itself loaded using the - [`load_dataset()`](`pointblank.load_dataset`) function): - - This table used above was a Polars DataFrame, but the `col_summary_tbl()` function works with - any table supported by `pointblank`, including Pandas DataFrames and Ibis backend tables. - Here's an example using a DuckDB table handled by Ibis: - - ```python - nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") - - pb.col_summary_tbl(data=nycflights, tbl_name="nycflights") - ``` - - -missing_vals_tbl(data: 'Any') -> 'GT' - - Display a table that shows the missing values in the input table. - - The `missing_vals_tbl()` function generates a table that shows the missing values in the input - table. The table is displayed using the Great Tables API, which allows for further customization - of the table's appearance if so desired. - - Parameters - ---------- - data - The table for which to display the missing values. This could be a DataFrame object, an - Ibis table object, a CSV file path, a Parquet file path, or a database connection string. - Read the *Supported Input Table Types* section for details on the supported table types. - - Returns - ------- - GT - A GT object that displays the table of missing values in the input table. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of - tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a - Polars or Pandas DataFrame, the availability of Ibis is not needed. - - The Missing Values Table - ------------------------ - The missing values table shows the proportion of missing values in each column of the input - table. The table is divided into sectors, with each sector representing a range of rows in the - table. The proportion of missing values in each sector is calculated for each column. The table - is displayed using the Great Tables API, which allows for further customization of the table's - appearance. - - To ensure that the table can scale to tables with many columns, each row in the reporting table - represents a column in the input table. There are 10 sectors shown in the table, where the first - sector represents the first 10% of the rows, the second sector represents the next 10% of the - rows, and so on. Any sectors that are light blue indicate that there are no missing values in - that sector. If there are missing values, the proportion of missing values is shown by a gray - color (light gray for low proportions, dark gray to black for very high proportions). - - Examples - -------- - The `missing_vals_tbl()` function is useful for quickly identifying columns with missing values - in a table. Here's an example using the `nycflights` dataset (loaded as a Polars DataFrame using - the [`load_dataset()`](`pointblank.load_dataset`) function): - - The table shows the proportion of missing values in each column of the `nycflights` dataset. The - table is divided into sectors, with each sector representing a range of rows in the table (with - around 34,000 rows per sector). The proportion of missing values in each sector is calculated - for each column. The various shades of gray indicate the proportion of missing values in each - sector. Many columns have no missing values at all, and those sectors are colored light blue. - - -assistant(model: 'str', data: 'Any' = None, tbl_name: 'str | None' = None, api_key: 'str | None' = None, display: 'str | None' = None) -> 'None' - - Chat with the PbA (Pointblank Assistant) about your data validation needs. - - The `assistant()` function provides an interactive chat session with the PbA (Pointblank - Assistant) to help you with your data validation needs. The PbA can help you with constructing - validation plans, suggesting validation methods, and providing code snippets for using the - Pointblank Python package. Feel free to ask the PbA about any aspect of the Pointblank package - and it will do its best to assist you. - - The PbA can also help you with constructing validation plans for your data tables. If you - provide a data table to the PbA, it will internally generate a JSON summary of the table and - use that information to suggest validation methods that can be used with the Pointblank package. - If using a Polars table as the data source, the PbA will be knowledgeable about the Polars API - and can smartly suggest validation steps that use aggregate measures with up-to-date Polars - methods. - - The PbA can be used with models from the following providers: - - - Anthropic - - OpenAI - - Ollama - - Amazon Bedrock - - The PbA can be displayed in a browser (the default) or in the terminal. You can choose one or - the other by setting the `display=` parameter to `"browser"` or `"terminal"`. - - :::{.callout-warning} - The `assistant()` function is still experimental. Please report any issues you encounter in - the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - model - The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, - `"ollama"`, and `"bedrock"`. - data - An optional data table to focus on during discussion with the PbA, which could be a - DataFrame object, an Ibis table object, a CSV file path, a Parquet file path, or a database - connection string. Read the *Supported Input Table Types* section for details on the - supported table types. - tbl_name : str, optional - The name of the data table. This is optional and is only used to provide a more detailed - prompt to the PbA. - api_key : str, optional - The API key to be used for the model. - display : str, optional - The display mode to use for the chat session. Supported values are `"browser"` and - `"terminal"`. If not provided, the default value is `"browser"`. - - Returns - ------- - None - Nothing is returned. Rather, you get an an interactive chat session with the PbA, which is - displayed in a browser or in the terminal. - - Constructing the `model` Argument - --------------------------------- - The `model=` argument should be constructed using the provider and model name separated by a - colon (`provider:model`). The provider text can any of: - - - `"anthropic"` (Anthropic) - - `"openai"` (OpenAI) - - `"ollama"` (Ollama) - - `"bedrock"` (Amazon Bedrock) - - The model name should be the specific model to be used from the provider. Model names are - subject to change so consult the provider's documentation for the most up-to-date model names. - - Notes on Authentication - ----------------------- - Providing a valid API key as a string in the `api_key` argument is adequate for getting started - but you should consider using a more secure method for handling API keys. - - One way to do this is to load the API key from an environent variable and retrieve it using the - `os` module (specifically the `os.getenv()` function). Places to store the API key might - include `.bashrc`, `.bash_profile`, `.zshrc`, or `.zsh_profile`. - - Another solution is to store one or more model provider API keys in an `.env` file (in the root - of your project). If the API keys have correct names (e.g., `ANTHROPIC_API_KEY` or - `OPENAI_API_KEY`) then DraftValidation will automatically load the API key from the `.env` file - and there's no need to provide the `api_key` argument. An `.env` file might look like this: - - ```plaintext - ANTHROPIC_API_KEY="your_anthropic_api_key_here" - OPENAI_API_KEY="your_openai_api_key_here" - ``` - - There's no need to have the `python-dotenv` package installed when using `.env` files in this - way. - - Notes on Data Sent to the Model Provider - ---------------------------------------- - If `data=` is provided then that data is sent to the model provider is a JSON summary of the - table. This data summary is generated internally by use of the `DataScan` class. The summary - includes the following information: - - - the number of rows and columns in the table - - the type of dataset (e.g., Polars, DuckDB, Pandas, etc.) - - the column names and their types - - column level statistics such as the number of missing values, min, max, mean, and median, etc. - - a short list of data values in each column - - The JSON summary is used to provide the model with the necessary information be knowledgable - about the data table. Compared to the size of the entire table, the JSON summary is quite small - and can be safely sent to the model provider. - - The Amazon Bedrock provider is a special case since it is a self-hosted model and security - controls are in place to ensure that data is kept within the user's AWS environment. If using an - Ollama model all data is handled locally. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables - requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or - Pandas DataFrame, the availability of Ibis is not needed. - - To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is - provided. The file will be automatically detected and loaded using the best available DataFrame - library. The loading preference is Polars first, then Pandas as a fallback. - - -load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', tbl_type: "Literal['polars', 'pandas', 'duckdb']" = 'polars') -> 'Any' - - Load a dataset hosted in the library as specified table type. - - The Pointblank library includes several datasets that can be loaded using the `load_dataset()` - function. The datasets can be loaded as a Polars DataFrame, a Pandas DataFrame, or as a DuckDB - table (which uses the Ibis library backend). These datasets are used throughout the - documentation's examples to demonstrate the functionality of the library. They're also useful - for experimenting with the library and trying out different validation scenarios. - - Parameters - ---------- - dataset - The name of the dataset to load. Current options are `"small_table"`, `"game_revenue"`, - `"nycflights"`, and `"global_sales"`. - tbl_type - The type of table to generate from the dataset. The named options are `"polars"`, - `"pandas"`, and `"duckdb"`. - - Returns - ------- - Any - The dataset for the `Validate` object. This could be a Polars DataFrame, a Pandas DataFrame, - or a DuckDB table as an Ibis table. - - Included Datasets - ----------------- - There are three included datasets that can be loaded using the `load_dataset()` function: - - - `"small_table"`: A small dataset with 13 rows and 8 columns. This dataset is useful for - testing and demonstration purposes. - - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Provides revenue data for a game - development company. For the particular game, there are records of player sessions, the items - they purchased, ads viewed, and the revenue generated. - - `"nycflights"`: A dataset with 336,776 rows and 18 columns. This dataset provides information - about flights departing from New York City airports (JFK, LGA, or EWR) in 2013. - - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Provides information about - global sales of products across different regions and countries. - - Supported DataFrame Types - ------------------------- - The `tbl_type=` parameter can be set to one of the following: - - - `"polars"`: A Polars DataFrame. - - `"pandas"`: A Pandas DataFrame. - - `"duckdb"`: An Ibis table for a DuckDB database. - - Examples - -------- - Load the `"small_table"` dataset as a Polars DataFrame by calling `load_dataset()` with - `dataset="small_table"` and `tbl_type="polars"`: - - Note that the `"small_table"` dataset is a Polars DataFrame and using the - [`preview()`](`pointblank.preview`) function will display the table in an HTML viewing - environment. - - The `"game_revenue"` dataset can be loaded as a Pandas DataFrame by specifying the dataset name - and setting `tbl_type="pandas"`: - - ```python - game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="pandas") - - pb.preview(game_revenue) - ``` - - The `"game_revenue"` dataset is a more real-world dataset with a mix of data types, and it's - significantly larger than the `small_table` dataset at 2000 rows and 11 columns. - - The `"nycflights"` dataset can be loaded as a DuckDB table by specifying the dataset name and - setting `tbl_type="duckdb"`: - - ```python - nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") - - pb.preview(nycflights) - ``` - - The `"nycflights"` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is - truly a real-world dataset and provides information about flights originating from New York City - airports in 2013. - - Finally, the `"global_sales"` dataset can be loaded as a Polars table by specifying the dataset - name. Since `tbl_type=` is set to `"polars"` by default, we don't need to specify it: - - ```python - global_sales = pb.load_dataset(dataset="global_sales") - - pb.preview(global_sales) - ``` - - The `"global_sales"` dataset is a large dataset with 50,000 rows and 20 columns. Each record - describes the sales of a particular product to a customer located in one of three global - regions: North America, Europe, or Asia. - - -get_data_path(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', file_type: "Literal['csv', 'parquet', 'duckdb']" = 'csv') -> 'str' - - Get the file path to a dataset included with the Pointblank package. - - This function provides direct access to the file paths of datasets included with Pointblank. - These paths can be used in examples and documentation to demonstrate file-based data loading - without requiring the actual data files. The returned paths can be used with - `Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities. - - Parameters - ---------- - dataset - The name of the dataset to get the path for. Current options are `"small_table"`, - `"game_revenue"`, `"nycflights"`, and `"global_sales"`. - file_type - The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`. - - Returns - ------- - str - The file path to the requested dataset file. - - Included Datasets - ----------------- - The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`): - - - `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples. - - `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company. - - `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports. - - `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions. - - File Types - ---------- - Each dataset is available in multiple formats: - - - `"csv"`: Comma-separated values file (`.csv`) - - `"parquet"`: Parquet file (`.parquet`) - - `"duckdb"`: DuckDB database file (`.ddb`) - - Examples - -------- - Get the path to a CSV file and use it with `Validate`: - - ```python - import pointblank as pb - - # Get path to the small_table CSV file - csv_path = pb.get_data_path("small_table", "csv") - print(csv_path) - - # Use the path directly with Validate - validation = ( - pb.Validate(data=csv_path) - .col_exists(["a", "b", "c"]) - .col_vals_gt(columns="d", value=0) - .interrogate() - ) - - validation - ``` - - Get a Parquet file path for validation examples: - - ```python - # Get path to the game_revenue Parquet file - parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet") - - # Validate the Parquet file directly - validation = ( - pb.Validate(data=parquet_path, label="Game Revenue Data Validation") - .col_vals_not_null(columns=["player_id", "session_id"]) - .col_vals_gt(columns="item_revenue", value=0) - .interrogate() - ) - - validation - ``` - - This is particularly useful for documentation examples where you want to demonstrate - file-based workflows without requiring users to have specific data files: - - ```python - # Example showing CSV file validation - sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv") - - validation = ( - pb.Validate(data=sales_csv, label="Sales Data Validation") - .col_exists(["customer_id", "product_id", "amount"]) - .col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}") - .interrogate() - ) - ``` - - See Also - -------- - [`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects. - - -connect_to_table(connection_string: 'str') -> 'Any' - - Connect to a database table using a connection string. - - This utility function tests whether a connection string leads to a valid table and returns - the table object if successful. It provides helpful error messages when no table is specified - or when backend dependencies are missing. - - Parameters - ---------- - connection_string - A database connection string with a required table specification using the `::table_name` - suffix. Supported formats are outlined in the *Supported Connection String Formats* section. - - Returns - ------- - Any - An Ibis table object for the specified database table. - - Supported Connection String Formats - ----------------------------------- - The `connection_string` parameter must include a valid connection string with a table name - specified using the `::` syntax. Here are some examples on how to format connection strings - for various backends: - - ``` - DuckDB: "duckdb:///path/to/database.ddb::table_name" - SQLite: "sqlite:///path/to/database.db::table_name" - PostgreSQL: "postgresql://user:password@localhost:5432/database::table_name" - MySQL: "mysql://user:password@localhost:3306/database::table_name" - BigQuery: "bigquery://project/dataset::table_name" - Snowflake: "snowflake://user:password@account/database/schema::table_name" - ``` - - If the connection string does not include a table name, the function will attempt to connect to - the database and list available tables, providing guidance on how to specify a table. - - Examples - -------- - Connect to a DuckDB table: - - ```python - import pointblank as pb - - # Get path to a DuckDB database file from package data - duckdb_path = pb.get_data_path("game_revenue", "duckdb") - - # Connect to the `game_revenue` table in the DuckDB database - game_revenue = pb.connect_to_table(f"duckdb:///{duckdb_path}::game_revenue") - - # Use with the `preview()` function - pb.preview(game_revenue) - ``` - - Here are some backend-specific connection examples: - - ```python - # PostgreSQL - pg_table = pb.connect_to_table( - "postgresql://user:password@localhost:5432/warehouse::customer_data" - ) - - # SQLite - sqlite_table = pb.connect_to_table("sqlite:///local_data.db::products") - - # BigQuery - bq_table = pb.connect_to_table("bigquery://my-project/analytics::daily_metrics") - ``` - - This function requires the Ibis library with appropriate backend drivers: - - ```bash - # You can install a set of common backends: - pip install 'ibis-framework[duckdb,postgres,mysql,sqlite]' - - # ...or specific backends as needed: - pip install 'ibis-framework[duckdb]' # for DuckDB - pip install 'ibis-framework[postgres]' # for PostgreSQL - ``` - See Also - -------- - print_database_tables : List all available tables in a database for discovery - - -print_database_tables(connection_string: 'str') -> 'list[str]' - - List all tables in a database from a connection string. - - The `print_database_tables()` function connects to a database and returns a list of all - available tables. This is particularly useful for discovering what tables exist in a database - before connecting to a specific table with `connect_to_table(). The function automatically - filters out temporary Ibis tables (memtables) to show only user tables. It supports all database - backends available through Ibis, including DuckDB, SQLite, PostgreSQL, MySQL, BigQuery, and - Snowflake. - - Parameters - ---------- - connection_string - A database connection string *without* the `::table_name` suffix. Example: - `"duckdb:///path/to/database.ddb"`. - - Returns - ------- - list[str] - List of table names, excluding temporary Ibis tables. - - See Also - -------- - connect_to_table : Connect to a database table with full connection string documentation - - - -## The YAML family - -The *YAML* group contains functions that allow for the use of YAML to orchestrate -validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow -from YAML strings or files. The `validate_yaml()` function checks if the YAML configuration passes -its own validity checks. The `yaml_to_python()` function converts YAML configuration to equivalent -Python code. - -yaml_interrogate(yaml: 'Union[str, Path]', set_tbl: 'Any' = None, namespaces: 'Optional[Union[Iterable[str], Mapping[str, str]]]' = None) -> 'Validate' -Execute a YAML-based validation workflow. - - This is the main entry point for YAML-based validation workflows. It takes YAML configuration - (as a string or file path) and returns a validated `Validate` object with interrogation results. - - The YAML configuration defines the data source, validation steps, and optional settings like - thresholds and labels. This function automatically loads the data, builds the validation plan, - executes all validation steps, and returns the interrogated results. - - Parameters - ---------- - yaml - YAML configuration as string or file path. Can be: (1) a YAML string containing the - validation configuration, or (2) a Path object or string path to a YAML file. - set_tbl - An optional table to override the table specified in the YAML configuration. This allows you - to apply a YAML-defined validation workflow to a different table than what's specified in - the configuration. If provided, this table will replace the table defined in the YAML's - `tbl` field before executing the validation workflow. This can be any supported table type - including DataFrame objects, Ibis table objects, CSV file paths, Parquet file paths, GitHub - URLs, or database connection strings. - namespaces - Optional module namespaces to make available for Python code execution in YAML - configurations. Can be a dictionary mapping aliases to module names or a list of module - names. See the "Using Namespaces" section below for detailed examples. - - Returns - ------- - Validate - An instance of the `Validate` class that has been configured based on the YAML input. This - object contains the results of the validation steps defined in the YAML configuration. It - includes metadata like table name, label, language, and thresholds if specified. - - Raises - ------ - YAMLValidationError - If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing - required fields, unknown validation methods, or data loading failures. - - Using Namespaces - ---------------- - The `namespaces=` parameter enables custom Python modules and functions in YAML configurations. - This is particularly useful for custom action functions and advanced Python expressions. - - **Namespace formats:** - - - Dictionary format: `{"alias": "module.name"}` maps aliases to module names - - List format: `["module.name", "another.module"]` imports modules directly - - **Option 1: Inline expressions (no namespaces needed)** - - ```python - import pointblank as pb - - # Simple inline custom action - yaml_config = ''' - tbl: small_table - thresholds: - warning: 0.01 - actions: - warning: - python: "lambda: print('Custom warning triggered')" - steps: - - col_vals_gt: - columns: [a] - value: 1000 - ''' - - result = pb.yaml_interrogate(yaml_config) - result - ``` - - **Option 2: External functions with namespaces** - - ```python - # Define a custom action function - def my_custom_action(): - print("Data validation failed: please check your data.") - - # Add to current module for demo - import sys - sys.modules[__name__].my_custom_action = my_custom_action - - # YAML that references the external function - yaml_config = ''' - tbl: small_table - thresholds: - warning: 0.01 - actions: - warning: - python: actions.my_custom_action - steps: - - col_vals_gt: - columns: [a] - value: 1000 # This will fail - ''' - - # Use namespaces to make the function available - result = pb.yaml_interrogate(yaml_config, namespaces={'actions': '__main__'}) - result - ``` - - This approach enables modular, reusable validation workflows with custom business logic. - - Examples - -------- - For the examples here, we'll use YAML configurations to define validation workflows. Let's start - with a basic YAML workflow that validates the built-in `small_table` dataset. - - ```python - import pointblank as pb - - # Define a basic YAML validation workflow - yaml_config = ''' - tbl: small_table - steps: - - rows_distinct - - col_exists: - columns: [date, a, b] - ''' - - # Execute the validation workflow - result = pb.yaml_interrogate(yaml_config) - result - ``` - - The validation table shows the results of our YAML-defined workflow. We can see that the - `rows_distinct()` validation failed (because there are duplicate rows in the table), while the - column existence checks passed. - - Now let's create a more comprehensive validation workflow with thresholds and metadata: - - ```python - # Advanced YAML configuration with thresholds and metadata - yaml_config = ''' - tbl: small_table - tbl_name: small_table_demo - label: Comprehensive data validation - thresholds: - warning: 0.1 - error: 0.25 - critical: 0.35 - steps: - - col_vals_gt: - columns: [d] - value: 100 - - col_vals_regex: - columns: [b] - pattern: '[0-9]-[a-z]{3}-[0-9]{3}' - - col_vals_not_null: - columns: [date, a] - ''' - - # Execute the validation workflow - result = pb.yaml_interrogate(yaml_config) - print(f"Table name: {result.tbl_name}") - print(f"Label: {result.label}") - print(f"Total validation steps: {len(result.validation_info)}") - ``` - - The validation results now include our custom table name and label. The thresholds we defined - will determine when validation steps are marked as warnings, errors, or critical failures. - - You can also load YAML configurations from files. Here's how you would work with a YAML file: - - ```python - from pathlib import Path - import tempfile - - # Create a temporary YAML file for demonstration - yaml_content = ''' - tbl: small_table - tbl_name: File-based Validation - steps: - - col_vals_between: - columns: [c] - left: 1 - right: 10 - - col_vals_in_set: - columns: [f] - set: [low, mid, high] - ''' - - with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: - f.write(yaml_content) - yaml_file_path = Path(f.name) - - # Load and execute validation from file - result = pb.yaml_interrogate(yaml_file_path) - result - ``` - - This approach is particularly useful for storing validation configurations as part of your data - pipeline or version control system, allowing you to maintain validation rules alongside your - code. - - ### Governance Metadata - - YAML workflows support governance metadata via `owner`, `consumers`, and `version` top-level - keys. These are forwarded to the `Validate` constructor and embedded in the validation report: - - ```python - yaml_config = ''' - tbl: small_table - tbl_name: sales_pipeline - owner: Data Engineering - consumers: [Analytics, Finance, Compliance] - version: "2.1.0" - steps: - - col_vals_not_null: - columns: [a, b] - ''' - - result = pb.yaml_interrogate(yaml_config) - print(f"Owner: {result.owner}") - print(f"Consumers: {result.consumers}") - print(f"Version: {result.version}") - ``` - - ### Aggregate Validations - - YAML supports aggregate validation methods for checking column-level statistics. These methods - validate that a column's sum, average, or standard deviation meets a threshold: - - ```python - yaml_config = ''' - tbl: small_table - steps: - - col_sum_gt: - columns: [d] - value: 0 - - col_avg_le: - columns: [a] - value: 10 - ''' - - result = pb.yaml_interrogate(yaml_config) - result - ``` - - The 15 available aggregate methods follow the pattern `col_{stat}_{comparator}` where - `{stat}` is `sum`, `avg`, or `sd` and `{comparator}` is `gt`, `lt`, `ge`, - `le`, or `eq`. - - ### Data Freshness - - Check that a date/datetime column has recent data using `data_freshness`: - - ```yaml - tbl: events.csv - steps: - - data_freshness: - columns: event_date - freshness: "24h" - ``` - - ### Active Parameter Shortcut - - The `active=` parameter controls whether a validation step runs. It supports boolean values - and Python expression shortcuts: - - ```yaml - steps: - - col_vals_gt: - columns: [d] - value: 100 - active: false # Skip this step - - - col_vals_not_null: - columns: [a] - active: true # Always run (default) - ``` - - ### Null Percentage Check - - Use `col_pct_null` to validate that the percentage of null values in a column is within bounds: - - ```yaml - steps: - - col_pct_null: - columns: [a, b] - value: 0.05 - ``` - - ### Using `set_tbl=` to Override the Table - - The `set_tbl=` parameter allows you to override the table specified in the YAML configuration. - This is useful when you have a template validation workflow but want to apply it to different - tables: - - ```python - import polars as pl - - # Create a test table with similar structure to small_table - test_table = pl.DataFrame({ - "date": ["2023-01-01", "2023-01-02", "2023-01-03"], - "a": [1, 2, 3], - "b": ["1-abc-123", "2-def-456", "3-ghi-789"], - "d": [150, 200, 250] - }) - - # Use the same YAML config but apply it to our test table - yaml_config = ''' - tbl: small_table # This will be overridden - tbl_name: Test Table # This name will be used - steps: - - col_exists: - columns: [date, a, b, d] - - col_vals_gt: - columns: [d] - value: 100 - ''' - - # Execute with table override - result = pb.yaml_interrogate(yaml_config, set_tbl=test_table) - print(f"Validation applied to: {result.tbl_name}") - result - ``` - - This feature makes YAML configurations more reusable and flexible, allowing you to define - validation logic once and apply it to multiple similar tables. - - -validate_yaml(yaml: 'Union[str, Path]') -> 'None' -Validate YAML configuration against the expected structure. - - This function validates that a YAML configuration conforms to the expected structure for - validation workflows. It checks for required fields, proper data types, and valid - validation method names. This is useful for validating configurations before execution or - for building configuration editors and validators. - - The function performs comprehensive validation including: - - - required fields ('tbl' and 'steps') - - proper data types for all fields - - valid threshold configurations - - known validation method names - - proper step configuration structure - - Parameters - ---------- - yaml - YAML configuration as string or file path. Can be: (1) a YAML string containing the - validation configuration, or (2) a Path object or string path to a YAML file. - - Raises - ------ - YAMLValidationError - If the YAML is invalid, malformed, or execution fails. This includes syntax errors, - missing required fields, unknown validation methods, or data loading failures. - - Examples - -------- - For the examples here, we'll demonstrate how to validate YAML configurations before using them - with validation workflows. This is particularly useful for building robust data validation - systems where you want to catch configuration errors early. - - Let's start with validating a basic configuration: - - ```python - import pointblank as pb - - # Define a basic YAML validation configuration - yaml_config = ''' - tbl: small_table - steps: - - rows_distinct - - col_exists: - columns: [a, b] - ''' - - # Validate the configuration: no exception means it's valid - pb.validate_yaml(yaml_config) - print("Basic YAML configuration is valid") - ``` - - The function completed without raising an exception, which means our configuration is valid and - follows the expected structure. - - Now let's validate a more complex configuration with thresholds and metadata: - - ```python - # Complex YAML configuration with all optional fields - yaml_config = ''' - tbl: small_table - tbl_name: My Dataset - label: Quality check - lang: en - locale: en - thresholds: - warning: 0.1 - error: 0.25 - critical: 0.35 - steps: - - rows_distinct - - col_vals_gt: - columns: [d] - value: 100 - - col_vals_regex: - columns: [b] - pattern: '[0-9]-[a-z]{3}-[0-9]{3}' - ''' - - # Validate the configuration - pb.validate_yaml(yaml_config) - print("Complex YAML configuration is valid") - - # Count the validation steps - import pointblank.yaml as pby - config = pby.load_yaml_config(yaml_config) - print(f"Configuration has {len(config['steps'])} validation steps") - ``` - - This configuration includes all the optional metadata fields and complex validation steps, - demonstrating that the validation handles the full range of supported options. - - Let's see what happens when we try to validate an invalid configuration: - - ```python - # Invalid YAML configuration: missing required 'tbl' field - invalid_yaml = ''' - steps: - - rows_distinct - ''' - - try: - pb.validate_yaml(invalid_yaml) - except pb.yaml.YAMLValidationError as e: - print(f"Validation failed: {e}") - ``` - - The validation correctly identifies that our configuration is missing the required `'tbl'` - field. - - Here's a practical example of using validation in a workflow builder: - - ```python - def safe_yaml_interrogate(yaml_config): - """Safely execute a YAML configuration after validation.""" - try: - # Validate the YAML configuration first - pb.validate_yaml(yaml_config) - print("✓ YAML configuration is valid") - - # Then execute the workflow - result = pb.yaml_interrogate(yaml_config) - print(f"Validation completed with {len(result.validation_info)} steps") - return result - - except pb.yaml.YAMLValidationError as e: - print(f"Configuration error: {e}") - return None - - # Test with a valid YAML configuration - test_yaml = ''' - tbl: small_table - steps: - - col_vals_between: - columns: [c] - left: 1 - right: 10 - ''' - - result = safe_yaml_interrogate(test_yaml) - ``` - - This pattern of validating before executing helps build more reliable data validation pipelines - by catching configuration errors early in the process. - - Note that this function only validates the structure and does not check if the specified data - source ('tbl') exists or is accessible. Data source validation occurs during execution with - `yaml_interrogate()`. - - Supported Top-level Keys - ------------------------ - The following top-level keys are recognized in the YAML configuration: - - - `tbl`: data source specification (required) - - `steps`: list of validation steps (required) - - `tbl_name`: human-readable table name - - `label`: validation description - - `df_library`: DataFrame library (`"polars"`, `"pandas"`, `"duckdb"`) - - `lang`: language code - - `locale`: locale setting - - `brief`: global brief template - - `thresholds`: global failure thresholds - - `actions`: global failure actions - - `final_actions`: actions triggered after all steps complete - - `owner`: data owner (governance metadata) - - `consumers`: data consumers (governance metadata) - - `version`: validation version string (governance metadata) - - `reference`: reference table for comparison-based validations - - Unknown top-level keys are rejected, which catches typos like `tbl_nmae` or `step`. - - Supported Validation Methods - ---------------------------- - In addition to all standard validation methods (e.g., `col_vals_gt`, `rows_distinct`, - `col_schema_match`), the following methods are also supported: - - - `col_pct_null`: check the percentage of null values in a column - - `data_freshness`: check that data is recent - - aggregate methods: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`, - `col_sum_eq`, `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`, - `col_avg_eq`, `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq` - - See Also - -------- - yaml_interrogate : execute YAML-based validation workflows - - -yaml_to_python(yaml: 'Union[str, Path]') -> 'str' -Convert YAML validation configuration to equivalent Python code. - - This function takes a YAML validation configuration and generates the equivalent Python code - that would produce the same validation workflow. This is useful for documentation, code - generation, or learning how to translate YAML workflows into programmatic workflows. - - The generated Python code includes all necessary imports, data loading, validation steps, - and interrogation execution, formatted as executable Python code. - - Parameters - ---------- - yaml - YAML configuration as string or file path. Can be: (1) a YAML string containing the - validation configuration, or (2) a Path object or string path to a YAML file. - - Returns - ------- - str - A formatted Python code string enclosed in markdown code blocks that replicates the YAML - workflow. The code includes import statements, data loading, validation method calls, and - interrogation execution. - - Raises - ------ - YAMLValidationError - If the YAML is invalid, malformed, or contains unknown validation methods. - - Examples - -------- - Convert a basic YAML configuration to Python code: - - ```python - import pointblank as pb - - # Define a YAML validation workflow - yaml_config = ''' - tbl: small_table - tbl_name: Data Quality Check - steps: - - col_vals_not_null: - columns: [a, b] - - col_vals_gt: - columns: [c] - value: 0 - ''' - - # Generate equivalent Python code - python_code = pb.yaml_to_python(yaml_config) - print(python_code) - ``` - - The generated Python code shows exactly how to replicate the YAML workflow programmatically. - This is particularly useful when transitioning from YAML-based workflows to code-based - workflows, or when generating documentation that shows both YAML and Python approaches. - - For more complex workflows with thresholds and metadata: - - ```python - # Advanced YAML configuration - yaml_config = ''' - tbl: small_table - tbl_name: Advanced Validation - label: Production data check - thresholds: - warning: 0.1 - error: 0.2 - steps: - - col_vals_between: - columns: [c] - left: 1 - right: 10 - - col_vals_regex: - columns: [b] - pattern: '[0-9]-[a-z]{3}-[0-9]{3}' - ''' - - # Generate the equivalent Python code - python_code = pb.yaml_to_python(yaml_config) - print(python_code) - ``` - - The generated code includes all configuration parameters, thresholds, and maintains the exact - same validation logic as the original YAML workflow. - - Governance metadata (`owner`, `consumers`, `version`) and `reference` are also rendered - in the generated Python code: - - ```python - yaml_config = ''' - tbl: small_table - tbl_name: Sales Pipeline - owner: Data Engineering - consumers: [Analytics, Finance] - version: "2.1.0" - steps: - - col_vals_not_null: - columns: [a] - - col_sum_gt: - columns: [d] - value: 0 - ''' - - python_code = pb.yaml_to_python(yaml_config) - print(python_code) - ``` - - This function is also useful for educational purposes, helping users understand how YAML - configurations map to the underlying Python API calls. - - - -## The Utility Functions family - -The Utility Functions group contains functions that are useful for accessing -metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of -columns or rows in a table. The `get_action_metadata()` function is useful when building custom -actions since it returns metadata about the validation step that's triggering the action. Lastly, -the `config()` utility lets us set global configuration parameters. - -get_column_count(data: 'Any') -> 'int' - - Get the number of columns in a table. - - The `get_column_count()` function returns the number of columns in a table. The function works - with any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis - backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports - direct input of CSV files, Parquet files, and database connection strings. - - Parameters - ---------- - data - The table for which to get the column count, which could be a DataFrame object, an Ibis - table object, a CSV file path, a Parquet file path, or a database connection string. - Read the *Supported Input Table Types* section for details on the supported table types. - - Returns - ------- - int - The number of columns in the table. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `get_column_count()` with these types of - tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a - Polars or Pandas DataFrame, the availability of Ibis is not needed. - - To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is - provided. The file will be automatically detected and loaded using the best available DataFrame - library. The loading preference is Polars first, then Pandas as a fallback. - - GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw - content URLs for downloading. The URL format should be: - `https://github.com/user/repo/blob/branch/path/file.csv` or - `https://github.com/user/repo/blob/branch/path/file.parquet` - - Connection strings follow database URL formats and must also specify a table using the - `::table_name` suffix. Examples include: - - ``` - "duckdb:///path/to/database.ddb::table_name" - "sqlite:///path/to/database.db::table_name" - "postgresql://user:password@localhost:5432/database::table_name" - "mysql://user:password@localhost:3306/database::table_name" - "bigquery://project/dataset::table_name" - "snowflake://user:password@account/database/schema::table_name" - ``` - - When using connection strings, the Ibis library with the appropriate backend driver is required. - - Examples - -------- - To get the number of columns in a table, we can use the `get_column_count()` function. Here's an - example using the `small_table` dataset (itself loaded using the - [`load_dataset()`](`pointblank.load_dataset`) function): - - This table is a Polars DataFrame, but the `get_column_count()` function works with any table - supported by `pointblank`, including Pandas DataFrames and Ibis backend tables. Here's an - example using a DuckDB table handled by Ibis: - - ```python - small_table_duckdb = pb.load_dataset("small_table", tbl_type="duckdb") - - pb.get_column_count(small_table_duckdb) - ``` - - #### Working with CSV Files - - The `get_column_count()` function can directly accept CSV file paths: - - #### Working with Parquet Files - - The function supports various Parquet input formats: - - You can also use glob patterns and directories: - - ```python - # Multiple Parquet files with glob patterns - pb.get_column_count("data/sales_*.parquet") - - # Directory containing Parquet files - pb.get_column_count("parquet_data/") - - # Partitioned Parquet dataset - pb.get_column_count("sales_data/") # Auto-discovers partition columns - ``` - - #### Working with Database Connection Strings - - The function supports database connection strings for direct access to database tables: - - The function always returns the number of columns in the table as an integer value, which is - `8` for the `small_table` dataset. - - -get_row_count(data: 'Any') -> 'int' - - Get the number of rows in a table. - - The `get_row_count()` function returns the number of rows in a table. The function works with - any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis - backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports - direct input of CSV files, Parquet files, and database connection strings. - - Parameters - ---------- - data - The table for which to get the row count, which could be a DataFrame object, an Ibis table - object, a CSV file path, a Parquet file path, or a database connection string. - Read the *Supported Input Table Types* section for details on the supported table types. - - Returns - ------- - int - The number of rows in the table. - - Supported Input Table Types - --------------------------- - The `data=` parameter can be given any of the following table types: - - - Polars DataFrame (`"polars"`) - - Pandas DataFrame (`"pandas"`) - - PySpark table (`"pyspark"`) - - DuckDB table (`"duckdb"`)* - - MySQL table (`"mysql"`)* - - PostgreSQL table (`"postgresql"`)* - - SQLite table (`"sqlite"`)* - - Microsoft SQL Server table (`"mssql"`)* - - Snowflake table (`"snowflake"`)* - - Databricks table (`"databricks"`)* - - BigQuery table (`"bigquery"`)* - - Parquet table (`"parquet"`)* - - CSV files (string path or `pathlib.Path` object with `.csv` extension) - - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet` - extension, or partitioned dataset) - - GitHub URLs (direct links to CSV or Parquet files on GitHub) - - Database connection strings (URI format with optional table specification) - - The table types marked with an asterisk need to be prepared as Ibis tables (with type of - `ibis.expr.types.relations.Table`). Furthermore, using `get_row_count()` with these types of - tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a - Polars or Pandas DataFrame, the availability of Ibis is not needed. - - To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is - provided. The file will be automatically detected and loaded using the best available DataFrame - library. The loading preference is Polars first, then Pandas as a fallback. - - GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw - content URLs for downloading. The URL format should be: - `https://github.com/user/repo/blob/branch/path/file.csv` or - `https://github.com/user/repo/blob/branch/path/file.parquet` - - Connection strings follow database URL formats and must also specify a table using the - `::table_name` suffix. Examples include: - - ``` - "duckdb:///path/to/database.ddb::table_name" - "sqlite:///path/to/database.db::table_name" - "postgresql://user:password@localhost:5432/database::table_name" - "mysql://user:password@localhost:3306/database::table_name" - "bigquery://project/dataset::table_name" - "snowflake://user:password@account/database/schema::table_name" - ``` - - When using connection strings, the Ibis library with the appropriate backend driver is required. - - Examples - -------- - Getting the number of rows in a table is easily done by using the `get_row_count()` function. - Here's an example using the `game_revenue` dataset (itself loaded using the - [`load_dataset()`](`pointblank.load_dataset`) function): - - This table is a Polars DataFrame, but the `get_row_count()` function works with any table - supported by `pointblank`, including Pandas DataFrames and Ibis backend tables. Here's an - example using a DuckDB table handled by Ibis: - - ```python - game_revenue_duckdb = pb.load_dataset("game_revenue", tbl_type="duckdb") - - pb.get_row_count(game_revenue_duckdb) - ``` - - #### Working with CSV Files - - The `get_row_count()` function can directly accept CSV file paths: - - #### Working with Parquet Files - - The function supports various Parquet input formats: - - You can also use glob patterns and directories: - - ```python - # Multiple Parquet files with glob patterns - pb.get_row_count("data/sales_*.parquet") - - # Directory containing Parquet files - pb.get_row_count("parquet_data/") - - # Partitioned Parquet dataset - pb.get_row_count("sales_data/") # Auto-discovers partition columns - ``` - - #### Working with Database Connection Strings - - The function supports database connection strings for direct access to database tables: - - The function always returns the number of rows in the table as an integer value, which is `2000` - for the `game_revenue` dataset. - - -get_action_metadata() -> 'dict | None' -Access step-level metadata when authoring custom actions. - - Get the metadata for the validation step where an action was triggered. This can be called by - user functions to get the metadata for the current action. This function can only be used within - callables crafted for the [`Actions`](`pointblank.Actions`) class. - - Returns - ------- - dict | None - A dictionary containing the metadata for the current step. If called outside of an action - (i.e., when no action is being executed), this function will return `None`. - - Description of the Metadata Fields - ---------------------------------- - The metadata dictionary contains the following fields for a given validation step: - - - `step`: The step number. - - `column`: The column name. - - `value`: The value being compared (only available in certain validation steps). - - `type`: The assertion type (e.g., `"col_vals_gt"`, etc.). - - `time`: The time the validation step was executed (in ISO format). - - `level`: The severity level (`"warning"`, `"error"`, or `"critical"`). - - `level_num`: The severity level as a numeric value (`30`, `40`, or `50`). - - `autobrief`: A localized and brief statement of the expectation for the step. - - `failure_text`: Localized text that explains how the validation step failed. - - Examples - -------- - When creating a custom action, you can access the metadata for the current step using the - `get_action_metadata()` function. Here's an example of a custom action that logs the metadata - for the current step: - - ```python - import pointblank as pb - - def log_issue(): - metadata = pb.get_action_metadata() - print(f"Type: {metadata['type']}, Step: {metadata['step']}") - - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(warning=log_issue), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=15, - ) - .interrogate() - ) - - validation - ``` - - Key pieces to note in the above example: - - - `log_issue()` (the custom action) collects `metadata` by calling `get_action_metadata()` - - the `metadata` is a dictionary that is used to craft the log message - - the action is passed as a bare function to the `Actions` object within the `Validate` object - (placing it within `Validate(actions=)` ensures it's set as an action for every validation step) - - See Also - -------- - Have a look at [`Actions`](`pointblank.Actions`) for more information on how to create custom - actions for validation steps that exceed a set threshold value. - - -get_validation_summary() -> 'dict | None' -Access validation summary information when authoring final actions. - - This function provides a convenient way to access summary information about the validation - process within a final action. It returns a dictionary with key metrics from the validation - process. This function can only be used within callables crafted for the - [`FinalActions`](`pointblank.FinalActions`) class. - - Returns - ------- - dict | None - A dictionary containing validation metrics. If called outside of an final action context, - this function will return `None`. - - Description of the Summary Fields - -------------------------------- - The summary dictionary contains the following fields: - - - `n_steps` (`int`): The total number of validation steps. - - `n_passing_steps` (`int`): The number of validation steps where all test units passed. - - `n_failing_steps` (`int`): The number of validation steps that had some failing test units. - - `n_warning_steps` (`int`): The number of steps that exceeded a 'warning' threshold. - - `n_error_steps` (`int`): The number of steps that exceeded an 'error' threshold. - - `n_critical_steps` (`int`): The number of steps that exceeded a 'critical' threshold. - - `list_passing_steps` (`list[int]`): List of step numbers where all test units passed. - - `list_failing_steps` (`list[int]`): List of step numbers for steps having failing test units. - - `dict_n` (`dict`): The number of test units for each validation step. - - `dict_n_passed` (`dict`): The number of test units that passed for each validation step. - - `dict_n_failed` (`dict`): The number of test units that failed for each validation step. - - `dict_f_passed` (`dict`): The fraction of test units that passed for each validation step. - - `dict_f_failed` (`dict`): The fraction of test units that failed for each validation step. - - `dict_warning` (`dict`): The 'warning' level status for each validation step. - - `dict_error` (`dict`): The 'error' level status for each validation step. - - `dict_critical` (`dict`): The 'critical' level status for each validation step. - - `all_passed` (`bool`): Whether or not every validation step had no failing test units. - - `highest_severity` (`str`): The highest severity level encountered during validation. This can - be one of the following: `"warning"`, `"error"`, or `"critical"`, `"some failing"`, or - `"all passed"`. - - `tbl_row_count` (`int`): The number of rows in the target table. - - `tbl_column_count` (`int`): The number of columns in the target table. - - `tbl_name` (`str`): The name of the target table. - - `validation_duration` (`float`): The duration of the validation in seconds. - - Note that the summary dictionary is only available within the context of a final action. If - called outside of a final action (i.e., when no final action is being executed), this function - will return `None`. - - Examples - -------- - Final actions are executed after the completion of all validation steps. They provide an - opportunity to take appropriate actions based on the overall validation results. Here's an - example of a final action function (`send_report()`) that sends an alert when critical - validation failures are detected: - - ```python - import pointblank as pb - - def send_report(): - summary = pb.get_validation_summary() - if summary["highest_severity"] == "critical": - # Send an alert email - send_alert_email( - subject=f"CRITICAL validation failures in {summary['tbl_name']}", - body=f"{summary['n_critical_steps']} steps failed with critical severity." - ) - - validation = ( - pb.Validate( - data=my_data, - final_actions=pb.FinalActions(send_report) - ) - .col_vals_gt(columns="revenue", value=0) - .interrogate() - ) - ``` - - Note that `send_alert_email()` in the example above is a placeholder function that would be - implemented by the user to send email alerts. This function is not provided by the Pointblank - package. - - The `get_validation_summary()` function can also be used to create custom reporting for - validation results: - - ```python - def log_validation_results(): - summary = pb.get_validation_summary() - - print(f"Validation completed with status: {summary['highest_severity'].upper()}") - print(f"Steps: {summary['n_steps']} total") - print(f" - {summary['n_passing_steps']} passing, {summary['n_failing_steps']} failing") - print( - f" - Severity: {summary['n_warning_steps']} warnings, " - f"{summary['n_error_steps']} errors, " - f"{summary['n_critical_steps']} critical" - ) - - if summary['highest_severity'] in ["error", "critical"]: - print("⚠️ Action required: Please review failing validation steps!") - ``` - - Final actions work well with both simple logging and more complex notification systems, allowing - you to integrate validation results into your broader data quality workflows. - - See Also - -------- - Have a look at [`FinalActions`](`pointblank.FinalActions`) for more information on how to create - custom actions that are executed after all validation steps have been completed. - - -write_file(validation: 'Validate', filename: 'str', path: 'str | None' = None, keep_tbl: 'bool' = False, keep_extracts: 'bool' = False, quiet: 'bool' = False) -> 'None' - - Write a Validate object to disk as a serialized file. - - Writing a validation object to disk with `write_file()` can be useful for keeping data - validation results close at hand for later retrieval (with `read_file()`). By default, any data - table that the validation object holds will be removed before writing to disk (not applicable if - no data table is present). This behavior can be changed by setting `keep_tbl=True`, but this - only works when the table is not of a database type (e.g., DuckDB, PostgreSQL, etc.), as - database connections cannot be serialized. - - Extract data from failing validation steps can also be preserved by setting - `keep_extracts=True`, which is useful for later analysis of data quality issues. - - The serialized file uses Python's pickle format for storage of the validation object state, - including all validation results, metadata, and optionally the source data. - - **Important note.** If your validation uses custom preprocessing functions (via the `pre=` - parameter), these functions must be defined at the module level (not interactively or as lambda - functions) to ensure they can be properly restored when loading the validation in a different - Python session. Read the *Creating Serializable Validations* section below for more information. - - :::{.callout-warning} - The `write_file()` function is currently experimental. Please report any issues you encounter in - the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - validation - The `Validate` object to write to disk. - filename - The filename to create on disk for the validation object. Should not include the file - extension as `.pkl` will be added automatically. - path - An optional directory path where the file should be saved. If not provided, the file will be - saved in the current working directory. The directory will be created if it doesn't exist. - keep_tbl - An option to keep the data table that is associated with the validation object. The default - is `False` where the data table is removed before writing to disk. For database tables - (e.g., Ibis tables with database backends), the table is always removed even if - `keep_tbl=True`, as database connections cannot be serialized. - keep_extracts - An option to keep any collected extract data for failing rows from validation steps. By - default, this is `False` (i.e., extract data is removed to save space). - quiet - Should the function not inform when the file is written? By default, this is `False`, so a - message will be printed when the file is successfully written. - - Returns - ------- - None - This function doesn't return anything but saves the validation object to disk. - - Creating Serializable Validations - --------------------------------- - To ensure your validations work reliably across different Python sessions, the recommended - approach is to use module-Level functions. So, create a separate Python file for your - preprocessing functions: - - ```python - # preprocessing_functions.py - import polars as pl - - def multiply_by_100(df): - return df.with_columns(pl.col("value") * 100) - - def add_computed_column(df): - return df.with_columns(computed=pl.col("value") * 2 + 10) - ``` - - Then import and use them in your validation: - - ```python - # your_main_script.py - import pointblank as pb - from preprocessing_functions import multiply_by_100, add_computed_column - - validation = ( - pb.Validate(data=my_data) - .col_vals_gt(columns="value", value=500, pre=multiply_by_100) - .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column) - .interrogate() - ) - - # Save validation and it will work reliably across sessions - pb.write_file(validation, "my_validation", keep_tbl=True) - ``` - - ### Problematic Patterns to Avoid - - Don't use lambda functions as they will cause immediate errors. - - Don't use interactive function definitions (as they may fail when loading). - - ```python - def my_function(df): # Defined in notebook/REPL - return df.with_columns(pl.col("value") * 2) - - validation = pb.Validate(data).col_vals_gt( - columns="value", value=100, pre=my_function - ) - ``` - - ### Automatic Analysis and Guidance - - When you call `write_file()`, it automatically analyzes your validation and provides: - - - confirmation when all functions will work reliably - - warnings for functions that may cause cross-session issues - - clear errors for unsupported patterns (lambda functions) - - specific recommendations and code examples - - loading instructions tailored to your validation - - ### Loading Your Validation - - To load a saved validation in a new Python session: - - ```python - # In a new Python session - import pointblank as pb - - # Import the same preprocessing functions used when creating the validation - from preprocessing_functions import multiply_by_100, add_computed_column - - # Upon loading the validation, functions will be automatically restored - validation = pb.read_file("my_validation.pkl") - ``` - - ** Testing Your Validation:** - - To verify your validation works across sessions: - - 1. save your validation in one Python session - 2. start a fresh Python session (restart kernel/interpreter) - 3. import required preprocessing functions - 4. load the validation using `read_file()` - 5. test that preprocessing functions work as expected - - ### Performance and Storage - - - use `keep_tbl=False` (default) to reduce file size when you don't need the original data - - use `keep_extracts=False` (default) to save space by excluding extract data - - set `quiet=True` to suppress guidance messages in automated scripts - - files are saved using pickle's highest protocol for optimal performance - - Examples - -------- - Let's create a simple validation and save it to disk: - - ```python - import pointblank as pb - - # Create a validation - validation = ( - pb.Validate(data=pb.load_dataset("small_table"), label="My validation") - .col_vals_gt(columns="d", value=100) - .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}") - .interrogate() - ) - - # Save to disk (without the original table data) - pb.write_file(validation, "my_validation") - ``` - - To keep the original table data for later analysis: - - ```python - # Save with the original table data included - pb.write_file(validation, "my_validation_with_data", keep_tbl=True) - ``` - - You can also specify a custom directory and keep extract data: - - ```python - pb.write_file( - validation, - filename="detailed_validation", - path="/path/to/validations", - keep_tbl=True, - keep_extracts=True - ) - ``` - - ### Working with Preprocessing Functions - - For validations that use preprocessing functions to be portable across sessions, define your - functions in a separate `.py` file: - - ```python - # In `preprocessing_functions.py` - - import polars as pl - - def multiply_by_100(df): - return df.with_columns(pl.col("value") * 100) - - def add_computed_column(df): - return df.with_columns(computed=pl.col("value") * 2 + 10) - ``` - - Then import and use them in your validation: - - ```python - # In your main script - - import pointblank as pb - from preprocessing_functions import multiply_by_100, add_computed_column - - validation = ( - pb.Validate(data=my_data) - .col_vals_gt(columns="value", value=500, pre=multiply_by_100) - .col_vals_between(columns="computed", left=50, right=1000, pre=add_computed_column) - .interrogate() - ) - - # This validation can now be saved and loaded reliably - pb.write_file(validation, "my_validation", keep_tbl=True) - ``` - - When you load this validation in a new session, simply import the preprocessing functions - again and they will be automatically restored. - - See Also - -------- - Use the [`read_file()`](`pointblank.read_file`) function to load a validation object that was - previously saved with `write_file()`. - - -read_file(filepath: 'str | Path') -> 'Validate' - - Read a Validate object from disk that was previously saved with `write_file()`. - - This function loads a validation object that was previously serialized to disk using the - `write_file()` function. The validation object will be restored with all its validation results, - metadata, and optionally the source data (if it was saved with `keep_tbl=True`). - - :::{.callout-warning} - The `read_file()` function is currently experimental. Please report any issues you encounter in - the [Pointblank issue tracker](https://github.com/posit-dev/pointblank/issues). - ::: - - Parameters - ---------- - filepath - The path to the saved validation file. Can be a string or Path object. - - Returns - ------- - Validate - The restored validation object with all its original state, validation results, and - metadata. - - Examples - -------- - Load a validation object that was previously saved: - - ```python - import pointblank as pb - - # Load a validation object from disk - validation = pb.read_file("my_validation.pkl") - - # View the validation results - validation - ``` - - You can also load using just the filename (without extension): - - ```python - # This will automatically look for "my_validation.pkl" - validation = pb.read_file("my_validation") - ``` - - The loaded validation object retains all its functionality: - - ```python - # Get validation summary - summary = validation.get_json_report() - - # Get sundered data (if original table was saved) - if validation.data is not None: - failing_rows = validation.get_sundered_data(type="fail") - ``` - - See Also - -------- - Use the [`write_file()`](`pointblank.Validate.write_file`) method to save a validation object - to disk for later retrieval with this function. - - -config(report_incl_header: 'bool' = True, report_incl_footer: 'bool' = True, report_incl_footer_timings: 'bool' = True, report_incl_footer_notes: 'bool' = True, preview_incl_header: 'bool' = True) -> 'PointblankConfig' - - Configuration settings for the Pointblank library. - - Parameters - ---------- - report_incl_header - This controls whether the header should be present in the validation table report. The - header contains the table name, label information, and might contain global failure - threshold levels (if set). - report_incl_footer - Should the footer of the validation table report be displayed? The footer contains the - starting and ending times of the interrogation and any notes added to validation steps. - report_incl_footer_timings - Controls whether the validation timing information (start time, duration, and end time) - should be displayed in the footer. Only applies when `report_incl_footer=True`. - report_incl_footer_notes - Controls whether the notes from validation steps should be displayed in the footer. Only - applies when `report_incl_footer=True`. - preview_incl_header - Whether the header should be present in any preview table (generated via the - [`preview()`](`pointblank.preview`) function). - - Returns - ------- - PointblankConfig - A `PointblankConfig` object with the specified configuration settings. - - - -## The Test Data Generation family - -Generate synthetic test data based on schema definitions. Use -`generate_dataset()` to create data from a `Schema` object. The helper functions define typed fields -with constraints for realistic test data generation. - -generate_dataset(schema: 'Schema', n: 'int' = 100, seed: 'int | None' = None, output: "Literal['polars', 'pandas', 'dict']" = 'polars', country: 'str | list[str] | dict[str, float]' = 'US', shuffle: 'bool' = True, weighted: 'bool' = True) -> 'Any' - - Generate synthetic test data from a schema. - - This function generates random data that conforms to a schema's column definitions. When the - schema is defined using `Field` objects with constraints (e.g., `min_val=`, `max_val=`, - `pattern=`, `preset=`), the generated data will respect those constraints. - - Parameters - ---------- - schema - The schema object defining the structure and constraints of the data to generate. Each - column can be specified using a field helper function (e.g., `int_field()`, - `string_field()`) for fine-grained control, or as a simple dtype string (e.g., - `"Int64"`, `"String"`) for unconstrained generation. - n - Number of rows to generate. The default is `100`. - seed - Random seed for reproducibility. If provided, the same seed will produce - the same data. Default is `None` (non-deterministic). - output - Output format for the generated data. Options are: (1) `"polars"` (the default) returns a - Polars DataFrame, (2) `"pandas"` returns a Pandas DataFrame, and (3) `"dict"` returns - a dictionary of lists. - country - Country code(s) for locale-aware generation when using presets. Accepts a single - ISO 3166-1 alpha-2 or alpha-3 code (e.g., `"US"`, `"DEU"`), a list of codes for - uniform mixing (e.g., `["US", "DE", "JP"]`), or a dict mapping codes to positive - weights (e.g., `{"US": 60, "DE": 25, "JP": 15}`). See the *Locale Mixing* section - below for details. The default is `"US"`. - shuffle - When `country=` is a list or dict (multi-country mixing), controls whether rows from - different countries are interleaved randomly (`True`, the default) or grouped by country - in the order the countries are specified (`False`). Ignored when `country=` is a single - string. - weighted - When `True`, names and locations are sampled according to real-world frequency tiers. - Common names like "James" and "Smith" appear far more often than rare names. Large - cities like New York and Los Angeles dominate over small towns. Only affects data files - that have been migrated to the tiered format; flat-list data always uses uniform - sampling. Default is `True`. - - Returns - ------- - DataFrame or dict - Generated data in the requested format. - - Raises - ------ - ValueError - If the schema has no columns or if constraints cannot be satisfied. - ImportError - If required optional dependencies are not installed. - - Presets and the `country=` Parameter - ------------------------------------ - Several `string_field()` presets produce locale-aware data that varies depending on the - `country=` parameter. The following presets are particularly affected: - - - **Address-related presets** (`"address"`, `"city"`, `"state"`, `"postcode"`, - `"phone_number"`, `"latitude"`, `"longitude"`, `"license_plate"`): produce addresses, - cities, postal codes, phone numbers, and license plates formatted for the specified - country. For example, `country="DE"` yields German street names and PLZ postal codes, - while `country="JP"` yields Japanese addresses. License plates for CA, US, DE, AU, and - GB use province/state-specific formats when location fields are present. - - **Person-related presets** (`"name"`, `"name_full"`, `"first_name"`, `"last_name"`, - `"email"`, `"user_name"`) produce culturally appropriate names for the specified country. - For example, `country="FR"` produces French names, while `country="KR"` produces Korean - names. - - **Business-related presets** (`"job"`, `"company"`): when both are present, the job and - company are drawn from the same industry for realism. The `"name_full"` preset will also - add profession-matched titles (e.g., "Dr." for doctors, "Prof." for professors), and - integer columns named `age` are automatically constrained to working-age range (22--65). - - **Financial presets** (`"iban"`, `"ssn"`, `"license_plate"`): produce identifiers in the - format used by the specified country. - - When multiple columns in the same schema use related presets, the generated data is - automatically coherent across those columns within each row. Person-related presets will share - the same identity (e.g., the email is derived from the name), address-related presets will - share the same location (e.g., the city matches the address), and business-related presets - will share the same industry context. - - Locale Mixing - ------------- - The `country=` parameter accepts three input forms for flexible locale control: - - (1) a **single string** (the default), such as `"US"` or `"DEU"`, which generates - all rows from one locale; (2) a **list of strings**, such as `["US", "DE", "JP"]`, - which splits rows equally across the listed countries; and (3) a **dict of weights**, - such as `{"US": 0.6, "DE": 0.3, "FR": 0.1}`, which allocates rows proportionally - (weights are auto-normalized, so `{"US": 6, "DE": 3, "FR": 1}` is equivalent). - - Row counts are distributed using largest-remainder apportionment so they always sum - to exactly `n=`. Each country's rows are generated as an independent batch (preserving - all cross-column coherence within each batch), then either interleaved randomly - (`shuffle=True`, the default) or left in contiguous country blocks - (`shuffle=False`). - - Supported Countries - ------------------- - The `country=` parameter currently supports 100 countries with full locale data: - - **Europe (38 countries):** Armenia (`"AM"`), Austria (`"AT"`), Azerbaijan (`"AZ"`), - Belgium (`"BE"`), Bulgaria (`"BG"`), Croatia (`"HR"`), Cyprus (`"CY"`), - Czech Republic (`"CZ"`), Denmark (`"DK"`), Estonia (`"EE"`), Finland (`"FI"`), - France (`"FR"`), Georgia (`"GE"`), Germany (`"DE"`), Greece (`"GR"`), - Hungary (`"HU"`), Iceland (`"IS"`), Ireland (`"IE"`), Italy (`"IT"`), - Latvia (`"LV"`), Lithuania (`"LT"`), Luxembourg (`"LU"`), Malta (`"MT"`), - Moldova (`"MD"`), Netherlands (`"NL"`), Norway (`"NO"`), Poland (`"PL"`), - Portugal (`"PT"`), Romania (`"RO"`), Russia (`"RU"`), Serbia (`"RS"`), - Slovakia (`"SK"`), Slovenia (`"SI"`), Spain (`"ES"`), Sweden (`"SE"`), - Switzerland (`"CH"`), Ukraine (`"UA"`), United Kingdom (`"GB"`) - - **Americas (19 countries):** Argentina (`"AR"`), Bolivia (`"BO"`), Brazil (`"BR"`), - Canada (`"CA"`), Chile (`"CL"`), Colombia (`"CO"`), Costa Rica (`"CR"`), - Dominican Republic (`"DO"`), Ecuador (`"EC"`), El Salvador (`"SV"`), - Guatemala (`"GT"`), Honduras (`"HN"`), Jamaica (`"JM"`), Mexico (`"MX"`), - Panama (`"PA"`), Paraguay (`"PY"`), Peru (`"PE"`), United States (`"US"`), - Uruguay (`"UY"`) - - **Asia-Pacific (22 countries):** Australia (`"AU"`), Bangladesh (`"BD"`), - Cambodia (`"KH"`), China (`"CN"`), Hong Kong (`"HK"`), India (`"IN"`), - Indonesia (`"ID"`), Japan (`"JP"`), Kazakhstan (`"KZ"`), Malaysia (`"MY"`), - Myanmar (`"MM"`), Nepal (`"NP"`), New Zealand (`"NZ"`), Pakistan (`"PK"`), - Philippines (`"PH"`), Singapore (`"SG"`), South Korea (`"KR"`), - Sri Lanka (`"LK"`), Taiwan (`"TW"`), Thailand (`"TH"`), Uzbekistan (`"UZ"`), - Vietnam (`"VN"`) - - **Middle East & Africa (21 countries):** Algeria (`"DZ"`), Cameroon (`"CM"`), - Egypt (`"EG"`), Ethiopia (`"ET"`), Ghana (`"GH"`), Israel (`"IL"`), - Jordan (`"JO"`), Kenya (`"KE"`), Lebanon (`"LB"`), Morocco (`"MA"`), - Mozambique (`"MZ"`), Nigeria (`"NG"`), Rwanda (`"RW"`), Saudi Arabia (`"SA"`), - Senegal (`"SN"`), South Africa (`"ZA"`), Tanzania (`"TZ"`), Tunisia (`"TN"`), - Turkey (`"TR"`), Uganda (`"UG"`), United Arab Emirates (`"AE"`) - - Pytest Fixture - -------------- - When Pointblank is installed, a `generate_dataset` pytest fixture is automatically - available in all test files: no imports or `conftest.py` setup required. The fixture - behaves identically to this function, but derives a deterministic seed from the test's - fully-qualified name when `seed=` is not provided. - - This means: - - - the **same test** always produces the **same data**, with no manual seed management. - - **different tests** get different seeds, so they exercise different data. - - **you** can still pass an explicit `seed=` to override the automatic seed. - - **calling** the fixture **multiple times** within one test produces different (but still - deterministic) data on each call. - - the fixture exposes `.default_seed` and `.last_seed` attributes for debugging. - - ```python - def test_my_pipeline(generate_dataset): - import pointblank as pb - - schema = pb.Schema( - user_id=pb.int_field(unique=True), - email=pb.string_field(preset="email"), - age=pb.int_field(min_val=18, max_val=100), - ) - df = generate_dataset(schema, n=500, country="DE") - # seed is derived from "test_my_pipeline" — same data every run - result = my_pipeline(df) - assert result.shape[0] == 500 - ``` - - Multiple datasets can be generated within the same test, each with its own - deterministic seed: - - When a test fails, include the seed in the assertion message so the failure is easy to - reproduce: - - ```python - def test_age_range(generate_dataset): - df = generate_dataset(schema, n=100) - assert df["age"].min() >= 18, f"Failed with seed {generate_dataset.last_seed}" - ``` - - Seed Stability - -------------- - A given seed (whether explicit or auto-derived) is guaranteed to produce identical output - **within the same Pointblank version**. Across versions, changes to country data files or - generator logic may alter the output for a given seed. - - For CI pipelines that require bit-exact data across library upgrades, save generated - DataFrames as Parquet or CSV snapshot files rather than relying on cross-version seed - stability. This is the same approach used by snapshot-testing tools like `pytest-snapshot` - and `syrupy`. - - Examples - -------- - Here we define a schema with field constraints and generate test data from it: - - ```python - import pointblank as pb - - schema = pb.Schema( - user_id=pb.int_field(min_val=1, unique=True), - email=pb.string_field(preset="email"), - age=pb.int_field(min_val=18, max_val=100), - status=pb.string_field(allowed=["active", "pending", "inactive"]), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - It's also possible to generate data from a simple, dtype-only schema. Setting - `output="pandas"` returns a Pandas DataFrame: - - ```python - schema = pb.Schema(name="String", age="Int64", active="Boolean") - - pb.preview(pb.generate_dataset(schema, n=50, seed=23, output="pandas")) - ``` - - When using presets, the `country=` parameter controls the locale. Here, `country="DE"` - produces German names and addresses: - - ```python - schema = pb.Schema( - name=pb.string_field(preset="name"), - address=pb.string_field(preset="address"), - city=pb.string_field(preset="city"), - ) - - pb.preview(pb.generate_dataset(schema, n=20, seed=23, country="DE")) - ``` - - We can combine several field types with nullable columns in a mixed-type dataset: - - ```python - from datetime import date, timedelta - - schema = pb.Schema( - id=pb.int_field(min_val=1, unique=True), - name=pb.string_field(preset="name"), - score=pb.float_field(min_val=0.0, max_val=100.0), - is_active=pb.bool_field(p_true=0.75), - joined=pb.date_field(min_date=date(2020, 1, 1), max_date=date(2024, 12, 31)), - session_time=pb.duration_field( - min_duration=timedelta(minutes=1), - max_duration=timedelta(hours=3), - nullable=True, null_probability=0.2, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - -int_field(min_val: 'int | None' = None, max_val: 'int | None' = None, allowed: 'list[int] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None, dtype: 'str' = 'Int64') -> 'IntField' - - Create an integer column specification for use in a schema. - - The `int_field()` function defines the constraints and behavior for an integer column when - generating synthetic data with `generate_dataset()`. You can control the range of values - with `min_val=` and `max_val=`, restrict values to a specific set with `allowed=`, enforce - uniqueness with `unique=True`, and introduce null values with `nullable=True` and - `null_probability=`. The `dtype=` parameter lets you choose the specific integer type (e.g., - `"Int8"`, `"UInt16"`, `"Int64"`), which also determines the valid range of values. - - When no constraints are specified, values are drawn uniformly from the full range of the - chosen integer dtype. If both `min_val=` and `max_val=` are provided, values are drawn - uniformly from that range. If `allowed=` is provided, values are sampled from that specific - list. - - Parameters - ---------- - min_val - Minimum value (inclusive). Default is `None` (no minimum, uses dtype lower bound). - max_val - Maximum value (inclusive). Default is `None` (no maximum, uses dtype upper bound). - allowed - List of allowed values (categorical constraint). When provided, values are sampled from - this list. Cannot be combined with `min_val=`/`max_val=`. - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. When `True`, the generator will - retry until it produces `n` distinct values (subject to retry limits). - generator - Custom callable that generates values. When provided, this overrides all other - constraints (`min_val=`, `max_val=`, `allowed=`, etc.). The callable should take no - arguments and return a single integer value. - dtype - Integer dtype. Default is `"Int64"`. Options: `"Int8"`, `"Int16"`, `"Int32"`, - `"Int64"`, `"UInt8"`, `"UInt16"`, `"UInt32"`, `"UInt64"`. - - Returns - ------- - IntField - An integer field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_val` is greater than `max_val`, if `allowed` is an empty list, if - `null_probability` is not between `0.0` and `1.0`, or if `dtype` is not a valid - integer type. - - Examples - -------- - The `min_val=` and `max_val=` parameters constrain generated ranges, while `allowed=` - restricts values to a specific set: - - ```python - import pointblank as pb - - schema = pb.Schema( - user_id=pb.int_field(min_val=1, unique=True), - age=pb.int_field(min_val=0, max_val=120), - rating=pb.int_field(allowed=[1, 2, 3, 4, 5]), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - It's possible to introduce missing values with `nullable=True` and `null_probability=`, - and to select a smaller dtype with `dtype=`: - - ```python - schema = pb.Schema( - score=pb.int_field(min_val=0, max_val=255, dtype="UInt8"), - optional_val=pb.int_field( - min_val=1, max_val=50, - nullable=True, null_probability=0.3, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - We can also enforce uniqueness with `unique=True` to produce distinct identifiers within - a range: - - ```python - schema = pb.Schema( - record_id=pb.int_field(min_val=1000, max_val=9999, unique=True), - priority=pb.int_field(allowed=[1, 2, 3]), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=10)) - ``` - - For complete control, a custom `generator=` callable can be provided: - - ```python - import random - - rng = random.Random(0) - - schema = pb.Schema( - even_numbers=pb.int_field(generator=lambda: rng.choice(range(0, 100, 2))), - ) - - pb.preview(pb.generate_dataset(schema, n=20, seed=5)) - ``` - - -float_field(min_val: 'float | None' = None, max_val: 'float | None' = None, allowed: 'list[float] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None, dtype: 'str' = 'Float64') -> 'FloatField' - - Create a floating-point column specification for use in a schema. - - The `float_field()` function defines the constraints and behavior for a floating-point column - when generating synthetic data with `generate_dataset()`. You can control the range of values - with `min_val=` and `max_val=`, restrict values to a specific set with `allowed=`, enforce - uniqueness with `unique=True`, and introduce null values with `nullable=True` and - `null_probability=`. The `dtype=` parameter lets you choose between `"Float32"` and - `"Float64"` precision. - - When both `min_val=` and `max_val=` are provided, values are drawn from a uniform - distribution across that range. If neither is specified, values are drawn uniformly from a - large default range. If `allowed=` is provided, values are sampled from that specific list. - - Parameters - ---------- - min_val - Minimum value (inclusive). Default is `None` (no minimum). - max_val - Maximum value (inclusive). Default is `None` (no maximum). - allowed - List of allowed values (categorical constraint). When provided, values are sampled from - this list. Cannot be combined with `min_val=`/`max_val=`. - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. When `True`, the generator will - retry until it produces `n` distinct values. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single float value. - dtype - Float dtype. Default is `"Float64"`. Options: `"Float32"`, `"Float64"`. - - Returns - ------- - FloatField - A float field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_val` is greater than `max_val`, if `allowed` is an empty list, if - `null_probability` is not between `0.0` and `1.0`, or if `dtype` is not a valid - float type. - - Examples - -------- - The `min_val=` and `max_val=` parameters define the generated value ranges: - - ```python - import pointblank as pb - - schema = pb.Schema( - price=pb.float_field(min_val=0.01, max_val=9999.99), - probability=pb.float_field(min_val=0.0, max_val=1.0), - temperature=pb.float_field(min_val=-40.0, max_val=50.0), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - It's also possible to restrict values to a discrete set with `allowed=`, which is useful - for fixed pricing tiers or measurement levels: - - ```python - schema = pb.Schema( - discount=pb.float_field(allowed=[0.05, 0.10, 0.15, 0.20, 0.25]), - weight_kg=pb.float_field(min_val=0.5, max_val=100.0), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - We can simulate missing measurements by introducing null values: - - ```python - schema = pb.Schema( - reading=pb.float_field( - min_val=0.0, max_val=500.0, - nullable=True, null_probability=0.2, - ), - calibration=pb.float_field(min_val=0.9, max_val=1.1), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - Setting `dtype="Float32"` gives reduced precision, and a custom `generator=` provides - full control over value generation: - - ```python - import random, math - - rng = random.Random(0) - - schema = pb.Schema( - sensor_value=pb.float_field(min_val=-10.0, max_val=10.0, dtype="Float32"), - log_value=pb.float_field(generator=lambda: math.log(rng.uniform(1, 1000))), - ) - - pb.preview(pb.generate_dataset(schema, n=20, seed=99)) - ``` - - -string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, pattern: 'str | None' = None, preset: 'str | None' = None, allowed: 'list[str] | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'StringField' - - Create a string column specification for use in a schema. - - The `string_field()` function defines the constraints and behavior for a string column when - generating synthetic data with `generate_dataset()`. It provides three main modes of string - generation: (1) controlled random strings with `min_length=`/`max_length=`, (2) strings - matching a regular expression via `pattern=`, or (3) realistic data using `preset=` (e.g., - `"email"`, `"name"`, `"address"`). You can also restrict values to a fixed set with - `allowed=`. Only one of `preset=`, `pattern=`, or `allowed=` can be specified at a time. - - When no special mode is selected, random alphanumeric strings are generated with lengths - between `min_length=` and `max_length=` (defaulting to 1--20 characters). - - Parameters - ---------- - min_length - Minimum string length (for random string generation). Default is `None` (defaults to - `1`). Only applies when `preset=`, `pattern=`, and `allowed=` are all `None`. - max_length - Maximum string length (for random string generation). Default is `None` (defaults to - `20`). Only applies when `preset=`, `pattern=`, and `allowed=` are all `None`. - pattern - Regular expression pattern that generated strings must match. Supports character - classes (e.g., `[A-Z]`, `[0-9]`), quantifiers (e.g., `{3}`, `{2,5}`), alternation, - and groups. Cannot be combined with `preset=` or `allowed=`. - preset - Preset name for generating realistic data. When specified, values are produced using - locale-aware data generation, and the `country=` parameter of `generate_dataset()` - controls the locale. Cannot be combined with `pattern=` or `allowed=`. See the - **Available Presets** section below for the full list. - allowed - List of allowed string values (categorical constraint). Values are sampled uniformly - from this list. Cannot be combined with `preset=` or `pattern=`. - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. When `True`, the generator will - retry until it produces `n` distinct values. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single string value. - - Returns - ------- - StringField - A string field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If more than one of `preset=`, `pattern=`, or `allowed=` is specified; if `allowed=` - is an empty list; if `min_length` or `max_length` is negative; if `min_length` exceeds - `max_length`; or if `preset` is not a recognized preset name. - - Available Presets - ----------------- - The `preset=` parameter accepts one of the following preset names, organized by category. - When a preset is used, the `country=` parameter of `generate_dataset()` controls the locale - for region-specific formatting (e.g., address formats, phone number patterns). - - **Personal:** `"name"` (first + last name), `"name_full"` (full name with possible prefix - or suffix), `"first_name"`, `"last_name"`, `"email"` (realistic email address), - `"phone_number"`, `"address"` (full street address), `"city"`, `"state"`, `"country"`, - `"country_code_2"` (ISO 3166-1 alpha-2 code, e.g., `"US"`), `"country_code_3"` (ISO - 3166-1 alpha-3 code, e.g., `"USA"`), `"postcode"`, `"latitude"`, `"longitude"` - - **Business:** `"company"` (company name), `"job"` (job title), `"catch_phrase"` - - **Internet:** `"url"`, `"domain_name"`, `"ipv4"`, `"ipv6"`, `"user_name"`, `"password"` - - **Text:** `"text"` (paragraph of text), `"sentence"`, `"paragraph"`, `"word"` - - **Financial:** `"credit_card_number"`, `"iban"`, `"currency_code"` - - **Identifiers:** `"uuid4"`, `"md5"` (MD5 hash, 32 hex chars), `"sha1"` (SHA-1 hash, - 40 hex chars), `"sha256"` (SHA-256 hash, 64 hex chars), `"ssn"` (social security number), - `"license_plate"` - - **Barcodes:** `"ean8"` (EAN-8 barcode with valid check digit), `"ean13"` (EAN-13 barcode - with valid check digit) - - **Date/Time (as strings):** `"date_this_year"`, `"date_this_decade"`, `"date_between"` - (random date between 2000–2025), `"date_range"` (two dates joined with an en-dash, e.g., - `"2012-05-12 – 2015-11-22"`), `"future_date"` (up to 1 year ahead), `"past_date"` - (up to 10 years back), `"time"` - - **Miscellaneous:** `"color_name"`, `"file_name"`, `"file_extension"`, `"mime_type"`, - `"user_agent"` (browser user agent string with country-specific browser weighting) - - Coherent Data Generation - ------------------------ - When multiple columns in the same schema use related presets, the generated data will be - coherent across those columns within each row. Specifically: - - - **Person-related presets** (`"name"`, `"name_full"`, `"first_name"`, `"last_name"`, - `"email"`, `"user_name"`): the email and username will be derived from the person's name. - - **Address-related presets** (`"address"`, `"city"`, `"state"`, `"postcode"`, - `"phone_number"`, `"latitude"`, `"longitude"`): the city, state, and postcode will - correspond to the same location within the address. - - This coherence is automatic and requires no additional configuration. - - Examples - -------- - The `preset=` parameter generates realistic personal data, while `allowed=` restricts - values to a categorical set: - - ```python - import pointblank as pb - - schema = pb.Schema( - name=pb.string_field(preset="name"), - email=pb.string_field(preset="email", unique=True), - status=pb.string_field(allowed=["active", "pending", "inactive"]), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - We can also generate strings that match a regular expression with `pattern=` (e.g., - product codes, identifiers): - - ```python - schema = pb.Schema( - product_code=pb.string_field(pattern=r"[A-Z]{3}-[0-9]{4}"), - batch_id=pb.string_field(pattern=r"BATCH-[A-Z][0-9]{3}"), - sku=pb.string_field(pattern=r"[A-Z]{2}[0-9]{6}"), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=23)) - ``` - - For random alphanumeric strings, `min_length=` and `max_length=` control the length. - Adding `nullable=True` introduces missing values: - - ```python - schema = pb.Schema( - short_code=pb.string_field(min_length=3, max_length=5), - notes=pb.string_field( - min_length=10, max_length=50, - nullable=True, null_probability=0.4, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - It's possible to combine business and internet presets to build a company directory: - - ```python - schema = pb.Schema( - company=pb.string_field(preset="company"), - domain=pb.string_field(preset="domain_name"), - industry_tag=pb.string_field(allowed=["tech", "finance", "health", "retail"]), - ) - - pb.preview(pb.generate_dataset(schema, n=20, seed=55)) - ``` - - -bool_field(p_true: 'float' = 0.5, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'BoolField' - - Create a boolean column specification for use in a schema. - - The `bool_field()` function defines the constraints and behavior for a boolean column when - generating synthetic data with `generate_dataset()`. The `p_true=` parameter controls the - probability of generating `True` values, which is useful for simulating real-world - distributions where events may be rare or common (e.g., 5% fraud rate, 80% active users). - - By default, `True` and `False` are equally likely (`p_true=0.5`). Setting `p_true=0.0` - produces all `False` values, and `p_true=1.0` produces all `True` values. - - Parameters - ---------- - p_true - Probability of generating `True`. Default is `0.5` (equal probability). - Must be between `0.0` and `1.0`. - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. Note that boolean columns can - only have 2 unique non-null values, so `n` must be `<= 2` when `unique=True` (or - `<= 3` with `nullable=True`). - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single boolean value. - - Returns - ------- - BoolField - A boolean field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `p_true` is not between `0.0` and `1.0`, or if `null_probability` is not between - `0.0` and `1.0`. - - Examples - -------- - The `p_true=` parameter controls the distribution of `True`/`False` values, allowing - you to simulate different probabilities: - - ```python - import pointblank as pb - - schema = pb.Schema( - is_active=pb.bool_field(p_true=0.8), - is_premium=pb.bool_field(p_true=0.2), - is_verified=pb.bool_field(), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - Optional boolean flags can be simulated by combining `nullable=True` with - `null_probability=`: - - ```python - schema = pb.Schema( - opted_in=pb.bool_field(p_true=0.6), - has_referral=pb.bool_field( - p_true=0.3, - nullable=True, null_probability=0.25, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - Boolean fields can be combined with other field types in a realistic schema: - - ```python - schema = pb.Schema( - user_id=pb.int_field(min_val=1, unique=True), - name=pb.string_field(preset="name"), - email_verified=pb.bool_field(p_true=0.9), - is_admin=pb.bool_field(p_true=0.05), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=10)) - ``` - - -date_field(min_date: 'str | date | None' = None, max_date: 'str | date | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DateField' - - Create a date column specification for use in a schema. - - The `date_field()` function defines the constraints and behavior for a date column when - generating synthetic data with `generate_dataset()`. You can control the date range with - `min_date=` and `max_date=`, enforce uniqueness with `unique=True`, and introduce null - values with `nullable=True` and `null_probability=`. - - Dates are generated uniformly within the specified range. If no range is provided, the - default range is 2000-01-01 to 2030-12-31. Both `min_date=` and `max_date=` accept either - `datetime.date` objects or ISO 8601 date strings (e.g., `"2024-06-15"`). - - Parameters - ---------- - min_date - Minimum date (inclusive). Can be an ISO format string (e.g., `"2020-01-01"`) or a - `datetime.date` object. Default is `None` (defaults to `2000-01-01`). - max_date - Maximum date (inclusive). Can be an ISO format string (e.g., `"2024-12-31"`) or a - `datetime.date` object. Default is `None` (defaults to `2030-12-31`). - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. When `True`, the generator will - retry until it produces `n` distinct dates. Ensure the date range is large enough to - accommodate the requested number of unique dates. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single `datetime.date` - value. - - Returns - ------- - DateField - A date field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_date` is later than `max_date`, or if a date string cannot be parsed. - - Examples - -------- - The `min_date=` and `max_date=` parameters accept `datetime.date` objects to define date - ranges: - - ```python - import pointblank as pb - from datetime import date - - schema = pb.Schema( - birth_date=pb.date_field( - min_date=date(1960, 1, 1), - max_date=date(2005, 12, 31), - ), - hire_date=pb.date_field( - min_date=date(2020, 1, 1), - max_date=date(2024, 12, 31), - ), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - For convenience, ISO format strings can be used instead of `date` objects: - - ```python - schema = pb.Schema( - event_date=pb.date_field(min_date="2024-01-01", max_date="2024-12-31"), - signup_date=pb.date_field(min_date="2023-06-01", max_date="2024-06-01"), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - We can introduce missing dates with `nullable=True` and enforce distinct values using - `unique=True`: - - ```python - schema = pb.Schema( - order_date=pb.date_field( - min_date="2024-01-01", max_date="2024-03-31", - unique=True, - ), - cancel_date=pb.date_field( - min_date="2024-01-01", max_date="2024-12-31", - nullable=True, null_probability=0.5, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - -datetime_field(min_date: 'str | datetime | None' = None, max_date: 'str | datetime | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DatetimeField' - - Create a datetime column specification for use in a schema. - - The `datetime_field()` function defines the constraints and behavior for a datetime column - when generating synthetic data with `generate_dataset()`. You can control the datetime range - with `min_date=` and `max_date=`, enforce uniqueness with `unique=True`, and introduce null - values with `nullable=True` and `null_probability=`. - - Datetime values are generated uniformly (at second-level resolution) within the specified - range. If no range is provided, the default range is 2000-01-01T00:00:00 to - 2030-12-31T23:59:59. Both `min_date=` and `max_date=` accept `datetime` objects, `date` - objects (which are converted to datetimes at midnight), or ISO 8601 datetime strings. - - Parameters - ---------- - min_date - Minimum datetime (inclusive). Can be an ISO format string (e.g., - `"2024-01-01T00:00:00"`), a `datetime.datetime` object, or a `datetime.date` object. - Default is `None` (defaults to `2000-01-01 00:00:00`). - max_date - Maximum datetime (inclusive). Can be an ISO format string, a `datetime.datetime` - object, or a `datetime.date` object. Default is `None` (defaults to - `2030-12-31 23:59:59`). - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. With second-level resolution - over a wide range, collisions are unlikely for moderate dataset sizes. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single - `datetime.datetime` value. - - Returns - ------- - DatetimeField - A datetime field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_date` is later than `max_date`, or if a datetime string cannot be parsed. - - Examples - -------- - The `min_date=` and `max_date=` parameters accept `datetime` objects for precise range - definitions: - - ```python - import pointblank as pb - from datetime import datetime - - schema = pb.Schema( - created_at=pb.datetime_field( - min_date=datetime(2024, 1, 1), - max_date=datetime(2024, 12, 31), - ), - updated_at=pb.datetime_field( - min_date=datetime(2024, 6, 1), - max_date=datetime(2024, 12, 31), - ), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - For a quick setup, ISO format strings work just as well: - - ```python - schema = pb.Schema( - event_time=pb.datetime_field( - min_date="2024-03-01T08:00:00", - max_date="2024-03-01T18:00:00", - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=23)) - ``` - - Optional timestamps can be simulated with `nullable=True`, and datetime fields work - nicely alongside other field types: - - ```python - schema = pb.Schema( - order_id=pb.int_field(min_val=1000, max_val=9999, unique=True), - placed_at=pb.datetime_field( - min_date=datetime(2024, 1, 1), - max_date=datetime(2024, 12, 31), - ), - shipped_at=pb.datetime_field( - min_date=datetime(2024, 1, 2), - max_date=datetime(2025, 1, 15), - nullable=True, null_probability=0.3, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - -time_field(min_time: 'str | time | None' = None, max_time: 'str | time | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'TimeField' - - Create a time column specification for use in a schema. - - The `time_field()` function defines the constraints and behavior for a time-of-day column - when generating synthetic data with `generate_dataset()`. You can control the time range - with `min_time=` and `max_time=`, enforce uniqueness with `unique=True`, and introduce null - values with `nullable=True` and `null_probability=`. - - Time values are generated uniformly (at second-level resolution) within the specified range. - If no range is provided, the default range is 00:00:00 to 23:59:59. Both `min_time=` and - `max_time=` accept `datetime.time` objects or ISO format time strings (e.g., `"09:30:00"`). - - Parameters - ---------- - min_time - Minimum time (inclusive). Can be an ISO format string (e.g., `"08:00:00"`) or a - `datetime.time` object. Default is `None` (defaults to `00:00:00`). - max_time - Maximum time (inclusive). Can be an ISO format string (e.g., `"17:30:00"`) or a - `datetime.time` object. Default is `None` (defaults to `23:59:59`). - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. With second-level resolution - within a time range, uniqueness is feasible for moderate dataset sizes. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single value. - - Returns - ------- - TimeField - A time field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_time` is later than `max_time`, or if a time string cannot be parsed. - - Examples - -------- - The `min_time=` and `max_time=` parameters accept `datetime.time` objects, making it - easy to define business-hours ranges: - - ```python - import pointblank as pb - from datetime import time - - schema = pb.Schema( - start_time=pb.time_field( - min_time=time(9, 0, 0), - max_time=time(12, 0, 0), - ), - end_time=pb.time_field( - min_time=time(13, 0, 0), - max_time=time(17, 0, 0), - ), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - ISO format strings can also be used for convenience: - - ```python - schema = pb.Schema( - login_time=pb.time_field(min_time="06:00:00", max_time="23:59:59"), - alarm_time=pb.time_field(min_time="05:00:00", max_time="09:00:00"), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=23)) - ``` - - It's possible to introduce optional time values with `nullable=True` and combine them - with other field types: - - ```python - schema = pb.Schema( - employee_id=pb.int_field(min_val=100, max_val=999, unique=True), - check_in=pb.time_field(min_time="07:00:00", max_time="10:00:00"), - check_out=pb.time_field( - min_time="16:00:00", max_time="20:00:00", - nullable=True, null_probability=0.15, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - -duration_field(min_duration: 'str | timedelta | None' = None, max_duration: 'str | timedelta | None' = None, nullable: 'bool' = False, null_probability: 'float' = 0.0, unique: 'bool' = False, generator: 'Callable[[], Any] | None' = None) -> 'DurationField' - - Create a duration column specification for use in a schema. - - The `duration_field()` function defines the constraints and behavior for a duration - (timedelta) column when generating synthetic data with `generate_dataset()`. You can - control the duration range with `min_duration=` and `max_duration=`, enforce uniqueness - with `unique=True`, and introduce null values with `nullable=True` and `null_probability=`. - - Duration values are generated uniformly (at second-level resolution) within the specified - range. If no range is provided, the default range is 0 seconds to 30 days. Both - `min_duration=` and `max_duration=` accept `datetime.timedelta` objects or colon-separated - strings in `"HH:MM:SS"` or `"MM:SS"` format. - - Parameters - ---------- - min_duration - Minimum duration (inclusive). Can be a `"HH:MM:SS"` or `"MM:SS"` string, or a - `datetime.timedelta` object. Default is `None` (defaults to 0 seconds). - max_duration - Maximum duration (inclusive). Can be a `"HH:MM:SS"` or `"MM:SS"` string, or a - `datetime.timedelta` object. Default is `None` (defaults to 30 days). - nullable - Whether the column can contain null values. Default is `False`. - null_probability - Probability of generating a null value for each row when `nullable=True`. Must be - between `0.0` and `1.0`. Default is `0.0`. - unique - Whether all values must be unique. Default is `False`. With second-level resolution - within a duration range, uniqueness is feasible for moderate dataset sizes. - generator - Custom callable that generates values. When provided, this overrides all other - constraints. The callable should take no arguments and return a single - `datetime.timedelta` value. - - Returns - ------- - DurationField - A duration field specification that can be passed to `Schema()`. - - Raises - ------ - ValueError - If `min_duration` is greater than `max_duration`, or if a duration string cannot be - parsed. - - Examples - -------- - The `min_duration=` and `max_duration=` parameters accept `timedelta` objects for - defining duration ranges: - - ```python - import pointblank as pb - from datetime import timedelta - - schema = pb.Schema( - session_length=pb.duration_field( - min_duration=timedelta(minutes=5), - max_duration=timedelta(hours=2), - ), - wait_time=pb.duration_field( - min_duration=timedelta(seconds=30), - max_duration=timedelta(minutes=15), - ), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - Colon-separated strings can also be used for quick duration definitions: - - ```python - schema = pb.Schema( - call_duration=pb.duration_field(min_duration="0:01:00", max_duration="1:30:00"), - break_time=pb.duration_field(min_duration="0:05:00", max_duration="0:30:00"), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=23)) - ``` - - Optional durations can be created with `nullable=True`, and duration fields work well - alongside other field types: - - ```python - schema = pb.Schema( - task_id=pb.int_field(min_val=1, max_val=500, unique=True), - time_spent=pb.duration_field( - min_duration=timedelta(minutes=1), - max_duration=timedelta(hours=8), - ), - overtime=pb.duration_field( - min_duration=timedelta(0), - max_duration=timedelta(hours=4), - nullable=True, null_probability=0.6, - ), - ) - - pb.preview(pb.generate_dataset(schema, n=30, seed=7)) - ``` - - -profile_fields(*, set: "Literal['minimal', 'standard', 'full']" = 'standard', split_name: 'bool' = True, include: 'list[str] | None' = None, exclude: 'list[str] | None' = None, prefix: 'str | None' = None) -> 'dict[str, StringField]' - - Create a dict of string field specifications representing a person profile. - - Returns a dictionary of `StringField` objects suitable for `**`-unpacking into a `Schema()`. - Each field uses a preset that participates in the existing coherence system, so generated - data will have coherent names, emails, addresses, and phone numbers within each row. - - Parameters - ---------- - set - The base set of profile fields to include. Options are `"minimal"` (name, email, phone; - 3-4 columns depending on `split_name=`), `"standard"` (name, email, city, state, - postcode, phone; 6-7 columns), and `"full"` (name, email, address, city, state, - postcode, phone, company, job; 9-10 columns). Default is `"standard"`. - split_name - Whether to split the name into separate `first_name` and `last_name` columns (`True`, - the default) or use a single combined `name` column (`False`). - include - List of additional preset names to add to the base set. For example, - `include=["company"]` adds a company column to the `"standard"` set. Presets already - in the base set are silently ignored. - exclude - List of preset names to remove from the (possibly augmented) set. For example, - `exclude=["postcode"]` removes the postcode column. Presets not in the set are silently - ignored. - prefix - Optional string to prepend to every column name. For example, `prefix="customer_"` - produces keys like `"customer_first_name"`, `"customer_email"`, etc. - - Returns - ------- - dict[str, StringField] - A dictionary mapping column names to `StringField` objects, ordered logically (name fields - first, then contact, address, phone, business). - - Raises - ------ - ValueError - If `set=` is not one of `"minimal"`, `"standard"`, or `"full"`; if `include=` or `exclude=` - contain unknown preset names; if a preset appears in both `include=` and `exclude=`; or if - `include=` contains name presets incompatible with the `split_name=` setting. - - Examples - -------- - The default call returns the `"standard"` set of profile columns. The `**` operator unpacks the - returned dictionary directly into `Schema()`, as if each `string_field()` call had been written - by hand. All coherence rules apply automatically: emails are derived from names, and - city/state/postcode/phone are internally consistent. - - ```python - import pointblank as pb - - schema = pb.Schema( - user_id=pb.int_field(unique=True), - **pb.profile_fields(), - ) - - pb.preview(pb.generate_dataset(schema, n=100, seed=23)) - ``` - - Use `set=` to control how many columns are generated. The `"minimal"` set includes only `name`, - `email`, and `phone`, while `"full"` adds `address`, `company`, and `job`. Setting - `split_name=False` collapses `first_name` and `last_name` into a single combined `name` column: - - ```python - schema = pb.Schema( - **pb.profile_fields(set="minimal", split_name=False), - balance=pb.float_field(min_val=0, max_val=10000), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - The `include=` and `exclude=` parameters let you customize the column set without switching to a - different base set. Here we start from the `"full"` set but drop the business columns: - - ```python - schema = pb.Schema( - **pb.profile_fields(set="full", exclude=["company", "job"]), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23, country="DE")) - ``` - - The `prefix=` parameter prepends a string to every column name, which is especially useful when - a schema needs two independent profiles (e.g., a sender and a recipient). Each prefixed group - maintains its own coherence: - - ```python - schema = pb.Schema( - **pb.profile_fields(set="minimal", prefix="sender_"), - **pb.profile_fields(set="minimal", prefix="recipient_"), - ) - - pb.preview(pb.generate_dataset(schema, n=50, seed=23)) - ``` - - - -## The Prebuilt Actions family - -The Prebuilt Actions group contains a function that can be used to -send a Slack notification when validation steps exceed failure threshold levels or just to provide a -summary of the validation results, including the status, number of steps, passing and failing steps, -table information, and timing details. - -send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' = None, summary_msg: 'str | None' = None, debug: 'bool' = False) -> 'Callable | None' - - Create a Slack notification function using a webhook URL. - - This function can be used in two ways: - - 1. With [`Actions`](`pointblank.Actions`) to notify about individual validation step failures - 2. With [`FinalActions`](`pointblank.FinalActions`) to provide a summary notification after all - validation steps have undergone interrogation - - The function creates a callable that sends notifications through a Slack webhook. Message - formatting can be customized using templates for both individual steps and summary reports. - - Parameters - ---------- - webhook_url - The Slack webhook URL. If `None` (and `debug=True`), a dry run is performed (see the - *Offline Testing* section below for information on this). - step_msg - Template string for step notifications. Some of the available variables include: `"{step}"`, - `"{column}"`, `"{value}"`, `"{type}"`, `"{time}"`, `"{level}"`, etc. See the *Available - Template Variables for Step Notifications* section below for more details. If not provided, - a default step message template will be used. - summary_msg - Template string for summary notifications. Some of the available variables are: - `"{n_steps}"`, `"{n_passing_steps}"`, `"{n_failing_steps}"`, `"{all_passed}"`, - `"{highest_severity}"`, etc. See the *Available Template Variables for Summary - Notifications* section below for more details. If not provided, a default summary message - template will be used. - debug - Print debug information if `True`. This includes the message content and the response from - Slack. This is useful for testing and debugging the notification function. If `webhook_url` - is `None`, the function will print the message to the console instead of sending it to - Slack. This is useful for debugging and ensuring that your templates are formatted - correctly. - - Returns - ------- - Callable - A function that sends notifications to Slack. - - Available Template Variables for Step Notifications - --------------------------------------------------- - When creating a custom template for validation step alerts (`step_msg=`), the following - templating strings can be used: - - - `"{step}"`: The step number. - - `"{column}"`: The column name. - - `"{value}"`: The value being compared (only available in certain validation steps). - - `"{type}"`: The assertion type (e.g., `"col_vals_gt"`, etc.). - - `"{level}"`: The severity level (`"warning"`, `"error"`, or `"critical"`). - - `"{level_num}"`: The severity level as a numeric value (`30`, `40`, or `50`). - - `"{autobrief}"`: A localized and brief statement of the expectation for the step. - - `"{failure_text}"`: Localized text that explains how the validation step failed. - - `"{time}"`: The time of the notification. - - Here's an example of how to construct a `step_msg=` template: - - ```python - step_msg = '''🚨 *Validation Step Alert* - • Step Number: {step} - • Column: {column} - • Test Type: {type} - • Value Tested: {value} - • Severity: {level} (level {level_num}) - • Brief: {autobrief} - • Details: {failure_text} - • Time: {time}''' - ``` - - This template will be filled with the relevant information when a validation step fails. The - placeholders will be replaced with actual values when the Slack notification is sent. - - Available Template Variables for Summary Notifications - ------------------------------------------------------ - When creating a custom template for a validation summary (`summary_msg=`), the following - templating strings can be used: - - - `"{n_steps}"`: The total number of validation steps. - - `"{n_passing_steps}"`: The number of validation steps where all test units passed. - - `"{n_failing_steps}"`: The number of validation steps that had some failing test units. - - `"{n_warning_steps}"`: The number of steps that exceeded a 'warning' threshold. - - `"{n_error_steps}"`: The number of steps that exceeded an 'error' threshold. - - `"{n_critical_steps}"`: The number of steps that exceeded a 'critical' threshold. - - `"{all_passed}"`: Whether or not every validation step had no failing test units. - - `"{highest_severity}"`: The highest severity level encountered during validation. This can be - one of the following: `"warning"`, `"error"`, or `"critical"`, `"some failing"`, or - `"all passed"`. - - `"{tbl_row_count}"`: The number of rows in the target table. - - `"{tbl_column_count}"`: The number of columns in the target table. - - `"{tbl_name}"`: The name of the target table. - - `"{validation_duration}"`: The duration of the validation in seconds. - - `"{time}"`: The time of the notification. - - Here's an example of how to put together a `summary_msg=` template: - - ```python - summary_msg = '''📊 *Validation Summary Report* - *Overview* - • Status: {highest_severity} - • All Passed: {all_passed} - • Total Steps: {n_steps} - - *Step Results* - • Passing Steps: {n_passing_steps} - • Failing Steps: {n_failing_steps} - • Warning Level: {n_warning_steps} - • Error Level: {n_error_steps} - • Critical Level: {n_critical_steps} - - *Table Info* - • Table Name: {tbl_name} - • Row Count: {tbl_row_count} - • Column Count: {tbl_column_count} - - *Timing* - • Duration: {validation_duration}s - • Completed: {time}''' - ``` - - This template will be filled with the relevant information when the validation summary is - generated. The placeholders will be replaced with actual values when the Slack notification is - sent. - - Offline Testing - --------------- - If you want to test the function without sending actual notifications, you can leave the - `webhook_url=` as `None` and set `debug=True`. This will print the message to the console - instead of sending it to Slack. This is useful for debugging and ensuring that your templates - are formatted correctly. Furthermore, the function could be run globally (i.e., outside of the - context of a validation plan) to show the message templates with all possible variables. Here's - an example of how to do this: - - ```python - import pointblank as pb - - # Create a Slack notification function - notify_slack = pb.send_slack_notification( - webhook_url=None, # Leave as None for dry run - debug=True, # Enable debug mode to print message previews - ) - # Call the function to see the message previews - notify_slack() - ``` - - This will print the step and summary message previews to the console, allowing you to see how - the templates will look when filled with actual data. You can then adjust your templates as - needed before using them in a real validation plan. - - When `step_msg=` and `summary_msg=` are not provided, the function will use default templates. - However, you can customize the templates to include additional information or change the format - to better suit your needs. Iterating on the templates can help you create more informative and - visually appealing messages. Here's an example of that: - - ```python - import pointblank as pb - - # Create a Slack notification function with custom templates - notify_slack = pb.send_slack_notification( - webhook_url=None, # Leave as None for dry run - step_msg='''*Data Validation Alert* - • Type: {type} - • Level: {level} - • Step: {step} - • Column: {column} - • Time: {time}''', - summary_msg='''*Data Validation Summary* - • Highest Severity: {highest_severity} - • Total Steps: {n_steps} - • Failed Steps: {n_failing_steps} - • Time: {time}''', - debug=True, # Enable debug mode to print message previews - ) - ``` - - These templates will be used with sample data when the function is called. The combination of - `webhook_url=None` and `debug=True` allows you to test your custom templates without having to - send actual notifications to Slack. - - Examples - -------- - When using an action with one or more validation steps, you typically provide callables that - fire when a matched threshold of failed test units is exceeded. The callable can be - a function or a lambda. The `send_slack_notification()` function creates a callable that sends - a Slack notification when the validation step fails. Here is how it can be set up to work for - multiple validation steps by using of [Actions](`pointblank.Actions`): - - ```python - import pointblank as pb - - # Create a Slack notification function - notify_slack = pb.send_slack_notification( - webhook_url="https://hooks.slack.com/services/your/webhook/url" - ) - # Create a validation plan - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(critical=notify_slack), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() - ) - - validation - ``` - - By placing the `notify_slack()` function in the `Validate(actions=Actions(critical=))` argument, - you can ensure that the notification is sent whenever the 'critical' threshold is reached (as - set here, when 15% or more of the test units fail). The notification will include information - about the validation step that triggered the alert. - - When using a [`FinalActions`](`pointblank.FinalActions`) object, the notification will be sent - after all validation steps have been completed. This is useful for providing a summary of the - validation process. Here is an example of how to set up a summary notification: - - ```python - import pointblank as pb - - # Create a Slack notification function - notify_slack = pb.send_slack_notification( - webhook_url="https://hooks.slack.com/services/your/webhook/url" - ) - # Create a validation plan - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - final_actions=pb.FinalActions(notify_slack), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() - ) - ``` - - In this case, the same `notify_slack()` function is used, but it is placed in - `Validate(final_actions=FinalActions())`. This results in the summary notification being sent - after all validation steps are completed, regardless of whether any steps failed or not. - - This simplicity is possible because the `send_slack_notification()` function creates a callable - that can be used in both contexts. The function will automatically determine whether to send a - step notification or a summary notification based on the context in which it is called. - - We can customize the message templates for both step and summary notifications. In that way, - it's possible to create a more informative and visually appealing message. For example, we can - use Markdown formatting to make the message more readable and visually appealing. Here is an - example of how to customize the templates: - - ```python - import pointblank as pb - # Create a Slack notification function - - notify_slack = pb.send_slack_notification( - webhook_url="https://hooks.slack.com/services/your/webhook/url", - step_msg=''' - 🚨 *Validation Step Alert* - • Step Number: {step} - • Column: {column} - • Test Type: {type} - • Value Tested: {value} - • Severity: {level} (level {level_num}) - • Brief: {autobrief} - • Details: {failure_text} - • Time: {time}''', - summary_msg=''' - 📊 *Validation Summary Report* - *Overview* - • Status: {highest_severity} - • All Passed: {all_passed} - • Total Steps: {n_steps} - - *Step Results* - • Passing Steps: {n_passing_steps} - • Failing Steps: {n_failing_steps} - • Warning Level: {n_warning_steps} - • Error Level: {n_error_steps} - • Critical Level: {n_critical_steps} - - *Table Info* - • Table Name: {tbl_name} - • Row Count: {tbl_row_count} - • Column Count: {tbl_column_count} - - *Timing* - • Duration: {validation_duration}s - • Completed: {time}''', - ) - - # Create a validation plan - validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions(default=notify_slack), - final_actions=pb.FinalActions(notify_slack), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() - ) - ``` - - In this example, we have customized the templates for both step and summary notifications. The - step notification includes details about the validation step, including the step number, column - name, test type, value tested, severity level, brief description, and time of the notification. - The summary notification includes an overview of the validation process, including the status, - number of steps, passing and failing steps, table information, and timing details. - - - - ----------------------------------------------------------------------- -This is a set of examples for the Pointblank library. ----------------------------------------------------------------------- - -### Starter Validation (https://posit-dev.github.io/pointblank/demos/01-starter/) - -A validation with the basics - -```python -import pointblank as pb - -validation = ( - pb.Validate( # Use pb.Validate to start - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="A starter validation" - ) - .col_vals_gt(columns="d", value=1000) # STEP 1 | - .col_vals_le(columns="c", value=5) # STEP 2 | <-- Build up a validation plan - .col_exists(columns=["date", "date_time"]) # STEP 3 | - .interrogate() # This will execute all validation steps and collect intel -) - -validation -``` - -### Advanced Validation (https://posit-dev.github.io/pointblank/demos/02-advanced/) - -A validation with a comprehensive set of rules - -```python -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), - tbl_name="game_revenue", - label="Comprehensive validation example", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35), - ) - .col_vals_regex(columns="player_id", pattern=r"^[A-Z]{12}[0-9]{3}$") # STEP 1 - .col_vals_gt(columns="session_duration", value=5) # STEP 2 - .col_vals_ge(columns="item_revenue", value=0.02) # STEP 3 - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) # STEP 4 - .col_vals_in_set( # STEP 5 - columns="acquisition", - set=["google", "facebook", "organic", "crosspromo", "other_campaign"] - ) - .col_vals_not_in_set(columns="country", set=["Mongolia", "Germany"]) # STEP 6 - .col_vals_between( # STEP 7 - columns="session_duration", - left=10, right=50, - pre = lambda df: df.select(pl.median("session_duration")) - ) - .rows_distinct(columns_subset=["player_id", "session_id", "time"]) # STEP 8 - .row_count_match(count=2000) # STEP 9 - .col_count_match(count=11) # STEP 10 - .col_vals_not_null(columns=pb.starts_with("item")) # STEPS 11-13 - .col_exists(columns="start_day") # STEP 14 - .interrogate() -) - -validation -``` - -### Data Extracts (https://posit-dev.github.io/pointblank/demos/03-data-extracts/) - -Pulling out data extracts that highlight rows with validation failures - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - tbl_name="game_revenue", - label="Validation with test unit failures available as an extract" - ) - .col_vals_gt(columns="item_revenue", value=0) # STEP 1: no test unit failures - .col_vals_ge(columns="session_duration", value=5) # STEP 2: 14 test unit failures -> extract - .interrogate() -) -``` - -```python -pb.preview(validation.get_data_extracts(i=2, frame=True), n_head=20, n_tail=20) -``` - -### Sundered Data (https://posit-dev.github.io/pointblank/demos/04-sundered-data/) - -Splitting your data into 'pass' and 'fail' subsets - -```python -import pointblank as pb -import polars as pl - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Sundering Data" - ) - .col_vals_gt(columns="d", value=1000) - .col_vals_le(columns="c", value=5) - .interrogate() -) - -validation -``` - -```python -pb.preview(validation.get_sundered_data(type="pass")) -``` - -### Step Report: Column Data Checks (https://posit-dev.github.io/pointblank/demos/05-step-report-column-check/) - -A step report for column checks shows what went wrong - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table"), - tbl_name="small_table", - label="Step reports for column data checks" - ) - .col_vals_ge(columns="c", value=4, na_pass=True) # has failing test units - .col_vals_regex(columns="b", pattern=r"\d-[a-z]{3}-\d{3}") # no failing test units - .interrogate() -) - -validation -``` - -```python -validation.get_step_report(i=1) -``` - -```python -validation.get_step_report(i=2) -``` - -### Step Report: Schema Check (https://posit-dev.github.io/pointblank/demos/06-step-report-schema-check/) - -When a schema doesn't match, a step report gives you the details - -```python -import pointblank as pb - -# Create a schema for the target table (`small_table` as a DuckDB table) -schema = pb.Schema( - columns=[ - ("date_time", "timestamp"), # this dtype doesn't match - ("dates", "date"), # this column name doesn't match - ("a", "int64"), - ("b",), # omit dtype to not check for it - ("c",), # "" "" "" "" - ("d", "float64"), - ("e", ["bool", "boolean"]), # try several dtypes (second one matches) - ("f", "str"), # this dtype doesn't match - ] -) - -# Use the `col_schema_match()` validation method to perform a schema check -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="duckdb"), - tbl_name="small_table", - label="Step report for a schema check" - ) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -```python -validation.get_step_report(i=1) -``` - -### Apply Validation Rules to Multiple Columns (https://posit-dev.github.io/pointblank/demos/apply-checks-to-several-columns/) - -Create multiple validation steps by using a list of column names with `columns=` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_ge(columns=["a", "c", "d"], value=0) # check values in 'a', 'c', and 'd' - .col_exists(columns=["date_time", "date"]) # check for the existence of two columns - .interrogate() -) - -validation -``` - -### Verifying Row and Column Counts (https://posit-dev.github.io/pointblank/demos/check-row-column-counts/) - -Check the dimensions of the table with the `*_count_match()` validation methods - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") - ) - .col_count_match(count=11) # expect 11 columns in the table - .row_count_match(count=2000) # expect 2,000 rows in the table - .row_count_match(count=0, inverse=True) # expect that the table has rows - .col_count_match( # compare column count against - count=pb.load_dataset( # that of another table - dataset="game_revenue", tbl_type="pandas" - ) - ) - .interrogate() -) - -validation -``` - -### Checks for Missing Values (https://posit-dev.github.io/pointblank/demos/checks-for-missing/) - -Perform validations that check whether missing/NA/Null values are present - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_not_null(columns="a") # expect no Null values - .col_vals_not_null(columns="b") # "" "" - .col_vals_not_null(columns="c") # "" "" - .col_vals_not_null(columns="d") # "" "" - .col_vals_null(columns="a") # expect all values to be Null - .interrogate() -) - -validation -``` - -### Custom Expression for Checking Column Values (https://posit-dev.github.io/pointblank/demos/col-vals-custom-expr/) - -A column expression can be used to check column values. Just use `col_vals_expr()` for this - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas") - ) - .col_vals_expr(expr=lambda df: (df["d"] % 1 != 0) & (df["a"] < 10)) # Pandas column expr - .interrogate() -) - -validation -``` - -### Column Selector Functions: Easily Pick Columns (https://posit-dev.github.io/pointblank/demos/column-selector-functions/) - -Use column selector functions in the `columns=` argument to conveniently choose columns - -```python -import pointblank as pb -import narwhals.selectors as ncs - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .col_vals_ge( - columns=pb.matches("rev|dur"), # check values in columns having 'rev' or 'dur' in name - value=0 - ) - .col_vals_regex( - columns=pb.ends_with("_id"), # check values in columns with names ending in '_id' - pattern=r"^[A-Z]{12}\d{3}" - ) - .col_vals_not_null( - columns=pb.last_n(2) # check that the last two columns don't have Null values - ) - .col_vals_regex( - columns=ncs.string(), # check that all string columns are non-empty strings - pattern=r"(.|\s)*\S(.|\s)*" - ) - .interrogate() -) - -validation -``` - -### Comparison Checks Across Columns (https://posit-dev.github.io/pointblank/demos/comparisons-across-columns/) - -Perform comparisons of values in columns to values in other columns - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_lt(columns="a", value=pb.col("c")) # values in 'a' > values in 'c' - .col_vals_between( - columns="d", # values in 'd' are between values - left=pb.col("c"), # in 'c' and the fixed value of 12,000; - right=12000, # any missing values encountered result - na_pass=True # in a passing test unit - ) - .interrogate() -) - -validation -``` - -### Expect No Duplicate Rows (https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-rows/) - -We can check for duplicate rows in the table with `rows_distinct()` - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct() # expect no duplicate rows - .interrogate() -) - -validation -``` - -### Checking for Duplicate Values (https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-values/) - -To check for duplicate values down a column, use `rows_distinct()` with a `columns_subset=` value - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .rows_distinct(columns_subset="b") # expect no duplicate values in 'b' - .interrogate() -) - -validation -``` - -### Expectations with a Text Pattern (https://posit-dev.github.io/pointblank/demos/expect-text-pattern/) - -With the `col_vals_regex()`, check for conformance to a regular expression - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_regex(columns="b", pattern=r"^\d-[a-z]{3}-\d{3}$") # check pattern in 'b' - .col_vals_regex(columns="f", pattern=r"high|low|mid") # check pattern in 'f' - .interrogate() -) - -validation -``` - -### Set Failure Threshold Levels (https://posit-dev.github.io/pointblank/demos/failure-thresholds/) - -Set threshold levels to better gauge adverse data quality - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds( # setting relative threshold defaults for all steps - warning=0.05, # 5% failing test units: warning threshold (gray) - error=0.10, # 10% failed test units: error threshold (yellow) - critical=0.15 # 15% failed test units: critical threshold (red) - ), - ) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=4, - thresholds=(5, 10, 20) # setting absolute thresholds for *this* step (W, E, C) - ) - .col_exists(columns="end_day") - .interrogate() -) - -validation -``` - -### Mutate the Table in a Validation Step (https://posit-dev.github.io/pointblank/demos/mutate-table-in-step/) - -For far more specialized validations, modify the table with the `pre=` argument before checking it - -```python -import pointblank as pb -import polars as pl -import narwhals as nw - -# Define preprocessing functions -def get_median_a(df): - """Use a Polars expression to aggregate column `a`.""" - return df.select(pl.median("a")) - -def add_b_length_column(df): - """Use Narwhals to add a string length column `b_len`.""" - return ( - nw.from_native(df) - .with_columns(b_len=nw.col("b").str.len_chars()) - ) - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_between( - columns="a", - left=3, right=6, - pre=get_median_a - ) - .col_vals_eq( - columns="b_len", - value=9, - pre=add_b_length_column - ) - .interrogate() -) - -validation -``` - -### Numeric Comparisons (https://posit-dev.github.io/pointblank/demos/numeric-comparisons/) - -Perform comparisons of values in columns to fixed values - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_gt(columns="d", value=1000) # values in 'd' > 1000 - .col_vals_lt(columns="d", value=10000) # values in 'd' < 10000 - .col_vals_ge(columns="a", value=1) # values in 'a' >= 1 - .col_vals_le(columns="c", value=5) # values in 'c' <= 5 - .col_vals_ne(columns="a", value=7) # values in 'a' not equal to 7 - .col_vals_between(columns="c", left=0, right=15) # 0 <= 'c' values <= 15 - .interrogate() -) - -validation -``` - -### Check the Schema of a Table (https://posit-dev.github.io/pointblank/demos/schema-check/) - -The schema of a table can be flexibly defined with `Schema` and verified with `col_schema_match()` - -```python -import pointblank as pb -import polars as pl - -tbl = pl.DataFrame( - { - "a": ["apple", "banana", "cherry", "date"], - "b": [1, 6, 3, 5], - "c": [1.1, 2.2, 3.3, 4.4], - } -) - -# Use the Schema class to define the column schema as loosely or rigorously as required -schema = pb.Schema( - columns=[ - ("a", "String"), # Column 'a' has dtype 'String' - ("b", ["Int", "Int64"]), # Column 'b' has dtype 'Int' or 'Int64' - ("c", ) # Column 'c' follows 'b' but we don't specify a dtype here - ] -) - -# Use the `col_schema_match()` validation method to perform the schema check -validation = ( - pb.Validate(data=tbl) - .col_schema_match(schema=schema) - .interrogate() -) - -validation -``` - -### Set Membership (https://posit-dev.github.io/pointblank/demos/set-membership/) - -Perform validations that check whether values are part of a set (or *not* part of one) - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) # part of this set - .col_vals_not_in_set(columns="f", set=["zero", "infinity"]) # not part of this set - .interrogate() -) - -validation -``` - -### Using Parquet Data (https://posit-dev.github.io/pointblank/demos/using-parquet-data/) - -A Parquet dataset can be used for data validation, thanks to Ibis - -```python -import pointblank as pb -import ibis - -game_revenue = ibis.read_parquet("data/game_revenue.parquet") - -validation = ( - pb.Validate(data=game_revenue, label="Example using a Parquet dataset.") - .col_vals_lt(columns="item_revenue", value=200) - .col_vals_gt(columns="item_revenue", value=0) - .col_vals_gt(columns="session_duration", value=5) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .interrogate() -) - -validation -``` - diff --git a/docs/llms.txt b/docs/llms.txt deleted file mode 100644 index b22f7969b3..0000000000 --- a/docs/llms.txt +++ /dev/null @@ -1,166 +0,0 @@ -# Pointblank - -## Docs - -### Examples - -- [Starter Validation](https://posit-dev.github.io/pointblank/demos/01-starter/): A validation with the basics. -- [Advanced Validation](https://posit-dev.github.io/pointblank/demos/02-advanced/): A validation with a comprehensive set of rules. -- [Data Extracts](https://posit-dev.github.io/pointblank/demos/03-data-extracts/): Pulling out data extracts that highlight rows with validation failures. -- [Sundered Data](https://posit-dev.github.io/pointblank/demos/04-sundered-data/): Splitting your data into 'pass' and 'fail' subsets. -- [Step Reports for Column Data Checks](https://posit-dev.github.io/pointblank/demos/05-step-report-column-check/): A step report for column checks shows what went wrong. -- [Step Report for a Schema Check](https://posit-dev.github.io/pointblank/demos/06-step-report-schema-check/): When a schema doesn't match, a step report gives you the details. -- [Step-Level Actions](https://posit-dev.github.io/pointblank/demos/07-validation-with-actions/): Configure actions to trigger when validation thresholds are exceeded, such as logging warnings or errors. -- [Final Actions](https://posit-dev.github.io/pointblank/demos/08-validation-with-final-actions/): Execute actions after validation completes, such as sending alerts or generating summary reports. -- [Numeric Comparisons](https://posit-dev.github.io/pointblank/demos/numeric-comparisons/): Perform comparisons of values in columns to fixed values. -- [Comparison Checks Across Columns](https://posit-dev.github.io/pointblank/demos/comparisons-across-columns/): Perform comparisons of values in columns to values in other columns. -- [Apply Validation Rules to Multiple Columns](https://posit-dev.github.io/pointblank/demos/apply-checks-to-several-columns/): Create multiple validation steps by using a list of column names with `columns=`. -- [Checks for Missing Values](https://posit-dev.github.io/pointblank/demos/checks-for-missing/): Perform validations that check whether missing/NA/Null values are present. -- [Expectations with a Text Pattern](https://posit-dev.github.io/pointblank/demos/expect-text-pattern/): With `col_vals_regex()`, check for conformance to a regular expression. -- [Set Membership](https://posit-dev.github.io/pointblank/demos/set-membership/): Perform validations that check whether values are part of a set (or *not* part of one). -- [Expect No Duplicate Rows](https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-rows/): We can check for duplicate rows in the table with `rows_distinct()`. -- [Checking for Duplicate Values](https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-values/): To check for duplicate values down a column, use `rows_distinct()` with a `columns_subset=` value. -- [Custom Expression for Checking Column Values](https://posit-dev.github.io/pointblank/demos/col-vals-custom-expr/): A column expression can be used to check column values. Just use `col_vals_expr()` for this. -- [Mutate the Table in a Validation Step](https://posit-dev.github.io/pointblank/demos/mutate-table-in-step/): For far more specialized validations, modify the table with the `pre=` argument before checking it. -- [Verifying Row and Column Counts](https://posit-dev.github.io/pointblank/demos/check-row-column-counts/): Check the dimensions of the table with the `*_count_match()` validation methods. -- [Validating Data Freshness](https://posit-dev.github.io/pointblank/demos/check-for-freshness/): Use date-based validations to ensure your data is current and recent. -- [Date and Datetime Validations](https://posit-dev.github.io/pointblank/demos/datetime-validations/): Comprehensive examples of date, datetime, and timezone-aware datetime comparisons. -- [Custom Validation with `specially()`](https://posit-dev.github.io/pointblank/demos/custom-validation-specially/): Create bespoke validations using `specially()` to implement domain-specific business rules. -- [Set Failure Threshold Levels](https://posit-dev.github.io/pointblank/demos/failure-thresholds/): Set threshold levels to better gauge adverse data quality. -- [Column Selector Functions: Easily Pick Columns](https://posit-dev.github.io/pointblank/demos/column-selector-functions/): Use column selector functions in the `columns=` argument to conveniently choose columns. -- [Check the Schema of a Table](https://posit-dev.github.io/pointblank/demos/schema-check/): The schema of a table can be flexibly defined with `Schema` and verified with `col_schema_match()`. -- [Using Parquet Data](https://posit-dev.github.io/pointblank/demos/using-parquet-data/): A Parquet dataset can be used for data validation, thanks to Ibis. -- [CLI Interactive Demos](https://posit-dev.github.io/pointblank/demos/cli-interactive/): These CLI demos showcase practical data quality workflows that you can use! - -### API Reference - -- [Validate](https://posit-dev.github.io/pointblank/reference/Validate.html): Workflow for defining a set of validations on a table and interrogating for results. -- [Thresholds](https://posit-dev.github.io/pointblank/reference/Thresholds.html): Definition of threshold values. -- [Actions](https://posit-dev.github.io/pointblank/reference/Actions.html): Definition of action values. -- [FinalActions](https://posit-dev.github.io/pointblank/reference/FinalActions.html): Define actions to be taken after validation is complete. -- [Schema](https://posit-dev.github.io/pointblank/reference/Schema.html): Definition of a schema object. -- [DraftValidation](https://posit-dev.github.io/pointblank/reference/DraftValidation.html): Draft a validation plan for a given table using an LLM. -- [Validate.col_vals_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_gt.html): Are column data greater than a fixed value or data in another column? -- [Validate.col_vals_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_lt.html): Are column data less than a fixed value or data in another column? -- [Validate.col_vals_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_ge.html): Are column data greater than or equal to a fixed value or data in another column? -- [Validate.col_vals_le](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_le.html): Are column data less than or equal to a fixed value or data in another column? -- [Validate.col_vals_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_eq.html): Are column data equal to a fixed value or data in another column? -- [Validate.col_vals_ne](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_ne.html): Are column data not equal to a fixed value or data in another column? -- [Validate.col_vals_between](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_between.html): Do column data lie between two specified values or data in other columns? -- [Validate.col_vals_outside](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_outside.html): Do column data lie outside of two specified values or data in other columns? -- [Validate.col_vals_in_set](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_in_set.html): Validate whether column values are in a set of values. -- [Validate.col_vals_not_in_set](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_not_in_set.html): Validate whether column values are not in a set of values. -- [Validate.col_vals_increasing](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_increasing.html): Are column data increasing by row? -- [Validate.col_vals_decreasing](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_decreasing.html): Are column data decreasing by row? -- [Validate.col_vals_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_null.html): Validate whether values in a column are Null. -- [Validate.col_vals_not_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_not_null.html): Validate whether values in a column are not Null. -- [Validate.col_vals_regex](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_regex.html): Validate whether column values match a regular expression pattern. -- [Validate.col_vals_within_spec](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_within_spec.html): Validate whether column values fit within a specification. -- [Validate.col_vals_expr](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_expr.html): Validate column values using a custom expression. -- [Validate.rows_distinct](https://posit-dev.github.io/pointblank/reference/Validate.rows_distinct.html): Validate whether rows in the table are distinct. -- [Validate.rows_complete](https://posit-dev.github.io/pointblank/reference/Validate.rows_complete.html): Validate whether row data are complete by having no missing values. -- [Validate.col_exists](https://posit-dev.github.io/pointblank/reference/Validate.col_exists.html): Validate whether one or more columns exist in the table. -- [Validate.col_pct_null](https://posit-dev.github.io/pointblank/reference/Validate.col_pct_null.html): Validate whether a column has a specific percentage of Null values. -- [Validate.col_schema_match](https://posit-dev.github.io/pointblank/reference/Validate.col_schema_match.html): Do columns in the table (and their types) match a predefined schema? -- [Validate.row_count_match](https://posit-dev.github.io/pointblank/reference/Validate.row_count_match.html): Validate whether the row count of the table matches a specified count. -- [Validate.col_count_match](https://posit-dev.github.io/pointblank/reference/Validate.col_count_match.html): Validate whether the column count of the table matches a specified count. -- [Validate.tbl_match](https://posit-dev.github.io/pointblank/reference/Validate.tbl_match.html): Validate whether the target table matches a comparison table. -- [Validate.conjointly](https://posit-dev.github.io/pointblank/reference/Validate.conjointly.html): Perform multiple row-wise validations for joint validity. -- [Validate.specially](https://posit-dev.github.io/pointblank/reference/Validate.specially.html): Perform a specialized validation with customized logic. -- [Validate.prompt](https://posit-dev.github.io/pointblank/reference/Validate.prompt.html): Validate rows using AI/LLM-powered analysis. -- [Validate.col_sum_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_eq.html): Does the column sum satisfy an equal to comparison? -- [Validate.col_sum_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_gt.html): Does the column sum satisfy a greater than comparison? -- [Validate.col_sum_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_ge.html): Does the column sum satisfy a greater than or equal to comparison? -- [Validate.col_sum_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_lt.html): Does the column sum satisfy a less than comparison? -- [Validate.col_sum_le](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_le.html): Does the column sum satisfy a less than or equal to comparison? -- [Validate.col_avg_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_eq.html): Does the column average satisfy an equal to comparison? -- [Validate.col_avg_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_gt.html): Does the column average satisfy a greater than comparison? -- [Validate.col_avg_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_ge.html): Does the column average satisfy a greater than or equal to comparison? -- [Validate.col_avg_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_lt.html): Does the column average satisfy a less than comparison? -- [Validate.col_avg_le](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_le.html): Does the column average satisfy a less than or equal to comparison? -- [Validate.col_sd_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_eq.html): Does the column standard deviation satisfy an equal to comparison? -- [Validate.col_sd_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_gt.html): Does the column standard deviation satisfy a greater than comparison? -- [Validate.col_sd_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_ge.html): Does the column standard deviation satisfy a greater than or equal to comparison? -- [Validate.col_sd_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_lt.html): Does the column standard deviation satisfy a less than comparison? -- [Validate.col_sd_le](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_le.html): Does the column standard deviation satisfy a less than or equal to comparison? -- [col](https://posit-dev.github.io/pointblank/reference/col.html): Helper function for referencing a column in the input table. -- [starts_with](https://posit-dev.github.io/pointblank/reference/starts_with.html): Select columns that start with specified text. -- [ends_with](https://posit-dev.github.io/pointblank/reference/ends_with.html): Select columns that end with specified text. -- [contains](https://posit-dev.github.io/pointblank/reference/contains.html): Select columns that contain specified text. -- [matches](https://posit-dev.github.io/pointblank/reference/matches.html): Select columns that match a specified regular expression pattern. -- [everything](https://posit-dev.github.io/pointblank/reference/everything.html): Select all columns. -- [first_n](https://posit-dev.github.io/pointblank/reference/first_n.html): Select the first `n` columns in the column list. -- [last_n](https://posit-dev.github.io/pointblank/reference/last_n.html): Select the last `n` columns in the column list. -- [expr_col](https://posit-dev.github.io/pointblank/reference/expr_col.html): Create a column expression for use in `conjointly()` validation. -- [seg_group](https://posit-dev.github.io/pointblank/reference/seg_group.html): Group together values for segmentation. -- [Validate.interrogate](https://posit-dev.github.io/pointblank/reference/Validate.interrogate.html): Execute each validation step against the table and store the results. -- [Validate.set_tbl](https://posit-dev.github.io/pointblank/reference/Validate.set_tbl.html): Set or replace the table associated with the Validate object. -- [Validate.get_tabular_report](https://posit-dev.github.io/pointblank/reference/Validate.get_tabular_report.html): Validation report as a GT table. -- [Validate.get_step_report](https://posit-dev.github.io/pointblank/reference/Validate.get_step_report.html): Get a detailed report for a single validation step. -- [Validate.get_json_report](https://posit-dev.github.io/pointblank/reference/Validate.get_json_report.html): Get a report of the validation results as a JSON-formatted string. -- [Validate.get_sundered_data](https://posit-dev.github.io/pointblank/reference/Validate.get_sundered_data.html): Get the data that passed or failed the validation steps. -- [Validate.get_data_extracts](https://posit-dev.github.io/pointblank/reference/Validate.get_data_extracts.html): Get the rows that failed for each validation step. -- [Validate.all_passed](https://posit-dev.github.io/pointblank/reference/Validate.all_passed.html): Determine if every validation step passed perfectly, with no failing test units. -- [Validate.assert_passing](https://posit-dev.github.io/pointblank/reference/Validate.assert_passing.html): Raise an `AssertionError` if all tests are not passing. -- [Validate.assert_below_threshold](https://posit-dev.github.io/pointblank/reference/Validate.assert_below_threshold.html): Raise an `AssertionError` if validation steps exceed a specified threshold level. -- [Validate.above_threshold](https://posit-dev.github.io/pointblank/reference/Validate.above_threshold.html): Check if any validation steps exceed a specified threshold level. -- [Validate.n](https://posit-dev.github.io/pointblank/reference/Validate.n.html): Provides a dictionary of the number of test units for each validation step. -- [Validate.n_passed](https://posit-dev.github.io/pointblank/reference/Validate.n_passed.html): Provides a dictionary of the number of test units that passed for each validation step. -- [Validate.n_failed](https://posit-dev.github.io/pointblank/reference/Validate.n_failed.html): Provides a dictionary of the number of test units that failed for each validation step. -- [Validate.f_passed](https://posit-dev.github.io/pointblank/reference/Validate.f_passed.html): Provides a dictionary of the fraction of test units that passed for each validation step. -- [Validate.f_failed](https://posit-dev.github.io/pointblank/reference/Validate.f_failed.html): Provides a dictionary of the fraction of test units that failed for each validation step. -- [Validate.warning](https://posit-dev.github.io/pointblank/reference/Validate.warning.html): Get the 'warning' level status for each validation step. -- [Validate.error](https://posit-dev.github.io/pointblank/reference/Validate.error.html): Get the 'error' level status for each validation step. -- [Validate.critical](https://posit-dev.github.io/pointblank/reference/Validate.critical.html): Get the 'critical' level status for each validation step. -- [DataScan](https://posit-dev.github.io/pointblank/reference/DataScan.html): Get a summary of a dataset. -- [preview](https://posit-dev.github.io/pointblank/reference/preview.html): Display a table preview that shows some rows from the top, some from the bottom. -- [col_summary_tbl](https://posit-dev.github.io/pointblank/reference/col_summary_tbl.html): Generate a column-level summary table of a dataset. -- [missing_vals_tbl](https://posit-dev.github.io/pointblank/reference/missing_vals_tbl.html): Display a table that shows the missing values in the input table. -- [assistant](https://posit-dev.github.io/pointblank/reference/assistant.html): Chat with the PbA (Pointblank Assistant) about your data validation needs. -- [load_dataset](https://posit-dev.github.io/pointblank/reference/load_dataset.html): Load a dataset hosted in the library as specified table type. -- [get_data_path](https://posit-dev.github.io/pointblank/reference/get_data_path.html): Get the file path to a dataset included with the Pointblank package. -- [connect_to_table](https://posit-dev.github.io/pointblank/reference/connect_to_table.html): Connect to a database table using a connection string. -- [print_database_tables](https://posit-dev.github.io/pointblank/reference/print_database_tables.html): List all tables in a database from a connection string. -- [yaml_interrogate](https://posit-dev.github.io/pointblank/reference/yaml_interrogate.html): Execute a YAML-based validation workflow. -- [validate_yaml](https://posit-dev.github.io/pointblank/reference/validate_yaml.html): Validate YAML configuration against the expected structure. -- [yaml_to_python](https://posit-dev.github.io/pointblank/reference/yaml_to_python.html): Convert YAML validation configuration to equivalent Python code. -- [get_column_count](https://posit-dev.github.io/pointblank/reference/get_column_count.html): Get the number of columns in a table. -- [get_row_count](https://posit-dev.github.io/pointblank/reference/get_row_count.html): Get the number of rows in a table. -- [get_action_metadata](https://posit-dev.github.io/pointblank/reference/get_action_metadata.html): Access step-level metadata when authoring custom actions. -- [get_validation_summary](https://posit-dev.github.io/pointblank/reference/get_validation_summary.html): Access validation summary information when authoring final actions. -- [write_file](https://posit-dev.github.io/pointblank/reference/write_file.html): Write a Validate object to disk as a serialized file. -- [read_file](https://posit-dev.github.io/pointblank/reference/read_file.html): Read a Validate object from disk that was previously saved with `write_file()`. -- [config](https://posit-dev.github.io/pointblank/reference/config.html): Configuration settings for the Pointblank library. -- [send_slack_notification](https://posit-dev.github.io/pointblank/reference/send_slack_notification.html): Create a Slack notification function using a webhook URL. - -### User Guide - -- [Actions](https://posit-dev.github.io/pointblank/user-guide/actions.html) -- [Assertions](https://posit-dev.github.io/pointblank/user-guide/assertions.html) -- [Briefs](https://posit-dev.github.io/pointblank/user-guide/briefs.html) -- [Data Inspection](https://posit-dev.github.io/pointblank/user-guide/cli-data-inspection.html) -- [Data Validation](https://posit-dev.github.io/pointblank/user-guide/cli-data-validation.html) -- [CLI Reference](https://posit-dev.github.io/pointblank/user-guide/cli-reference.html) -- [Column Summaries](https://posit-dev.github.io/pointblank/user-guide/col-summary-tbl.html) -- [Column Selection Patterns](https://posit-dev.github.io/pointblank/user-guide/column-selection-patterns.html) -- [Draft Validation](https://posit-dev.github.io/pointblank/user-guide/draft-validation.html) -- [Expression-Based Validation](https://posit-dev.github.io/pointblank/user-guide/expressions.html) -- [Data Extracts](https://posit-dev.github.io/pointblank/user-guide/extracts.html) -- [Installation](https://posit-dev.github.io/pointblank/user-guide/installation.html) -- [Languages](https://posit-dev.github.io/pointblank/user-guide/langs.html) -- [MCP Quick Start](https://posit-dev.github.io/pointblank/user-guide/mcp-quick-start.html) -- [Missing Values Reporting](https://posit-dev.github.io/pointblank/user-guide/missing-vals-tbl.html) -- [Preprocessing](https://posit-dev.github.io/pointblank/user-guide/preprocessing.html) -- [Previewing Data](https://posit-dev.github.io/pointblank/user-guide/preview.html) -- [Quickstart](https://posit-dev.github.io/pointblank/user-guide/quickstart.html) -- [Schema Validation](https://posit-dev.github.io/pointblank/user-guide/schema-validation.html) -- [Segmentation](https://posit-dev.github.io/pointblank/user-guide/segmentation.html) -- [Step Reports](https://posit-dev.github.io/pointblank/user-guide/step-reports.html) -- [Sundering Validated Data](https://posit-dev.github.io/pointblank/user-guide/sundering.html) -- [Test Data Generation](https://posit-dev.github.io/pointblank/user-guide/test-data-generation.html) -- [Thresholds](https://posit-dev.github.io/pointblank/user-guide/thresholds.html) -- [Validation Methods](https://posit-dev.github.io/pointblank/user-guide/validation-methods.html) -- [Overview](https://posit-dev.github.io/pointblank/user-guide/validation-overview.html) -- [Validation Reports](https://posit-dev.github.io/pointblank/user-guide/validation-reports.html) -- [YAML Reference](https://posit-dev.github.io/pointblank/user-guide/yaml-reference.html) -- [YAML Validation Workflows](https://posit-dev.github.io/pointblank/user-guide/yaml-validation-workflows.html) \ No newline at end of file diff --git a/docs/print-styles.css b/docs/print-styles.css deleted file mode 100644 index 16561db693..0000000000 --- a/docs/print-styles.css +++ /dev/null @@ -1,217 +0,0 @@ -/* Print-specific styles for PDF generation */ - -@media print { - /* Page settings - page numbers added via post-processing */ - @page { - size: letter landscape; - margin: 0.5in; - } - - /* Hide navigation and other UI elements for PDF (but keep TOC) */ - nav.navbar:not(#TOC), - .nav-footer, - #quarto-header, - .quarto-title-banner, - .quarto-title, - header.quarto-title-block, - .sidebar, - #quarto-sidebar, - .page-navigation { - display: none !important; - } - - /* Show and style the TOC for PDF */ - #TOC, - nav#TOC { - display: block !important; - page-break-after: always; - page-break-before: auto; - margin: 1in auto; - max-width: 8in; - padding: 1em; - } - - #TOC::before, - nav#TOC::before { - content: "Table of Contents"; - display: block; - font-size: 24pt; - font-weight: bold; - margin-bottom: 0.75em; - } - - #TOC ul, - nav#TOC ul { - list-style: none; - padding-left: 0; - } - - #TOC li, - nav#TOC li { - margin: 0.5em 0; - line-height: 1.4; - } - - #TOC ul ul, - nav#TOC ul ul { - padding-left: 1.5em; - font-size: 0.9em; - } - - #TOC a, - nav#TOC a { - text-decoration: none; - color: #333; - } - - #TOC a::after, - nav#TOC a::after { - content: leader('.') target-counter(attr(href), page); - } - - /* Manual Table of Contents */ - .manual-toc { - page-break-after: always; - page-break-inside: avoid; - padding: 2em; - max-width: 8in; - margin: 0 auto; - } - - .manual-toc h1 { - font-size: 24pt; - margin-bottom: 1.5em; - text-align: center; - } - - .manual-toc ol { - list-style: none; - padding: 0; - font-size: 14pt; - line-height: 2.5; - } - - .manual-toc li { - margin: 0.75em 0; - position: relative; - } - - .manual-toc a { - text-decoration: none; - color: #333; - } - - /* Style the title page */ - .title-page { - page-break-after: always; - page-break-inside: avoid; - text-align: center; - margin: 0; - padding: 0; - min-height: 8in; - } - - /* Page break utility */ - .page-break { - page-break-after: always; - height: 0; - margin: 0; - padding: 0; - } - - .title-page * { - page-break-before: avoid !important; - page-break-after: avoid !important; - } - - /* Page break helper */ - .page-break { - page-break-after: always; - height: 0; - margin: 0; - padding: 0; - } - - /* Avoid page breaks inside important elements */ - .validation-report, - pre, - code, - img { - page-break-inside: avoid; - } - - /* Ensure links are visible */ - a[href]:after { - content: none !important; - } - - /* Optimize table rendering - reduce font size for wide tables */ - table { - width: 100%; - border-collapse: collapse; - font-size: 9pt; - page-break-inside: avoid; - page-break-before: auto; - page-break-after: auto; - } - - table th, table td { - padding: 4px 6px; - font-size: 8pt; - page-break-inside: avoid; - } - - /* Make validation tables more compact */ - .validation-report table { - font-size: 7pt; - } - - .validation-report table th, - .validation-report table td { - padding: 2px 4px; - } - - /* Ensure table containers don't break */ - .cell-output, - .cell-output-display, - div:has(> table) { - page-break-inside: avoid; - } - - /* Ensure code blocks fit */ - pre code { - font-size: 8pt; - white-space: pre-wrap; - word-wrap: break-word; - } - - /* Better header spacing and page breaks */ - h1 { - page-break-before: always; - page-break-after: avoid; - page-break-inside: avoid; - margin-top: 0; - } - - /* Don't break page before the first h1 */ - body > h1:first-of-type, - main > h1:first-of-type, - #quarto-content > h1:first-of-type { - page-break-before: avoid; - } - - h2, h3, h4, h5, h6 { - page-break-after: avoid; - page-break-inside: avoid; - } - - /* Ensure images fit on page */ - img { - max-width: 100%; - height: auto; - } -} - -@media screen { - /* Screen-only: add print preview button styles if needed */ -} diff --git a/docs/scripts/post-render.py b/docs/scripts/post-render.py deleted file mode 100644 index 95782b3232..0000000000 --- a/docs/scripts/post-render.py +++ /dev/null @@ -1,386 +0,0 @@ -import os -import glob -import re - -# Print the working directory -print("Current working directory:", os.getcwd()) - -# Get a list of all files in the working directory -files = os.listdir(".") -print("Files in working directory:", files) - -site_files = os.listdir("_site") -print("Files in '_site' directory:", site_files) - -# Process all HTML files in the `_site/reference/` directory (except `index.html`) -# and apply the specified transformations -html_files = [f for f in glob.glob("_site/reference/*.html") if not f.endswith("index.html")] - -print(f"Found {len(html_files)} HTML files to process") - -for html_file in html_files: - print(f"Processing: {html_file}") - - with open(html_file, "r") as file: - content = file.readlines() - - # Determine the classification of each h1 tag based on its content - classification_info = {} - for i, line in enumerate(content): - # Look for both class="title" and styled h1 tags - h1_match = re.search(r'(.*?)', line) - if not h1_match: - # Also check for h1 tags with style attribute (for level1 section titles) - h1_match = re.search(r'(.*?)', line) - - if h1_match: - original_h1_content = h1_match.group(1).strip() - # Store classification based on original content - if original_h1_content and original_h1_content[0].isupper(): - if "." in original_h1_content: - classification_info[i] = ("method", "steelblue", "#E3F2FF") - else: - classification_info[i] = ("class", "darkgreen", "#E3FEE3") - else: - classification_info[i] = ("function", "darkorange", "#FFF1E0") - - # Remove the literal text `Validate.` from the h1 tag - # TODO: Add line below stating the class name for the method - content = [ - line.replace( - '

Validate.', - '

', - ) - for line in content - ] - - # If the inner content of the h1 tag either: - # - has a literal `.` in it, or - # - doesn't start with a capital letter, - # then add `()` to the end of the content of the h1 tag - for i, line in enumerate(content): - # Use regex to find h1 tags (both class="title" and styled versions) - h1_match = re.search(r'', line) - if not h1_match: - h1_match = re.search(r'', line) - - if h1_match: - # Extract the content of the h1 tag - start = h1_match.end() - end = line.find("

", start) - h1_content = line[start:end].strip() - - # Check if the content meets the criteria - if "." in h1_content or (h1_content and not h1_content[0].isupper()): - # Modify the content - h1_content += "()" - - # Replace the h1 tag with the modified content - content[i] = line[:start] + h1_content + line[end:] - - # Add classification labels using stored info - for i, line in enumerate(content): - if i in classification_info: - h1_match = re.search(r"]*>(.*?)", line) - if h1_match: - h1_content = h1_match.group(1) - label_type, label_color, background_color = classification_info[i] - - label_span = f'{label_type}' - - new_h1_content = h1_content + label_span - new_line = line.replace(h1_content, new_h1_content) - content[i] = new_line - - # Wrap bare h1 tags (those with style attribute but no quarto-title wrapper) in proper structure - for i, line in enumerate(content): - # Look for h1 tags with style attribute that aren't already wrapped - if "

\n{h1_content}\n\n' - content[i] = wrapped_h1 - - # Add a style attribute to the h1 tag to use a monospace font for code-like appearance - content = [ - line.replace( - '

', - "

", - ) - for line in content - ] - - # Some h1 tags may not have a class attribute, so we handle that case too - content = [ - line.replace( - "

", - "

", - ) - for line in content - ] - - # Move the first

tag (description) to immediately after the title header - header_end_line = None - first_p_line = None - first_p_content = None - found_sourcecode = False - title_line = None - sourcecode_line = None - - # First pass: find the header end, title, sourcecode, and the first

tag after sourceCode - for i, line in enumerate(content): - # Find where the header ends - if "" in line: - header_end_line = i - - # Find the title line (either in header or in level1 section) - if '

' in line: - found_sourcecode = True - sourcecode_line = i - - # Find the first

tag after we've seen the sourceCode div - if found_sourcecode and first_p_line is None and line.strip().startswith(" header_end_line: - # Title is in a separate section, insert after title - insert_after_line = title_line - else: - # Title is in header, insert after header - insert_after_line = header_end_line - - # Apply italic styling to the description - if "style=" not in first_p_content: - styled_p = first_p_content.replace( - "

", - '

', - ) - else: - styled_p = first_p_content - - # Remove the original

line - content.pop(first_p_line) - - # Adjust sourcecode_line since we removed a line before it - if first_p_line < sourcecode_line: - sourcecode_line -= 1 - - # Insert the styled

line after the determined position (accounting for the removed line) - insert_position = ( - insert_after_line + 1 if first_p_line > insert_after_line else insert_after_line - ) - content.insert(insert_position, "\n") # Add spacing - content.insert(insert_position + 1, styled_p) - content.insert(insert_position + 2, "\n") # Add spacing - - # Adjust sourcecode_line since we added lines before it - sourcecode_line += 3 - - # Add "USAGE" label before the sourceCode div - usage_label = '

USAGE

\n' - content.insert(sourcecode_line, usage_label) - - # Style the first and second
tags with different borders - dl_count = 0 - for i, line in enumerate(content): - if "
" in line: - dl_count += 1 - if dl_count == 1: - # First
tag - green border - content[i] = line.replace( - "
", - '
', - ) - elif dl_count == 2: - # Second
tag - indigo border - content[i] = line.replace( - "
", - '
', - ) - break # Stop after finding the second one - - # Fix return value formatting in individual function pages, removing the `:` before the - # return value and adjusting the style of the parameter annotation separator - content_str = "".join(content) - return_value_pattern = ( - r' :' - ) - return_value_replacement = r' ' - content_str = re.sub(return_value_pattern, return_value_replacement, content_str) - - # Fix double asterisks in kwargs parameters - content_str = content_str.replace("****kwargs**", "**kwargs") - - content = content_str.splitlines(keepends=True) - - # Turn all h3 tags into h4 tags - content = [line.replace("", "

") for line in content] - - # Turn all h2 tags into h3 tags - content = [line.replace("", "") for line in content] - - # Add gradient animation to Examples headers and horizontal rules - content_str = "".join(content) - - # Find and replace Examples headers with animated gradient styling - examples_pattern = ( - r'(]*class="[^"]*doc-section-examples[^"]*"[^>]*>)(.*?Examples.*?)()' - ) - examples_replacement = r"""\1\2\3 -
""" - - content_str = re.sub(examples_pattern, examples_replacement, content_str, flags=re.DOTALL) - - content = content_str.splitlines(keepends=True) - - # Place a horizontal rule at the end of each reference page - content_str = "".join(content) - main_end_pattern = r"" - main_end_replacement = ( - "\n" - '
\n' - '
' - ) - content_str = re.sub(main_end_pattern, main_end_replacement, content_str) - content = content_str.splitlines(keepends=True) - - with open(html_file, "w") as file: - file.writelines(content) - - -# Modify the `index.html` file in the `_site/reference/` directory -index_file = "_site/reference/index.html" - -if os.path.exists(index_file): - print(f"Processing index file: {index_file}") - - with open(index_file, "r") as file: - content = file.read() - - # Convert tables to dl/dt/dd format - def convert_table_to_dl(match): - table_content = match.group(1) - - # Extract all table rows - row_pattern = r"]*>(.*?)" - rows = re.findall(row_pattern, table_content, re.DOTALL) - - dl_items = [] - for row in rows: - # Extract the two td elements - td_pattern = r"]*>(.*?)" - tds = re.findall(td_pattern, row, re.DOTALL) - - if len(tds) == 2: - link_content = tds[0].strip() - description = tds[1].strip() - - dt = f"
{link_content}
" - dd = f'
{description}
' - dl_items.append(f"{dt}\n{dd}") - - dl_content = "\n\n".join(dl_items) - return f'
\n
\n\n{dl_content}\n\n
\n
' - - # Replace all table structures with dl/dt/dd - table_pattern = r'\s*(.*?)\s*
' - content = re.sub(table_pattern, convert_table_to_dl, content, flags=re.DOTALL) - - # Add () to methods and functions in tags within
elements - def add_parens_to_functions(match): - full_tag = match.group(0) - link_text = match.group(1) - - # Rules for adding (): - # - Don't touch capitalized content (classes) - # - Add () if text has a period (methods like Validate.col_vals_gt) - # - Add () if text doesn't start with capital (functions like starts_with, load_dataset) - if "." in link_text or (link_text and not link_text[0].isupper()): - # Replace the link text with the same text + () - return full_tag.replace(f">{link_text}", f">{link_text}()") - - return full_tag - - # Find all tags within
elements and apply the function - dt_link_pattern = r"
]*>([^<]+)
" - content = re.sub(dt_link_pattern, add_parens_to_functions, content) - - # Remove redundant "API Reference" top-level nav item - # Find the nav structure and flatten it by removing the top-level wrapper - nav_pattern = r'(]*>.*?]*>.*?\s*
    \s*)
  • ]*href="[^"]*#api-reference"[^>]*>API Reference\s*]*>(.*?)
\s*(\s*)' - nav_replacement = r"\1\2\3" - content = re.sub(nav_pattern, nav_replacement, content, flags=re.DOTALL) - - with open(index_file, "w") as file: - file.write(content) - - print("Index file processing complete") -else: - print(f"Index file not found: {index_file}") - - -# Update quarto-secondary-nav-title to display "User Guide" text -# This improves the mobile navigation by making it clear what the sidebar toggle reveals -all_html_files = glob.glob("_site/**/*.html", recursive=True) -print(f"Found {len(all_html_files)} HTML files to check for secondary nav title") - -for html_file in all_html_files: - with open(html_file, "r") as file: - content = file.read() - - # Replace empty h1.quarto-secondary-nav-title with h5 containing "User Guide" - original_pattern = r'

' - replacement = '
User Guide
' - - if original_pattern in content: - print(f"Updating secondary nav title in: {html_file}") - content = content.replace(original_pattern, replacement) - - with open(html_file, "w") as file: - file.write(content) - - -print("Finished processing all files") diff --git a/docs/styles.css b/docs/styles.css deleted file mode 100644 index cbe3c53b00..0000000000 --- a/docs/styles.css +++ /dev/null @@ -1,277 +0,0 @@ -.table a { - text-underline-offset: 4px; - font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; - color: inherit; - font-weight: bold; - font-size: 0.875em; - word-break: initial; - text-decoration-thickness: 2px; -} - -.navbar-brand img { - max-height: 45px; - padding-right: 0px; - margin-left: 5px; - margin-right: 12px; - border-style: solid; - border-width: 2px; - border-radius: 5px; - border-color: darkgray; - transition: transform 0.3s ease; -} - -.navbar-brand img:hover { - transform: scale(1.15); -} - -body { - background-color: #FCFEFF; -} - -p a { - color: black; - text-underline-offset: 4px; -} - -li a { - color: black; - text-underline-offset: 4px; -} - -dt { - height: 32px; - overflow-x: visible; - margin-left: -4px; -} - -.table td { - padding-left: 0px; -} - -code { - font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; - font-size: 0.875em; - color: rgb(18, 18, 18); -} - -.blockquote { - font-size: 13px; - background-color: #E8E8E8; - border-left-color: #4682B4; -} - -p,h1,h2,h3,#toc-title,#toc-function-reference,.nav-link,.table { - font-family: "Open Sans", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; -} - -.cell-output.cell-output-stdout { - border-style: solid; - border-width: 1px; - border-color: #A1A1A165; - border-radius: 4px; - padding: 0.4em; - background-color: #F7F1EB65; - margin-bottom: 12px; -} - -summary::marker { - font-size: 1.5em; -} - -.parameter-name { - margin-right: -10px; -} - -#toc-title { - font-size: 16px; -} - -#toc-title > ul { - font-size: 14px; -} - -#quarto-margin-sidebar { - margin-left: 20px; - padding-top: 30px; -} - -#quarto-margin-sidebar nav ul li { - padding-top: 3px; -} - -#navbarCollapse > ul.navbar-nav.navbar-nav-scroll.me-auto > li > a:hover { - background-color: rgb(240, 112, 0); - color: white; - border-radius: 5px; -} - -#navbarCollapse > ul.navbar-nav.navbar-nav-scroll.me-auto > li > a { - color: white; - font-weight: bold; -} - -a.sidebar-item-text.sidebar-link.text-start[data-bs-toggle="collapse"] { - pointer-events: none; - cursor: default; -} - -div.sidebar-item-container { - cursor: default !important; -} - -a.sidebar-item-toggle.text-start > i { - color: #FAFAFD; -} - -a.sidebar-item-text.sidebar-link.active { - color: rgb(244, 121, 0); - text-decoration: underline; - text-decoration-thickness: 2px; - text-decoration-color: rgba(101, 180, 237, 0.75); -} - -#api-reference > h1 { - margin: 0; -} - -.navbar-title { - font-family: Inter, Roboto, "Helvetica Neue", "Arial Nova", "Nimbus Sans", Arial, sans-serif; - font-weight: bold; - color: white; - border-style: solid; - border-width: 2px; - border-radius: 8px; - padding-left: 10px; - padding-right: 10px; - padding-top: 3px; - padding-bottom: 4px; -} - -#quarto-sidebar { - background-color: #FAFAFD; -} - -a.sidebar-item-text.sidebar-link.text-start > span { - text-transform: uppercase; - font-weight: 600; - color: dimgray; -} - -.sidebar-menu-container { - border-bottom-style: solid; - border-bottom-width: 3px; - border-bottom-color: #E7EAF6; -} - -.menu-text { - font-size: 15.3px; -} - -.sidebar-item-section { - padding-bottom: 8px; -} - -#quarto-header > nav > div { - margin-left: 5%; - margin-right: 5%; -} - -.navbar { - background: linear-gradient(-45deg, #04778B, #1C7FF0, #006C93, #05614C); - background-size: 200% 100%; - animation: gradient 20s ease infinite; -} - -@keyframes gradient { - 0% { - background-position: 0% 50%; - } - 50% { - background-position: 100% 50%; - } - 100% { - background-position: 0% 50%; - } -} - -.nav-link.active { - text-decoration: underline; - text-decoration-thickness: 2px; - text-decoration-color: rgba(101, 180, 237, 0.75); - text-underline-offset: 4px; -} - -#navbarCollapse > ul.navbar-nav.navbar-nav-scroll.ms-auto > li { - padding-right: 5px -} - -.shrink-example .cell-output table { - zoom: 60%; -} - -.nav-footer { - background: linear-gradient(-45deg, - rgba(4, 119, 139, 0.15), - rgba(28, 127, 240, 0.2), - rgba(0, 108, 147, 0.12), - rgba(5, 97, 76, 0.18), - rgba(173, 216, 230, 0.25)); - background-size: 400% 400%; - animation: footerGradient 15s ease infinite; - padding-top: 20px; - border-top-color: rgba(4, 119, 139, 0.4); - border-top-style: solid; - border-top-width: 2px; -} - -.nav-footer-left { - color: rgb(48, 48, 48); -} - -.nav-footer-right { - color: rgb(48, 48, 48); -} - -#examples { - padding-left: 20px; - padding-right: 20px; - padding-top: 1px; - padding-bottom: 10px; - background-color: rgb(241, 252, 243); -} - -@keyframes footerGradient { - 0% { - background-position: 0% 50%; - } - 50% { - background-position: 100% 50%; - } - 100% { - background-position: 0% 50%; - } -} - -@keyframes examplesGradient { - 0% { - background-position: 0% 50%; - } - 50% { - background-position: 100% 50%; - } - 100% { - background-position: 0% 50%; - } -} - -/* Adjust mobile secondary nav positioning */ -@media (max-width: 991.98px) { - #quarto-header .container-fluid.d-flex { - margin-left: 0; - padding-left: 5px; - } - - .quarto-secondary-nav-title { - padding-top: 7px; - } -} diff --git a/docs/user-guide-pdf.qmd b/docs/user-guide-pdf.qmd deleted file mode 100644 index 42c2c305f7..0000000000 --- a/docs/user-guide-pdf.qmd +++ /dev/null @@ -1,97 +0,0 @@ ---- -format: - html: - toc: true - toc-depth: 3 - number-sections: true - embed-resources: true - theme: flatly - css: - - styles.css - - print-styles.css - page-layout: full - self-contained: true -jupyter: python3 ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer=False) -``` - -::: {.title-page} - -![](assets/pointblank_logo.png){width=400px style="display: block; margin: 0 auto; margin-top: 2in; margin-bottom: 0.75in;"} - -

-Data validation toolkit for assessing and monitoring data quality. -

- -

-© 2024–2026 Posit Software, PBC -

- -::: - -# Validation Plan - -{{< include user-guide/validation-overview.qmd >}} - -{{< include user-guide/validation-methods.qmd >}} - -{{< include user-guide/column-selection-patterns.qmd >}} - -{{< include user-guide/preprocessing.qmd >}} - -{{< include user-guide/segmentation.qmd >}} - -{{< include user-guide/thresholds.qmd >}} - -{{< include user-guide/actions.qmd >}} - -{{< include user-guide/briefs.qmd >}} - -# Advanced Validation - -{{< include user-guide/expressions.qmd >}} - -{{< include user-guide/schema-validation.qmd >}} - -{{< include user-guide/assertions.qmd >}} - -{{< include user-guide/draft-validation.qmd >}} - -# YAML - -{{< include user-guide/yaml-validation-workflows.qmd >}} - -{{< include user-guide/yaml-reference.qmd >}} - -# Post Interrogation - -{{< include user-guide/validation-reports.qmd >}} - -{{< include user-guide/step-reports.qmd >}} - -{{< include user-guide/extracts.qmd >}} - -{{< include user-guide/sundering.qmd >}} - -# Data Inspection - -{{< include user-guide/preview.qmd >}} - -{{< include user-guide/col-summary-tbl.qmd >}} - -{{< include user-guide/missing-vals-tbl.qmd >}} - -# The Pointblank CLI - -{{< include user-guide/cli-data-inspection.qmd >}} - -{{< include user-guide/cli-data-validation.qmd >}} - -{{< include user-guide/cli-reference.qmd >}} - diff --git a/docs/user-guide.pdf b/docs/user-guide.pdf deleted file mode 100644 index b201b32b7b..0000000000 Binary files a/docs/user-guide.pdf and /dev/null differ diff --git a/docs/user-guide/actions.qmd b/docs/user-guide/actions.qmd deleted file mode 100644 index ada5ff7203..0000000000 --- a/docs/user-guide/actions.qmd +++ /dev/null @@ -1,529 +0,0 @@ ---- -title: Actions -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -Actions transform data validation from passive reporting to active response by automatically -executing code when quality issues arise. They bridge the gap between detection and intervention, -enabling immediate notifications and comprehensive logging when thresholds are exceeded. - -Whether you need simple console messages for interactive analysis or complex alerting for production -pipelines, Actions provide the framework to make your validation workflows responsive. For example, -when validating revenue values, you can configure immediate alerts if failures exceed acceptable -thresholds, ensuring data issues are addressed promptly rather than discovered later. - -In this article, we'll explore how to use Actions to respond to threshold violations during data -validation, and Final Actions to execute code after all validation steps are complete, giving you -powerful tools to monitor, alert, and report on your data's quality. - -## How Actions Work - -Let's look at an example on how this works in practice. The following validation plan contains a -single step (using `~~Validate.col_vals_gt()`) where the `thresholds=` and `actions=` parameters are -set using `Thresholds` and `Actions` calls: - -```{python} -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_gt( - columns="c", value=2, - thresholds=pb.Thresholds(warning=1, error=5), - - # Emit a console message when the warning threshold is exceeded --- - actions=pb.Actions(warning="WARNING: failing test found.") - ) - .interrogate() -) -``` - -The code uses `thresholds=pb.Thresholds(warning=1, error=5)` to set a 'warning' threshold of `1` and -an 'error' threshold of `5` failing test units. The results part of the validation table shows that: - -- The `FAIL` column shows that 3 tests units have failed -- The `W` column (short for 'warning') shows a filled gray circle indicating it's reached its -threshold level -- The `E` ('error') column shows an open yellow circle indicating it's below the threshold -level - -More importantly, the text `"WARNING: failing test found."` has been emitted. Here it appears above -the validation table and that's because the action is executed eagerly during interrogation -(before the report has even been generated). - -So, an action is executed for a particular condition (e.g., 'warning') within a validation step if -these three things are true: - -1. there is a threshold set for that condition (either globally, or as part of that step) -2. there is an associated action set for the condition (again, either set globally or within the -step) -3. during interrogation, the threshold value for the condition was exceeded by the number or -proportion of failing test units - -There is a lot of flexibility for setting both thresholds and actions and everything here is -considered optional. Put another way, you can set various thresholds and various actions as needed -and the interrogation phase will determine whether all the requirements are met for executing -an action. - -## Defining Actions - -Actions can be defined in several ways, providing flexibility for different notification needs. - -### Using String Messages - -There are a few options in how to define the actions: - -1. **String**: a message to be displayed in the console -2. **Callable**: a function to be called -3. **List of Strings/Callables**: for execution of multiple messages or functions - -The actions are executed at interrogation time when the threshold level assigned to the action -is exceeded by the number or proportion of failing test units. When providing a string, it will -simply be printed to the console. A callable will also be executed at the time of interrogation. -If providing a list of strings or callables, each item in the list will be executed in order. -Such a list can contain a mix of strings and callables. - -Displaying console messages may be a simple approach, but it is effective. And the strings don't -have to be static, there are templating features that can be useful for constructing strings for a -variety of situations. The following placeholders are available for use: - -- `{type}`: The validation step type where the action is executed (e.g., ‘col_vals_gt’, etc.) -- `{level}`: The threshold level where the action is executed (‘warning’, ‘error’, or ‘critical’) -- `{step}` or `{i}`: The step number in the validation workflow where the action is executed -- `{col}` or `{column}`: The column name where the action is executed -- `{val}` or `{value}`: An associated value for the validation method -- `{time}`: A datetime value for when the action was executed - -Here's an example where we prepare a console message with a number of value placeholders -(`action_str`) and use it globally at `Actions(critical=)`: - -```{python} -action_str = "[{LEVEL}: {TYPE}]: Step {step} has failed validation. ({time})" - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - - # Use `action_str` for any critical thresholds exceeded --- - actions=pb.Actions(critical=action_str), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.10) - .col_vals_ge(columns="session_duration", value=15) - .interrogate() -) -``` - -What we get here are two messages in the console, corresponding to critical failures in steps 2 and -3. The placeholders were replaced with the correct text for the context. Note that some of the -resulting text is capitalized (e.g., `"CRITICAL"`, `"COL_VALS_GT"`, etc.) and this is because we -capitalized the placeholder text itself. Have a look at the documentation article of `Actions` for -more details on this. - -### Using Callable Functions - -Aside from strings, any callable can be used as an action value. Here's an example where we use a -custom function as part of an action: - -```{python} -def duration_issue(): - from datetime import datetime - print(f"Data quality issue found ({datetime.now()}).") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", value=15, - - # Use the `duration_issue()` function as an action for this step --- - actions=pb.Actions(warning=duration_issue), - ) - .interrogate() -) -``` - -In this case, the 'warning' action is set to call the user's `dq_issue()` function. This action is -only executed when the 'warning' threshold is exceeded in step 3. Because all three thresholds are -exceeded in that step, the 'warning' action of executing the function occurs (resulting in a -message being printed to the console). - -This is an example where actions can be defined locally for an individual validation step. The -global threshold setting applied to all three validation steps but the step-level action only -applied to step 3. You are free to mix and match both threshold and action settings at the global -level (i.e., set in the `Validate` call) or at the step level. The key thing to be aware of is that -step-level settings of thresholds and actions take precedence. - -## Accessing Context in Actions - -While string templates provide helpful placeholders to access information about validation steps, -callable functions offer more flexibility through access to detailed metadata. When using functions -as actions, you can retrieve comprehensive information about the validation context, allowing for -complex logic and dynamic responses to validation issues. - -### Using `get_action_metadata()`{.qd-no-link} in Callables - -To access information about the validation step where an action was triggered, we can call -`get_action_metadata()` in the body of a function to be used within `Actions`. This provides useful -context about the validation step that triggered the action. - -```{python} -def print_problem(): - m = pb.get_action_metadata() - print(f"{m['level']} ({m['level_num']}) for Step {m['step']}: {m['failure_text']}") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - - # Use the `print_problem()` function as the action --- - actions=pb.Actions(default=print_problem), - brief=True, - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) -``` - -In this example, we're creating a function called `print_problem()` that prints information about -each validation step that fails. We then apply this function as the default action for all threshold -levels using `actions=pb.Actions(default=print_problem)`. (Note that the `default=` and -`highest_only=` parameters will be covered in more detail in following sections.) - -We end up seeing two messages printed for failures in Steps 2 and 3. And though those steps had more -than one threshold exceeded, only the most severe level in each yielded a console message (due to -the default `highest_only=True` behavior). - -By setting the action in `Validate(actions=)`, we applied it to all validation steps where -thresholds are exceeded. This eliminates the need to set `actions=` at every validation step (though -you can do this as a local override, even setting `actions=None` to disable globally set actions). - -### Available Metadata Fields - -The dictionary returned by `get_action_metadata()` contains the following fields: - -- `step`: The step number. -- `column`: The column name. -- `value`: The value being compared (only available in certain validation steps). -- `type`: The assertion type (e.g., `"col_vals_gt"`, etc.). -- `time`: The time the validation step was executed (in ISO format). -- `level`: The severity level (`"warning"`, `"error"`, or `"critical"`). -- `level_num`: The severity level as a numeric value (`30`, `40`, or `50`). -- `autobrief`: A localized and brief statement of the expectation for the step. -- `failure_text`: Localized text that explains how the validation step failed. - -## Customizing Action Behavior - -The `Actions` class has two additional parameters that provide more control over how actions are -executed: - -### Setting Default Actions with `default=` - -Instead of specifying actions separately for each threshold level, you can use the `default=` -parameter to set a common action for all levels: - -```{python} -def log_all_issues(): - m = pb.get_action_metadata() - print(f"[{m['level'].upper()}] Validation failed in step {m['step']} with level {m['level']}") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - - # The `log_all_issues()` callable is set to every threshold --- - actions=pb.Actions(default=log_all_issues), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) -``` - -The `default=` parameter sets the same action for all threshold levels. If you later specify an -action for a specific level, it will override this default for that level only. - -When using the `default=` parameter, be aware that your action (whether a string template or -callable function) needs to work across all validation steps where thresholds might be exceeded. Not -all validation methods provide the same context for string templates or in the metadata dictionary -returned by `get_action_metadata()`. - -For example, some validation steps like `~~Validate.col_vals_gt()` provide a `value` field that can -be accessed with `{value}` in string templates, while others like `~~Validate.col_exists()` don't -have this concept. When creating default actions, either use only the universally available -placeholders (`{step}`, `{level}`, `{type}`, and `{time}`), or include conditional logic in your -callable functions to handle different validation types appropriately. - -### Controlling Action Execution with `highest_only=` - -By default, Pointblank only executes the action for the most severe threshold level that's been -exceeded. If you want actions for all exceeded thresholds to be executed, you can set -`highest_only=False`: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - actions=pb.Actions( - warning="Warning threshold exceeded in step {step}", - error="Error threshold exceeded in step {step}", - critical="Critical threshold exceeded in step {step}", - - # Execute all applicable actions --- - highest_only=False - ), - ) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) -``` - -In this example, if all three thresholds are exceeded in a step, you'll see all three messages -printed, rather than just the critical one. - -The default behavior (`highest_only=True`) helps prevent notification fatigue by limiting the number -of actions executed when multiple thresholds are exceeded in the same validation step. For example, -if a validation step fails with 60% of rows not passing, it would exceed 'warning', 'error', and -'critical' thresholds simultaneously. With `highest_only=True`, only the critical action would -execute. - -You might want to set `highest_only=False` when: - -- different threshold levels need to trigger different types of notifications (e.g., warnings to -Slack, errors to email, critical to urgent notifications) -- you need comprehensive logging of all severity levels for audit purposes -- you're building a dashboard that displays counts of issues at each severity level - -## Using Multiple Actions for a Threshold - -You can specify multiple actions to be executed for a single threshold level by providing a list: - -```{python} -def send_notification(): - print("📧 Notification sent to data team") - -def log_to_system(): - print("📝 Issue logged in system") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(critical=0.15), - - # Set multiple actions for the critical threshold exceedance --- - actions=pb.Actions( - critical=[ - "CRITICAL: Data validation failed", # First action: display message - send_notification, # Second action: call function - log_to_system # Third action: call another function - ] - ), - ) - .col_vals_gt(columns="session_duration", value=15) - .interrogate() -) -``` - -When providing a list of actions, they will be executed in sequence when the threshold is exceeded. -This allows you to combine different types of actions such as displaying messages, sending -notifications, and logging events. - -## Final Actions - -### Creating Final Actions - -When you need to execute actions after all validation steps are complete, Pointblank provides the -`FinalActions` class. Unlike `Actions` which triggers on a per-step basis during the validation -process, `FinalActions` executes after the entire validation is complete, giving you a way to -respond to the overall validation results. - -Here's how to use `FinalActions`: - -```{python} -def send_alert(): - summary = pb.get_validation_summary() - if summary["highest_severity"] == "critical": - print(f"ALERT: Critical validation failures found in `{summary['tbl_name']}`") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - tbl_name="game_revenue", - thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), - - # Set final actions to be executed after all interrogations --- - final_actions=pb.FinalActions( - "Validation complete.", # 1. a string message - send_alert # 2. a callable function - ) - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.10) - .interrogate() -) -``` - -In this example: - -- We define the function `send_alert()` that checks the validation summary for critical failures -- We provide a simple string message `"Validation complete."` that will print to the console -- Both actions will execute in order after all validation steps have completed - -Because the 'critical' threshold was exceeded in Step 2, we see the printed alert of `send_alert()` -after the simple string message. - -`FinalActions` accepts any number of actions as positional arguments. Each argument can be: - -1. **String**: A message to be displayed in the console -2. **Callable**: A function to be called with no arguments -3. **List of Strings/Callables**: Multiple actions to execute in sequence - -All actions will be executed in the order they are provided after all validation steps have -completed. - -### Using `get_validation_summary()`{.qd-no-link} in Final Actions - -When creating a callable function to use with `FinalActions`, you can access information about the -overall validation results using the `get_validation_summary()` function. This gives you a -dictionary with comprehensive information about the validation: - -```python -def comprehensive_report(): - summary = pb.get_validation_summary() - print(f"Validation Report for {summary['tbl_name']}:") - print(f"- Steps: {summary['n_steps']}") - print(f"- Passing steps: {summary['n_passing_steps']}") - print(f"- Failing steps: {summary['n_failing_steps']}") - - # Take additional actions based on results - if summary["n_failing_steps"] > 0: - - # Create a Slack notification function --- - notify = pb.send_slack_notification( - webhook_url="https://hooks.slack.com/services/your/webhook/url", - summary_msg=""" - 🚨 *Validation Failure Alert* - • Table: {tbl_name} - • Failed Steps: {n_failing_steps} of {n_steps} - • Highest Severity: {highest_severity} - • Time: {time} - """, - ) - - # Execute the notification function - notify() - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - tbl_name="game_revenue", - final_actions=pb.FinalActions(comprehensive_report), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .interrogate() -) -``` - -```{python} -# | echo: false - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - tbl_name="game_revenue", - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .interrogate() -) -``` - -Here we used the `send_slack_notification()` function, which is available in Pointblank as a -pre-built action. It can be used by itself in `final_actions=` but here it's integrated into the -user's `comprehensive_report()` function to provide finer control with conditional logic. - -### Combining Step-level and Final Actions - -You can use both `Actions` and `FinalActions` together for comprehensive validation control: - -```{python} -def log_step_failure(): - m = pb.get_action_metadata() - print(f"Step {m['step']} failed with {m['level']}") - - -def generate_summary(): - summary = pb.get_validation_summary() - # Sum up total failed test units across all steps - total_failed = sum(summary["dict_n_failed"].values()) - # Sum up total test units across all steps - total_units = sum(summary["dict_n"].values()) - print(f"Validation complete: {total_failed} failures out of {total_units} tests") - -( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds(warning=0.05, error=0.10), - - # Set an action for each step (highest threshold exceeded) --- - actions=pb.Actions(default=log_step_failure), - - # Set a final action to get a summary of the validation process --- - final_actions=pb.FinalActions(generate_summary), - ) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .interrogate() -) -``` - -This approach allows you to: - -1. log individual step failures during the validation process using `Actions` -2. generate a comprehensive report after all validation steps are complete using `FinalActions` - -Using both action types gives you fine-grained control over when and how notifications and other -actions are triggered in your validation workflow. - -## Conclusion - -Actions provide a powerful mechanism for responding to data validation results in Pointblank. By -combining threshold settings with appropriate actions, you can create sophisticated data quality -workflows that: - -- provide immediate feedback through console messages -- execute custom functions when validation thresholds are exceeded -- customize notifications based on severity levels -- generate comprehensive reports after validation is complete -- automate responses to data quality issues - -The flexible design of `Actions` and `FinalActions` allows you to start simple with basic console -messages and gradually build up to complex validation workflows with conditional logic, custom -reporting, and integrations with other systems like Slack, email, or logging services. - -When designing your validation strategy, consider leveraging both step-level actions for immediate -responses and final actions for holistic reporting. This combination provides comprehensive control -over your data validation process and helps ensure that data quality issues are detected, reported, -and addressed efficiently. diff --git a/docs/user-guide/assertions.qmd b/docs/user-guide/assertions.qmd deleted file mode 100644 index 6989a2da8d..0000000000 --- a/docs/user-guide/assertions.qmd +++ /dev/null @@ -1,377 +0,0 @@ ---- -title: Assertions -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -In addition to validation steps that create reports, Pointblank provides **assertions**. This is a -lightweight way to confirm data quality by raising exceptions when validation conditions aren't met. -Assertions are particularly useful in: - -- data processing pipelines where you need to halt execution if data doesn't meet expectations -- testing environments where you want to verify data properties programmatically -- scripts and functions where you need immediate notification of data problems - -## Basic Assertion Workflow - -The assertion workflow uses your familiar validation steps with assertion methods to check that -validations meet your requirements: - -```{python} -import pointblank as pb -import polars as pl - -# Create sample data -sample_data = pl.DataFrame({ - "id": [1, 2, 3, 4, 5], - "value": [10.5, 8.3, -2.1, 15.7, 7.2] -}) - -# Create a validation plan and assert that all steps pass -( - pb.Validate(data=sample_data) - .col_vals_gt(columns="id", value=0, brief="IDs must be positive") - .col_vals_gt(columns="value", value=-5, brief="Values should exceed -5") - - # Will automatically `interrogate()` and raise an AssertionError if any validation fails --- - .assert_passing() -) -``` - -This simple pattern allows you to integrate data quality checks into your data pipelines. With it, -you can create clear stopping points when data doesn't meet specified criteria. - -## Assertion Methods - -Pointblank offers two types of assertions: - -1. Full Passing Assertions: using `~~Validate.assert_passing()` to verify that every single test -unit passes -2. Threshold-Based Assertions: using `~~Validate.assert_below_threshold()` to verify that failure -rates stay within acceptable thresholds - -### `assert_passing()` - -The `~~Validate.assert_passing()` method is the strictest form of assertion, requiring every single -validation test unit to pass: - -```{python} -try: - ( - pb.Validate(data=sample_data) - .col_vals_gt(columns="value", value=0) - - # Direct assertion: automatically interrogates --- - .assert_passing() - ) -except AssertionError as e: - print("AssertionError:", str(e)) -``` - -### `assert_below_threshold()` - -The `~~Validate.assert_below_threshold()` method is more flexible as it allows some failures as long -as they stay below specified threshold levels. Pointblank uses three severity thresholds that -increase in order of seriousness: - -- **'warning'** (least severe): the first threshold that gets triggered when failures exceed this -level -- **'error'** (more severe): the middle threshold indicating more serious data quality issues -- **'critical'** (most severe): the highest threshold indicating critical data quality problems - -```{python} -# Create a two-column DataFrame for this example -tbl_pl = pl.DataFrame({ - "a": [4, 6, 9, 7, 12, 8, 7, 12, 10, 7], - "b": [9, 8, 10, 5, 10, 9, 14, 6, 6, 8], - -}) - -# Set thresholds: warning=0.2 (20%), error=0.3 (30%), critical=0.4 (40%) -validation = ( - pb.Validate(data=tbl_pl, thresholds=(0.2, 0.3, 0.4)) - .col_vals_gt(columns="b", value=5) # 1/10 failing (10% failure rate) - .col_vals_lt(columns="a", value=11) # 2/10 failing (20% failure rate) - .col_vals_ge(columns="b", value=8) # 3/10 failing (30% failure rate) - .interrogate() -) - -validation -``` - -The validation report above visually indicates threshold levels with colored circles: - -- gray circles in the `W` column indicate the 'warning' threshold -- yellow circles in the `E` column indicate the 'error' threshold -- red circles in the `C` column indicate the 'critical' threshold - -This won't pass the `~~Validate.assert_below_threshold()` assertion for the 'error' level because -step 3 exceeds this threshold (30% failure rate matches the error threshold): - -```{python} -try: - validation.assert_below_threshold(level="error") -except AssertionError as e: - print("AssertionError:", str(e)) -``` - -We can check against the 'error' threshold for specific steps with the `i=` parameter: - -```{python} -validation.assert_below_threshold(level="error", i=[1, 2]) -``` - -This passes because the highest threshold exceeded in steps 1 and 2 is 'warning'. - -The `~~Validate.assert_below_threshold()` method takes these parameters: - -- `level=`: threshold level to check against (`"warning"`, `"error"`, or `"critical"`) -- `i=`: optional specific step number(s) to check -- `message=`: optional custom error message - -This is particularly useful when: - -- working with real-world data where some percentage of failures is acceptable -- implementing different severity levels for data quality rules -- gradually improving data quality with stepped thresholds - -::: {.callout-note} -Assertion methods like `~~Validate.assert_passing()` and `~~Validate.assert_below_threshold()` -will automatically call `~~Validate.interrogate()` if needed, so you don't have to explicitly -include this step when using assertions directly. -::: - -## Using Status Check Methods - -In addition to assertion methods that raise exceptions, Pointblank provides status check methods -that return boolean values: - -### `all_passed()` - -The `~~Validate.all_passed()` method will return `True` only if every single test unit in every -validation step passed: - -```{python} -validation = ( - pb.Validate(data=sample_data) - .col_vals_gt(columns="value", value=0) - .interrogate() -) - -if not validation.all_passed(): - print("Validation failed: some values are not positive") -``` - -### `warning()`, `error()`, and `critical()` - -The methods `~~Validate.warning()`, `~~Validate.error()`, and `~~Validate.critical()` all return -information about whether validation steps exceeded that specific threshold level. - -While assertion methods raise exceptions to halt execution when thresholds are exceeded, these -status methods give you fine-grained control to implement custom logic based on different validation -quality levels. - -```{python} -validation = ( - pb.Validate(data=sample_data, thresholds=(0.05, 0.10, 0.20)) - .col_vals_gt(columns="value", value=0) # Some values are negative - .interrogate() -) - -validation -``` - -The `~~Validate.warning()` method returns a dictionary mapping step numbers to boolean values. A -`True` value means that step exceeds the warning threshold: - -```{python} -# Get dictionary of warning status for each step -warning_status = validation.warning() -print(f"Warning status: {warning_status}") # {1: True} means step 1 exceeds warning threshold -``` - -You can check a specific step using the `i=` parameter, and get a single boolean with `scalar=True`: - -```{python} -# Check error threshold for specific step -has_errors = validation.error(i=1, scalar=True) - -if has_errors: - print("Step 1 exceeded the error threshold.") -``` - -Similarly, we can check if any steps exceed the 'critical' threshold: - -```{python} -# Check against critical threshold -critical_status = validation.critical() -print(f"Critical status: {critical_status}") -``` - -These methods are particularly useful for: - -1. Conditional logic: taking different actions based on threshold severity -2. Reporting: generating summary reports about validation quality -3. Monitoring: tracking data quality trends over time -4. Graceful degradation: implementing fallback logic when quality decreases - -Each method has these options: - -- without parameters: returns a dictionary mapping step numbers to boolean status values -- with `i=`: check specific step(s) -- with `scalar=True`: return a single boolean instead of a dictionary (when checking a specific -step) - -While assertion methods raise exceptions to halt execution when thresholds are exceeded, these -methods give you fine-grained control to implement custom logic based on different validation -quality levels. - -## Customizing Error Messages - -You can provide custom error messages when assertions fail to make them more meaningful in your -specific workflow context: - -```{python} -# Create a validation with potential failures -validation = ( - pb.Validate(data=sample_data, thresholds=(0.2, 0.3, 0.4)) - .col_vals_gt(columns="value", value=0) - .interrogate() -) - -# Display the validation results -validation -``` - -When you need to customize the error message that appears when an assertion fails, use the -`message=` parameter: - -```{python} -try: - # Custom message for threshold assertion - validation.assert_below_threshold( - level="warning", - message="Data quality too low for processing!" - ) -except AssertionError as e: - print(f"Custom handling of failure: {e}") -``` - -Descriptive error messages are essential in production systems where multiple team members might -need to interpret validation failures. The custom message lets you provide context appropriate to -your specific workflow or data pipeline stage. - -## Combining Assertions with Actions - -Actions and assertions serve complementary but distinct purposes in data validation workflows: - -- Actions trigger during validation but shouldn't raise errors (as this would halt report -generation) -- Assertions are designed to raise errors based on specific conditions, making them ideal for -flow control after validation completes - -Here's a simplified example showing how to use them together. The print statements simulate logging -or monitoring that would be valuable in production data pipelines: - -```{python} -# Define a simple action function (won't raise errors) -def notify_quality_issue(message="Data quality issue detected"): - print(f"ACTION TRIGGERED: {message}") - -# Create data with known failures -problem_data = pl.DataFrame({ - "id": [1, 2, 3, -4, 5], # One negative ID - "value": [10.5, 8.3, -2.1, 15.7, 7.2] # One negative value -}) - -# First use actions for automated responses during validation -print("Running validation with actions...") -validation = ( - pb.Validate(data=problem_data, thresholds=(0.1, 0.2, 0.3)) - .col_vals_gt( - columns="id", value=0, - brief="IDs must be positive", - actions=pb.Actions(warning=notify_quality_issue) - ) - .interrogate() # Actions trigger here but won't stop report generation -) - -# Then use assertions after validation for workflow control -print("\nNow using assertion for flow control...") -try: - validation.assert_below_threshold(level="warning") - print("This line won't execute if the assertion fails") -except AssertionError as e: - print(f"Validation failed threshold check: {e}") - print("Implementing fallback process...") -``` - -This approach gives you the best of both worlds: - -- Actions provide immediate notification during validation without interrupting the process -- Assertions control workflow execution after validation when important thresholds are exceeded - -This pattern works well in data pipelines where you want both: (1) automated responses during -validation and (2) clear decision points after validation is complete. - -## Best Practices for Assertions - -When using assertions in your data workflows, consider these best practices: - -1. **Choose the right assertion type**: - - use `~~Validate.assert_passing()` for critical validations where any failure is unacceptable - - use `~~Validate.assert_below_threshold()` for validations where some failure rate is acceptable - -2. **Set appropriate thresholds** that match your data quality requirements: - ```python - # Example threshold strategy - validation = pb.Validate( - data=sample_data, - # warning at 1%, error at 5%, critical at 10% - thresholds=pb.Thresholds(warning=0.01, error=0.05, critical=0.10) - ) - ``` - -3. **Use a graduated approach** to validation severity: - ```python - # Critical validations: must be perfect - validation_1.assert_passing() - - # Important validations: must be below error threshold - validation_2.assert_below_threshold(level="error") - - # Monitor-only validations: check warning status - warning_status = validation_3.warning() - ``` - -4. **Placement in pipelines**: place assertions at critical points where data quality is essential - -5. **Error handling**: wrap assertions in try-except blocks for better error handling in production -systems - -6. **Combine with reporting**: use both assertions and reporting approaches for comprehensive -quality control - -## Conclusion - -Pointblank's assertion methods give you flexible options for enforcing data quality requirements: - -- `~~Validate.assert_passing()` for strict validation where every test unit must pass -- `~~Validate.assert_below_threshold()` for more flexible validation where some failures are -tolerable -- Status methods (`~~Validate.warning()`, `~~Validate.error()`, and `~~Validate.critical()`) for -programmatic threshold checking - -By using these assertion methods appropriately, you can build robust data pipelines with different -levels of quality enforcement (from strict validation of critical data properties to more lenient -checks for less critical aspects). This graduated approach to data quality helps create systems that -are both reliable and practical in real-world data environments. diff --git a/docs/user-guide/briefs.qmd b/docs/user-guide/briefs.qmd deleted file mode 100644 index a78d33438f..0000000000 --- a/docs/user-guide/briefs.qmd +++ /dev/null @@ -1,294 +0,0 @@ ---- -title: Briefs -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -When validating data with Pointblank, it's often helpful to have descriptive labels for each -validation step. This is where *briefs* come in. A brief is a short description of what a validation -step is checking and it appears in the `STEP` column of the validation report table. Briefs make -your validation reports more readable and they help others understand what each step is verifying -without needing to look at the code. - -Briefs can be set in two ways: - -1. Globally: applied to all validation steps via the `brief=` parameter in `Validate` -2. Locally: set for individual validation steps via the `brief=` parameter in each validation method - -Understanding these two approaches to adding briefs gives you flexibility in how you document your -validation process. Global briefs provide consistency across all steps and save time when you want -similar descriptions throughout, while step-level briefs allow for precise customization when -specific validations need more detailed or unique explanations. In practice, many validation -workflows will combine both approaches (i.e., setting a useful global brief template while -overriding it for steps that require special attention). - -## Global Briefs - -To set a global brief that applies to all validation steps, use the `Validate(brief=)` parameter -when creating a `Validate` object: - -```{python} -import pointblank as pb -import polars as pl - -# Sample data -data = pl.DataFrame({ - "id": [1, 2, 3, 4, 5], - "value": [10, 20, 30, 40, 50], - "category": ["A", "B", "C", "A", "B"] -}) - -# Create a validation with a global brief -( - pb.Validate( - data=data, - - # Global brief template --- - brief="Step {step}: {auto}" - ) - .col_vals_gt(columns="value", value=5) - .col_vals_in_set(columns="category", set=["A", "B", "C"]) - .interrogate() -) -``` - -In this example, every validation step will have a brief description that follows the pattern -`"Step X: [auto-generated description]"`. - -This is a simple example of template-based briefs. Later in this guide, we'll explore the full range -of templating elements available for creating custom brief descriptions that precisely communicate -what each validation step is checking. - -## Step-level Briefs - -You can also set briefs for individual validation steps: - -```{python} -( - pb.Validate(data=data) - .col_vals_gt( - columns="value", value=5, - brief="Check if values exceed minimum threshold of 5" - ) - .col_vals_in_set( - columns="category", set=["A", "B", "C"], - brief="Verify categories are valid" - ) - .interrogate() -) -``` - -Local briefs override any global briefs that might be set. - -## Brief Templating - -Briefs support templating elements that get replaced with specific values: - -- `{auto}`: an auto-generated description of the validation -- `{step}`: the step number in the validation plan -- `{col}`: the column name(s) being validated -- `{value}`: the comparison value used in the validation (when applicable) -- `{thresholds}`: a short summary of thresholds levels set (or unset) for the step -- `{segment}`, `{segment_column}`, `{segment_value}`: information on the step's segment - -Here's how to use these templates: - -```{python} -( - pb.Validate(data=data) - .col_vals_gt( - columns="value", value=5, - brief="Step {step}: Checking column '{col}' for values `> 5`" - ) - .col_vals_in_set( - columns="category", set=["A", "B", "C"], - brief="{auto} **(Step {step})**" - ) - .interrogate() -) -``` - -These template elements make briefs highly flexible and customizable. You can combine multiple -templating elements in a single brief to create descriptive yet concise validation step -descriptions. The templates help maintain consistency across your validation reports while providing -enough detail to understand what each step is checking. - -Note that not all templating elements will be relevant for every validation step. For instance, -`{value}` is only applicable to validation functions that hold a comparison value like -`~~Validate.col_vals_gt()`. If you include a templating element that isn't relevant to a particular -step, it will not be replaced with a corresponding value. - -Briefs support the use of Markdown formatting, allowing you to add emphasis with **bold** or -_italic_ text, include `inline code` formatting, or other Markdown elements to make your briefs more -visually distinctive and informative. This can be especially helpful when you want certain parts of -your briefs to stand out in the validation report. - -## Automatic Briefs - -If you want Pointblank to generate briefs for you automatically, you can set `brief=True`. Here, -we'll make that setting at the global level (by using `Validate(brief=True)`): - -```{python} -( - pb.Validate( - data=data, - - # Setting for automatically generated briefs --- - brief=True - ) - .col_vals_gt(columns="value", value=5) - .col_vals_in_set(columns="category", set=["A", "B", "C"]) - .interrogate() -) -``` - -Automatic briefs are descriptive and include information about what's being validated, including the -column names and the validation conditions. - -## Briefs Localized to a Specified Language - -When using the `lang=` parameter in `Validate`, automatically generated briefs will be created in -the specified language (along with other elements of the validation report table): - -```{python} -( - pb.Validate( - data=data, - - # Setting the language as Spanish --- - lang="es", - - # Automatically generate all briefs in Spanish - brief=True - ) - .col_vals_gt(columns="value", value=5) - .col_vals_in_set(columns="category", set=["A", "B", "C"]) - .interrogate() -) -``` - -When using the `lang=` parameter in combination with the `{auto}` templating element, the -auto-generated portion of the brief will also be translated to the specified language. This makes it -possible to create fully localized validation reports where both custom text and auto-generated -descriptions appear in the same language. - -Pointblank supports several languages for localized briefs, including French (`"fr"`), German -(`"de"`), Spanish (`"es"`), Italian (`"it"`), and Portuguese (`"pt"`). For the complete list of -supported languages, refer to the `Validate` documentation. - -## Disabling Briefs - -If you've set a global brief but want to disable it for specific validation steps, you can set -`brief=False`: - -```{python} -( - pb.Validate( - data=data, - - # Global brief template --- - brief="Step {step}: {auto}" - ) - .col_vals_gt(columns="value", value=5) # This step uses the global brief setting - .col_vals_in_set( - columns="category", - set=["A", "B", "C"], - - # No brief for this step --- - brief=False - ) - .interrogate() -) -``` - -## Practical Example: Comprehensive Validation with Briefs - -In real-world data validation scenarios, you'll likely work with more complex datasets and apply -various types of validation checks. This final example brings together many of the brief-generating -techniques we've covered, showing how you can mix different approaches in a single validation -workflow. - -```{python} -# Create a slightly larger dataset -data_2 = pl.DataFrame({ - "id": [1, 2, 3, 4, 5, 6, 7, 8], - "value": [10, 20, 30, 40, 50, 60, 70, 80], - "ratio": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], - "category": ["A", "B", "C", "A", "B", "C", "A", "B"], - "date": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", - "2023-01-05", "2023-01-06", "2023-01-07", "2023-01-08"] -}) - -( - pb.Validate(data=data_2) - .col_vals_gt( - columns="value", value=0, - - # Plaintext brief --- - brief="All values must be positive." - ) - .col_vals_between( - columns="ratio", left=0, right=1, - - # Template-based brief --- - brief="**Step {step}**: Ratios should be between `0` and `1`." - ) - .col_vals_in_set( - columns="category", set=["A", "B", "C"], - - # Automatically generated brief --- - brief=True - ) - .interrogate() -) -``` - -The example above demonstrates: - -- plaintext briefs with direct messages -- template-based briefs with Markdown formatting -- automatically generated briefs (`brief=True`) - -By combining these different brief styles, you can create validation reports that are informative, -consistent, and tailored to your specific data quality requirements. - -## Best Practices for Using Briefs - -Well-crafted briefs can significantly enhance the readability and usefulness of your validation -reports. Here are some guidelines to follow: - -1. Be concise: briefs should be short and to the point; they're meant to quickly communicate the -purpose of a validation step - -2. Be specific: include relevant details or conditions that make the validation meaningful - -3. Use templates consistently: if you're using template elements like `"{step}"` or `"{col}"`, try -to use them consistently across all briefs for a cleaner look - -4. Use auto-generated briefs as a starting point: you can start with `Validate(brief=True)` to see -what Pointblank generates automatically, then customize as needed - -5. Add custom briefs for complex validations: custom briefs are especially useful for complex -validations where the purpose might not be immediately obvious from the code - -Following these best practices will help ensure your validation reports are easy to understand for -everyone who needs to review them. - -## Conclusion - -Briefs help make validation reports more readable and understandable. By using global briefs, -step-level briefs, or a combination of both, you can create validation reports that clearly -communicate what each validation step is checking. - -Whether you want automatically generated descriptions or precisely tailored custom messages, the -brief system provides the flexibility to make your data validation work more transparent and easier -to interpret for all stakeholders. diff --git a/docs/user-guide/cli-data-inspection.qmd b/docs/user-guide/cli-data-inspection.qmd deleted file mode 100644 index 63ba2d509b..0000000000 --- a/docs/user-guide/cli-data-inspection.qmd +++ /dev/null @@ -1,218 +0,0 @@ ---- -title: Data Inspection -jupyter: python3 -toc-expand: 2 -bread-crumbs: true ---- - -Pointblank’s CLI (`pb`) makes it easy to view your data before running validations. It has several -commands that are exceedingly useful for understanding your data’s structure, checking for obvious -issues, and confirming that your data source is being read correctly. We also make it easy to -explore data in various formats and locations. Let's go through each of the commands for inspecting -and exploring data. - -## `pb info`: Inspecting the Data Structure - -Use `pb info` to display basic information about your data source. Here's how this works with a -local CSV file: - -```bash -pb info worldcities.csv -``` - -![](/assets/pb-info-worldcities-csv.png){width=100%} - -This command shows the (1) table type (e.g., `pandas`, `polars`, etc.), (2) the number of rows and -columns, and (3) the data source path or identifier. - -That example used a local CSV file. The same file is also present in Pointblank's GitHub repository -(in the `data-raw` directory) and the CLI is able to load the data from there as well: - -```bash -pb info https://github.com/posit-dev/pointblank/blob/main/data_raw/worldcities.csv -``` - -![](/assets/pb-info-worldcities-github-csv.png){width=100%} - -The `pb info` command is useful before running validations to confirm your data source's dimensions, -and, whether it can even be loaded. - -::: {.callout-info} - -You can inspect a wide variety of data sources using the CLI! Here are some examples with `pb info`: - -```bash -pb info small_table # built in dataset -pb info worldcities.csv # single CSV file -pb info meteo.parquet # single Parquet file -pb info "*.parquet" # several Parquet files -pb info "data/*.parquet" # partitioned Parquet files -pb info "duckdb:///warehouse/analytics.ddb::customer_metrics" # DB table via connection string -pb info https://github.com/posit-dev/pointblank/blob/main/data_raw/global_sales.csv # GitHub URL -``` - -And these input schemes work with all other commands that accept a `DATA_SOURCE`. -::: - -## `pb preview`: Previewing Data - -Use `pb preview` to view the first and last rows of your data. Let's try it out with the -`worldcities.csv` file: - -```bash -pb preview worldcities.csv -``` - -![](/assets/pb-preview-worldcities-csv.png){width=100%} - -As can be seen, `pb preview` gives you a preview of the dataset as a table in the console. The -dataset has 41K rows but we're electing to show only five rows from the head and from the tail. - -Let's go over some features of the table preview. First off, the table header provides information -on the data source and the DataFrame library that handled the reading of the CSV. Below the column -names are simplified representations of the data types (e.g., `` for `object`, `` for -`Float64`). We provide row numbers (in gray) in the table stub to indicate which of the rows are -from the head or the tail (and a divider helps to distinguish these row groups). If you'd prefer -to eliminate the row numbers, use the `--no-row-numbers` option: - -```bash -pb preview worldcities.csv --no-row-numbers -``` - -![](/assets/pb-preview-worldcities-csv-no-row-numbers.png){width=100%} - -While `pb preview` purposefully displays only a few rows, the number of columns shown can be more -than you might need. Furthermore, if a table has *a lot* of columns, you'll only see some of the -first and some of the last columns. This is where column selection becomes useful and there are a -few methods available for subsetting the preview table's columns. A good one (provided you know the -column names) is to use the `--columns` option along with a comma-delimted set of column names. -Let's look at a preview of the included `game_revenue` dataset before subsetting the columns: - -```bash -pb preview game_revenue -``` - -![](/assets/pb-preview-game_revenue-all-columns.png){width=100%} - -That's 11 columns in total and while the all columns *are* shown (i.e., none in the middle are -truncated from view), we start to see some necessary instances of abbreviating via `…` within the -column names and in the displayed values. - -Let's now use the `--columns` with a set of column names: - -```bash -pb preview game_revenue -columns "player_id, item_type, item_name, start_day" -``` - -![](/assets/pb-preview-game_revenue-column-names.png){width=100%} - -With that, the few columns that are displayed no longer have to abbreviate their data values. This -is an important consideration since a selective display of column becomes more necessary if column -content is large or if the width of the terminal (in terms of characters) cannot be increased. - -You may want to view ranges of columns by their indices. This is convenient when you want to get a -closer look at a few side-by-side columns and you don't want to bother with getting the set of -column names exactly right (i.e., for quick inspection). For this, we need to use the `--col-range` -option with the desired left/right column bounds separated by a colon: - -```bash -pb preview game_revenue —-col-range "3:6" -``` - -![](/assets/pb-preview-game_revenue-column-range.png){width=100%} - -In the case that you want to save a table preview as an HTML table in a standalone file, you can add -in the `--output-html` option (just add a path/filename with an .html extension). - -And there are many more options that allow for quick iteration while previewing a table. Use -`pb preview --help` to get a helpful listing. - -## `pb scan`: Getting Column Summaries - -We can use `pb scan` for fairly comprehensive summaries of column data, including: - -- data types -- missing value counts -- unique value counts -- summary statistics (mean, standard deviation, min, max, quartiles, and the interquartile range) - -Let's use this on the `worldcities.csv` dataset: - -```bash -pb scan worldcities.csv -``` - -![](/assets/pb-scan-worldcities-csv.png){width=100%} - -Each row in the summary table represents a column in the input dataset. Just as in `pb preview` we -get simplified dtypes (in the `Type` column). The `NA` and `UQ` indicate how many missing and unique -values are in the column. The remaining columns are statistical measures and there's an important -thing to note here: the values provided for any string-based columns (here, `city_name` and -`country`) are derived from string lengths. - -When using `pb scan`, it's helpful to know that large numbers in the summary table are automatically -abbreviated for readability, so you'll see values like `39.8k` or `38.0M` instead of long numbers -that would require many more characters. For the best experience, try to use a terminal window -that's at least 150 characters wide. This will help ensure that all column values are fully visible -and not adversely abbreviated by the underlying table mechanism. - -If your table has many columns, that's not much of a problem for the reporting! Each column is -represented as a row in the report, so you'll simply see more lines in the output (and you could -always limit the number of columns reported). - -There are two options for `pb scan`: - -- `--columns "col1,col2"`: scan only specified columns -- `--output-html "file.html"`: save scan as an HTML file - -Both of these options are also in the `pb preview` command and they behave the same way here. - -## `pb missing`: Reporting on Missing Values - -Use `pb missing` to generate a missing values report, visualizing missingness across columns and 10 -*row sectors*. Here's an example using `worldcities.csv`: - -```bash -pb missing worldcities.csv -``` - -![](/assets/pb-missing-worldcities-csv.png){width=100%} - -This report is arranged similarly to that of `pb scan`, where each column in the input table gets a -row in this report table. Each of the 10 row sectors represents 1/10 of the rows in the dataset, -where sector `1` encompasses the head of the table, and `10` the tail. - -More often than not, we expect few missing values so a filled green circle signifies that the -collection of rows in a sector (for a column) has no missing values. We don't see any red circles in -the `worldcities.csv`-based example but, if we did, that would mean that sectors for a given column -are entirely filled with missing values. - -What's in between the no-missing and completely-missing cases are percentages of missing values. For -instance, we can see that row sector `3` of the `population` column has 18% missing values (which is -very odd for a table with the sole purpose of providing population values). - -We also have cases where we see <1% of values in a row sector missing. The reporting of `pb missing` -is very careful not to 'round down' in cases where there could be very few missing values (or even -just one) in a large table. - -Seeing this type of missing value report can be really important! You might not expect *any* missing -values but finding them will inform decisions on whether to institute checks for them. Another case -is that missing values will pop up in specific sectors, indicating a change in how data is processed -and appended to the table. - -By way of options, there's only one for `pb missing` and it is `--output-html`. With that (as in the -previous two commands discussed), we can write the missing values report to a standalone HTML file. - -## Wrapping Up - -Pointblank’s CLI provides a set of commands that make it easy to inspect, understand, and diagnose -your data before you move on to validation or analysis. Using these tools can help you catch issues -early and gain confidence in your data sources. - -- use `pb info` and before running validations to confirm your data source can be loaded -- use `pb preview` to quickly understand what the data looks like -- use `pb scan` for a quick data profile and to spot outliers or data quality issues -- use `pb missing` to visualize and diagnose missing data patterns - -By incorporating these commands into your workflow, you’ll be better equipped to work efficiently -with your data (and avoid surprises down the line). diff --git a/docs/user-guide/cli-data-validation.qmd b/docs/user-guide/cli-data-validation.qmd deleted file mode 100644 index e8a023adb0..0000000000 --- a/docs/user-guide/cli-data-validation.qmd +++ /dev/null @@ -1,264 +0,0 @@ ---- -title: Data Validation -jupyter: python3 -toc-expand: 2 -bread-crumbs: true ---- - -Validating data directly in the terminal with the Pointblank CLI offers a fast, scriptable, and -repeatable way to check your data. This approach is especially useful for quick checks, CI/CD -pipelines, and automation workflows, where you want immediate feedback and clear pass/fail results. - -The CLI commands are designed for efficiency: you can run validations with a single line, integrate -them easily into shell scripts or data pipelines, and benefit from clear, color-coded output that’s -easy to interpret at a glance. - -The `pb validate` command lets you perform common validation checks directly on your data source -with a simple command-line interface. This works well both for quick, one-off checks and for use in -automated pipelines. - -For more complex validation logic, the `pb run` command serves as a runner for validation scripts -written with the Pointblank Python API, allowing you to execute custom validation workflows from the -command line. - -## `pb validate`: Quick, One-Line Data Checks - -The `pb validate` command is your go-to for running common validation checks directly on your data -source. It’s perfect for quick, one-off checks or for use in automated pipelines. You specify -exactly which check you want to run using the `--check` option, making your intent clear and your -validation explicit. - -Here’s how you construct a validation command: - -```bash -pb validate worldcities.csv --check [other options] -``` - -You always provide the data source first, then specify one or more checks with `--check`. Each check -can have its own options, such as `--column` or `--value`, depending on what you want to validate. - -### Checking for Duplicate and Complete Rows - -To check for duplicate rows, use the `rows-distinct` check: - -```bash -pb validate worldcities.csv --check rows-distinct -``` - -![](/assets/pb-validate-rows-distinct-worldcities-csv.png){width=100%} - -The output shows you whether your data contains any duplicate rows, how many rows were checked, and -if any duplicates were found. The color-coding of the results helps you quickly interpret the -results, using green for pass and red for fail. Here, no duplicate rows were detected out of the 41K -rows checked. - -To check that every row is complete (i.e., no missing values in any column), use the `rows-complete` -check: - -```bash -pb validate worldcities.csv --check rows-complete -``` - -![](/assets/pb-validate-rows-complete-worldcities-csv.png){width=100%} - -With this check we see that the `worldcities.csv` dataset has 739 rows containing at least one -Null/missing value. And with any dataset, it's easy to quickly spot if there are any rows with -missing data using this command. - -### Checking for Nulls and Value Ranges - -You can easily check for missing values in a column, or ensure that values fall within a certain -range. Here’s how to check that all values in the `population` column are not null: - -```bash -pb validate worldcities.csv --check col-vals-not-null --column city_name -``` - -![](/assets/pb-validate-worldcities-not-null-city_name.png){width=100%} - -Perhaps surprisingly, we find that one row has a missing city name. - -Let's now check whether all values in the `population` column are greater than zero: - -```bash -pb validate worldcities.csv --check col-vals-gt --column population --value 0 -``` - -![](/assets/pb-validate-worldcities-gt-0-population.png){width=100%} - -With that we find that there are 741 rows where the `population` value is not greater than 0 (note -that this check also fails when cells are null or missing). - -### Multiple Checks in One Command - -You can chain several checks together in a single command. This is handy for comprehensive data -quality checks: - -```bash -pb validate worldcities.csv --check rows-distinct --check col-vals-not-null --column city_name --check col-vals-gt --column population --value 0 -``` - -![](/assets/pb-validate-multi-check.png){width=100%} - -Each check is shown one after the other in the terminal output, so you can review the result of each -validation step individually as the command proceeds. - -### Seeing and Saving Failing Rows - -If a check fails, you might want to see which rows caused the failure. Use the `--show-extract` -option to display failing rows right in the terminal: - -```bash -pb validate worldcities.csv --check rows-complete --show-extract -``` - -![](/assets/pb-validate-show-extract.png){width=100%} - -Or, save the failing rows to a CSV file for further investigation: - -```bash -pb validate worldcities.csv --check rows-complete --show-extract --write-extract incomplete_failing_rows -``` - -![](/assets/pb-validate-write-extract.png){width=100%} - -Note here in the output the additional lines stating that failing rows were saved to a folder -(`incomplete_failing_rows`) and, within that folder the `step_01_rows_complete.csv` file was -written. Using a folder for extracts is necessary in practice since there may be multiple -validations defined in a `pb validate` command. - -### Advanced Options and CI/CD Integration - -- use `--exit-code` to make the command exit with a non-zero code if any check fails; useful for -CI/CD pipelines -- use `--limit` to control how many failing rows are shown or saved -- use `--list-checks` to see all available validation checks and their options - -```bash -pb validate worldcities.csv --check col-vals-not-null --column city_name --exit-code -``` - -![](/assets/pb-validate-exit-code.png){width=100%} - -## `pb run`: Custom Validation Workflows with Python - -For more complex validation logic, use the `pb run` command. This lets you execute a Python script -containing Pointblank validation steps, combining the flexibility of the Python API with the -convenience of the CLI. - -You can always scaffold a template script using the `pb make-template` command: - -```bash -pb make-template my_validation.py -``` - -![](/assets/pb-make-template.png){width=100%} - -But for our example, we'll elect to make our own `worldcities_validation.py` file from scratch. It -will: - -- use the `worldcities.csv` file -- apply two thresholds (one for 'warning', another for 'error') -- have six validation steps - -Here's what it looks like: - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data="worldcities.csv", - thresholds=pb.Thresholds( - warning=1, # 1 failure - error=0.05, # 5% of rows failing - ), - ) - .col_schema_match( - schema=pb.Schema( - columns=[ - ("city_name", "object"), - ("latitude", "float64"), - ("longitude", "float64"), - ("country", "object"), - ("population", "float64"), - ] - ), - ) - .col_vals_not_null(columns="city_name") - .col_vals_not_null(columns="population") - .col_vals_gt(columns="population", value=0, na_pass=True) - .col_vals_between(columns="latitude", left=-90, right=90) - .col_vals_between(columns="longitude", left=-180, right=180) - .interrogate() -) -``` - -Now, we'll run the .py script from the terminal: - -```bash -pb run worldcities_validation.py -``` - -![](/assets/pb-run-worldcities_validation.png){width=100%} - -You’ll see a summary table that lists all of the steps and their results and you can include as many -steps and as much logic as you need. - -### Output Options - -You could save the validation report as HTML or JSON (or both) for the purposes of sharing or -for automation: - -```bash -pb run worldcities_validation.py --output-html report.html --output-json report.json -``` - -![](/assets/pb-run-worldcities_validation-output.png){width=100%} - -There are also the options to produce extracts (subset of failing rows) with `--show-extract` or -`--write-extract` (just like with `pb validate`). Let's do both in the following example: - -```bash -pb run worldcities_validation.py --show-extract --write-extract worldcities_failures -``` - -![](/assets/pb-run-worldcities_validation-extracts.png){width=100%} - -This shows a preview of each extract for those validation steps where extracts were produced (steps -2, 3, and 4). Individual CSV files with extracted rows for those steps were written to the -`worldcities_failures` directory. - -### Controlling Failure Behavior - -It's possible to use the `--fail-on` option to control when the command should exit with an error, -based on the severity of validation failures. This is especially useful for automated workflows and -CI/CD pipelines. - -Let's try that with our `worldcities_validation.py` validation, which we've seen exeeds the -'warning' in steps 2, 3, and 4: - -```bash -pb run worldcities_validation.py --fail-on warning -``` - -![](/assets/pb-run-worldcities_validation-fail-on-warning.png){width=100%} - -Notice the final line states `Exiting with error due to warning, error, or critical validation -failures`. Because we applied `--fail-on warning`, any presence of `warning' (or higher levels such -as 'error' or 'critical') will yield a non-zero exit code that should stop a pipeline process. We -can prove this by running the following lines in the terminal - -```bash -pb run worldcities_validation.py --fail-on warning > /dev/null 2>&1 -echo $? -``` - -which returns `1`. - -## Wrapping Up - -Pointblank’s CLI gives you powerful tools for validating your data, whether you need a quick check -or a custom workflow. Use `pb validate` for fast, one-liner checks and `pb run` for more advanced, -scriptable validation logic. With clear output and flexible options, you can catch data issues early -and keep your workflows running smoothly. diff --git a/docs/user-guide/cli-reference.qmd b/docs/user-guide/cli-reference.qmd deleted file mode 100644 index 62dad778de..0000000000 --- a/docs/user-guide/cli-reference.qmd +++ /dev/null @@ -1,454 +0,0 @@ ---- -title: CLI Reference -jupyter: python3 -toc-expand: 2 -bread-crumbs: true ---- - -This page provides a complete reference for all Pointblank CLI commands. Each section shows the full help text as it appears in the terminal, giving you quick access to all available options and examples. - -For practical usage examples and workflows, see the [CLI Data Validation](cli-data-validation.qmd) and [CLI Data Inspection](cli-data-inspection.qmd) guides. - -## `pb` - Main Command - -The main entry point for all Pointblank CLI operations: - -> ``` -> Usage: pb [OPTIONS] COMMAND [ARGS]... -> -> Pointblank CLI: Data validation and quality tools for data engineers. -> -> Use this CLI to validate data quality, explore datasets, and generate -> comprehensive reports for CSV, Parquet, and database sources. Suitable for -> data pipelines, ETL validation, and exploratory data analysis from the -> command line. -> -> Quick Examples: -> -> pb preview data.csv Preview your data -> pb scan data.csv Generate data profile -> pb validate data.csv Run basic validation -> -> Use pb COMMAND --help for detailed help on any command. -> -> Options: -> -v, --version Show the version and exit. -> -h, --help Show this message and exit. -> -> Commands: -> info Display information about a data source. -> preview Preview a data table showing head and tail rows. -> scan Generate a data scan profile report. -> missing Generate a missing values report for a data table. -> validate Perform single or multiple data validations. -> run Run a Pointblank validation script or YAML configuration. -> make-template Create a validation script or YAML configuration template. -> pl Execute Polars expressions and display results. -> datasets List available built-in datasets. -> requirements Check installed dependencies and their availability. -> ``` - -## `pb info` - Data Source Information - -Display basic information about a data source: - -> ``` -> Usage: pb info [OPTIONS] [DATA_SOURCE] -> -> Display information about a data source. -> -> Shows table type, dimensions, column names, and data types. -> -> DATA_SOURCE can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> -> Options: -> --help Show this message and exit. -> ``` - -## `pb preview` - Data Table Preview - -Preview data showing head and tail rows: - -> ``` -> Usage: pb preview [OPTIONS] [DATA_SOURCE] -> -> Preview a data table showing head and tail rows. -> -> DATA_SOURCE can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> - Piped data from pb pl command -> -> COLUMN SELECTION OPTIONS: -> -> For tables with many columns, use these options to control which columns are -> displayed: -> -> - --columns: Specify exact columns (e.g., --columns "name,age,email") -> - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15") -> - --col-first: Show first N columns (e.g., --col-first 5) -> - --col-last: Show last N columns (e.g., --col-last 3) -> -> Tables with >15 columns automatically show first 7 and last 7 columns with -> indicators. -> -> Options: -> --columns TEXT Comma-separated list of columns to display -> --col-range TEXT Column range like '1:10' or '5:' or ':15' -> (1-based indexing) -> --col-first INTEGER Show first N columns -> --col-last INTEGER Show last N columns -> --head INTEGER Number of rows from the top (default: 5) -> --tail INTEGER Number of rows from the bottom (default: 5) -> --limit INTEGER Maximum total rows to display (default: 50) -> --no-row-numbers Hide row numbers -> --max-col-width INTEGER Maximum column width in pixels (default: 250) -> --min-table-width INTEGER Minimum table width in pixels (default: 500) -> --no-header Hide table header -> --output-html PATH Save HTML output to file -> --help Show this message and exit. -> ``` - -## `pb scan` - Data Profile Reports - -Generate comprehensive data profiles: - -> ``` -> Usage: pb scan [OPTIONS] [DATA_SOURCE] -> -> Generate a data scan profile report. -> -> Produces a comprehensive data profile including: -> -> - Column types and distributions -> - Missing value patterns -> - Basic statistics -> - Data quality indicators -> -> DATA_SOURCE can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> - Piped data from pb pl command -> -> Options: -> --output-html PATH Save HTML scan report to file -> -c, --columns TEXT Comma-separated list of columns to scan -> --help Show this message and exit. -> ``` - -## `pb missing` - Missing Values Reports - -Generate reports focused on missing values: - -> ``` -> Usage: pb missing [OPTIONS] [DATA_SOURCE] -> -> Generate a missing values report for a data table. -> -> DATA_SOURCE can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> - Piped data from pb pl command -> -> Options: -> --output-html PATH Save HTML output to file -> --help Show this message and exit. -> ``` - -## `pb validate` - Quick Data Validations - -Perform single or multiple data validations: - -> ``` -> Usage: pb validate [OPTIONS] [DATA_SOURCE] -> -> Perform single or multiple data validations. -> -> Run one or more validation checks on your data in a single command. Use -> multiple --check options to perform multiple validations. -> -> DATA_SOURCE can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> -> AVAILABLE CHECK_TYPES: -> -> Require no additional options: -> -> - rows-distinct: Check if all rows in the dataset are unique (no duplicates) -> - rows-complete: Check if all rows are complete (no missing values in any column) -> -> Require --column: -> -> - col-exists: Check if a specific column exists in the dataset -> - col-vals-not-null: Check if all values in a column are not null/missing -> -> Require --column and --value: -> -> - col-vals-gt: Check if column values are greater than a fixed value -> - col-vals-ge: Check if column values are greater than or equal to a fixed value -> - col-vals-lt: Check if column values are less than a fixed value -> - col-vals-le: Check if column values are less than or equal to a fixed value -> -> Require --column and --set: -> -> - col-vals-in-set: Check if column values are in an allowed set -> -> Use --list-checks to see all available validation methods with examples. The -> default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows. -> -> Examples: -> -> pb validate data.csv # Uses default validation (rows-distinct) -> pb validate data.csv --list-checks # Show all available checks -> pb validate data.csv --check rows-distinct -> pb validate data.csv --check rows-distinct --show-extract -> pb validate data.csv --check rows-distinct --write-extract failing_rows_folder -> pb validate data.csv --check rows-distinct --exit-code -> pb validate data.csv --check col-exists --column price -> pb validate data.csv --check col-vals-not-null --column email -> pb validate data.csv --check col-vals-gt --column score --value 50 -> pb validate data.csv --check col-vals-in-set --column status --set "active,inactive,pending" -> -> Multiple validations in one command: pb validate data.csv --check rows- -> distinct --check rows-complete -> -> Options: -> --list-checks List available validation checks and exit -> --check CHECK_TYPE Type of validation check to perform. Can be used -> multiple times for multiple checks. -> --column TEXT Column name or integer position as #N (1-based index) -> for validation. -> --set TEXT Comma-separated allowed values for col-vals-in-set -> checks. -> --value FLOAT Numeric value for comparison checks. -> --show-extract Show extract of failing rows if validation fails -> --write-extract TEXT Save failing rows to folder. Provide base name for -> folder. -> --limit INTEGER Maximum number of failing rows to save to CSV -> (default: 500) -> --exit-code Exit with non-zero code if validation fails -> --help Show this message and exit. -> ``` - -## `pb run` - Validation Scripts and YAML - -Run Python validation scripts or YAML configurations: - -> ``` -> Usage: pb run [OPTIONS] [VALIDATION_FILE] -> -> Run a Pointblank validation script or YAML configuration. -> -> VALIDATION_FILE can be: - A Python file (.py) that defines validation logic -> - A YAML configuration file (.yaml, .yml) that defines validation steps -> -> Python scripts should load their own data and create validation objects. -> YAML configurations define data sources and validation steps declaratively. -> -> If --data is provided, it will automatically replace the data source in your -> validation objects (Python scripts) or override the 'tbl' field (YAML -> configs). -> -> To get started quickly, use 'pb make-template' to create templates. -> -> DATA can be: -> -> - CSV file path (e.g., data.csv) -> - Parquet file path or pattern (e.g., data.parquet, data/*.parquet) -> - GitHub URL to CSV/Parquet (e.g., https://github.com/user/repo/blob/main/data.csv) -> - Database connection string (e.g., duckdb:///path/to/db.ddb::table_name) -> - Dataset name from pointblank (small_table, game_revenue, nycflights, global_sales) -> -> Examples: -> -> pb make-template my_validation.py # Create a Python template -> pb run validation_script.py -> pb run validation_config.yaml -> pb run validation_script.py --data data.csv -> pb run validation_config.yaml --data small_table --output-html report.html -> pb run validation_script.py --show-extract --fail-on error -> pb run validation_config.yaml --write-extract extracts_folder --fail-on critical -> -> Options: -> --data TEXT Data source to replace in validation objects -> (Python scripts and YAML configs) -> --output-html PATH Save HTML validation report to file -> --output-json PATH Save JSON validation summary to file -> --show-extract Show extract of failing rows if validation -> fails -> --write-extract TEXT Save failing rows to folders (one CSV per -> step). Provide base name for folder. -> --limit INTEGER Maximum number of failing rows to save to -> CSV (default: 500) -> --fail-on [critical|error|warning|any] -> Exit with non-zero code when validation -> reaches this threshold level -> --help Show this message and exit. -> ``` - -## `pb make-template` - Template Generation - -Create validation script or YAML configuration templates: - -> ``` -> Usage: pb make-template [OPTIONS] [OUTPUT_FILE] -> -> Create a validation script or YAML configuration template. -> -> Creates a sample Python script or YAML configuration with examples showing -> how to use Pointblank for data validation. The template type is determined -> by the file extension: - .py files create Python script templates - -> .yaml/.yml files create YAML configuration templates -> -> Edit the template to add your own data loading and validation rules, then -> run it with 'pb run'. -> -> OUTPUT_FILE is the path where the template will be created. -> -> Examples: -> -> pb make-template my_validation.py # Creates Python script template -> pb make-template my_validation.yaml # Creates YAML config template -> pb make-template validation_template.yml # Creates YAML config template -> -> Options: -> --help Show this message and exit. -> ``` - -## `pb pl` - Polars Expression Execution - -Execute Polars expressions and display results: - -> ``` -> Usage: pb pl [OPTIONS] [POLARS_EXPRESSION] -> -> Execute Polars expressions and display results. -> -> Execute Polars DataFrame operations from the command line and display the -> results using Pointblank's visualization tools. -> -> POLARS_EXPRESSION should be a valid Polars expression that returns a -> DataFrame. The 'pl' module is automatically imported and available. -> -> Examples: -> -> # Direct expression -> pb pl "pl.read_csv('data.csv')" -> pb pl "pl.read_csv('data.csv').select(['name', 'age'])" -> pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" -> -> # Multi-line with editor (supports multiple statements) -> pb pl --edit -> -> # Multi-statement code example in editor: -> # csv = pl.read_csv('data.csv') -> # result = csv.select(['name', 'age']).filter(pl.col('age') > 25) -> -> # Multi-line with a specific editor -> pb pl --edit --editor nano -> pb pl --edit --editor code -> pb pl --edit --editor micro -> -> # From file -> pb pl --file query.py -> -> Piping to other pb commands -> pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct -> pb pl --edit --pipe | pb preview --head 10 -> pb pl --edit --pipe | pb scan --output-html report.html -> pb pl --edit --pipe | pb missing --output-html missing_report.html -> -> Use --output-format to change how results are displayed: -> pb pl "pl.read_csv('data.csv')" --output-format scan -> pb pl "pl.read_csv('data.csv')" --output-format missing -> pb pl "pl.read_csv('data.csv')" --output-format info -> -> Note: For multi-statement code, assign your final result to a variable like -> 'result', 'df', 'data', or ensure it's the last expression. -> -> Options: -> -e, --edit Open editor for multi-line input -> -f, --file PATH Read query from file -> --editor TEXT Editor to use for --edit mode (overrides -> $EDITOR and auto-detection) -> -o, --output-format [preview|scan|missing|info] -> Output format for the result -> --preview-head INTEGER Number of head rows for preview -> --preview-tail INTEGER Number of tail rows for preview -> --output-html PATH Save HTML output to file -> --pipe Output data in a format suitable for piping -> to other pb commands -> --pipe-format [parquet|csv] Format for piped output (default: parquet) -> --help Show this message and exit. -> ``` - -## `pb datasets` - Built-in Datasets - -List available built-in datasets: - -> ``` -> Usage: pb datasets [OPTIONS] -> -> List available built-in datasets. -> -> Options: -> --help Show this message and exit. -> ``` - -## `pb requirements` - Dependency Check - -Check installed dependencies and their availability: - -> ``` -> Usage: pb requirements [OPTIONS] -> -> Check installed dependencies and their availability. -> -> Options: -> --help Show this message and exit. -> ``` - -## Common Data Source Types - -All commands that accept a `DATA_SOURCE` parameter support these formats: - -- **CSV files**: `data.csv`, `path/to/data.csv` -- **Parquet files**: `data.parquet`, `data/*.parquet` (patterns supported) -- **GitHub URLs**: `https://github.com/user/repo/blob/main/data.csv` -- **Database connections**: `duckdb:///path/to/db.ddb::table_name` -- **Built-in datasets**: `small_table`, `game_revenue`, `nycflights`, `global_sales` -- **Piped data**: Output from `pb pl` command (where supported) - -## Exit Codes and Automation - -Many commands support options useful for automation and CI/CD: - -- `--exit-code`: Exit with non-zero code on validation failure -- `--fail-on [critical|error|warning|any]`: Control failure thresholds -- `--output-html`, `--output-json`: Save reports for external consumption -- `--write-extract`: Save failing rows for investigation - -These features make Pointblank CLI commands suitable for integration into data pipelines, quality gates, and automated workflows. diff --git a/docs/user-guide/col-summary-tbl.qmd b/docs/user-guide/col-summary-tbl.qmd deleted file mode 100644 index db90672473..0000000000 --- a/docs/user-guide/col-summary-tbl.qmd +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: Column Summaries -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -``` - -While previewing a table with `preview()` is undoubtedly a good thing to do, sometimes you need -more. This is where summarizing a table comes in. When you view a summary of a table, the -column-by-column info can quickly increase your understanding of a dataset. Plus, it allows you to -quickly catch anomalies in your data (e.g., the maximum value of a column could be far outside the -realm of possibility). - -Pointblank provides a function to make it extremely easy to view column-level summaries in a single -table. That function is called `col_summary_tbl()` and, just like `preview()` does, it supports the -use of any table that Pointblank can use for validation. And no matter what the input data is, the -resultant reporting table is consistent in its design and construction. - -## Trying out `col_summary_tbl()`{.qd-no-link} - -The function only requires a table. Let's use the `small_table` dataset (a very simple table) to -start us off: - -```{python} -import pointblank as pb - -small_table = pb.load_dataset(dataset="small_table", tbl_type="polars") - -pb.col_summary_tbl(small_table) -``` - -The header provides the type of table we're looking at (`POLARS`, since this is a Polars DataFrame) -and the table dimensions. The rest of the table focuses on the column-level summaries. As such, each -row represents a summary of a column in the `small_table` dataset. There's a lot of information in -this summary table to digest. Some of it is intuitive since this sort of table summarization isn't -all that uncommon, but other aspects of it could also give some pause. So we'll carefully wade -through how to interpret this report. - -## Data Categories in the Column Summary Table - -On the left side of the table are icons of different colors. These represent categories that the -columns fall into. There are only five categories and columns can only be of one type. The -categories (and their letter marks) are: - -- `N`: numeric -- `S`: string-based -- `D`: date/datetime -- `T/F`: boolean -- `O`: object - -The numeric category (`N`) takes data types such as floats and integers. The `S` category is for -string-based columns. Date or datetime values are lumped into the `D` category. Boolean columns -(`T/F`) have their own category and are *not* considered numeric (e.g., `0`/`1`). The `O` category -is a catchall for all other types of columns. Given the disparity of these categories and that we -want them in the same table, some statistical measures will be sensible for certain column -categories but not for others. Given that, we'll explain how each category is represented in the -column summary table. - -## Numeric Data - -Three columns in `small_table` are numeric: `a` (`Int64`), `c` (`Int64`), and `d` (`Float64`). The -common measures of the missing count/proportion (`NA`) and the unique value count/proportion (`UQ`) -are provided for the numeric data type. For these two measures, the top number is the absolute count -of missing values and the count of unique values. The bottom number is a proportion of the absolute -count divided by the row count; this makes each proportion a value between `0` and `1` (bounds -included). - -The next two columns represent the mean (`Mean`) and the standard deviation (`SD`). The minumum -(`Min`), maximum, (`Max`) and a set of quantiles occupy the next few columns (includes `P5`, `Q1`, -`Med` for median, `Q3`, and `P95`). Finally, the interquartile range (`IQR`: `Q3` - `Q1`) is the -last measure provided. - -## String Data - -String data is present in `small_table`, being in columns `b` and `f`. The missing value (`NA`) and -uniqueness (`UQ`) measures are accounted for here. The statistical measures are all based on string -lengths, so what happens is that all strings in a column are converted to those numeric values and a -subset of stats values is presented. To avoid some understandable confusion when reading the table, -the stats values in each of the cells with values are annotated with the text `"SL"`. It makes less -sense to provide a full suite of quantile values so only the minimum (`Min`), median (`Med`), and -maximum (`Max`) are provided. - -## Date/Datetime Data and Boolean Data - -We see that in the first two rows of our summary table there are summaries of the `date_time` and -`date` columns. The summaries we provide for a date/datetime category (notice the green `D` to the -left of the column names) are: - -1. the missing count/proportion (`NA`) -2. the unique value count/proportion (`UQ`) -3. the minimum and maximum dates/datetimes - -One column, `e`, is of the `Boolean` type. Because columns of this type could only have `True`, -`False`, or missing values, we provide summary data for missingness (under `NA`) and proportions of -`True` and `False` values (under `UQ`). diff --git a/docs/user-guide/column-selection-patterns.qmd b/docs/user-guide/column-selection-patterns.qmd deleted file mode 100644 index 69e9c039a9..0000000000 --- a/docs/user-guide/column-selection-patterns.qmd +++ /dev/null @@ -1,342 +0,0 @@ ---- -title: Column Selection Patterns -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_header=False, report_incl_footer_timings=False) -``` - -Data validation often requires working with columns in flexible ways. Pointblank offers two powerful -approaches: - -1. Applying validation rules across multiple columns: validate many columns with a single rule -2. Comparing values between columns: create validations that compare values across different columns - -This guide covers both approaches in detail with practical examples. - -## Part 1: Applying Rules Across Multiple Columns - -Many of Pointblank's validation methods perform column-level checks. These methods provide the -`columns=` parameter, which accepts not just a single column name but multiple columns through -various selection methods. - -Why is this useful? Often you'll want to perform the same validation check (e.g., checking that -numerical values are all positive) across multiple columns. Rather than defining the same rules -multiple times, you can map the validation across those columns in a single step. - -Let's explore this using the `game_revenue` dataset: - -```{python} -#| echo: false -pb.preview(pb.load_dataset(dataset="game_revenue")) -``` - -### Using a List of Column Names - -The simplest way to validate multiple columns is to provide a list to the `columns=` parameter. In -the `game_revenue` dataset, we have two columns with numerical data: `item_revenue` and -`session_duration`. If we expect all values in both columns to be greater than `0`, we can write: - -```{python} -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_gt( - columns=["item_revenue", "session_duration"], - value=0 - ) - .interrogate() -) -``` - -The validation report shows two validation steps were created from a single method call! All -validation parameters are shared across all generated steps, including thresholds and briefs: - -```{python} -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_gt( - columns=["item_revenue", "session_duration"], - value=0, - thresholds=(0.1, 0.2, 0.3), - brief="`{col}` must be greater than zero." - ) - .interrogate() -) -``` - -In this example, you can see that the validation report displays customized briefs for each column -("`item_revenue` must be greater than zero." and "`session_duration` must be greater than zero."), -automatically substituting the column name using the `{col}` placeholder in the brief template. This -feature is particularly helpful when reviewing reports, as it provides clear, human-readable -descriptions of what each validation step is checking. When working with multiple columns through a -single validation call, these dynamically generated briefs make your validation reports more -understandable for both technical and non-technical stakeholders. - -### Using Pointblank's Column Selectors - -For more advanced column selection, Pointblank provides selector functions that resolve columns -based on: - -- text patterns in column names -- column position -- column data type - -Two common selectors, `starts_with()` and `ends_with()`, resolve columns based on text patterns in -column names. - -The `game_revenue` dataset has three columns starting with "item": `item_type`, `item_name`, and -`item_revenue`. Let's check that these columns contain no missing values: - -```{python} -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_not_null(columns=pb.starts_with("item")) - .interrogate() -) -``` - -Three validation steps were automatically created because three columns matched the pattern. - -The complete list of column selectors includes: - -- `starts_with()` -- `ends_with()` -- `contains()` -- `matches()` -- `everything()` -- `first_n()` -- `last_n()` - -### Combining Column Selectors - -Column selectors can be combined for more powerful selection. To do this, use the `col()` helper -function with logical operators: - -- `&` (*and*) -- `|` (*or*) -- `-` (*difference*) -- `~` (*not*) - -For example, to select all columns except the first four: - -```{python} -col_selection = pb.col(pb.everything() - pb.first_n(4)) - -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_not_null( - columns=col_selection, - thresholds=(1, 0.05, 0.1) - ) - .interrogate() -) -``` - -This selects every column except the first four, resulting in seven validation steps. - -### Narwhals Selectors - -Pointblank also supports column selectors from the -[Narwhals](https://narwhals-dev.github.io/narwhals/) library, which include: - -- `matches()` -- `by_dtype()` -- `boolean()` -- `categorical()` -- `datetime()` -- `numeric()` -- `string()` - -Here's an example selecting all numeric columns: - -```{python} -import narwhals.selectors as ncs - -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_gt( - columns=ncs.numeric(), - value=0 - ) - .interrogate() -) -``` - -And selecting all string columns matching "item_": - -```{python} -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_not_null(columns=pb.col(ncs.string() & ncs.matches("item_"))) - .interrogate() -) -``` - -This example demonstrates the power of combining Narwhals selectors with logical operators. By using -`ncs.string()` to select string columns and then filtering with `ncs.matches("item_")`, we can -precisely target text columns with specific naming patterns. This type of targeted selection is -particularly valuable when working with wide datasets that have consistent column naming -conventions, allowing you to apply appropriate validation rules to logically grouped columns without -explicitly listing each one. - -### Caveats for Using Column Selectors - -While column selectors are powerful, there are some caveats. If a selector doesn't match any -columns, the validation won't fail but will show an 'explosion' in the report: - -```{python} -( - pb.Validate(data=pb.load_dataset("game_revenue")) - .col_vals_not_null(columns=pb.starts_with("items")) - .col_vals_gt(columns="item_revenue", value=0) - .interrogate() -) -``` - -Notice that although there was a problem with Step 1 (that should be addressed), the interrogation -did move on to Step 2 without complication. - -To mitigate uncertainty, include validation steps that check for the existence of key columns with -`~~Validate.col_exists()` or verify the schema with `~~Validate.col_schema_match()`. - -## Part 2: Comparing Values Between Columns - -Sometimes you need to compare values across different columns rather than against fixed values. -Pointblank enables this through the `col()` helper function. - -Let's look at examples using the `small_table` dataset: - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table"), n_head=20, n_tail=20) -``` - -### Using `col()`{.qd-no-link} to Specify a Comparison Column - -While we typically use validation methods to compare column values against fixed values: - -```python -... -.col_vals_gt(columns="a", value=2, ...) -... -``` - -We can also compare values between columns by using `col()` in the `value=` parameter: - -```python -... -.col_vals_gt(columns="a", value=pb.col("x"), ...) -... -``` - -This checks that each value in column `a` is greater than the corresponding value in column `x`. -Here's a concrete example: - -```{python} -( - pb.Validate(data=pb.load_dataset("small_table")) - .col_vals_gt( - columns="d", - value=pb.col("c") - ) - .interrogate() -) -``` - -Notice that the validation report shows both column names (`d` and `c`). There are two failing test -units because of missing values in column `c`. When comparing across columns, missing values in -either column can cause failures. - -To handle missing values, use `na_pass=True`: - -```{python} -( - pb.Validate(data=pb.load_dataset("small_table")) - .col_vals_gt( - columns="d", - value=pb.col("c"), - na_pass=True - ) - .interrogate() -) -``` - -Now all tests pass. - -The following validation methods accept a `col()` expression in their `value=` parameter: - -- `~~Validate.col_vals_gt()` -- `~~Validate.col_vals_lt()` -- `~~Validate.col_vals_ge()` -- `~~Validate.col_vals_le()` -- `~~Validate.col_vals_eq()` -- `~~Validate.col_vals_ne()` - -### Using `col()` in Range Checks - -For range validations via `~~Validate.col_vals_between()` and `~~Validate.col_vals_outside()` you -can use a mix of column references and fixed values: - -```{python} -( - pb.Validate(data=pb.load_dataset("small_table")) - .col_vals_between( - columns="d", - left=pb.col("c"), - right=10_000, - na_pass=True - ) - .interrogate() -) -``` - -The validation report shows the range as `[c, 10000]`, indicating that the lower bound comes from -column `c` while the upper bound is fixed at `10000`. - -## Advanced Examples: Combining Both Approaches - -The true power comes from combining both approaches: validating multiple columns and using -cross-column comparisons: - -```{python} -validation = ( - pb.Validate(data=pb.load_dataset("small_table")) - .col_vals_gt( - columns=["c", "d"], - value=pb.col("a"), - na_pass=True - ) - .interrogate() -) - -validation -``` - -This creates validation steps checking that values in both columns `d` and `e` are greater than -their corresponding values in column `a`. - -## Conclusion - -Pointblank provides flexible approaches to working with columns: - -1. Column selection: validate multiple columns with a single validation rule -2. Cross-column comparison: compare values between columns - -These capabilities allow you to: - -- write more concise validation code -- apply consistent validation rules across similar columns -- create dynamic validations that check relationships between columns -- build comprehensive data quality checks with minimal code - -By getting familiar with these techniques, you can create more elegant and powerful validation plans -while also reducing repetition in your code. diff --git a/docs/user-guide/draft-validation.qmd b/docs/user-guide/draft-validation.qmd deleted file mode 100644 index cfd7e2116f..0000000000 --- a/docs/user-guide/draft-validation.qmd +++ /dev/null @@ -1,425 +0,0 @@ ---- -title: Draft Validation -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -Draft validation in Pointblank leverages large language models (LLMs) to automatically generate -validation plans for your data. This feature is especially useful when starting validation on a new -dataset or when you need to quickly establish baseline validation coverage. - -The `DraftValidation` class connects to various LLM providers to analyze your data's characteristics -and generate a complete validation plan tailored to its structure and content. - -## How `DraftValidation`{.qd-no-link} Works - -When you use `DraftValidation`, the process works through these steps: - -1. a statistical summary of your data is generated using the `DataScan` class -2. this summary is converted to JSON format and sent to your selected LLM provider -3. the LLM uses the summary along with knowledge about Pointblank's validation capabilities to -generate a validation plan -4. the result is returned as executable Python code that you can use directly or modify as needed - -The entire process happens without sending all of the data to the LLM provider, but only a summary -that includes column names, data types, basic statistics, and a small sample of values. - -## Requirements and Setup - -To use the `DraftValidation` feature, you'll need: - -1. an API key from a supported LLM provider -2. the required Python packages installed - -You can install all necessary dependencies with: - -```bash -pip install pointblank[generate] -``` - -This will install the `chatlas` package and other dependencies required for `DraftValidation`. - -### Supported LLM Providers - -The `DraftValidation` class supports multiple LLM providers: - -- **Anthropic** (Claude models) -- **OpenAI** (GPT models) -- **Ollama** (local LLMs) -- **Amazon Bedrock** (AWS-hosted models) - -Each provider has different capabilities and performance characteristics, but all can be used to -generate validation plans through a consistent interface. - -## Basic Usage - -The simplest way to use `DraftValidation` is to provide your data and specify an LLM model. Let's -try it out with the `global_sales` dataset. - -```python -import pointblank as pb - -# Load a dataset -data = pb.load_dataset(dataset="global_sales", tbl_type="polars") - -# Generate a validation plan -pb.DraftValidation( - data=data, - model="anthropic:claude-sonnet-4-5", - api_key="your_api_key_here" # Replace with your actual API key -) -``` - -````plaintext -```python -import pointblank as pb - -# Define schema based on column names and dtypes -schema = pb.Schema(columns=[ - ("product_id", "String"), - ("product_category", "String"), - ("customer_id", "String"), - ("customer_segment", "String"), - ("region", "String"), - ("country", "String"), - ("city", "String"), - ("timestamp", "Datetime(time_unit='us', time_zone=None)"), - ("quarter", "String"), - ("month", "Int64"), - ("year", "Int64"), - ("price", "Float64"), - ("quantity", "Int64"), - ("status", "String"), - ("email", "String"), - ("revenue", "Float64"), - ("tax", "Float64"), - ("total", "Float64"), - ("payment_method", "String"), - ("sales_channel", "String") -]) - -# The validation plan -validation = ( - pb.Validate( - data=your_data, # Replace your_data with the actual data variable - label="Draft Validation", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35) - ) - .col_schema_match(schema=schema) - .col_vals_not_null(columns=[ - "product_category", "customer_segment", "region", "country", - "price", "quantity", "status", "email", "revenue", "tax", - "total", "payment_method", "sales_channel" - ]) - .col_vals_between(columns="month", left=1, right=12, na_pass=True) - .col_vals_between(columns="year", left=2021, right=2023, na_pass=True) - .col_vals_gt(columns="price", value=0) - .col_vals_gt(columns="quantity", value=0) - .col_vals_gt(columns="revenue", value=0) - .col_vals_gt(columns="tax", value=0) - .col_vals_gt(columns="total", value=0) - .col_vals_in_set(columns="product_category", set=[ - "Manufacturing", "Retail", "Healthcare" - ]) - .col_vals_in_set(columns="customer_segment", set=[ - "Government", "Consumer", "SMB" - ]) - .col_vals_in_set(columns="region", set=[ - "Asia Pacific", "Europe", "North America" - ]) - .col_vals_in_set(columns="status", set=[ - "returned", "shipped", "delivered" - ]) - .col_vals_in_set(columns="payment_method", set=[ - "Apple Pay", "PayPal", "Bank Transfer", "Credit Card" - ]) - .col_vals_in_set(columns="sales_channel", set=[ - "Partner", "Distributor", "Phone" - ]) - .row_count_match(count=50000) - .col_count_match(count=20) - .rows_distinct() - .interrogate() -) - -validation -``` -```` - -### Managing API Keys - -While you can directly provide API keys as shown above, there are more secure approaches: - -```python -import os - -# Get API key from environment variable -api_key = os.getenv("ANTHROPIC_API_KEY") - -draft_validation = pb.DraftValidation( - data=data, - model="anthropic:claude-sonnet-4-5", - api_key=api_key -) -``` - -You can also store API keys in a `.env` file in your project's root directory: - -``` -# Contents of .env file -ANTHROPIC_API_KEY=your_anthropic_api_key_here -OPENAI_API_KEY=your_openai_api_key_here -``` - -If your API keys have standard names (like `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`), -`DraftValidation` will automatically find and use them: - -```python -# No API key needed if stored in .env with standard names -draft_validation = pb.DraftValidation( - data=data, - model="anthropic:claude-sonnet-4-5" -) -``` - -## Example Output for `nycflights` - -Here's an example of a validation plan that might be generated by `DraftValidation` for the -`nycflights` dataset: - -```python -pb.DraftValidation( - pb.load_dataset(dataset="nycflights", tbl_type="duckdb", - model="anthropic:claude-sonnet-4-5" -) -``` - -````plaintext -```python -import pointblank as pb - -# Define schema based on column names and dtypes -schema = pb.Schema(columns=[ - ("year", "int64"), - ("month", "int64"), - ("day", "int64"), - ("dep_time", "int64"), - ("sched_dep_time", "int64"), - ("dep_delay", "int64"), - ("arr_time", "int64"), - ("sched_arr_time", "int64"), - ("arr_delay", "int64"), - ("carrier", "string"), - ("flight", "int64"), - ("tailnum", "string"), - ("origin", "string"), - ("dest", "string"), - ("air_time", "int64"), - ("distance", "int64"), - ("hour", "int64"), - ("minute", "int64") -]) - -# The validation plan -validation = ( - pb.Validate( - data=your_data, # Replace your_data with the actual data variable - label="Draft Validation", - thresholds=pb.Thresholds(warning=0.10, error=0.25, critical=0.35) - ) - .col_schema_match(schema=schema) - .col_vals_not_null(columns=[ - "year", "month", "day", "sched_dep_time", "carrier", "flight", - "origin", "dest", "distance", "hour", "minute" - ]) - .col_vals_between(columns="month", left=1, right=12) - .col_vals_between(columns="day", left=1, right=31) - .col_vals_between(columns="sched_dep_time", left=106, right=2359) - .col_vals_between(columns="dep_delay", left=-43, right=1301, na_pass=True) - .col_vals_between(columns="air_time", left=20, right=695, na_pass=True) - .col_vals_between(columns="distance", left=17, right=4983) - .col_vals_between(columns="hour", left=1, right=23) - .col_vals_between(columns="minute", left=0, right=59) - .col_vals_in_set(columns="origin", set=["EWR", "LGA", "JFK"]) - .col_count_match(count=18) - .row_count_match(count=336776) - .rows_distinct() - .interrogate() -) - -validation -``` -```` - -Notice how the generated plan includes: - -1. A schema validation with appropriate data types -2. Not-null checks for required columns -3. Range validations for numerical data -4. Set membership checks for categorical data -5. Row and column count validations -6. Appropriate handling of missing values with `na_pass=True` - -## Working with Model Providers - -### Specifying Models - -When using `DraftValidation`, you specify the model in the format `"provider:model_name"`: - -```python -# Using Anthropic's Claude model -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") - -# Using OpenAI's GPT model -pb.DraftValidation(data=data, model="openai:gpt-4-turbo") - -# Using a local model with Ollama -pb.DraftValidation(data=data, model="ollama:llama3:latest") - -# Using Amazon Bedrock -pb.DraftValidation(data=data, model="bedrock:anthropic.claude-3-sonnet-20240229-v1:0") -``` - -### Model Performance and Privacy - -Different models have different capabilities when it comes to generating validation plans: - -- Anthropic Claude Sonnet 4.5 generally provides the most comprehensive and accurate validation -plans -- OpenAI GPT-4 models also perform well -- Local models through Ollama can be useful for private data but they currently have reduced -capabilities here - -A key advantage of `DraftValidation` is that your actual dataset is not sent to the LLM provider. -Instead, only a summary is transmitted, which includes: - -- the number of rows and columns -- column names and data types -- basic statistics (min, max, mean, median, missing values count) -- a small sample of values from each column (usually 5-10 values) - -This approach protects your data while still providing enough context for the LLM to generate -relevant validation rules. - -## Customizing Generated Plans - -The validation plan generated by `DraftValidation` is just a starting point. You'll typically want -to: - -1. review the generated code for correctness -2. replace `your_data` with your actual data variable name that exists in your workspace -3. ensure the data object referenced is actually present in your workspace -4. adjust thresholds and validation parameters -5. add domain-specific validation rules -6. remove any unnecessary checks - -For example, you might start by capturing the text representation of your draft validation. This -will give you the raw Python code that you can copy into a new code cell in your notebook or script. -From there, you can customize it by modifying thresholds to match your organization's data quality -standards, adding business-specific validation rules that require domain knowledge, or removing -checks that aren't relevant to your use case. Once you've made your modifications, you can execute -the customized validation plan as you would any other Pointblank validation. - -## Under the Hood - -### The Generated Data Summary - -To understand what the LLM works with, here's an example of the data summary format that's sent: - -```json -{ - "table_info": { - "rows": 336776, - "columns": 18, - "table_type": "duckdb" - }, - "column_info": [ - { - "column_name": "year", - "column_type": "int64", - "missing_values": 0, - "min": 2013, - "max": 2013, - "mean": 2013.0, - "median": 2013, - "sample_values": [2013, 2013, 2013, 2013, 2013] - }, - { - "column_name": "month", - "column_type": "int64", - "missing_values": 0, - "min": 1, - "max": 12, - "mean": 6.548819, - "median": 7, - "sample_values": [1, 1, 1, 1, 1] - }, - // Additional columns... - ] -} -``` - -### The Prompt Strategy - -The `DraftValidation` class uses a carefully crafted prompt that instructs the LLM to: - -1. use the schema information to create a `Schema` object -2. include `~~Validate.col_vals_not_null()` for columns with no missing values -3. add appropriate range validations based on min/max values -4. include row and column count validations -5. format the output as clean, executable Python code - -The prompt also contains constraints to ensure consistent, high-quality results, such as using line -breaks in long lists for readability, applying `na_pass=True` for columns with missing values, and -avoiding duplicate validations. - -## Best Practices and Troubleshooting - -### When to Use `DraftValidation`{.qd-no-link} - -Drafting a validation is most useful when: - -- working with a new dataset for the first time -- needing to quickly establish baseline validation -- exploring potential validation rules before formalizing them -- validating columns with consistent patterns (numeric ranges, categories, etc.) - -Consider writing validation plans manually when you need very specific business rules, are working -with sensitive data, need complex validation logic, or need to validate relationships between -columns. - -### Recommended Workflow and Common Issues - -Here's a recommended workflow incorporating `DraftValidation`: - -1. generate an initial plan with `DraftValidation` -2. review the generated validations for relevance -3. adjust thresholds and parameters as needed -4. add specific business logic and cross-column validations -5. store the final validation plan in version control - -It's possible that you might bump up against some issues. Here are some common ones and solutions -you might try: - -- Authentication Errors: ensure your API key is valid and correctly passed to `DraftValidation` -- Package Not Found: make sure you've installed the required packages with -`pip install pointblank[generate]` -- Unsupported Model: verify you're using the correct `provider:model` format -- Poor Quality Plans: try a more capable model - -## Conclusion - -`DraftValidation` provides a powerful way to jumpstart your data validation process by leveraging -LLMs to generate context-aware validation plans. By analyzing your data's structure and content, -`DraftValidation` can create comprehensive validation rules that would otherwise take significant -time to develop manually. - -The feature balances privacy (by sending only data summaries) with utility (by generating executable -validation code). While the generated plans should always be reviewed and refined, they provide an -excellent starting point for ensuring your data meets your quality requirements. - -By understanding how `DraftValidation` works and how to customize its output, you can significantly -accelerate your data validation workflows and improve the quality of your data throughout your -projects. diff --git a/docs/user-guide/expressions.qmd b/docs/user-guide/expressions.qmd deleted file mode 100644 index 7e81e18f58..0000000000 --- a/docs/user-guide/expressions.qmd +++ /dev/null @@ -1,188 +0,0 @@ ---- -title: Expression-Based Validation -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -While Pointblank offers many specialized validation functions for common data quality checks, -sometimes you need more flexibility for complex validation requirements. This is where -expression-based validation with `~~Validate.col_vals_expr()` comes in. - -The `~~Validate.col_vals_expr()` method allows you to: - -- combine multiple conditions in a single validation step -- access row-wise values across multiple columns - -Now let's explore how to use these capabilities through a collection of examples! - -## Basic Usage - -At its core, `~~Validate.col_vals_expr()` validates whether an expression evaluates to `True` for -each row in your data. Here's a simple example: - -```{python} -import pointblank as pb -import polars as pl - -# Load small_table dataset as a Polars DataFrame -small_table_pl = pb.load_dataset(dataset="small_table", tbl_type="polars") - -( - pb.Validate(data=small_table_pl) - .col_vals_expr( - - # Use Polars expression syntax --- - expr=pl.col("d") > pl.col("a") * 50, - brief="Column `d` should be at least 50 times larger than `a`." - ) - .interrogate() -) -``` - -In this example, we're validating that for each row, the value in column `d` is at least 50 times -larger than the value in column `a`. - -## Notes on Expression Syntax - -The expression syntax depends on your table type: - -- **Polars**: uses Polars expression syntax with `pl.col("column_name")` -- **Pandas**: uses standard Python/NumPy syntax - -The expression should: - -- evaluate to a boolean result for each row -- reference columns using the appropriate syntax for your table type -- use standard operators (`+`, `-`, `*`, `/`, `>`, `<`, `==`, etc.) -- not include assignments - -## Complex Expressions - -The real power of `~~Validate.col_vals_expr()` comes with complex expressions that would be -difficult to represent using the standard validation functions: - -```{python} -# Load game_revenue dataset as a Polars DataFrame -game_revenue_pl = pb.load_dataset(dataset="game_revenue", tbl_type="polars") - -( - pb.Validate(data=game_revenue_pl) - .col_vals_expr( - - # Use Polars expression syntax --- - expr=(pl.col("session_duration") > 20) | (pl.col("item_revenue") > 10), - brief="Sessions should be either long (>20 min) or high-value (>$10)." - ) - .interrogate() -) -``` - -This validates that either the session duration is longer than 20 minutes OR the item revenue is -greater than $10. - -## Example: Multiple Conditions - -You can create sophisticated validations with multiple conditions: - -```{python} -# Create a simple Polars DataFrame -employee_df = pl.DataFrame({ - "age": [25, 30, 15, 40, 35], - "income": [50000, 75000, 0, 100000, 60000], - "years_experience": [3, 8, 0, 15, 7] -}) - -( - pb.Validate(data=employee_df, tbl_name="employee_data") - .col_vals_expr( - - # Complex condition with multiple comparisons --- - expr=( - (pl.col("age") >= 18) & - (pl.col("income") / (pl.col("years_experience") + 1) <= 25000) - ), - brief="Adults should have reasonable income-to-experience ratios." - ) - .interrogate() -) -``` - -## Example: Handling Null Values - -When working with expressions, consider how to handle null/missing values: - -```{python} -( - pb.Validate(data=small_table_pl) - .col_vals_expr( - - # Check for nulls before division --- - expr=(pl.col("c").is_not_null()) & ((pl.col("c") / pl.col("a")) > 1.5), - brief="Ratio of `c`/`a` should exceed 1.5 (when `c` is not null)." - ) - .interrogate() -) -``` - -## Best Practices - -Here are some tips and tricks for effectively using expression-based validation with -`~~Validate.col_vals_expr()`. - -### Document Your Expressions - -Always provide clear documentation in the `brief=` parameter: - -```{python} -( - pb.Validate(data=small_table_pl) - .col_vals_expr( - expr=pl.col("d") > pl.col("a") * 1.5, - - # Document which columns are being compared --- - brief="Column `d` should be at least 1.5 times larger than column `a`." - ) - .interrogate() -) -``` - -### Handle Edge Cases - -Consider potential edge cases like division by zero or nulls: - -```{python} -( - pb.Validate(data=small_table_pl) - .col_vals_expr( - - # Check denominator before division --- - expr=(pl.col("a") != 0) & (pl.col("d") / pl.col("a") > 1.5), - brief="Ratio of `d`/`a` should exceed 1.5 (avoiding division by zero)." - ) - .interrogate() -) -``` - -### Test on Small Datasets First - -When developing complex expressions, test on a small sample of your data first to ensure your logic -is correct before applying it to large datasets. - -## Conclusion - -The `~~Validate.col_vals_expr()` method provides a powerful way to implement complex validation -logic in Pointblank when standard validation methods aren't sufficient. By leveraging expressions, -you can create sophisticated data quality checks tailored to your specific requirements, combining -conditions across multiple columns and applying transformations as needed. - -This flexibility makes expression-based validation an essential tool for addressing complex data -quality scenarios in your validation workflows. diff --git a/docs/user-guide/extracts.qmd b/docs/user-guide/extracts.qmd deleted file mode 100644 index 957b5f319b..0000000000 --- a/docs/user-guide/extracts.qmd +++ /dev/null @@ -1,419 +0,0 @@ ---- -title: Data Extracts -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_header=False, report_incl_footer_timings=False) -``` - -When validating data, identifying exactly which rows failed is critical for diagnosing and resolving -data quality issues. This is where *data extracts* come in. Data extracts consist of target table -rows containing at least one cell that failed validation. While the validation report provides an -overview of pass/fail statistics, data extracts give you the actual problematic records for deeper -investigation. - -This article will cover: - -- which validation methods collect data extracts -- multiple ways to access and work with data extracts -- practical examples of using extracts for data quality improvement -- advanced techniques for analyzing extract patterns - -## The Validation Methods that Work with Data Extracts - -The following validation methods operate on column values and will have rows extracted when there -are failing test units in those rows: - -- `~~Validate.col_vals_gt()` -- `~~Validate.col_vals_lt()` -- `~~Validate.col_vals_ge()` -- `~~Validate.col_vals_le()` -- `~~Validate.col_vals_eq()` -- `~~Validate.col_vals_ne()` -- `~~Validate.col_vals_between()` -- `~~Validate.col_vals_outside()` -- `~~Validate.col_vals_in_set()` -- `~~Validate.col_vals_not_in_set()` -- `~~Validate.col_vals_null()` -- `~~Validate.col_vals_not_null()` -- `~~Validate.col_vals_regex()` -- `~~Validate.col_vals_expr()` -- `~~Validate.conjointly()` - -These row-based validation methods will also have rows extracted should there be failing rows: - -- `~~Validate.rows_distinct()` -- `~~Validate.rows_complete()` - -Note that some validation methods like `~~Validate.col_exists()` and `~~Validate.col_schema_match()` -don't generate data extracts because they validate structural aspects of the table rather than -checking column values. - -## Accessing Data Extracts - -There are three primary ways to access data extracts in Pointblank: - -1. the **CSV** buttons in validation reports -2. through the `~~Validate.get_data_extracts()` method -3. inspecting a subset of failed rows in step reports - -Let's explore each approach using examples. - -### CSV Data from Validation Reports - -Data extracts are embedded within validation report tables. Let's look at an example, using the -`small_table` dataset, where data extracts are collected in a single validation step due to failing -test units: - -```{python} -import pointblank as pb - -validation = ( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_lt( columns="d", value=3000) - .interrogate() -) - -validation -``` - -The single validation step checks whether values in `d` are less than `3000`. Within that column, -values range from `108.34` to `9999.99` so it makes sense that we can see 4 failing test units in -the `FAIL` column. - -If you look at the far right of the validation report you'll find there's a `CSV` button. Pressing -it initiates the download of a CSV file, and that file contains the data extract for this validation -step. The `CSV` button only appears when: - -1. there is a non-zero number of failing test units -2. the validation step is based on the use of a column-value or a row-based validation method (the -methods outlined in the section entitled *The Validation Methods that Work with Data Extracts*) - -Access to CSV data for the test unit errors is useful when the validation report is shared with -other data quality stakeholders, since it is easily accessible and doesn't require further use of -Pointblank. The stakeholder can simply open the downloaded CSV in their preferred spreadsheet -software, import it into a different analysis environment like R or Julia, or process it with any -tool that supports CSV files. This cross-platform compatibility makes the CSV export particularly -valuable in mixed-language data teams where different members might be working with different tools. - -### `get_data_extracts()` - -For programmatic access to data extracts, Pointblank provides the `~~Validate.get_data_extracts()` -method. This allows you to work with extract data directly in your Python workflow: - -```{python} -# Get data extracts from step 1 -extract_1 = validation.get_data_extracts(i=1, frame=True) - -extract_1 -``` - -The extracted table is of the same type (a Polars DataFrame) as the target table. Previously we used -`load_dataset()` with the `tbl_type="polars"` option to fetch the dataset in that form. - -Note these important details about using `~~Validate.get_data_extracts()`: - -- the parameter `i=1` corresponds to the step number shown in the validation report (1-indexed, not -0-indexed) -- setting `frame=True` returns the data as a DataFrame rather than a dictionary (only works when `i` -is a single integer) -- the extract includes all columns from the original data, not just the column being validated -- an additional `_row_num_` column is added to identify the original row positions - -### Step Reports - -Step reports provide another way to access and visualize failing data. When you generate a step -report for a validation step that has failing rows, those failing rows are displayed directly in the -report: - -```{python} -# Get a step report for the first validation step -step_report = validation.get_step_report(i=1) - -step_report -``` - -Step reports offer several advantages for working with data extracts as they: - -1. provide immediate visual context by highlighting the specific column being validated -2. format the data for better readability, especially useful when sharing results with colleagues -3. include additional metadata about the validation step and failure statistics - -For steps with many failures, you can customize how many rows to display: - -```{python} -# Limit to just 2 rows of failing data -limited_report = validation.get_step_report(i=1, limit=2) - -limited_report -``` - -Step reports are particularly valuable when you want to quickly inspect the failing data without -extracting it into a separate DataFrame. They provide a bridge between the high-level validation -report and the detailed data extracts. - -## Viewing Data Extracts with `preview()`{.qd-no-link} - -To get a consistent HTML representation of any data extract (regardless of the table type), we can -use the `preview()` function: - -```{python} -pb.preview(data=extract_1) -``` - -The view is optimized for readability, with column names and data types displayed in a compact -format. Notice that the `_row_num_` column is now part of the table stub and doesn't steal focus -from the table's original columns. - -The `preview()` function is designed to provide the head and tail (5 rows each) of the table so very -large extracts won't overflow the display. - -## Working with Multiple Validation Steps - -When validating data with multiple steps, you can extract failing rows from any step or combine -extracts from multiple steps: - -```{python} -# Create a validation with multiple steps -multi_validation = ( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_gt(columns="a", value=3) # Step 1 - .col_vals_lt(columns="d", value=3000) # Step 2 - .col_vals_regex(columns="b", pattern="^[0-9]-[a-z]{3}-[0-9]{3}$") # Step 3 - .interrogate() -) - -multi_validation -``` - -### Extracting Data from a Specific Step - -You can access extracts from any specific validation step: - -```{python} -# Get extracts from step 2 (`d < 3000` validation) -less_than_failures = multi_validation.get_data_extracts(i=2, frame=True) - -less_than_failures -``` - -Using `frame=True` means that returned value will be a DataFrame (not a dictionary that contains a -single DataFrame). - -If a step has no failing rows, an empty DataFrame will be returned: - -```{python} -# Get extracts from step 3 (regex check) -regex_failures = multi_validation.get_data_extracts(i=3, frame=True) - -regex_failures -``` - -### Getting All Extracts at Once - -To retrieve extracts from all steps with failures in one command: - -```{python} -# Get all extracts () -all_extracts = multi_validation.get_data_extracts() - -# Display the step numbers that have extracts -print(f"Steps with data extracts: {list(all_extracts.keys())}") -``` - -A dictionary of DataFrames is returned and only steps with failures will appear in this dictionary. - -### Getting Specific Extracts - -You can also retrieve data extracts from several specified steps as a dictionary: - -```{python} -# Get extracts from steps 1 and 2 as a dictionary -extract_dict = multi_validation.get_data_extracts(i=[1, 2]) - -# The keys are the step numbers -print(f"Dictionary keys: {list(extract_dict.keys())}") - -# Get the number of failing rows in each extract -for step, extract in extract_dict.items(): - print(f"Step {step}: {len(extract)} failing rows") -``` - -Note that `frame=True` cannot be used when retrieving multiple extracts. - -## Applications of Data Extracts - -Once you have extracted the failing data, there are numerous ways to analyze and use this -information to improve data quality. Let's explore some practical applications. - -### Finding Patterns Across Validation Steps - -You can analyze patterns across different validation steps by combining extracts: - -```{python} -# Get a consolidated view of all rows that failed any validation -all_failure_rows = set() -for step, extract in all_extracts.items(): - if len(extract) > 0: - all_failure_rows.update(extract["_row_num_"]) - -print(f"Total unique rows with failures: {len(all_failure_rows)}") -print(f"Row numbers with failures: {sorted(all_failure_rows)}") -``` - -### Identifying Rows with Multiple Failures - -You might want to find rows that failed multiple validation checks, as these often represent more -serious data quality issues: - -```{python} -# Get row numbers from each extract -step1_rows = set(multi_validation.get_data_extracts(i=1, frame=True)["_row_num_"]) -step2_rows = set(multi_validation.get_data_extracts(i=2, frame=True)["_row_num_"]) - -# Find rows that failed both validations -common_failures = step1_rows.intersection(step2_rows) -print(f"Rows failing both step 1 and step 2: {common_failures}") -``` - -### Statistical Analysis of Failing Values - -Once you have data extracts, you can perform statistical analysis to identify patterns in the -failing data: - -```{python} -# Get extracts from step 2 -d_value_failures = multi_validation.get_data_extracts(i=2, frame=True) - -# Basic statistical analysis of the failing values -if len(d_value_failures) > 0: - print(f"Min failing value: {d_value_failures['d'].min()}") - print(f"Max failing value: {d_value_failures['d'].max()}") - print(f"Mean failing value: {d_value_failures['d'].mean()}") -``` - -These analysis techniques help you thoroughly investigate data quality issues by examining failing -data from multiple perspectives. Rather than treating failures as isolated incidents, you can -identify patterns that might indicate systematic problems in your data pipeline. - -### Detailed Analysis with `col_summary_tbl()`{.qd-no-link} - -For a more comprehensive view of the statistical properties of your extract data, you can use the -`col_summary_tbl()` function: - -```{python} -# Get extracts from step 2 -d_value_failures = multi_validation.get_data_extracts(i=2, frame=True) - -# Generate a comprehensive statistical summary of the failing data -pb.col_summary_tbl(d_value_failures) -``` - -This statistical overview provides: - -1. a count of values (including missing values) -2. type information for each column -3. distribution metrics like min, max, mean, and quartiles for numeric columns -4. frequency of common values for categorical columns -5. missing value counts and proportions - -Using `col_summary_tbl()` on data extracts lets you quickly understand the characteristics of -failing data without writing custom analysis code. This approach is particularly valuable when: - -- You need to understand the statistical properties of failing records -- You want to compare distributions of failing vs passing data -- You're looking for anomalies or unexpected patterns within the failing rows - -For example, if values failing a validation check are concentrated at certain quantiles or have an -unusual distribution shape, this might indicate a systematic data collection or processing issue -rather than random errors. - -## Using Extracts for Data Quality Improvement - -Data extracts are especially valuable for: - -1. **Root Cause Analysis**: examining the full context of failing rows to understand why they failed -2. **Data Cleaning**: creating targeted cleanup scripts that focus only on problematic records -3. **Feedback Loops**: sharing specific examples with data providers to improve upstream quality -4. **Pattern Recognition**: identifying systemic issues by analyzing groups of failing records - -Here's an example of using extracts to create a corrective action plan: - -```{python} -import polars as pl - -# Create a new sample of an extract DF -sample_extract = pl.DataFrame({ - "id": range(1, 11), - "value": [3500, 4200, 3800, 9800, 5500, 7200, 8300, 4100, 7600, 3200], - "category": ["A", "B", "A", "C", "B", "A", "C", "B", "A", "B"], - "region": [ - "South", "South", "North", "East", "South", - "South", "East", "South", "West", "South" - ] -}) - -# Identify which regions have the most failures -region_counts = ( - sample_extract - .group_by("region") - .agg(pl.len().alias("failure_count")) - .sort("failure_count", descending=True) -) - -region_counts -``` - -Analysis shows that 6 out of 10 failing records (60%) are from the `"South"` region, making it the -highest priority area for data quality investigation. This suggests a potential systemic issue with -data collection or processing in that specific region. - -## Best Practices for Working with Data Extracts - -When incorporating data extracts into your data quality workflow: - -1. Use extracts for investigation, not just reporting: the real value is in the insights you gain -from analyzing the problematic data - -2. Combine with other Pointblank features: data extracts work well with step reports and can inform -threshold settings for future validations - -3. Consider sampling for very large datasets: if your extracts contain thousands of rows, focus your -investigation on a representative sample - -4. Look beyond individual validation steps: cross-reference extracts from different steps to -identify complex issues that span multiple validation rules - -5. Document patterns in failing data: record and share insights about common failure modes to build -organizational knowledge about data quality issues. - -By integrating these practices into your data validation workflow, you'll transform data extracts -from simple error lists into powerful diagnostic tools. The most successful data quality initiatives -treat extracts as the starting point for investigation rather than the end result of validation. -When systematically analyzed and documented, patterns in failing data can reveal underlying issues -in data systems, collection methods, or business processes that might otherwise remain hidden. -Remember that the ultimate goal isn't just to identify problematic records, but to use that -information to implement targeted improvements that prevent similar issues from occurring in the -future. - -## Conclusion - -Data extracts bridge the gap between high-level validation statistics and the detailed context -needed to fix data quality issues. By providing access to the actual failing records, Pointblank -enables you to: - -- pinpoint exactly which data points caused validation failures -- understand the full context around problematic values -- develop targeted strategies for data cleanup and quality improvement -- communicate specific examples to stakeholders - -Whether you're accessing extracts through CSV downloads, the `~~Validate.get_data_extracts()` -method, or step reports, this feature provides the detail needed to move from identifying problems -to implementing solutions. diff --git a/docs/user-guide/index.qmd b/docs/user-guide/index.qmd deleted file mode 100644 index 7b14853f8a..0000000000 --- a/docs/user-guide/index.qmd +++ /dev/null @@ -1,216 +0,0 @@ ---- -title: "Introduction" -format: - html: - include-in-header: - - text: | - ---- - -Redirecting to the [Introduction](../index.qmd)... - -## A Simple Validation Table - -This is a validation report table that is produced from a validation of a Polars DataFrame: - -```{python} -#| code-fold: true -#| code-summary: "Show the code" -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table"), label="Example Validation") - .col_vals_lt(columns="a", value=10) - .col_vals_between(columns="d", left=0, right=5000) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) - .col_vals_regex(columns="b", pattern=r"^[0-9]-[a-z]{3}-[0-9]{3}$") - .interrogate() -) -``` - -Each row in this reporting table constitutes a single validation step. Roughly, the left-hand side -outlines the validation rules and the right-hand side provides the results of each validation step. -While simple in principle, there's a lot of useful information packed into this validation table. - -Here's a diagram that describes a few of the important parts of the validation table: - -![](/assets/validation-table-diagram.png){width=100%} - -There are three things that should be noted here: - -- validation steps: each step is a separate test on the table, focused on a certain aspect of the -table -- validation rules: the validation type is provided here along with key constraints -- validation results: interrogation results are provided here, with a breakdown of test units -(*total*, *passing*, and *failing*), threshold flags, and more - -The intent is to provide the key information in one place, and have it be interpretable by data -stakeholders. For example, a failure can be seen in the second row (notice there's a CSV button). A -data quality stakeholder could click this to download a CSV of the failing rows for that step. - -## Example Code, Step-by-Step - -This section will walk you through the example code used above. - -```python -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt(columns="a", value=10) - .col_vals_between(columns="d", left=0, right=5000) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) - .col_vals_regex(columns="b", pattern=r"^[0-9]-[a-z]{3}-[0-9]{3}$") - .interrogate() -) -``` - -Note these three key pieces in the code: - -- **data**: the `Validate(data=)` argument takes a DataFrame or database table that you want to validate -- **steps**: the methods starting with `col_vals_` specify validation steps that run on specific columns -- **execution**: the `~~Validate.interrogate()` method executes the validation plan on the table - -This common pattern is used in a validation workflow, where `Validate` and -`~~Validate.interrogate()` bookend a validation plan generated through calling validation methods. - -In the next few sections we'll go a bit further by understanding how we can measure data quality and -respond to failures. - -## Understanding Test Units - -Each validation step will execute a type of validation test on the target table. For example, a -`~~Validate.col_vals_lt()` validation step can test that each value in a column is less than a -specified number. And the key finding that's reported in each step is the number of *test units* -that pass or fail. - -In the validation report table, test unit metrics are displayed under the `UNITS`, `PASS`, and -`FAIL` columns. This diagram explains what the tabulated values signify: - -![](/assets/validation-test-units.png){width=100%} - -Test units are dependent on the test being run. Some validation methods might test every value in a -particular column, so each value will be a test unit. Others will only have a single test unit since -they aren't testing individual values but rather if the overall test passes or fails. - -## Setting Thresholds for Data Quality Signals - -Understanding test units is essential because they form the foundation of Pointblank's threshold -system. Thresholds let you define acceptable levels of data quality, triggering different severity -signals ('warning', 'error', or 'critical') when certain failure conditions are met. - -Here's a simple example that uses a single validation step along with thresholds set using the -`Thresholds` class: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt( - columns="a", - value=7, - - # Set the 'warning' and 'error' thresholds --- - thresholds=pb.Thresholds(warning=2, error=4) - ) - .interrogate() -) -``` - -If you look at the validation report table, we can see: - -- the `FAIL` column shows that 2 tests units have failed -- the `W` column (short for 'warning') shows a filled gray circle indicating those failing test -units reached that threshold value -- the `E` column (short for 'error') shows an open yellow circle indicating that the number of -failing test units is below that threshold - -The one final threshold level, `C` (for 'critical'), wasn't set so it appears on the validation -table as a long dash. - -## Taking Action on Threshold Exceedances - -Pointblank becomes even more powerful when you combine thresholds with actions. The -`Actions` class lets you trigger responses when validation failures exceed threshold levels, turning -passive reporting into active notifications. - -Here's a simple example that adds an action to the previous validation: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt( - columns="a", - value=7, - thresholds=pb.Thresholds(warning=2, error=4), - - # Set an action for the 'warning' threshold --- - actions=pb.Actions( - warning="WARNING: Column 'a' has values that aren't less than 7." - ) - ) - .interrogate() -) -``` - -Notice the printed warning message: `"WARNING: Column 'a' has values that aren't less than -7."`. The warning indicator (filled gray circle) visually confirms this threshold was reached and -the action should trigger. - -Actions make your validation workflows more responsive and integrated with your data pipelines. For -example, you can generate console messages, Slack notifications, and more. - -## Navigating the User Guide - -As you continue exploring Pointblank's capabilities, you'll find the **User Guide** organized into -sections that will help you navigate the various features. - -### Getting Started - -The *Getting Started* section introduces you to Pointblank: - -- [Introduction](index.qmd): Overview of Pointblank and core concepts (**this article**) -- [Installation](installation.qmd): How to install and set up Pointblank - -### Validation Plan - -The *Validation Plan* section covers everything you need to know about creating robust -validation plans: - -- [Overview](validation-overview.qmd): Survey of validation methods and their shared parameters -- [Validation Methods](validation-methods.qmd): A closer look at the more common validation methods -- [Column Selection Patterns](column-selection-patterns.qmd): Techniques for targeting specific columns -- [Preprocessing](preprocessing.qmd): Transform data before validation -- [Segmentation](segmentation.qmd): Apply validations to specific segments of your data -- [Thresholds](thresholds.qmd): Set quality standards and trigger severity levels -- [Actions](actions.qmd): Respond to threshold exceedances with notifications or custom functions -- [Briefs](briefs.qmd): Add context to validation steps - -### Advanced Validation - -The *Advanced Validation* section explores more specialized validation techniques: - -- [Expression-Based Validation](expressions.qmd): Use column expressions for advanced validation -- [Schema Validation](schema-validation.qmd): Enforce table structure and column types -- [Assertions](assertions.qmd): Raise exceptions to enforce data quality requirements -- [Draft Validation](draft-validation.qmd): Create validation plans from existing data - -### Post Interrogation - -After validating your data, the *Post Interrogation* section helps you analyze and respond to -results: - -- [Validation Reports](validation-reports.qmd): Understand and customize the validation report table -- [Step Reports](step-reports.qmd): View detailed results for individual validation steps -- [Data Extracts](extracts.qmd): Extract and analyze failing data -- [Sundering Validated Data](sundering.qmd): Split data based on validation results - -### Data Inspection - -The *Data Inspection* section provides tools to explore and understand your data: - -- [Previewing Data](preview.qmd): View samples of your data -- [Column Summaries](col-summary-tbl.qmd): Get statistical summaries of your data -- [Missing Values Reporting](missing-vals-tbl.qmd): Identify and visualize missing data - -By following this guide, you'll gain a comprehensive understanding of how to validate, monitor, and -maintain high-quality data with Pointblank. diff --git a/docs/user-guide/installation.qmd b/docs/user-guide/installation.qmd deleted file mode 100644 index 90e36662bb..0000000000 --- a/docs/user-guide/installation.qmd +++ /dev/null @@ -1,250 +0,0 @@ ---- -title: "Installation" -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -Pointblank can be installed using various package managers. The base installation gives you the core -validation functionality, with optional dependencies for working with different data sources. - -## Basic Installation - -You can install Pointblank using your preferred package manager: - -::: {.panel-tabset} -## pip - -```bash -pip install pointblank -``` - -## uv - -```bash -uv pip install pointblank -``` - -## conda - -```bash -conda install -c conda-forge pointblank -``` - -## pixi - -```bash -# add pointblank to project -pixi init name-of-project -cd name-of-project -pixi add pointblank -``` -::: - -## DataFrame Libraries - -Pointblank requires a DataFrame library but doesn't include one by default, giving you the -flexibility to choose either [Pandas](https://pandas.pydata.org) or [Polars](https://pola.rs): - -::: {.panel-tabset} -## Polars - -```bash -# Using pip -pip install pointblank[pl] - -# Or manually -pip install polars>=1.24.0 -``` - -## Pandas - -```bash -# Using pip -pip install pointblank[pd] - -# Or manually -pip install pandas>=2.2.3 -``` -::: - -Pointblank works seamlessly with both libraries, and you can choose the one that best fits your -workflow and performance requirements. - -## Optional Dependencies - -### Ibis Backends - -To work with various database systems through [Ibis](https://ibis-project.org), you can install -additional backends: - -::: {.panel-tabset} -## pip - -```bash -pip install pointblank[sqlite] # SQLite -pip install pointblank[duckdb] # DuckDB -pip install pointblank[postgres] # PostgreSQL -pip install pointblank[mysql] # MySQL -pip install pointblank[mssql] # Microsoft SQL Server -pip install pointblank[bigquery] # BigQuery -pip install pointblank[pyspark] # Apache Spark -pip install pointblank[databricks] # Databricks -pip install pointblank[snowflake] # Snowflake - -# Example of installing multiple backends -pip install pointblank[duckdb,postgres,sqlite] -``` - -## uv - -```bash -uv pip install pointblank[sqlite] # SQLite -uv pip install pointblank[duckdb] # DuckDB -uv pip install pointblank[postgres] # PostgreSQL -uv pip install pointblank[mysql] # MySQL -uv pip install pointblank[mssql] # Microsoft SQL Server -uv pip install pointblank[bigquery] # BigQuery -uv pip install pointblank[pyspark] # Apache Spark -uv pip install pointblank[databricks] # Databricks -uv pip install pointblank[snowflake] # Snowflake - -# Example of installing multiple backends -uv pip install pointblank[duckdb,postgres,sqlite] -``` - -## conda - -```bash -conda install -c conda-forge pointblank-sqlite # SQLite -conda install -c conda-forge pointblank-duckdb # DuckDB -conda install -c conda-forge pointblank-postgres # PostgreSQL -conda install -c conda-forge pointblank-mysql # MySQL -conda install -c conda-forge pointblank-mssql # Microsoft SQL Server -conda install -c conda-forge pointblank-bigquery # BigQuery -conda install -c conda-forge pointblank-pyspark # Apache Spark -conda install -c conda-forge pointblank-databricks # Databricks -conda install -c conda-forge pointblank-snowflake # Snowflake - -# Example of installing multiple backends -conda install -c conda-forge pointblank-duckdb pointblank-postgres pointblank-sqlite -``` - -## pixi - -```bash -pixi add pointblank-sqlite # SQLite -pixi add pointblank-duckdb # DuckDB -pixi add pointblank-postgres # PostgreSQL -pixi add pointblank-mysql # MySQL -pixi add pointblank-mssql # Microsoft SQL Server -pixi add pointblank-bigquery # BigQuery -pixi add pointblank-pyspark # Apache Spark -pixi add pointblank-databricks # Databricks -pixi add pointblank-snowflake # Snowflake - -# Example of installing multiple backends -pixi add pointblank-duckdb pointblank-postgres pointblank-sqlite -``` -::: - -::: {.callout-note} -Even when using exclusively Ibis backends, you still need either Pandas or Polars installed since -Pointblank's reporting functionality (powered by -[Great Tables](https://posit-dev.github.io/great-tables)) requires a DataFrame library for rendering -tabular reporting results. -::: - -### AI-Assisted Validation (Experimental) - -Pointblank includes experimental support for AI-assisted validation plan generation: - -```bash -pip install pointblank[generate] -``` - -This installs the necessary dependencies for working with LLM providers to help generate validation -plans. See the [Draft Validation](draft-validation.qmd) article for how to create validation plans -from existing data. - -### Development Version - -If you want the latest development version with the newest features, you can install directly from -GitHub: - -```bash -pip install git+https://github.com/posit-dev/pointblank.git -``` - -## Verifying Your Installation - -You can verify your installation by importing Pointblank and checking the version: - -```python -import pointblank as pb -print(pb.__version__) -``` - -## System Requirements - -- Python 3.10 or higher -- a supported DataFrame library (Pandas or Polars) -- optional: Ibis (for database connectivity) - -## Next Steps - -Now that you've installed Pointblank, you're ready to start validating your data. If you haven't -read the [Introduction](index.qmd) yet, consider starting there to learn the basic concepts. - -If you encounter any installation issues, please -[open an issue on GitHub](https://github.com/posit-dev/pointblank/issues/new) with details about -your system and the specific error messages you're seeing. The maintainers actively monitor these -issues and can help troubleshoot problems. - -For a quick test of your installation, try running a simple validation: - -```python -import pointblank as pb - -# Load a small dataset -data = pb.load_dataset("small_table") - -# Create a simple validation -validation = ( - pb.Validate(data=data) - .col_exists(columns=["a", "b", "c"]) - .interrogate() -) - -# Display the validation results -validation -``` - -## Command Line Interface - -Once installed, Pointblank also provides a powerful command-line interface for quick data validation tasks: - -```bash -# Test the CLI with a built-in dataset -pb validate small_table --check rows-distinct - -# Check if a column exists -pb validate small_table --check col-exists --column a - -# Validate data ranges -pb validate small_table --check col-vals-lt --column a --value 10 -``` - -The CLI is perfect for: - -- quick data quality checks in CI/CD pipelines -- exploratory data analysis from the terminal -- integration with shell scripts and automation workflows - -::: {.callout-tip} -## See the CLI in Action -Watch our [interactive CLI demonstrations](../demos/cli-interactive/index.qmd) to see these commands executing in real-time with actual output formatting. -::: - -Learn more about the CLI capabilities in the [Command Line Interface](cli.qmd) guide. diff --git a/docs/user-guide/langs.qmd b/docs/user-guide/langs.qmd deleted file mode 100644 index eb98a2c49c..0000000000 --- a/docs/user-guide/langs.qmd +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Languages -jupyter: python3 -html-table-processing: none ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_header=False, report_incl_footer_timings=False) -``` - -It's possible to generate reporting in various spoken languages. We do this via the `lang=` -argument in `Validate`. - -
- -![](/assets/pointblank-sales-data.fr.png){width=100%} - -![](/assets/pointblank-sales-data.de.png){width=100%} - -![](/assets/pointblank-sales-data.it.png){width=100%} - -![](/assets/pointblank-sales-data.es.png){width=100%} - -![](/assets/pointblank-sales-data.pt-BR.png){width=100%} - -![](/assets/pointblank-sales-data.nl.png){width=100%} - -![](/assets/pointblank-sales-data.ja.png){width=100%} - -![](/assets/pointblank-sales-data.ko.png){width=100%} - -![](/assets/pointblank-sales-data.zh-CN.png){width=100%} - -
- - diff --git a/docs/user-guide/mcp-quick-start.qmd b/docs/user-guide/mcp-quick-start.qmd deleted file mode 100644 index 156414ef90..0000000000 --- a/docs/user-guide/mcp-quick-start.qmd +++ /dev/null @@ -1,308 +0,0 @@ ---- -title: "MCP Quick Start" -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -Transform your data validation workflow with conversational AI in VS Code or Positron IDE. Here are three simple steps to start validating data through conversation (and no complex configuration required). - -### 1. Install - -```bash -pip install pointblank[mcp] -``` - -### 2. Configure Your IDE - -**For VS Code**: - -**Option 1: Workspace Configuration (Recommended for teams)** - -1. Create a `.vscode/mcp.json` file in your project folder -2. Add this configuration: - -```json -{ - "servers": { - "pointblank": { - "command": "python", - "args": ["-m", "pointblank_mcp_server.pointblank_server"] - } - } -} -``` - -**Option 2: User Configuration (Personal use)** - -1. Run command: `MCP: Open User Configuration` (Cmd/Ctrl + Shift + P) -2. Add the same JSON configuration above - -> ⚠️ **Security Note**: Only add MCP servers from trusted sources. VS Code will ask you to confirm trust when starting the server for the first time. - -**For Positron**: - -1. Open Positron Settings -2. Navigate to MCP Server configuration -3. Add the configuration (format may vary) - -> **Note**: If you don't see MCP settings, you may need to install an MCP extension first. Search for "MCP" in the Extensions marketplace. - -### 3. Start Chatting - -``` -"Load my sales data and check its quality" -``` - -That's basically how you get started. - -## Essential Commands - -Master these five command patterns and you'll be able to handle most data validation scenarios. Think of these as your fundamental vocabulary for talking to Pointblank. - -### Load Data - -``` -"Load the file /path/to/data.csv" -"Load my Netflix dataset from the working directory" -"Load the CSV file with sales metrics" -"Load customer_data.csv as my main dataset" -``` - -### Explore Data - -``` -"Analyze the data for netflix_data" -"Show me a preview of the loaded data" -"Create a column summary table" -"Generate a missing values analysis" -``` - -**What you'll get**: Comprehensive data profiling with statistics including missing values, data types, distributions, and summary statistics for each column. The preview and summary tables are automatically generated as beautiful HTML files that open in your browser. This gives you a complete picture of your dataset's structure and characteristics before you define quality rules. - -### Check Quality - -``` -"Create a validator for netflix_data" -"Add validation that ratings are between 0 and 10" -"Check that all release years are reasonable" -"Apply the basic_quality template" -``` - -**What you'll get**: Actual data quality validation that checks your data against business rules and domain knowledge. This tells you if your data meets your specific quality requirements and identifies rows that fail validation criteria. - -### Create Data Validations - -``` -"Add validation that show_id values are unique" -"Check that cast field is not empty for movies" -"Ensure vote_count is greater than 0" -"Validate that country field follows ISO format" -``` - -**What you'll get**: Individual validation rules added to your validator. Each rule tests a specific business requirement and can be customized with thresholds and actions. - -### Run and Export - -``` -"Run the validation and show results" -"Export validation failures to CSV" -"Get failed rows for step 2" -"Save the validation report" -``` - ---- - -## Top Three Use Cases - -Once you have Pointblank set up, these scenarios show the most common and powerful ways to use natural language data validation. Each example demonstrates the conversational flow and immediate value you'll experience. - -### 1. 📊 Quick Data Check - -**When**: You get a new dataset - -``` -👤 "I just got netflix_data.csv. Is it clean?" -🤖 "Loading dataset... Found 16,000 movies/shows with 18 columns. - Quality analysis complete: - - ✅ No duplicate show IDs - - ⚠️ 466 missing countries (2.9%) - - ⚠️ 132 missing directors (0.8%) - Opening data preview in your browser..." -``` - -### 2. 🔄 Daily Validation - -**When**: Regular data quality checks - -``` -👤 "Apply the basic_quality template to my data" -🤖 "Creating validator with basic quality checks... - ✅ Data types validated - ✅ Missing value thresholds checked - ✅ Duplicate detection passed - Validation complete! All checks passed." -``` - -### 3. 🚨 Issue Investigation - -**When**: Something looks wrong - -``` -👤 "Show me movies with ratings above 9.5" -🤖 "Creating validator for high ratings... - Found 47 entries with ratings > 9.5 - Exporting suspicious rows to CSV - Most are documentaries - could be valid!" -``` - -These examples show how natural conversation can quickly identify and resolve data quality issues that might take hours to diagnose manually. - -## Core Capabilities - -Pointblank's MCP server provides powerful tools for comprehensive data validation with beautiful, interactive HTML reports: - -### Data Exploration - -- **Interactive HTML previews** with automatic browser opening showing head/tail rows -- **Column summary tables** with detailed statistics and color-coded data types -- **Missing values analysis** with visual patterns and percentages -- **Data quality analysis** with comprehensive profiling insights - -### Validation Workflows - -- **Validator creation** with flexible thresholds and configuration -- **Many validation types** for comprehensive data quality checking -- **Step-by-step validation** building with natural language commands -- **Template-based validation** for common data quality patterns - -### HTML Reports & Analysis - -- **Interactive validation reports** automatically opened in your browser -- **Timestamped HTML files** for easy sharing and documentation -- **Python code generation** for reproducible validation scripts - -All interactions use natural language, making advanced data validation accessible to users at any technical level while producing publication-ready HTML reports. - -## Common Validation Rules - -Understanding what validation rules to ask for will help you quickly build comprehensive data quality checks. These examples cover the most frequent validation scenarios using Pointblank's built-in validation functions. - -### Data Integrity - -- "Check for duplicate show IDs" -- "Ensure no missing required fields like title" -- "Validate that release years are between 1900 and 2025" - -### Business Logic - -- "Ratings must be between 0 and 10" -- "Budget must be positive numbers" -- "Duration should be greater than 0" - -### Cross-Field Validation - -- "Release year should match date_added year" -- "Vote count should correlate with popularity" -- "Movies should have directors specified" - -### Available Templates - -Pointblank includes pre-built validation templates: - -- `basic_quality` - Essential data quality checks -- `financial_data` - Money and numeric validations -- `customer_data` - Personal information validations -- `sensor_data` - Time series and measurement checks -- `survey_data` - Response and rating validations - -These rule patterns can be combined and customized for your specific data and business requirements. The natural language interface makes it easy to express complex validation logic without learning technical syntax. - -## Some Tips and Tricks - -These recommendations will help you get more value from your Pointblank MCP server and avoid some common pitfalls. - -### Talk Naturally - -✅ **Good:** "Check if customer emails look valid" - -❌ **Avoid:** "Execute col_vals_regex on email column" - -### Provide Context - -✅ **Good:** "This is for the board presentation" - -❌ **Avoid:** Just asking for validation without explanation - -### Build Incrementally - -1. Start with data profiling -2. Add basic validation rules -3. Create templates for reuse -4. Set up automated checks - -### Save Templates - -``` -"Save these rules as 'customer_validation'" -"Apply the financial_data template" -"Use our standard survey validation" -``` - -### Interactive Visual Tables - -Pointblank automatically generates beautiful, interactive HTML tables for data exploration: - -``` -"Show me a preview of the data" -"Generate a column summary table" -"Create a missing values analysis" -``` - -These commands create professional HTML tables with: - -- **Color-coded data types** (numeric in purple, text in yellow) -- **Gradient styling** tailored to each table type -- **Automatic browser opening** for immediate viewing -- **Timestamped files** for easy reference and sharing - -The tables open automatically in your default browser, making it easy to share data insights with colleagues or include in presentations. - -These practices help you build data quality workflows that scale with your needs while remaining accessible to those with varying technical backgrounds. - -## File Support - -Pointblank works with many major data file formats, making it easy to validate data regardless of how it's stored. This support means you can maintain consistent validation practices across your entire data ecosystem. - -| Type | Extensions | Example | Backend Support | -|------|------------|---------|-----------------| -| **CSV** | `.csv` | `sales_data.csv` | pandas, polars | -| **Parquet** | `.parquet` | `big_data.parquet` | pandas, polars | -| **JSON** | `.json` | `api_response.json` | pandas, polars | -| **JSON Lines** | `.jsonl` | `streaming_data.jsonl` | pandas, polars | - -The consistent natural language interface works the same regardless of file format, so you can focus on validation logic rather than technical details. Polars provides faster processing for large datasets, while Pandas offers broader format support. - -## Quick Troubleshooting - -When you encounter issues, these quick fixes resolve the most common problems. Furthermore, the natural language interface means you can always ask for help and explanations. - -| Problem | Quick Fix | -|---------|-----------| -| "File not found" | Use absolute path: `/Users/name/Downloads/data.csv` | -| "DataFrame not found" | Check loaded datasets with "List my loaded dataframes" | -| "Validator not found" | Use "List active validators" to see available validators | -| "Validation too slow" | Try "Use pandas backend" or sample your data first | -| "HTML tables won't open" | Check your default browser settings | -| "Need validation ideas" | Ask "Show me validation templates" or "Suggest validations for my data" | - -**Browser Issues**: The HTML tables automatically open in your default browser. If they don't appear, check that your browser isn't blocking pop-ups and that you have a default browser set in your system preferences. - -Remember, you can always ask the AI to explain what's happening or suggest solutions when you run into problems. - -## Now You're Ready! - -You now have everything needed to start validating data through conversation. The beauty of Pointblank's MCP server is that it grows with your expertise: start simple and gradually build more sophisticated validation workflows as you become comfortable with the interface. - -Start with simple commands and build up to more complex validation workflows. The AI will guide you through the process and help you create robust data quality checks! diff --git a/docs/user-guide/missing-vals-tbl.qmd b/docs/user-guide/missing-vals-tbl.qmd deleted file mode 100644 index 951392099e..0000000000 --- a/docs/user-guide/missing-vals-tbl.qmd +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: Missing Values Reporting -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -``` - -Sometimes values just aren't there: they're missing. This can either be expected or another thing to -worry about. Either way, we can dig a little deeper if need be and use the `missing_vals_tbl()` -function to generate a summary table that can elucidate how many values are missing, and roughly -where. - -## Using and Understanding `missing_vals_tbl()`{.qd-no-link} - -The missing values table is arranged a lot like the column summary table (generated via the -`col_summary_tbl()` function) in that columns of the input table are arranged as rows in the -reporting table. Let's use `missing_vals_tbl()` on the `nycflights` dataset, which has a lot of -missing values: - -```{python} -import pointblank as pb - -nycflights = pb.load_dataset(dataset="nycflights", tbl_type="polars") - -pb.missing_vals_tbl(nycflights) -``` - -There are 18 columns in `nycflights` and they're arranged down the missing values table as rows. To -the right we see column headers indicating 10 columns that are row sectors. Row sectors are groups -of rows and each sector contains a tenth of the total rows in the table. The leftmost sectors are -the rows at the top of the table whereas the sectors on the right are closer to the bottom. If you'd -like to know which rows make up each row sector, there are details on this in the table footer area -(click the `ROW SECTORS` text or the disclosure triangle). - -Now that we know about row sectors, we need to understand the visuals here. A light blue cell -indicates there are no (`0`) missing values within a given row sector of a column. For `nycflights` -we can see that several columns have no missing values at all (i.e., the light blue color makes up -the entire row in the missing values table). - -When there are missing values in a column's row sector, you'll be met with a grayscale color. The -proportion of missing values corresponds to the color ramp from light gray to solid black. -Interestingly, most of the columns that have missing values appear to be related to each other in -terms of the extent of missing values (i.e., the appearance in the reporting table looks roughly the -same, indicating a sort of systematic missingness). These columns are `dep_time`, `dep_delay`, -`arr_time`, `arr_delay`, and `air_time`. - -The odd column out with regard to the distribution of missing values is `tailnum`. By scanning the -row and observing that the grayscale color values are all a little different we see that the degree -of missingness of more variable and not related to the other columns containing missing values. - -## Missing Value Tables from the Other Datasets - -The `small_table` dataset has only 13 rows to it. Let's use that as a Pandas DataFrame with -`missing_vals_tbl()`: - -```{python} -small_table = pb.load_dataset(dataset="small_table", tbl_type="pandas") - -pb.missing_vals_tbl(small_table) -``` - -It appears that only column `c` has missing values. And since the table is very small in terms of -row count, most of the row sectors contain only a single row. - -The `game_revenue` dataset has *no* missing values. And this can be easily proven by using -`missing_vals_tbl()` with it: - -```{python} -game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") - -pb.missing_vals_tbl(game_revenue) -``` - -We see nothing but light blue in this report! The header also indicates that there are no missing -values by displaying a large green check mark (the other report tables provided a count of total -missing values across all columns). diff --git a/docs/user-guide/preprocessing.qmd b/docs/user-guide/preprocessing.qmd deleted file mode 100644 index 2cd8f395e1..0000000000 --- a/docs/user-guide/preprocessing.qmd +++ /dev/null @@ -1,311 +0,0 @@ ---- -title: Preprocessing -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -While the available validation methods can do a lot for you, there's likewise a lot of things you -*can't* easily do with them. What if you wanted to validate that - -- string lengths in a column are less than 10 characters? -- the median of values in a column is less than the median of values in another column? -- there are at least three instances of every categorical value in a column? - -These constitute more sophisticated validation requirements, yet such examinations are quite -prevalent in practice. Rather than expanding our library to encompass every conceivable validation -scenario (a pursuit that would yield an unwieldy and potentially infinite collection) we instead -employ a more elegant approach. By transforming the table under examination through judicious -preprocessing and exposing key metrics, we may subsequently employ the existing collection of -validation methods. This compositional strategy affords us considerable analytical power while -maintaining conceptual clarity and implementation parsimony. - -Central to this approach is the idea of composability. Pointblank makes it easy to safely transform -the target table for a given validation via the `pre=` argument. Any computed columns are available -for the (short) lifetime of the validation step during interrogation. This composability means: - -1. we can validate on different forms of the initial dataset (e.g., validating on -aggregate forms, validating on calculated columns, etc.) -2. there's no need to start an entirely new validation process for each transformed version of the -data (i.e., one tabular report could be produced instead of several) - -This compositional paradigm allows us to use data transformation effectively within our validation -workflows, maintaining both flexibility and clarity in our data quality assessments. - -## Transforming Data with Lambda Functions - -Now, through examples, let's look at the process of performing the validations mentioned above. -We'll use the `small_table` dataset for all of the examples. Here it is in its entirety: - -```{python} -#| echo: false -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars"), n_head=20, n_tail=20) -``` - -In getting to grips with the basics, we'll try to validate that string lengths in the `b` column are -less than 10 characters. We can't directly use the `~~Validate.col_vals_lt()` validation method with -that column because it is meant to be used with a column of numeric values. Let's just give that -method what it needs and create a column with string lengths! - -The target table is a Polars DataFrame so we'll provide a function that uses the Polars API to add -in that numeric column: - -```{python} -import polars as pl - -# Define a preprocessing function that gets string lengths from column `b` -def add_string_length_column(df): - return df.with_columns(string_lengths=pl.col("b").str.len_chars()) - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="String lengths" - ) - .col_vals_lt( - - # The generated column, via `pre=` (see below) --- - columns="string_lengths", - - # The string length value to be less than --- - value=10, - - # The preprocessing function that modifies the table --- - pre=add_string_length_column - ) - .interrogate() -) -``` - -The validation was successfully constructed and we can see from the validation report table that all -strings in `b` had lengths less than 10 characters. Also note that the icon under the `TBL` column -is no longer a rightward-facing arrow, but one that is indicative of a transformation taking place. - -Let's examine the transformation approach more closely. In the previous example, we're not directly -testing the `b` column itself. Instead, we're validating the `string_lengths` column that was -generated by the lambda function provided to `pre=`. The Polars API's `with_columns()` method does -the heavy lifting, creating numerical values that represent each string's length in the original -column. - -That transformation occurs only during interrogation and only for that validation step. Any prior or -subsequent steps would normally use the as-provided `small_table`. Having the possibility for -data transformation being isolated at the step level means that you don't have to generate separate -validation plans for each form of the data, you're free to fluidly transform the target table as -necessary for perform validations on different representations of the data. - -## Using Custom Functions for Preprocessing - -While lambda functions work well for simple transformations, custom named functions can make your -validation code more organized and reusable, especially for complex preprocessing logic. Let's -implement the same string length validation using a dedicated function: - -```{python} -def add_string_lengths(df): - # This generates string length from a column `b`; the new column with - # the values is called `string_lengths` (will be placed as the last column) - return df.with_columns(string_lengths=pl.col("b").str.len_chars()) - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="String lengths for column `b`." - ) - .col_vals_lt( - - # Use of a column selector function to select the last column --- - columns=pb.last_n(1), - - # The string length to be less than --- - value=10, - - # Custom function for generating string lengths in a new column --- - pre=add_string_lengths - ) - .interrogate() -) -``` - -The column-generating logic was placed in the `add_string_lengths()` function, which is then passed -to `pre=`. Notice we're using `pb.last_n(1)` in the `columns` parameter. This is a convenient column -selector that targets the last column in the DataFrame, which in our case is the newly created -`string_lengths` column. This saves us from having to explicitly write out the column name, making -our code more adaptable if column names change. Despite not specifying the name directly, you'll -still see the actual column name (`string_lengths`) displayed in the validation report. - -## Creating Parameterized Preprocessing Functions - -So far we've used simple functions and lambdas, but sometimes you may want to create more flexible -preprocessing functions that can be configured with parameters. Let's create a reusable function -that can calculate string lengths for any column: - -```{python} -def string_length_calculator(column_name): - """Returns a preprocessing function that calculates string lengths for the specified column.""" - def preprocessor(df): - return df.with_columns(string_lengths=pl.col(column_name).str.len_chars()) - return preprocessor - -# Validate string lengths in column b -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="String lengths for column `b`." - ) - .col_vals_lt( - columns=pb.last_n(1), - value=10, - pre=string_length_calculator(column_name="b") - ) - .interrogate() -) -``` - -This pattern is called a *function factory*, which is a function that creates and returns another -function. The outer function (`string_length_calculator()`) accepts parameters that customize the -behavior of the returned preprocessing function. The inner function (`preprocessor()`) is what -actually gets called during validation. - -This approach offers several benefits as it: - -- creates reusable, configurable preprocessing functions -- keeps your validation code DRY -- allows you to separate configuration from implementation -- enables easy application of the same transformation to different columns - -You could extend this pattern to create even more sophisticated preprocessing functions with -multiple parameters, default values, and complex logic. - -## Using Narwhals to Preprocess Many Types of DataFrames - -In this previous example we used a Polars table. You might have a situation where you perform data -validation variously on Pandas and Polars DataFrames. This is where Narwhals becomes handy: it -provides a single, consistent API that works across multiple DataFrame types, eliminating the need -to learn and switch between different APIs depending on your data source. - -Let's obtain `small_table` as a Pandas DataFrame. We'll construct a validation step to verify that -the median of column `c` is greater than the median in column `a`. - -```{python} -import narwhals as nw - -# Define preprocessing function using Narwhals for cross-backend compatibility -def get_median_columns_c_and_a(df): - return nw.from_native(df).select(nw.median("c"), nw.median("a")) - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Median comparison.", - ) - .col_vals_gt( - columns="c", - value=pb.col("a"), - - # Using Narwhals to modify the table; generates table with columns `c` and `a` --- - pre=get_median_columns_c_and_a - ) - .interrogate() -) -``` - -The goal is to check that the median value of `c` is greater than the corresponding median of -column `a`, which we set up through the `columns=` and `value=` parameters in the -`~~Validate.col_vals_gt()` method. - -There's a bit to unpack here so let's look at at the lambda function first. Narwhals can translate -a Pandas DataFrame to a Narwhals DataFrame with its `from_native()` function. After that initiating -step, you're free to use the Narwhals API (which is modeled on a subset of the Polars API) to do the -necessary data transformation. In this case, we are getting the medians of the `c` and `a` columns -and ending up with a one-row, two-column table. - -We should note that the transformed table is, perhaps surprisingly, a Narwhals DataFrame (we didn't -have to go back to a Pandas DataFrame by using `.to_native()`). Pointblank is able to work directly -with the Narwhals DataFrame for validation purposes, which makes the workflow more concise. - -One more thing to note: Pointblank provides a convenient syntactic sugar for working with Narwhals. -If you name the lambda parameter `dfn` instead of `df`, the system automatically applies -`nw.from_native()` to the input DataFrame first. This lets you write more concise code without -having to explicitly convert the DataFrame to a Narwhals format. - -## Swapping in a Totally Different DataFrame - -Sometimes data validation requires looking at completely transformed versions of your data (such as -aggregated summaries, pivoted views, or even reference tables). While this approach goes against the -typical paradigm of validating a single *target table*, there are legitimate use cases where you -might need to validate properties that only emerge after significant transformations. - -Let's now try to prepare the final validation scenario, checking that there are at least three -instances of every categorical value in column `f` (which contains string values in the set of -`"low"`, `"mid"`, and `"high"`). This time, we'll prepare the transformed table (transformed by -Polars expressions) outside of the Pointblank code. - -```{python} -data_original = pb.load_dataset(dataset="small_table", tbl_type="polars") -data_transformed = data_original.group_by("f").len(name="n") - -data_transformed -``` - -Then, we'll plug in the `data_transformed` DataFrame with a preprocessing function: - -```{python} -# Define preprocessing function to use the transformed data -def use_transformed_data(df): - return data_transformed - -( - pb.Validate( - data=data_original, - tbl_name="small_table", - label="Category counts.", - ) - .col_vals_ge( - columns="n", - value=3, - pre=use_transformed_data - ) - .interrogate() -) -``` - -We can see from the validation report table that there are three test units. This corresponds to a -row for each of the categorical value counts. From the report, we find that two of the three test -units are passing test units (turns out there are only two instances of `"mid"` in column `f`). - -Note that the swapped-in table can be any table type that Pointblank supports, like a Polars -DataFrame (as shown here), a Pandas DataFrame, a Narwhals DataFrame, or any other compatible format. -This flexibility allows you to validate properties of your data that might only be apparent after -significant reshaping or aggregation. - -## Conclusion - -The preprocessing capabilities in Pointblank provide the power and flexibility for validating -complex data properties beyond what's directly possible with the standard validation methods. -Through the `pre=` parameter, you can: - -- transform your data on-the-fly with computed columns -- generate aggregated metrics to validate statistical properties -- work seamlessly across different DataFrame types using Narwhals -- swap in completely different tables when validating properties that emerge only after -transformation - -By combining these preprocessing techniques with Pointblank's validation methods, you can create -comprehensive data quality checks that address virtually any validation scenario without needing an -endless library of specialized validation functions. This composable approach keeps your validation -code concise while allowing you to verify even the most complex data quality requirements. - -Remember that preprocessing happens just for the specific validation step, keeping your validation -plan organized and maintaining the integrity of your original data throughout the rest of the -validation process. diff --git a/docs/user-guide/preview.qmd b/docs/user-guide/preview.qmd deleted file mode 100644 index 7e958f5815..0000000000 --- a/docs/user-guide/preview.qmd +++ /dev/null @@ -1,136 +0,0 @@ ---- -title: Previewing Data -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -``` - -In many cases, it's *good* to look at your data tables. Before validating a table, you'll likely -want to inspect a portion of it before diving into the creation of data-quality rules. This is -pretty easily done with Polars and Pandas DataFrames, however, it's not as easy with database tables -and each table backend displays things differently. - -To make this common task a little better, you can use the `preview()` function in Pointblank. It has -been designed to work with every table that the package supports (i.e., DataFrames and Ibis-backend -tables, the latter of which are largely database tables). Plus, what's shown in the output is -consistent, no matter what type of data you're looking at. - -## Viewing a Table with `preview()`{.qd-no-link} - -Let's look at how `preview()` works. It requires only a table and, for this first example, let's use -the `nycflights` dataset: - -```{python} -import pointblank as pb - -nycflights = pb.load_dataset(dataset="nycflights", tbl_type="polars") - -pb.preview(nycflights) -``` - -This is an HTML table using the style of the other reporting tables in the library. The header is -more minimal here, only showing the type of table we're looking at (`POLARS` in this case) along -with the table dimensions. The column headers provide both the column names and the column data -types. - -By default, we're getting the first five rows and the last five rows. Row numbers (from the original -dataset) provide an indication of which rows are the head and tail rows. The blue lines provide -additional demarcation of the column containing the row numbers and the head and tail row groups. -Finally, any cells with missing values are prominently styled with red lettering and a lighter red -background. - -If you'd rather not see the row numbers in the table, you can use the `show_row_numbers=False` -option. Let's try that with the `game_revenue` dataset as a DuckDB table: - -```{python} -game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") - -pb.preview(game_revenue, show_row_numbers=False) -``` - -With the above preview, the row numbers are gone. The horizontal blue line still serves to divide -the top and bottom rows of the table, however. - -## Adjusting the Number of Rows Shown - -It could be that displaying the five top and bottom rows is not preferred. This can be changed with -the `n_head=` and `n_tail=`. Maybe, you want three from the top along with the last row? Let's try -that out with the `small_table` dataset as a Pandas DataFrame: - -```{python} -small_table = pb.load_dataset(dataset="small_table", tbl_type="pandas") - -pb.preview(small_table, n_head=3, n_tail=1) -``` - -If you're looking at a small table and want to see the entirety of it, you can enlarge the `n_head=` -and `n_tail=` values: - -```{python} -small_table = pb.load_dataset(dataset="small_table", tbl_type="pandas") - -pb.preview(small_table, n_head=10, n_tail=10) -``` - -Given that the table has 13 rows, asking for 20 rows to be displayed effectively shows the entire -table. - -## Previewing a Subset of Columns - -The preview scales well to tables that have many columns by allowing for a horizontal scroll. -However, previewing data from all columns can be impractical if you're only concerned with a key set -of them. To preview only a subset of a table's columns, we can use the `columns_subset=` argument. -Let's do this with the `nycflights` dataset and provide a list of six columns from that table. - -```{python} -pb.preview( - nycflights, - columns_subset=["hour", "minute", "sched_dep_time", "year", "month", "day"] -) -``` - -What we see are the six columns we specified from the `nycflights` dataset. - -Note that the columns are displayed in the order provided in the `columns_subset=` list. This can be -useful for making quick, side-by-side comparisons. In the example above, we placed `hour` and -`minute` next to the `sched_dep_time` column. In the original dataset, `sched_dep_time` is far -apart from the other two columns, but, it's useful to have them next to each other in the preview -since `hour` and `minute` are derived from `sched_dep_time` (and this lets us spot check any -issues). - -We can also use column selectors within `columns_subset=`. Suppose we want to only see those columns -that have `"dep_"` or `"arr_"` in the name. To do that, we use the `matches()` column selector -function: - -```{python} -pb.preview(nycflights, columns_subset=pb.matches("dep_|arr_")) -``` - -Several selectors can be combined together through use of the `col()` function and operators such as -`&` (*and*), `|` (*or*), `-` (*difference*), and `~` (*not*). Let's look at a column selection case -where: - -- the first three columns are selected -- all columns containing `"dep_"` or `"arr_"` are selected -- any columns beginning with `"sched"` are omitted - -This is how we put that together within `col()`: - -```{python} -pb.preview( - nycflights, - columns_subset=pb.col((pb.first_n(3) | pb.matches("dep_|arr_")) & ~ pb.starts_with("sched")) -) -``` - -This gives us a preview with only the columns that fit the specific selection rules. Incidentally, -using selectors with a dataset through `preview()` is a good way to test out the use of selectors -more generally. Since they are primarily used to select columns for validation, trying them -beforehand with `preview()` can help verify that your selection logic is sound. diff --git a/docs/user-guide/quickstart.qmd b/docs/user-guide/quickstart.qmd deleted file mode 100644 index 9d435dd125..0000000000 --- a/docs/user-guide/quickstart.qmd +++ /dev/null @@ -1,229 +0,0 @@ ---- -title: Quickstart -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -The Pointblank library is all about assessing the state of data quality for a table. You provide the -validation rules and the library will dutifully interrogate the data and provide useful reporting. -We can use different types of tables like Polars and Pandas DataFrames, Parquet files, or various -database tables. Let's walk through what data validation looks like in Pointblank. - -## A Simple Validation Table - -This is a validation report table that is produced from a validation of a Polars DataFrame: - -```{python} -#| code-fold: true -#| code-summary: "Show the code" -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table"), label="Example Validation") - .col_vals_lt(columns="a", value=10) - .col_vals_between(columns="d", left=0, right=5000) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) - .col_vals_regex(columns="b", pattern=r"^[0-9]-[a-z]{3}-[0-9]{3}$") - .interrogate() -) -``` - -Each row in this reporting table constitutes a single validation step. Roughly, the left-hand side -outlines the validation rules and the right-hand side provides the results of each validation step. -While simple in principle, there's a lot of useful information packed into this validation table. - -Here's a diagram that describes a few of the important parts of the validation table: - -![](/assets/validation-table-diagram.png){width=100%} - -There are three things that should be noted here: - -- validation steps: each step is a separate test on the table, focused on a certain aspect of the -table -- validation rules: the validation type is provided here along with key constraints -- validation results: interrogation results are provided here, with a breakdown of test units -(*total*, *passing*, and *failing*), threshold flags, and more - -The intent is to provide the key information in one place, and have it be interpretable by data -stakeholders. For example, a failure can be seen in the second row (notice there's a CSV button). A -data quality stakeholder could click this to download a CSV of the failing rows for that step. - -## Example Code, Step-by-Step - -This section will walk you through the example code used above. - -```python -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt(columns="a", value=10) - .col_vals_between(columns="d", left=0, right=5000) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) - .col_vals_regex(columns="b", pattern=r"^[0-9]-[a-z]{3}-[0-9]{3}$") - .interrogate() -) -``` - -Note these three key pieces in the code: - -- **data**: the `Validate(data=)` argument takes a DataFrame or database table that you want to validate -- **steps**: the methods starting with `col_vals_` specify validation steps that run on specific columns -- **execution**: the `~~Validate.interrogate()` method executes the validation plan on the table - -This common pattern is used in a validation workflow, where `Validate` and -`~~Validate.interrogate()` bookend a validation plan generated through calling validation methods. - -In the next few sections we'll go a bit further by understanding how we can measure data quality and -respond to failures. - -## Understanding Test Units - -Each validation step will execute a type of validation test on the target table. For example, a -`~~Validate.col_vals_lt()` validation step can test that each value in a column is less than a -specified number. And the key finding that's reported in each step is the number of *test units* -that pass or fail. - -In the validation report table, test unit metrics are displayed under the `UNITS`, `PASS`, and -`FAIL` columns. This diagram explains what the tabulated values signify: - -![](/assets/validation-test-units.png){width=100%} - -Test units are dependent on the test being run. Some validation methods might test every value in a -particular column, so each value will be a test unit. Others will only have a single test unit since -they aren't testing individual values but rather if the overall test passes or fails. - -## Setting Thresholds for Data Quality Signals - -Understanding test units is essential because they form the foundation of Pointblank's threshold -system. Thresholds let you define acceptable levels of data quality, triggering different severity -signals ('warning', 'error', or 'critical') when certain failure conditions are met. - -Here's a simple example that uses a single validation step along with thresholds set using the -`Thresholds` class: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt( - columns="a", - value=7, - - # Set the 'warning' and 'error' thresholds --- - thresholds=pb.Thresholds(warning=2, error=4) - ) - .interrogate() -) -``` - -If you look at the validation report table, we can see: - -- the `FAIL` column shows that 2 tests units have failed -- the `W` column (short for 'warning') shows a filled gray circle indicating those failing test -units reached that threshold value -- the `E` column (short for 'error') shows an open yellow circle indicating that the number of -failing test units is below that threshold - -The one final threshold level, `C` (for 'critical'), wasn't set so it appears on the validation -table as a long dash. - -## Taking Action on Threshold Exceedances - -Pointblank becomes even more powerful when you combine thresholds with actions. The -`Actions` class lets you trigger responses when validation failures exceed threshold levels, turning -passive reporting into active notifications. - -Here's a simple example that adds an action to the previous validation: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_lt( - columns="a", - value=7, - thresholds=pb.Thresholds(warning=2, error=4), - - # Set an action for the 'warning' threshold --- - actions=pb.Actions( - warning="WARNING: Column 'a' has values that aren't less than 7." - ) - ) - .interrogate() -) -``` - -Notice the printed warning message: `"WARNING: Column 'a' has values that aren't less than -7."`. The warning indicator (filled gray circle) visually confirms this threshold was reached and -the action should trigger. - -Actions make your validation workflows more responsive and integrated with your data pipelines. For -example, you can generate console messages, Slack notifications, and more. - -## Navigating the User Guide - -As you continue exploring Pointblank's capabilities, you'll find the **User Guide** organized into -sections that will help you navigate the various features. - -### Getting Started - -The *Getting Started* section introduces you to Pointblank: - -- [Introduction](index.qmd): Overview of Pointblank and core concepts (**this article**) -- [Installation](installation.qmd): How to install and set up Pointblank - -### Validation Plan - -The *Validation Plan* section covers everything you need to know about creating robust -validation plans: - -- [Overview](validation-overview.qmd): Survey of validation methods and their shared parameters -- [Validation Methods](validation-methods.qmd): A closer look at the more common validation methods -- [Column Selection Patterns](column-selection-patterns.qmd): Techniques for targeting specific columns -- [Preprocessing](preprocessing.qmd): Transform data before validation -- [Segmentation](segmentation.qmd): Apply validations to specific segments of your data -- [Thresholds](thresholds.qmd): Set quality standards and trigger severity levels -- [Actions](actions.qmd): Respond to threshold exceedances with notifications or custom functions -- [Briefs](briefs.qmd): Add context to validation steps - -### Advanced Validation - -The *Advanced Validation* section explores more specialized validation techniques: - -- [Expression-Based Validation](expressions.qmd): Use column expressions for advanced validation -- [Schema Validation](schema-validation.qmd): Enforce table structure and column types -- [Assertions](assertions.qmd): Raise exceptions to enforce data quality requirements -- [Draft Validation](draft-validation.qmd): Create validation plans from existing data - -### Post Interrogation - -After validating your data, the *Post Interrogation* section helps you analyze and respond to -results: - -- [Validation Reports](validation-reports.qmd): Understand and customize the validation report table -- [Step Reports](step-reports.qmd): View detailed results for individual validation steps -- [Data Extracts](extracts.qmd): Extract and analyze failing data -- [Sundering Validated Data](sundering.qmd): Split data based on validation results - -### Data Inspection - -The *Data Inspection* section provides tools to explore and understand your data: - -- [Previewing Data](preview.qmd): View samples of your data -- [Column Summaries](col-summary-tbl.qmd): Get statistical summaries of your data -- [Missing Values Reporting](missing-vals-tbl.qmd): Identify and visualize missing data - -By following this guide, you'll gain a comprehensive understanding of how to validate, monitor, and -maintain high-quality data with Pointblank. - -::: {.callout-note} -A [PDF version of the User Guide](../user-guide.pdf) is also available for offline reading. -::: diff --git a/docs/user-guide/schema-validation.qmd b/docs/user-guide/schema-validation.qmd deleted file mode 100644 index 1ef5a5786c..0000000000 --- a/docs/user-guide/schema-validation.qmd +++ /dev/null @@ -1,551 +0,0 @@ ---- -title: Schema Validation -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -Schema validation in Pointblank allows you to verify that your data conforms to an expected -structure and type specification. This is particularly useful when ensuring data consistency across -systems or validating incoming data against predefined requirements. - -Let's first look at the dataset we'll use for the first example: - -```{python} -import pointblank as pb - -# Preview the small_table dataset we'll use throughout this guide -pb.preview(pb.load_dataset(dataset="small_table", tbl_type="polars")) -``` - -## Schema Definition and Validation - -A schema in Pointblank is created using the `Schema` class which defines the expected structure of a -table. Once created, you apply schema validation through the `~~Validate.col_schema_match()` -validation step. - -```{python} -# Create a schema definition matching small_table structure -schema = pb.Schema( - columns=[ - ("date_time",), # Only check column name - ("date",), # Only check column name - ("a", "Int64"), # Check name and type - ("b", "String"), # Check name and type - ("c", "Int64"), # Check name and type - ("d", "Float64"), # Check name and type - ("e", "Boolean"), # Check name and type - ("f",), # Only check column name - ] -) - -# Validate the small_table against the schema -small_table_validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Schema validation of `small_table`.", - ) - .col_schema_match(schema=schema) - .interrogate() -) - -small_table_validation -``` - -The output shows the validation passed successfully. When all columns have the correct names and -types as specified in the schema, the validation passes with a single passing test unit. If there -were discrepancies, this would fail, but the basic output wouldn't show specific issues. - -For detailed information about validation results, use `~~Validate.get_step_report()`: - -```{python} -small_table_validation.get_step_report(i=1) -``` - -The step report provides specific details about which columns were checked and whether they matched -the schema, helping diagnose issues when validation fails. - -## Schema Components and Column Types - -When defining a schema, you need to specify column names and optionally their data types. By -default, Pointblank enforces strict validation where: - -- all columns in your table must match the specified schema -- column order must match the schema -- column types are case-sensitive -- type names must match exactly - -The schema definition accepts column types as string representations, which vary depending on your -data source: - -- `string`: Character data (may also be `"String"`, `"varchar"`, `"character"`, etc.) -- `integer`: Integer values (may also be `"Int64"`, `"int"`, `"bigint"`, etc.) -- `numeric`: Numeric values including integers and floating-point numbers (may also be `"Float64"`, -`"double"`, `"decimal"`, etc.) -- `boolean`: Logical values (`True`/`False`) (may also be `"Boolean"`, `"bool"`, etc.) -- `datetime`: Date and time values (may also be `"Datetime"`, `"timestamp"`, etc.) -- `date`: Date values (may also be `"Date"`, etc.) -- `time`: Time values - -For specific database engines or DataFrame libraries, you may need to use their exact type names -(like `"VARCHAR(255)"` for SQL databases or `"Int64"` for Polars integers). - -## Discovering Column Types - -To easily determine the correct type string for columns in your data, Pointblank provides two -helpful functions: - -```{python} -import polars as pl -from datetime import date - -# Define a sample dataframe -sample_df = pl.DataFrame({ - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "join_date": [date(2020, 1, 1), date(2021, 3, 15), date(2022, 7, 10)] -}) -``` - -```{python} -# Method 1: Using `preview()` with `show_types=True` to see column types -pb.preview(sample_df) -``` - -```{python} -# Method 2: Using `col_summary_tbl()` which shows column types and other details -pb.col_summary_tbl(sample_df) -``` - -These functions help you identify the exact type strings to use in your schema definitions, -eliminating guesswork and ensuring compatibility with your data source. - -## Creating a Schema - -You can create a schema in four different ways, each with its own advantages. All schema objects -can be printed to display their column names and data types. - -### 1. Using a List of Tuples with `columns=` - -This approach allows for mixed validation: some columns checked for both name and type, others only -for name: - -```{python} -schema_tuples = pb.Schema( - - # List of tuples approach: flexible for mixed type/name checking --- - columns=[ - ("name", "String"), # Check name and type - ("age", "Int64"), # Check name and type - ("height",) # Check name only - ] -) - -print(schema_tuples) -``` - -This is the only method that allows checking just column names for some columns while checking both -names and types for others. - -### 2. Using a Dictionary with `columns=` - -This approach is often the most readable when defining a schema manually, especially for larger -schemas: - -```{python} -schema_dict = pb.Schema( - - # Dictionary approach (keys are column names, values are data types) --- - columns={ - "name": "String", - "age": "Int64", - "height": "Float64", - "created_at": "Datetime" - } -) - -print(schema_dict) -``` - -With this method, you must always provide both column names (as keys) and their types (as values). - -### 3. Using Keyword Arguments - -For more readable code with a small number of columns: - -```{python} -schema_kwargs = pb.Schema( - - # Keyword arguments approach (more readable for simple schemas) --- - name="String", - age="Int64", - height="Float64" -) - -print(schema_kwargs) -``` - -Like the dictionary method, this approach requires both column names and types. - -### 4. Extracting from an Existing Table with `tbl=` - -You can automatically extract a schema from an existing table: - -```{python} -import polars as pl - -# Create a sample dataframe -df = pl.DataFrame({ - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], - "height": [5.6, 6.0, 5.8] -}) - -# Extract schema from table -schema_from_table = pb.Schema(tbl=df) - -print(schema_from_table) -``` - -This is especially useful when you want to validate that future data matches the structure of a -reference dataset. - -## Multiple Data Types for a Column - -You can specify multiple acceptable types for a column by providing a list of types: - -```{python} -# Schema with multiple possible types for a column -schema_multi_types = pb.Schema( - columns={ - "name": "String", - "age": ["Int64", "Float64"], # Accept either integer or float - "active": "Boolean" - } -) - -print(schema_multi_types) -``` - -This is useful when working with data sources that might represent the same information in different -ways (e.g., integers sometimes stored as floats). - -## Schema Validation Options - -When using `col_schema_match()`, you can customize validation behavior with several important -options: - -| Option | Default | Description | -|--------|---------|-------------| -| `complete` | `True` | Require exact column presence (no extra columns allowed) | -| `in_order` | `True` | Enforce column order | -| `case_sensitive_colnames` | `True` | Make column name matching case-sensitive | -| `case_sensitive_dtypes` | `True` | Make data type matching case-sensitive | -| `full_match_dtypes` | `True` | Require exact (not partial) type name matches | - -### Controlling Column Presence - -By default, `~~Validate.col_schema_match()` requires a complete match between the schema's columns -and the table's columns. You can make this more flexible: - -```{python} -# Create a sample table -users_table_extra = pl.DataFrame({ - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], - "extra_col": ["a", "b", "c"] # Extra column not in schema -}) - -# Create a schema -schema = pb.Schema( - columns={"id": "Int64", "name": "String", "age": "Int64"} -) - -# Validate without requiring all columns to be present -validation = ( - pb.Validate(data=users_table_extra) - .col_schema_match( - schema=schema, - - # Allow schema columns to be a subset --- - complete=False - ) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -### Column Order Enforcement - -You can control whether column order matters in your validation: - -```{python} -# Create a sample table -users_table = pl.DataFrame({ - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], -}) - -# Create a schema -schema = pb.Schema( - columns={"name": "String", "age": "Int64", "id": "Int64"} -) - -# Validate without enforcing column order -validation = ( - pb.Validate(data=users_table) - .col_schema_match( - schema=schema, - - # Don't enforce column order --- - in_order=False - ) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -### Case Sensitivity - -Control whether column names and data types are case-sensitive: - -```{python} -# Create schema with different case charactistics -case_schema = pb.Schema( - columns={"ID": "int64", "NAME": "string", "AGE": "int64"} -) - -# Create validation with case-insensitive column names and types -validation = ( - pb.Validate(data=users_table) - .col_schema_match( - schema=case_schema, - - # Ignore case in column names --- - case_sensitive_colnames=False, - - # Ignore case in data type names --- - case_sensitive_dtypes=False - ) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -### Type Matching Precision - -Control how strictly data types must match: - -```{python} -# Create schema with simplified type names -type_schema = pb.Schema( - - # Using simplified type names --- - columns={"id": "int", "name": "str", "age": "int"} -) - -# Allow partial type matches -validation = ( - pb.Validate(data=users_table) - .col_schema_match( - schema=type_schema, - - # Ignore case in data type names --- - case_sensitive_dtypes=False, - - # Allow partial type name matches --- - full_match_dtypes=False - ) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -## Common Schema Validation Patterns - -This section explores common patterns for applying schema validation to different scenarios. Each -pattern addresses specific validation needs you might encounter when working with real-world data. -We'll examine the step reports (`~~Validate.get_step_report()`) for these validations since they -provide more detailed information about what was checked and how the validation performed, offering -an intuitive way to understand the results beyond simple pass/fail indicators. - -### Structural Validation Only - -When you only care about column names but not their types: - -```{python} -# Create a schema with only column names -structure_schema = pb.Schema( - columns=["id", "name", "age", "extra_col"] -) - -# Validate structure only -validation = ( - pb.Validate(data=users_table_extra) - .col_schema_match(schema=structure_schema) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -### Mixed Validation - -Validate types for critical columns but just presence for others: - -```{python} -# Mixed validation for different columns -mixed_schema = pb.Schema( - columns=[ - ("id", "Int64"), # Check name and type - ("name", "String"), # Check name and type - ("age",), # Check name only - ("extra_col",) # Check name only - ] -) - -# Validate with mixed approach -validation = ( - pb.Validate(data=users_table_extra) - .col_schema_match(schema=mixed_schema) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -### Progressive Schema Evolution - -As your data evolves, you might need to adapt your validation approach: - -```{python} -# Original schema -original_schema = pb.Schema( - columns={ - "id": "Int64", - "name": "String" - } -) - -# New data with additional columns -evolved_data = pl.DataFrame({ - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], # New column - "active": [True, False, True] # New column -}) - -# Validate with flexible parameters -validation = ( - pb.Validate(evolved_data) - .col_schema_match( - schema=original_schema, - - # Allow extra columns --- - complete=False, - - # Don't enforce order --- - in_order=False - ) - .interrogate() -) - -validation.get_step_report(i=1) -``` - -## Integrating with Larger Validation Workflows - -Schema validation is often just one part of a comprehensive data validation strategy. You can -combine schema checks with other validation steps: - -```{python} -# Define a schema -schema = pb.Schema( - columns={ - "id": "Int64", - "name": "String", - "age": "Int64" - } -) - -# Create a validation plan -validation = ( - pb.Validate( - users_table, - label="User data validation", - thresholds=pb.Thresholds(warning=0.05, error=0.1) - ) - - # Add schema validation --- - .col_schema_match(schema=schema) - - # Add other validation steps --- - .col_vals_not_null(columns="id") - .col_vals_gt(columns="age", value=26) - .interrogate() -) - -validation -``` - -This approach allows you to first validate the structure of your data and then check specific -business rules or constraints. - -## Best Practices - -1. Define schemas early: document and define expected data structures early in your data -workflow. - -2. Choose the right creation method: - - use `columns=` for readability with many columns - - use `columns=` for mixed name/type validation - - use `kwargs` for small schemas with simple column names - - use `tbl=` to extract schemas from reference datasets - -3. Be deliberate about strictness: choose validation parameters based on your specific needs: - - strict validation (`complete=True`) for critical data interfaces - - flexible validation (`complete=False`, `in_order=False`) for evolving datasets - -4. Reuse schemas: create schema definitions that can be reused across multiple validation -contexts. - -5. Version control schemas: as your data evolves, maintain versions of your schemas to track -changes. - -6. Extract schemas from reference data: when you have a 'golden' dataset that represents your -ideal structure, use `Schema(tbl=reference_data)` to extract its schema. - -7. Consider type flexibility: use multiple types per column (`["Int64", "Float64"]`) when -working with data from diverse sources. - -8. Combine with targeted validation: use schema validation for structural checks and add -specific validation steps for business rules. - -## Conclusion - -Schema validation provides a powerful mechanism for ensuring your data adheres to expected -structural requirements. It serves as an excellent first line of defense in your data validation -strategy, verifying that the data you're working with has the expected shape before applying more -detailed business rule validations. - -The `Schema` class offers multiple ways to define schemas, from manual specification with -dictionaries or keyword arguments to automatic extraction from reference tables. When combined with -the flexible options of `~~Validate.col_schema_match()`, you can implement validation approaches -ranging from strict structural enforcement to more flexible evolution-friendly checks. - -By understanding the different schema creation methods and validation options, you can efficiently -validate the structure of your data tables and ensure they meet your requirements before processing. diff --git a/docs/user-guide/segmentation.qmd b/docs/user-guide/segmentation.qmd deleted file mode 100644 index d1f42e2b5d..0000000000 --- a/docs/user-guide/segmentation.qmd +++ /dev/null @@ -1,285 +0,0 @@ ---- -title: Segmentation -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -When validating data, you often need to analyze specific subsets or segments of your data -separately. Maybe you want to ensure that data quality meets standards in each geographic region, -for each product category, or across different time periods. This is where the `segments=` argument -can be useful. - -Data segmentation lets you split a validation step into multiple segments, with each segment -receiving its own validation step. Rather than validating an entire table at once, you could instead -validate different partitions separately and get separate results for each. - -The `segments=` argument is available in many validation methods; typically it's in those methods -that check values within rows, and those methods that examine entire rows -(`~~Validate.rows_distinct()`, `~~Validate.rows_complete()`). When you use it, Pointblank will: - -1. split your data according to your segmentation criteria -2. run the validation separately on each segment -3. report results individually for each segment - -Let's explore how to use the `segments=` argument through a few practical examples. - -## Basic Segmentation by Column Values - -The simplest way to segment data is by the unique values in a column. For the upcoming example, -we'll use the `small_table` dataset, which contains a categorical-value column called `f`. - -First, let's preview the dataset: - -```{python} -table = pb.load_dataset() - -pb.preview(table) -``` - -Now, let's validate that values in column `d` are greater than `100`, but we'll also segment the -validation by the categorical values in column `f`: - -```{python} -validation_1 = ( - pb.Validate( - data=pb.load_dataset(), - tbl_name="small_table", - label="Segmented validation by category" - ) - .col_vals_gt( - columns="d", value=100, - - # Segment by unique values in column `f` --- - segments="f" - ) - .interrogate() -) - -validation_1 -``` - -In the validation report, notice that instead of a single validation step, we have multiple steps: -one for each unique value in the `f` column. The segmentation is clearly indicated in the `STEP` -column with labels like `SEGMENT f / high`, making it easy to identify which segment each -validation result belongs to. This clear labeling helps when reviewing reports, especially with -complex validations that use multiple segmentation criteria. - -## Segmenting on Specific Values - -Sometimes you don't want to segment on all unique values in a column, but only on specific ones of -interest. You can do this by providing a tuple with the column name and a list of values: - -```{python} -validation_2 = ( - pb.Validate( - data=pb.load_dataset(), - tbl_name="small_table", - label="Segmented validation on specific categories" - ) - .col_vals_gt( - columns="d", - value=100, - segments=("f", ["low", "high"]) # Only segment on "low" and "high" values in column `f` - ) - .interrogate() -) - -validation_2 -``` - -In this example, we only create validation steps for the `"low"` and `"high"` segments, ignoring any -rows with `f` equal to `"mid"`. - -## Multiple Segmentation Criteria - -For more complex segmentation, you can provide a list of columns or column-value tuples. This -creates segments based on combinations of criteria: - -```{python} -validation_3 = ( - pb.Validate( - data=pb.load_dataset(), - tbl_name="small_table", - label="Multiple segmentation criteria" - ) - .col_vals_gt( - columns="d", - value=100, - - # Segment by values in `f` AND specific values in `a` --- - segments=["f", ("a", [1, 2])] - ) - .interrogate() -) - -validation_3 -``` - -This creates validation steps for each combination of values in column `f` and the specified values -in column `a`. - -## Segmentation with Preprocessing - -You can combine segmentation with preprocessing for powerful and flexible validations. All -preprocessing is applied before segmentation occurs, which means you can create derived columns to -segment on: - -```{python} -import polars as pl - -# Define preprocessing function for creating a categorical column -def add_d_category_column(df): - return df.with_columns( - d_category=pl.when(pl.col("d") > 150).then(pl.lit("high")).otherwise(pl.lit("low")) - ) - -validation_4 = ( - pb.Validate( - data=pb.load_dataset(tbl_type="polars"), - tbl_name="small_table", - label="Segmentation with preprocessing", - ) - .col_vals_gt( - columns="d", value=100, - - # Create a column containing categorical values --- - pre=add_d_category_column, - - # Segment by the computed column `d_category` generated via `pre=` --- - segments="d_category", - ) - .interrogate() -) - -validation_4 -``` - -In this example, we first create a derived column `d_category` based on whether `d` is greater than -`150`. Then, we segment our validation based on this derived column by using -`segments="d_category"`. - -## When to Use Segmentation - -Segmentation is particularly useful when: - -1. Data quality standards vary by group: different regions, product lines, or customer segments -might have different acceptable thresholds -2. Identifying problem areas: segmentation helps pinpoint exactly where data quality issues -exist, rather than just knowing that some issue exists somewhere in the data -3. Generating detailed reports: by segmenting, you get more granular reporting that can be -shared with different stakeholders responsible for different parts of the data -4. Tracking improvements over time: segmented validations make it easier to see if data quality -is improving in specific areas that were previously problematic - -By using segmentation strategically in these scenarios, you can transform your data validation from -a simple pass/fail system into a much more nuanced diagnostic tool that provides actionable insights -about data quality across different dimensions. This targeted approach not only helps identify -issues more precisely but also enables more effective communication of data quality metrics to -relevant stakeholders. - -## Segmentation vs. Multiple Validation Steps - -So why use segmentation instead of just creating separate validation steps for each segment using -filtering in the `pre=` argument? Well, segmentation offers several nice advantages: - -1. Conciseness: you define your validation logic once, not repeatedly for each segment -2. Consistency: we can be certain that the same validation is applied uniformly across segments -3. Clarity: the validation report will clearly organize results by segment (with extra labeling) -4. Convenience: there's no need to manually extract and filter subsets of your data - -Segmentation can end of simplifying your validation code while also providing more structured and -informative reporting about different portions of your data. - -## Practical Example: Validating Sales Data by Region and Product Type - -Let's see a more realistic example where we validate sales data segmented by both region and product -type: - -```{python} -import pandas as pd -import numpy as np - -# Create a sample sales dataset -np.random.seed(123) - -# Create a simple sales dataset -sales_data = pd.DataFrame({ - "region": np.random.choice(["North", "South", "East", "West"], 100), - "product_type": np.random.choice(["Electronics", "Clothing", "Food"], 100), - "units_sold": np.random.randint(5, 100, 100), - "revenue": np.random.uniform(100, 10000, 100), - "cost": np.random.uniform(50, 5000, 100) -}) - -# Calculate profit -sales_data["profit"] = sales_data["revenue"] - sales_data["cost"] -sales_data["profit_margin"] = sales_data["profit"] / sales_data["revenue"] - -# Preview the dataset -pb.preview(sales_data) -``` - -Now, let's validate that profit margins are above 20% across different regions and product types: - -```{python} -validation_5 = ( - pb.Validate( - data=sales_data, - tbl_name="sales_data", - label="Sales data validation by region and product" - ) - .col_vals_gt( - columns="profit_margin", - value=0.2, - segments=["region", "product_type"], - brief="Profit margin > 20% check" - ) - .interrogate() -) - -validation_5 -``` - -This validation gives us a detailed breakdown of profit margin performance across the different -regions and product types, making it easy to identify areas that need attention. - -## Best Practices for Segmentation - -Effective data segmentation requires thoughtful planning about how to divide your data in ways that -make sense for your validation needs. When implementing segmentation in your data validation -workflow, consider these key principles: - -1. Choose meaningful segments: select segmentation columns that align with your business logic and -organizational structure - -2. Use preprocessing when needed: if your raw data doesn't have good segmentation columns, create -them through preprocessing (with the `pre=` argument) - -3. Combine with actions: for critical segments, define segment-specific actions using the `actions=` -parameter to respond to validation failures. - -By implementing these best practices, you'll create more targeted, maintainable, and actionable data -validations. Segmentation becomes most powerful when it aligns with natural divisions in your data -and analytical processes, allowing for more precise identification of quality issues while -maintaining a unified validation framework. - -## Conclusion - -Data segmentation can make your validations more targeted and informative. By dividing your data -into meaningful segments, you can identify quality issues with greater precision, apply appropriate -validation standards to different parts of your data, and generate more actionable reports. - -The `segments=` parameter transforms validation from a monolithic process into a granular assessment -of data quality across various dimensions of your dataset. Whether you're dealing with regional -differences, product categories, time periods, or any other meaningful divisions in your data, -segmentation makes it possible to validate each portion according to its specific requirements while -maintaining the simplicity of a unified validation framework. diff --git a/docs/user-guide/step-reports.qmd b/docs/user-guide/step-reports.qmd deleted file mode 100644 index 1011546c2b..0000000000 --- a/docs/user-guide/step-reports.qmd +++ /dev/null @@ -1,460 +0,0 @@ ---- -title: "Step Reports" -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -While validation reports provide a comprehensive overview of all validation steps, sometimes you -need to focus on a specific validation step in greater detail. This is where *step reports* come in. -A step report is a detailed examination of a single validation step, providing in-depth information -about the test units that were validated and their pass/fail status. - -Step reports are especially useful when debugging validation failures, investigating problematic -data, or communicating detailed findings to colleagues who are responsible for specific data quality -issues. - -## Creating a Step Report - -To create a step report, you first need to run a validation and then use the -`~~Validate.get_step_report()` method, specifying which validation step you want to examine: - -```{python} -import pointblank as pb -import polars as pl - -# Sample data as a Polars DataFrame -data = pl.DataFrame({ - "id": range(1, 11), - "value": [10, 20, 3, 35, 50, 2, 70, 8, 20, 4], - "category": ["A", "B", "C", "A", "D", "F", "A", "E", "H", "G"], - "ratio": [0.5, 0.7, 0.3, 1.2, 0.8, 0.9, 0.4, 1.5, 0.6, 0.2], - "status": ["active", "active", "inactive", "active", "inactive", - "active", "inactive", "active", "active", "inactive"] -}) - -# Create a validation -validation = ( - pb.Validate(data=data, tbl_name="example_data") - .col_vals_gt( - columns="value", - value=10 - ) - .col_vals_in_set( - columns="category", - set=["A", "B", "C"] - ) - .interrogate() -) - -# Get step report for the second validation step (i=2) -step_report = validation.get_step_report(i=2) - -step_report -``` - -In this example, we first create and interrogate a validation object with two steps. We then -generate a step report for the second validation step (`i=2`), which checks if the values in the -`category` column are in the set `["A", "B", "C"]`. - -Note that step numbers in Pointblank start at `1`, matching what you see in the validation report's -`STEP` column (i.e., not 0-based indexing). So the first step is referred to with `i=1`, the second -step with `i=2`, and so on. - -## Understanding Step Report Components - -A step report consists of several key components that provide detailed information about the -validation step: - -1. Header: displays the validation step number, type of validation, and a brief description -2. Table Body: presents either the failing rows, a sample of completely passing data, or an -expected/actual comparison (for a `~~Validate.col_schema_match()` step) - -The step report table highlights passing and failing rows, making it easy to identify problematic -data points. This is especially useful for diagnosing issues when dealing with large datasets. - -## Different Types of Step Reports - -It's important to note that step reports vary in appearance and structure depending on the type of -validation method used: - -- Value-based validations (like `~~Validate.col_vals_gt()`, `~~Validate.col_vals_in_set()`): show -individual rows that failed validation -- Uniqueness checks (`~~Validate.rows_distinct()`): group together the duplicate records in order of -appearance -- Schema validations (`~~Validate.col_schema_match()`): display column-level information about -expected vs. actual data types - -Additionally, step reports for value-based validations and uniqueness checks operate in two distinct -modes: - -1. When errors are present: The report shows only the failing rows and, for value-based validations, -clearly highlights the column under study -2. When no errors exist: The report header clearly indicates success, and a sample of the data is -shown (along with the studied column highlighted, for value-based validations) - -This variation in reporting style allows step reports to effectively communicate the specific type -of validation being performed and display relevant information in the most appropriate format. When -you're working with different validation types, expect to see different step report layouts -optimized for each context. - -### Value-Based Validation Step Reports - -Value-based step reports focus on showing individual rows where values in the target column failed -the validation check. These reports highlight the specific column being validated and clearly -display which values violated the condition. - -```{python} -# Create sample data with some validation failures -data = pl.DataFrame({ - "id": range(1, 8), - "value": [120, 85, 47, 210, 30, 10, 5], - "category": ["A", "B", "C", "A", "D", "B", "E"] -}) - -# Create a validation with a value-based check -validation_values = ( - pb.Validate(data=data, tbl_name="sales_data") - .col_vals_gt( - columns="value", - value=50, - brief="Sales values should exceed $50" - ) - .interrogate() -) - -# Display the step report for the value-based validation -validation_values.get_step_report(i=1) -``` - -This report clearly identifies which rows contain values that don't meet our threshold, making it -easy to investigate these specific data points. - -### Uniqueness Validation Step Reports - -Uniqueness checks produce a different type of step report that groups duplicate records together. -This format makes it easy to identify patterns in duplicate data. - -```{python} -# Create sample data with some duplicate rows based on the combination of columns -data = pl.DataFrame({ - "customer_id": [101, 102, 103, 101, 104, 105, 102], - "order_date": ["2023-01-15", "2023-01-16", "2023-01-16", - "2023-01-15", "2023-01-17", "2023-01-18", "2023-01-19"], - "product": ["Laptop", "Phone", "Tablet", "Laptop", - "Monitor", "Keyboard", "Headphones"] -}) - -# Create a validation checking for unique customer-product combinations -validation_duplicates = ( - pb.Validate(data=data, tbl_name="order_data") - .rows_distinct( - columns_subset=["customer_id", "product"], - brief="Customer should not order the same product twice" - ) - .interrogate() -) - -# Display the step report for the uniqueness validation -validation_duplicates.get_step_report(i=1) -``` - -The report organizes duplicate records together, making it easy to see which combinations are -repeated and how many times they appear. - -### Schema Validation Step Reports - -Schema validation step reports have a completely different structure, comparing expected versus -actual column data types and presence. - -```{python} -schema = pb.Schema( - columns=[ - ("date_time", "timestamp"), - ("dates", "date"), - ("a", "int64"), - ("b",), - ("c",), - ("d", "float64"), - ("e", ["bool", "boolean"]), - ("f", "str"), - ] -) - -validation_schema = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="duckdb"), - tbl_name="small_table", - label="Step report for a schema check" - ) - .col_schema_match(schema=schema) - .interrogate() -) - -# Display the step report for the schema validation -validation_schema.get_step_report(i=1) -``` - -This report style focuses on comparing the expected schema against the actual table structure, -highlighting mismatches in data types or missing/extra columns. The table format makes it easy to -see exactly where the schema expectations differ from reality. - -## Customizing Step Reports - -Step reports can be customized with several parameters to better focus your analysis and tailor the -output to your specific needs. The `~~Validate.get_step_report()` method offers multiple -customization options to help you create more effective reports. - -When a dataset has many columns, you might want to focus on just those relevant to your analysis. -You can create a step report containing only a subset of the columns in the target table: - -```{python} -validation.get_step_report( - i=2, - - # Only show these columns --- - columns_subset=["id", "category", "status"] -) -``` - -This approach makes step reports much easier to interpret by highlighting just the essential columns -that help understand the validation failures. - -For large datasets with many failing rows, you might want to use `limit=` to set a cap on the number -of rows shown in the report: - -```{python} -validation.get_step_report( - i=2, - - # Only show up to 2 failing rows --- - limit=2 -) -``` - -The report header can also be extensively customized to provide more specific context. You can -replace the default header with plain text or Markdown formatting: - -```{python} -validation.get_step_report( - i=2, - header="Category Values Validation: *Critical Analysis*" -) -``` - -For more advanced header customization, you can use the templating system with the `{title}` and -`{details}` elements to retain parts of the default header while adding your own content. The -`{title}` template is the default title whereas `{details}` provides information on the assertion, -number of failures, etc. Let's move away from the default template of `{title}{details}` and provide -a custom title to go with the details text: - -```{python} -validation.get_step_report( - i=2, - header="Custom Category Validation Report {details}" -) -``` - -We can keep `{title}` and `{details}` and add some more context in between the two: - -```{python} -validation.get_step_report( - i=2, - header=( - "{title}
" - "" - "This validation is critical for our data quality standards." - "
" - "{details}" - ) -) -``` - -You could always use more HTML and CSS to do *a lot* of customization: - -```{python} -validation.get_step_report( - i=2, - header=( - "VALIDATION SUMMARY\n\n{details}\n\n" - "
" - "
" - "{title}" - "
" - ) -) -``` - -If you prefer no header at all, simply set `header=None`: - -```{python} -validation.get_step_report( - i=2, - header=None -) -``` - -These customization options can be combined to create highly focused reports tailored to specific -needs: - -```{python} -validation.get_step_report( - i=2, - columns_subset=["id", "category"], - header="*Category Validation:* Top Issues", - limit=2 -) -``` - -Through these customization options, you can craft step reports that effectively communicate the -most important information to different audiences. Technical teams might benefit from seeing all -columns but with a limited number of examples. Business stakeholders might prefer a focused view -with only the most relevant columns. For documentation purposes, custom headers provide important -context about what's being validated. - -Remember that customizing your step reports is about more than aesthetics: it's about making complex -validation information more accessible and actionable for all stakeholders involved in data quality. - -## Using Step Reports for Data Investigation - -Step reports can be powerful tools for investigating data quality issues. Let's look at a more -complex example: - -```{python} -# Create a more complex dataset with multiple issues -complex_data = pl.DataFrame({ - "id": range(1, 11), - "value": [10, 20, 3, 40, 50, 2, 70, 80, 90, 7], - "ratio": [0.1, 0.2, 0.3, 1.4, 0.5, 0.6, 0.7, 0.8, 1.2, 0.9], - "category": ["A", "B", "C", "A", "D", "B", "A", "C", "B", "E"] -}) - -# Create a validation with multiple steps -validation_complex = ( - pb.Validate(data=complex_data, tbl_name="complex_data") - .col_vals_gt(columns="value", value=10) - .col_vals_le(columns="ratio", value=1.0) - .col_vals_in_set(columns="category", set=["A", "B", "C"]) - .interrogate() -) - -# Get step report for the ratio validation (step 2) -ratio_report = validation_complex.get_step_report(i=2) - -ratio_report -``` - -In this example, we're investigating issues with the `ratio` column by generating a step report -specifically for that validation step. The step report shows exactly which rows have values that -exceed our maximum threshold of `1.0`. - -## Combining Step Reports with Extracts - -For more advanced analysis, you can extract the actual data from a step report into a DataFrame: - -```{python} -# Extract the data from the step report -failing_ratios = validation_complex.get_data_extracts(i=2) - -failing_ratios -``` - -This extracts the failing rows from the validation step, which you can then further analyze or fix -as needed. Note that the parameter `i=2` corresponds directly to the step number shown in the -validation report; it's the same numbering system used for `~~Validate.get_step_report()`. - -These extracts are particularly valuable for analysts who need to: - -- perform additional calculations on problematic data -- feed failing records into correction pipelines -- create visualizations of data patterns that led to validation failures -- export problem records to share with data owners - -It's worth noting that the validation report itself includes export buttons on the far right of each -row that allow you to download CSV files of the failing data directly. This serves as a convenient -delivery mechanism for sharing extracts with colleagues who may not be working in Python, making the -validation report not just a visual tool but also a practical means of distributing problematic data -for further investigation. - -## Step Reports with Segmented Data - -When working with segmented validation, step reports become even more valuable as they allow you to -investigate issues within specific segments: - -```{python} -# Create data with different regions -segmented_data = pl.DataFrame({ - "id": range(1, 10), - "value": [10, 20, 3, 40, 50, 2, 6, 8, 60], - "region": ["North", "North", "South", "South", "East", "East", "West", "West", "West"] -}) - -# Create a validation with segments -segmented_validation = ( - pb.Validate(data=segmented_data, tbl_name="regional_data") - .col_vals_gt( - columns="value", - value=10, - segments="region" # Segment by region - ) - .interrogate() -) - -# Get step report for a specific segment (the 'West' region) -# For segmented validations, each segment gets its own step number -north_report = segmented_validation.get_step_report(i=4) - -north_report -``` - -For segmented validations, each segment is treated as a separate validation step with its own step -number. This allows you to investigate issues specific to each data segment using the appropriate -step number from the validation report. - -## Best Practices for Using Step Reports - -Here are some guidelines for effectively using step reports in your data validation workflow: - -1. Generate step reports selectively: create reports only for steps that require detailed -investigation rather than for all steps - -2. Use the `limit=` parameter for large datasets: when working with large datasets, focus only -on a subset of failing rows to avoid information overload - -3. Share specific step reports with stakeholders: when collaborating with domain experts, share -relevant step reports to help them understand and address specific data quality issues (and -customize the header to improve clarity) - -4. Combine with extracts for deeper analysis: use the `~~Validate.get_data_extracts()` method to -extract the failing rows for further analysis or correction - -5. Document findings from step reports: when you discover patterns or insights from step reports, -document them to inform future data quality improvements - -Remember that step reports are most valuable when used strategically as part of a broader data -quality framework. By following these best practices, you can use step reports not just for -troubleshooting, but to develop a deeper understanding of your data's characteristics and quality -patterns over time. This approach transforms step reports from simple debugging tools into strategic -assets for continuous data quality improvement. - -## Conclusion - -Step reports provide a focused lens into specific validation steps, allowing you to investigate data -quality issues in detail. By generating targeted reports for specific validation steps, you can: - -- pinpoint exactly which data points are causing validation failures -- communicate specific issues to relevant stakeholders -- gather insights that might be missed in the aggregate validation report -- track improvements in specific aspects of data quality over time - -Whether you're debugging validation failures, investigating edge cases, or communicating specific -data quality issues to colleagues, step reports can give you the detailed information you need to -understand and resolve data quality problems effectively. diff --git a/docs/user-guide/sundering.qmd b/docs/user-guide/sundering.qmd deleted file mode 100644 index aa200e2fcb..0000000000 --- a/docs/user-guide/sundering.qmd +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: Sundering Validated Data -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_header=False, report_incl_footer_timings=False) -``` - -Sundering data? First off, let's get the correct meaning across here. Sundering is really just -splitting, dividing, cutting into two pieces. And it's a useful thing we can do in Pointblank to any -data that we are validating. When you interrogate the data, you learn about which rows have test -failures within them. With more validation steps, we get an even better picture of this simply by -virtue of more testing. - -The power of sundering lies in its ability to separate your data into two distinct categories: - -1. rows that pass all validation checks (clean data) -2. rows that fail one or more validation checks (problematic data) - -This approach allows you to: - -- focus your analysis on clean, reliable data -- isolate problematic records for investigation or correction -- create pipelines that handle good and bad data differently - -Let's use the `small_table` in our examples to show just how sundering is done. Here's that table: - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table"), n_head=20, n_tail=20) -``` - -## A Simple Example Where Data is Torn Asunder - -We'll begin with a very simple validation plan, having only a single step. There *will be* failing -test units here. - -```{python} -import pointblank as pb - -validation = ( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_ge(columns="d", value=1000) - .interrogate() -) - -validation -``` - -We see six failing test units in `FAIL` column of the above validation report table. There is a data -extract (collection of failing rows) available. Let's use the `~~Validate.get_data_extracts()` -method to have a look at it. - -```{python} -validation.get_data_extracts(i=1, frame=True) -``` - -This is six rows of data that had failing test units in column `d`. Indeed we can see that all -values in that column are less than `1000` (and we asserted that values should be greater than or -equal to `1000`). This is the 'bad' data, if you will. Using the `~~Validate.get_sundered_data()` -method, we get the 'good' part: - -```{python} -validation.get_sundered_data() -``` - -This is a Polars DataFrame of seven rows. All values in `d` were passing test units (i.e., fulfilled -the expectation outlined in the validation step) and, in many ways, this is like a 'good extract'. - -You can always collect the failing rows with `~~Validate.get_sundered_data()` by using the -`type="fail"` option. Let's try that here: - -```{python} -validation.get_sundered_data(type="fail") -``` - -It gives us the same rows as in the DataFrame obtained from using -`validation.get_data_extracts(i=1, frame=True)`. Two important things to know about -`~~Validate.get_sundered_data()` are that the table rows returned from `type=pass` (the default) and -`type=fail` are: - -- the sum of rows across these returned tables will be equal to that of the original table -- the rows in each split table are mutually exclusive (i.e., you won't find the same row in both) - -You can think of sundered data as a filtered version of the original dataset based on validation -results. While the simple example illustrates how this process works on a basic level, the value of -the method is better seen in a slightly more complex example. - -## Using `get_sundered_data()` with a More Comprehensive Validation - -The previous example used exactly one validation step. You're likely to use more than that in -standard practice so let's see how `~~Validate.get_sundered_data()` works in those common -situations. Here's a validation with three steps: - -```{python} -validation_2 = ( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_ge( - columns="d", - value=1000 - ) - .col_vals_not_null(columns="c") - .col_vals_gt( - columns="a", - value=2 - ) - .interrogate() -) - -validation_2 -``` - -There are quite a few failures here across the three validation steps. In the `FAIL` column of the -validation report table, there are 12 failing test units if we were to tally them up. So if the -input table has 13 rows in total, does this mean there would be one row in the table returned by -`~~Validate.get_sundered_data()`? Not so: - -```{python} -validation_2.get_sundered_data() -``` - -There are four rows. This is because the different validation steps tested values in different -columns of the table. Some of the failing test units had to have occurred in more than once in -certain rows. The rows that didn't have any failing test units across the three different tests -(in three different columns) are the ones seen above. This brings us to the third important thing -about the sundering process: - -- the absence of test-unit failures in a row across all validation steps means those rows are -returned as the 'passing' set, all others are placed in the 'failing' set - -In validations where many validation steps are used, we can be more confident about the level of -data quality for those rows returned in the passing set. But not every type of validation step is -considered within this splitting procedure. The next section will explain the rules on that. - -## The Validation Methods Considered When Sundering - -The sundering procedure relies on row-level validation types to be used. This makes sense as it's -impossible to judge the quality of a row when using the -[`col_exists()`](https://posit-dev.github.io/pointblank/reference/Validate.col_exists.html) -validation method, for example. Luckily, we have many row-level validation methods; here's a list: - -- `~~Validate.col_vals_gt()` -- `~~Validate.col_vals_lt()` -- `~~Validate.col_vals_ge()` -- `~~Validate.col_vals_le()` -- `~~Validate.col_vals_eq()` -- `~~Validate.col_vals_ne()` -- `~~Validate.col_vals_between()` -- `~~Validate.col_vals_outside()` -- `~~Validate.col_vals_in_set()` -- `~~Validate.col_vals_not_in_set()` -- `~~Validate.col_vals_null()` -- `~~Validate.col_vals_not_null()` -- `~~Validate.col_vals_regex()` -- `~~Validate.col_vals_expr()` -- `~~Validate.rows_distinct()` -- `~~Validate.rows_complete()` -- `~~Validate.conjointly()` - -This is the same list of validation methods that are considered when creating data extracts. - -There are some additional caveats though. Even if using a validation method drawn from the set -above, the validation step won't be used for sundering if: - -- the `active=` parameter for that step has been set to `False` -- the `pre=` parameter has been used - -The first one makes intuitive sense (you decided to skip this validation step entirely), the second -one requires some explanation. Using `pre=` allows you to modify the target table, there's no easy -or practical way to compare rows in a mutated table compared to the original table (e.g., a -mutation may drastically reduce the number of rows). - -## Practical Applications of Sundering - -### 1. Creating Clean Datasets for Analysis - -One of the most common use cases for sundering is preparing validated data for downstream analysis: - -```{python} -# Comprehensive validation for analysis-ready data -analysis_validation = ( - pb.Validate(data=pb.load_dataset(dataset="small_table")) - .col_vals_not_null(columns=["a", "b", "c", "d", "e", "f"]) # No missing values - .col_vals_gt(columns="a", value=0) # Positive values only - .col_vals_lt(columns="d", value=10000) # No extreme outliers - .interrogate() -) - -# Extract only the clean data that passed all checks -clean_data = analysis_validation.get_sundered_data(type="pass") - -# Use the clean data for your analysis -pb.preview(clean_data) -``` - -This approach ensures that any subsequent analysis is based on data that meets your quality -standards, reducing the risk of misleading results or spurious conclusions due to problematic -records. By making validation an explicit step in your analytical workflow, you create a natural -quality gate that prevents invalid data from influencing your findings. - -### 2. Creating Parallel Workflows for Clean and Problematic Data - -You can use sundering to create parallel processing paths: - -```{python} -# Get both clean and problematic data -clean_data = analysis_validation.get_sundered_data(type="pass") -problem_data = analysis_validation.get_sundered_data(type="fail") - -# Process clean data (in real applications, you'd do more here) -print(f"Clean data size: {len(clean_data)} rows") - -# Log problematic data for investigation -print(f"Problematic data size: {len(problem_data)} rows") -``` - -This approach enables you to build robust data processing pathways with separate handling for clean -and problematic data. In production environments, you could save problematic records to a separate -location for further investigation, generate detailed logs of validation failures, and trigger -automated notifications to data stewards when issues arise. By establishing clear protocols for -handling both data streams, you create a systematic approach to data quality that balances immediate -analytical needs with longer-term data improvement goals. - -### 3. Data Quality Monitoring and Improvement - -Tracking the ratio of passing to failing rows over time can help monitor data quality trends: - -```{python} -# Calculate data quality metrics -total_rows = len(pb.load_dataset(dataset="small_table")) -passing_rows = len(clean_data) -quality_score = passing_rows / total_rows - -print(f"Data quality score: {quality_score:.2%}") -print(f"Passing rows: {passing_rows} out of {total_rows}") -``` - -By tracking these metrics over time, you can measure the impact of your data quality improvement -efforts and communicate progress to stakeholders. This approach transforms sundering from a one-time -filtering tool into an ongoing data quality management system, where improving the ratio of passing -rows becomes a measurable business objective aligned with broader data governance goals. - -## Best Practices for Using Sundered Data - -When incorporating data sundering into your workflow, consider these best practices: - -1. Be comprehensive in your validation: the more validation steps you include (assuming they're -meaningful), the more confidence you can have in your passing dataset - -2. Document your validation criteria: when sharing sundered data with others, always document the -criteria used to determine passing rows - -3. Consider traceability: for audit purposes, it may be valuable to add a column indicating whether -a record was originally in the passing or failing set - -4. Balance strictness and practicality: if you're too strict with validation rules, you might end up -with very few passing rows; consider the appropriate level of strictness for your use case - -5. Use sundering as part of a pipeline: automate the process of validation, sundering, and -subsequent handling of the two resulting datasets - -6. Continually refine validation rules: as you learn more about your data and domain, update your -validation rules to improve the accuracy of your sundering process - -By following these best practices, data scientists and engineers can transform sundering from a -simple utility into a strategic component of their data quality framework. When implemented -thoughtfully, sundering enables a shift from reactive data cleaning to proactive quality management, -where validation criteria evolve alongside your understanding of the data. - -The ultimate goal isn't just to separate good data from bad, but to gradually improve your entire -dataset over time by addressing the root causes of validation failures that appear in the failing -set. This approach turns data validation from a gatekeeper function into a continuous improvement -process. - -## Conclusion - -Data sundering provides a powerful way to separate your data based on validation results. While -the concept is simple (splitting data into passing and failing sets) the feature can very useful in -many data workflows. By integrating sundering into your data pipeline, you can: - -- ensure that downstream analysis only works with validated data -- create focused datasets for different purposes -- improve overall data quality through systematic identification and isolation of problematic -records -- build more robust data pipelines that explicitly handle data quality issues - -So long as you're aware of the rules and limitations of sundering, you're likely to find it to be a -simple and useful way to filter your input table on the basis of a validation plan, turning data -validation from a passive reporting tool into an active component of your data processing workflow. diff --git a/docs/user-guide/test-data-generation.qmd b/docs/user-guide/test-data-generation.qmd deleted file mode 100644 index a8075f4fc8..0000000000 --- a/docs/user-guide/test-data-generation.qmd +++ /dev/null @@ -1,849 +0,0 @@ ---- -title: Test Data Generation -jupyter: python3 -toc-expand: 2 -html-table-processing: none ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -Pointblank provides a built-in test data generation system that creates realistic, locale-aware -synthetic data based on schema definitions. This is useful for testing validation rules, creating -sample datasets, and generating fixture data for development. - -::: {.callout-note} -Throughout this guide, we use `pb.preview()` to display generated datasets with nice HTML -formatting. This is optional: `pb.generate_dataset()` returns a standard DataFrame that you can -display or manipulate however you prefer. -::: - -## Quick Start - -Generate test data using a schema with field constraints: - -```{python} -import pointblank as pb - -# Define a schema with typed field specifications -schema = pb.Schema( - user_id=pb.int_field(min_val=1, unique=True), - name=pb.string_field(preset="name"), - email=pb.string_field(preset="email"), - age=pb.int_field(min_val=18, max_val=80), - status=pb.string_field(allowed=["active", "pending", "inactive"]), -) - -# Generate 100 rows of test data (seed ensures reproducibility) -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -## Field Types - -Pointblank provides helper functions for defining typed columns with constraints: - -| Function | Description | Key Parameters | -|----------|-------------|----------------| -| `int_field()` | Integer columns | `min_val`, `max_val`, `allowed`, `unique` | -| `float_field()` | Float columns | `min_val`, `max_val`, `allowed` | -| `string_field()` | String columns | `preset`, `pattern`, `allowed`, `unique` | -| `bool_field()` | Boolean columns | `p_true` (probability of True) | -| `date_field()` | Date columns | `min_val`, `max_val` | -| `datetime_field()` | Datetime columns | `min_val`, `max_val` | -| `time_field()` | Time columns | `min_val`, `max_val` | -| `duration_field()` | Duration columns | `min_val`, `max_val` | -| `profile_fields()` | Bundled person-profile fields | `set`, `split_name`, `include`, `exclude`, `prefix` | - -### Integer Fields - -Integer fields support range constraints with `min_val` and `max_val`, discrete allowed values with -`allowed`, and uniqueness enforcement with `unique=True`: - -```{python} -schema = pb.Schema( - id=pb.int_field(min_val=1000, max_val=9999, unique=True), - quantity=pb.int_field(min_val=1, max_val=100), - rating=pb.int_field(allowed=[1, 2, 3, 4, 5]), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -The `unique=True` constraint ensures no duplicate values appear in that column, which is useful for -generating primary keys or identifiers. - -### Float Fields - -Float fields work similarly to integers, with `min_val` and `max_val` defining the range of -generated values: - -```{python} -schema = pb.Schema( - price=pb.float_field(min_val=0.0, max_val=1000.0), - discount=pb.float_field(min_val=0.0, max_val=0.5), - temperature=pb.float_field(min_val=-40.0, max_val=50.0), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -Values are uniformly distributed across the specified range, making this useful for simulating -measurements, prices, or any continuous numeric data. - -### String Fields with Presets - -Presets generate realistic data like names, emails, and addresses. When you include related -fields like `name` and `email` in the same schema, Pointblank ensures **coherence** (e.g., the email -address will be derived from the person's name), making the generated data more realistic: - -```{python} -schema = pb.Schema( - full_name=pb.string_field(preset="name"), - email=pb.string_field(preset="email"), - company=pb.string_field(preset="company"), - city=pb.string_field(preset="city"), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -This coherence extends to other related fields like `user_name`, which will also reflect the -person's name when included alongside name and email fields. - -### String Fields with Patterns - -Use regex patterns to generate strings matching specific formats: - -```{python} -schema = pb.Schema( - product_code=pb.string_field(pattern=r"[A-Z]{3}-[0-9]{4}"), - phone=pb.string_field(pattern=r"\([0-9]{3}\) [0-9]{3}-[0-9]{4}"), - hex_color=pb.string_field(pattern=r"#[0-9A-F]{6}"), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -Patterns support standard regex character classes and quantifiers, giving you flexibility to -generate data matching virtually any format specification. - -### Boolean Fields - -Control the probability of `True` values: - -```{python} -schema = pb.Schema( - is_active=pb.bool_field(p_true=0.8), # 80% True - is_premium=pb.bool_field(p_true=0.2), # 20% True - is_verified=pb.bool_field(), # 50% True (default) -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -This probabilistic control is helpful when you need to simulate real-world distributions where -certain states are more common than others. - -### Date and Datetime Fields - -Temporal fields accept Python `date` and `datetime` objects for their range boundaries, generating -values uniformly distributed within the specified period: - -```{python} -from datetime import date, datetime - -schema = pb.Schema( - birth_date=pb.date_field( - min_date=date(1960, 1, 1), - max_date=date(2005, 12, 31) - ), - created_at=pb.datetime_field( - min_date=datetime(2024, 1, 1), - max_date=datetime(2024, 12, 31) - ), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -The same pattern applies to `time_field()` and `duration_field()`, allowing you to generate -realistic temporal data for any use case. - -## Available Presets - -The `preset=` parameter in `string_field()` supports many data types: - -**Personal Data:** - -- `name`: full name (first + last) -- `name_full`: full name with optional prefix/suffix (e.g., "Dr. Ana Sousa", "Prof. Tanaka Yuki") -- `first_name`: first name only -- `last_name`: last name only -- `gender`: person's gender (`"male"` or `"female"`), coherent with name fields -- `email`: email address -- `phone_number`: phone number in country-specific format - -**Location Data:** - -- `address`: full street address -- `city`: city name -- `state`: state/province name -- `country`: country name -- `country_code_2`: ISO 3166-1 alpha-2 country code (e.g., `"US"`) -- `country_code_3`: ISO 3166-1 alpha-3 country code (e.g., `"USA"`) -- `postcode`: postal/ZIP code -- `latitude`: latitude coordinate -- `longitude`: longitude coordinate - -**Business Data:** - -- `company`: company name -- `job`: job title -- `catch_phrase`: business catch phrase - -**Internet Data:** - -- `url`: website URL -- `domain_name`: domain name -- `ipv4`: IPv4 address -- `ipv6`: IPv6 address -- `user_name`: username -- `password`: password - -**Financial Data:** - -- `credit_card_number`: credit card number -- `credit_card_provider`: card network name (Visa, Mastercard, American Express, or Discover); coherent with `credit_card_number` -- `iban`: International Bank Account Number -- `currency_code`: currency code (USD, EUR, etc.) - -**Identifiers:** - -- `uuid4`: UUID version 4 -- `md5`: MD5 hash (32 hex characters) -- `sha1`: SHA-1 hash (40 hex characters) -- `sha256`: SHA-256 hash (64 hex characters) -- `ssn`: Social Security Number (country-specific format) -- `license_plate`: vehicle license plate (location-aware for CA, US, DE, AU, GB) - -**Barcodes:** - -- `ean8`: EAN-8 barcode with valid check digit -- `ean13`: EAN-13 barcode with valid check digit - -**Date/Time:** - -- `date_this_year`: a date within the current year -- `date_this_decade`: a date within the current decade -- `date_between`: a random date between 2000 and 2025 -- `date_range`: two dates joined with an en-dash (e.g., `"2012-05-12 – 2015-11-22"`) -- `future_date`: a date up to 1 year in the future -- `past_date`: a date up to 10 years in the past -- `time`: a time value - -**Text:** - -- `word`: single word -- `sentence`: full sentence -- `paragraph`: paragraph of text -- `text`: multiple paragraphs - -**Miscellaneous:** - -- `color_name`: color name -- `file_name`: file name -- `file_extension`: file extension -- `mime_type`: MIME type -- `user_agent`: browser user agent string (country-weighted) -- `locale_code`: locale identifier (e.g., `"en_US"`, `"de_DE"`; multilingual countries return a random official locale) - -## Profile Fields - -When generating person-profile data, you often need several related presets together: a name, an -email derived from that name, an address, a phone number, and so on. Rather than wiring up each -column individually, the `profile_fields()` helper returns a ready-made dictionary of -`StringField` objects that you can unpack directly into a `Schema()`. - -### Basic Usage - -With no arguments, `profile_fields()` returns the **standard** set of seven columns: -`first_name`, `last_name`, `email`, `city`, `state`, `postcode`, and `phone_number`. All -coherence rules apply automatically: emails are derived from names, and city/state/postcode/phone -are internally consistent. - -```{python} -schema = pb.Schema( - user_id=pb.int_field(unique=True, min_val=1), - **pb.profile_fields(), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -The `**` operator unpacks the dictionary into keyword arguments, as if you had written each -`string_field(preset=...)` call by hand. - -### Choosing a Set - -Three built-in sets control how many columns are generated: - -| Set | Columns | -|-----|------| -| `"minimal"` | `first_name`, `last_name`, `email`, `phone_number` | -| `"standard"` | `first_name`, `last_name`, `email`, `city`, `state`, `postcode`, `phone_number` | -| `"full"` | `first_name`, `last_name`, `email`, `address`, `city`, `state`, `postcode`, `phone_number`, `company`, `job` | - -```{python} -# Minimal profile: just name, email, and phone -pb.preview( - pb.generate_dataset( - pb.Schema(**pb.profile_fields(set="minimal")), - n=100, seed=23, - ) -) -``` - -```{python} -# Full profile: includes address, company, and job title -pb.preview( - pb.generate_dataset( - pb.Schema(**pb.profile_fields(set="full")), - n=100, seed=23, - ) -) -``` - -### Combined vs. Split Names - -By default, names are split into `first_name` and `last_name` columns. Set `split_name=False` to -get a single `name` column instead: - -```{python} -pb.preview( - pb.generate_dataset( - pb.Schema(**pb.profile_fields(set="minimal", split_name=False)), - n=100, seed=23, - ) -) -``` - -### Adding and Removing Columns - -Use `include=` to add presets to the base set and `exclude=` to remove them. Both accept lists of -preset names. The available profile presets are: `first_name`, `last_name`, `name`, `email`, -`address`, `city`, `state`, `postcode`, `phone_number`, `company`, and `job`. - -```{python} -# Standard set + company column -pb.preview( - pb.generate_dataset( - pb.Schema(**pb.profile_fields(include=["company"])), - n=100, seed=23, - ) -) -``` - -```{python} -# Standard set without city and state -pb.preview( - pb.generate_dataset( - pb.Schema(**pb.profile_fields(exclude=["city", "state"])), - n=100, seed=23, - ) -) -``` - -You can combine `include=` and `exclude=` in the same call, as long as the same preset does not -appear in both. - -### Column Prefixes - -The `prefix=` parameter prepends a string to every column name. This is especially useful when a -schema needs two independent profiles (e.g., sender and recipient): - -```{python} -schema = pb.Schema( - **pb.profile_fields(set="minimal", prefix="sender_"), - **pb.profile_fields(set="minimal", prefix="recipient_"), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23)) -``` - -Each prefixed group maintains its own coherence: the sender's email is derived from the sender's -name, and the recipient's email from the recipient's name. - -### Combining with Other Field Types - -Since `profile_fields()` returns a plain dictionary, it composes naturally with any other field -types: - -```{python} -schema = pb.Schema( - id=pb.int_field(unique=True, min_val=1000), - **pb.profile_fields(), - active=pb.bool_field(p_true=0.8), - signup_date=pb.date_field( - min_date="2024-01-01", - max_date="2025-12-31", - ), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23, country="DE")) -``` - -## Country-Specific Data - -One of the most powerful features is generating locale-aware data. Use the `country=` parameter -to generate data specific to a country. This affects names, cities, addresses, and other -locale-sensitive presets. - -Let's create a schema that includes several location-related fields. When generating data for a -specific country, Pointblank ensures *consistency across related fields*. The city, address, -postcode, and coordinates will all correspond to the same location: - -```{python} -# Schema with linked location fields -schema = pb.Schema( - name=pb.string_field(preset="name"), - city=pb.string_field(preset="city"), - address=pb.string_field(preset="address"), - postcode=pb.string_field(preset="postcode"), - latitude=pb.string_field(preset="latitude"), - longitude=pb.string_field(preset="longitude"), -) -``` - -Here's German data with authentic names and addresses from cities like Berlin, Munich, and Hamburg. -Notice how the latitude/longitude coordinates match real locations in Germany: - -```{python} -pb.preview(pb.generate_dataset(schema, n=200, seed=23, country="DE")) -``` - -Japanese data includes names in romanized form and addresses from cities like Tokyo, Osaka, and -Kyoto. The coordinates fall within Japan's geographic boundaries: - -```{python} -pb.preview(pb.generate_dataset(schema, n=200, seed=23, country="JP")) -``` - -Brazilian data features Portuguese names and addresses from cities like São Paulo, Rio de Janeiro, -and Brasília. The postal codes follow Brazil's CEP format: - -```{python} -pb.preview(pb.generate_dataset(schema, n=200, seed=23, country="BR")) -``` - -This location coherence is valuable when testing geospatial applications, address validation -systems, or any scenario where realistic, internally-consistent location data matters. - -### Data Coherence - -Pointblank automatically links related columns to produce realistic rows. There are three -coherence systems that activate based on which presets appear together in a schema: - -**Address coherence** activates when *any* address-related preset is present (`address`, `city`, -`state`, `postcode`, `latitude`, `longitude`, `phone_number`, `license_plate`). All of these -fields will refer to the same location within each row. - -**Person coherence** activates when *any* person-related preset is present (`name`, `name_full`, -`first_name`, `last_name`, `email`, `user_name`). The email and username are derived from the -person's name. - -**Business coherence** activates when *both* `job` and `company` are present. When active: - -- the company and job title are drawn from the same industry (e.g., a nurse will work at a -hospital, not a law firm). -- `name_full` gains profession-matched titles: a doctor may appear as "Dr. Ana Sousa" and a -professor as "Prof. Tanaka Yuki". For German-speaking countries (DE, AT, CH), the honorific -stacks before the professional title (e.g., "Herr Dr. med. Klaus Weber"). -- integer columns whose name contains `age` (e.g., `age`, `person_age`) are automatically -constrained to working-age range (22–65). - -Here's an example showing all three coherence systems working together: - -```{python} -schema = pb.Schema( - name=pb.string_field(preset="name_full"), - email=pb.string_field(preset="email"), - company=pb.string_field(preset="company"), - job=pb.string_field(preset="job"), - city=pb.string_field(preset="city"), - state=pb.string_field(preset="state"), - license_plate=pb.string_field(preset="license_plate"), - age=pb.int_field(), -) - -pb.preview(pb.generate_dataset(schema, n=100, seed=23, country="DE")) -``` - -**License plate coherence** is part of address coherence. For `CA`, `US`, `DE`, `AU`, and `GB`, -license plates follow real subregion-specific formats when location fields are present. For example, -an Ontario row produces plates like `"CABC 123"` while a British Columbia row produces `"AB1 23C"`. -Letters I, O, Q, and U are excluded from plate generation, matching real-world restrictions. - -### Supported Countries - -Pointblank currently supports 100 countries with full locale data for realistic test data generation. -You can use either ISO 3166-1 alpha-2 codes (e.g., `"US"`) or alpha-3 codes (e.g., `"USA"`). - -**Europe (38 countries):** - -- Armenia (`AM`), Austria (`AT`), Azerbaijan (`AZ`), Belgium (`BE`), Bulgaria (`BG`), Croatia (`HR`), Cyprus (`CY`), Czech Republic (`CZ`), Denmark (`DK`), Estonia (`EE`), Finland (`FI`), France (`FR`), Georgia (`GE`), Germany (`DE`), Greece (`GR`), Hungary (`HU`), Iceland (`IS`), Ireland (`IE`), Italy (`IT`), Latvia (`LV`), Lithuania (`LT`), Luxembourg (`LU`), Malta (`MT`), Moldova (`MD`), Netherlands (`NL`), Norway (`NO`), Poland (`PL`), Portugal (`PT`), Romania (`RO`), Russia (`RU`), Serbia (`RS`), Slovakia (`SK`), Slovenia (`SI`), Spain (`ES`), Sweden (`SE`), Switzerland (`CH`), Ukraine (`UA`), United Kingdom (`GB`) - -**Americas (19 countries):** - -- Argentina (`AR`), Bolivia (`BO`), Brazil (`BR`), Canada (`CA`), Chile (`CL`), Colombia (`CO`), Costa Rica (`CR`), Dominican Republic (`DO`), Ecuador (`EC`), El Salvador (`SV`), Guatemala (`GT`), Honduras (`HN`), Jamaica (`JM`), Mexico (`MX`), Panama (`PA`), Paraguay (`PY`), Peru (`PE`), United States (`US`), Uruguay (`UY`) - -**Asia-Pacific (22 countries):** - -- Australia (`AU`), Bangladesh (`BD`), Cambodia (`KH`), China (`CN`), Hong Kong (`HK`), India (`IN`), Indonesia (`ID`), Japan (`JP`), Kazakhstan (`KZ`), Malaysia (`MY`), Myanmar (`MM`), Nepal (`NP`), New Zealand (`NZ`), Pakistan (`PK`), Philippines (`PH`), Singapore (`SG`), South Korea (`KR`), Sri Lanka (`LK`), Taiwan (`TW`), Thailand (`TH`), Uzbekistan (`UZ`), Vietnam (`VN`) - -**Middle East & Africa (21 countries):** - -- Algeria (`DZ`), Cameroon (`CM`), Egypt (`EG`), Ethiopia (`ET`), Ghana (`GH`), Israel (`IL`), Jordan (`JO`), Kenya (`KE`), Lebanon (`LB`), Morocco (`MA`), Mozambique (`MZ`), Nigeria (`NG`), Rwanda (`RW`), Saudi Arabia (`SA`), Senegal (`SN`), South Africa (`ZA`), Tanzania (`TZ`), Tunisia (`TN`), Turkey (`TR`), Uganda (`UG`), United Arab Emirates (`AE`) - -Additional countries and expanded coverage are planned for future releases. - -### Mixing Multiple Countries - -When you need test data that spans multiple locales (e.g., simulating an international customer -base), you can pass a list or dict to the `country=` parameter instead of a single string. - -Passing a list of country codes splits rows equally across those countries. Here, 200 rows are -divided evenly among the US, Germany, and Japan (~67 each): - -```{python} -schema = pb.Schema( - name=pb.string_field(preset="name"), - city=pb.string_field(preset="city"), - postcode=pb.string_field(preset="postcode"), -) - -pb.preview(pb.generate_dataset(schema, n=200, seed=23, country=["US", "DE", "JP"])) -``` - -To control the proportion of rows per country, pass a dict mapping country codes to weights. The -following generates 200 rows with 70% from the US, 20% from Germany, and 10% from France: - -```{python} -pb.preview( - pb.generate_dataset( - schema, n=200, seed=23, - country={"US": 0.7, "DE": 0.2, "FR": 0.1}, - ) -) -``` - -Weights are auto-normalized, so `{"US": 7, "DE": 2, "FR": 1}` is equivalent to the example above. -Row counts are allocated using largest-remainder apportionment, ensuring they always sum to exactly -`n`. - -By default, rows from different countries are interleaved randomly (`shuffle=True`). Set -`shuffle=False` to keep rows grouped by country in the order the countries are listed: - -```{python} -pb.preview( - pb.generate_dataset( - schema, n=120, seed=23, - country=["US", "DE", "JP"], shuffle=False, - ) -) -``` - -All coherence systems (address, person, business) work correctly within each country's batch of -rows. A French row will have a French name with a matching French email; a Japanese row will have a -Japanese name with a matching Japanese email. Non-preset columns (integers, floats, booleans, dates) -are generated independently for each batch but still respect their field constraints. - -### Frequency-Weighted Sampling - -By default, names and cities are sampled uniformly at random from the locale data, giving every -entry the same probability of being selected. Real-world distributions are far from uniform though: -"James" and "Maria" appear orders of magnitude more often than "Thaddeus" or "Xiomara", and more -people live in New York City than in Flagstaff. The `weighted=True` parameter makes generated data -reflect this natural skew. - -```{python} -schema = pb.Schema( - name=pb.string_field(preset="name"), - city=pb.string_field(preset="city"), -) - -pb.preview(pb.generate_dataset(schema, n=200, seed=23, country="US", weighted=True)) -``` - -With weighting enabled you will see popular names like James, John, Mary, and Patricia appear more -frequently, while unusual names surface only occasionally. Similarly, cities like New York, Los -Angeles, and Chicago dominate the output while smaller cities appear less often. - -The feature works by organizing locale data into four frequency tiers. Each tier has a sampling -probability that determines how likely its members are to be selected: - -| Tier | Probability | Contents | -|------|-------------|----------| -| very_common | 45% | The top ~10% of entries by real-world frequency | -| common | 30% | The next ~20% of entries | -| uncommon | 20% | The next ~30% of entries | -| rare | 5% | The remaining ~40% of entries | - -When a value is needed, a tier is first chosen according to these probabilities and then a single -entry is picked uniformly at random within that tier. This two-step approach keeps sampling fast -while producing a realistic long-tail distribution. Setting `weighted=False` pools all entries -across every tier and samples them uniformly, which can be useful when you want an even spread -rather than a realistic distribution. - -Weighted sampling combines seamlessly with multi-country mixing. Each country's batch uses its own -tiered data independently, so a mixed dataset will have weighted US names alongside weighted German -names: - -```{python} -pb.preview( - pb.generate_dataset( - schema, - n=200, - seed=23, - country={"US": 0.6, "DE": 0.4}, - weighted=True, - ) -) -``` - -All 100 supported country locales have tiered name and location data, so `weighted=True` produces -realistic frequency distributions for every country. - -## Output Formats - -The `generate_dataset()` function supports multiple output formats via the `output=` parameter, -making it easy to integrate with your preferred data processing library. - -```{python} -schema = pb.Schema( - id=pb.int_field(min_val=1), - name=pb.string_field(preset="name"), -) -``` - -The default output is a Polars DataFrame, which offers excellent performance and a modern API for -data manipulation: - -```{python} -polars_df = pb.generate_dataset(schema, n=100, seed=23, output="polars") - -pb.preview(polars_df) -``` - -If your workflow uses Pandas, simply specify `output="pandas"` to get a **Pandas DataFrame**: - -```{python} -pandas_df = pb.generate_dataset(schema, n=100, seed=23, output="pandas") - -pb.preview(pandas_df) -``` - -Both formats work seamlessly with Pointblank's validation functions, so you can choose whichever -fits best with your existing data pipeline. - -## Using Generated Data for Validation Testing - -A common use case is generating test data to validate your validation rules: - -```{python} -# Define a schema with constraints -schema = pb.Schema( - user_id=pb.int_field(min_val=1, unique=True), - email=pb.string_field(preset="email"), - age=pb.int_field(min_val=18, max_val=100), - status=pb.string_field(allowed=["active", "pending", "inactive"]), -) - -# Generate test data -test_data = pb.generate_dataset(schema, n=100, seed=23) - -# Validate the generated data (it should pass all checks) -validation = ( - pb.Validate(test_data) - .col_vals_gt("user_id", 0) - .col_vals_regex("email", r".+@.+\..+") - .col_vals_between("age", 18, 100) - .col_vals_in_set("status", ["active", "pending", "inactive"]) - .interrogate() -) - -validation -``` - -Since the generated data respects the constraints defined in the schema, it should pass all -validation checks. This workflow is particularly useful for testing validation logic before -applying it to production data, or for creating reproducible test fixtures in your CI/CD pipeline. - -## Pytest Fixture - -When Pointblank is installed, a `generate_dataset` **pytest fixture** is automatically available -in all your test files. There is no need to import anything or add configuration to `conftest.py`: -the fixture is registered via pytest's plugin system. - -The fixture works identically to `pb.generate_dataset()`, but with one key difference: when you -don't supply a `seed=` parameter, a deterministic seed is automatically derived from the test's -fully-qualified name. This means: - -- the **same test** always produces the **same data**: no manual seed management required. -- *different tests* get different seeds, so they exercise different datasets. -- you can still pass an explicit `seed=` to override the automatic seed when needed. - -### Basic Usage - -Use it by adding `generate_dataset` to your test function's parameter list: - -```{.python filename="test_pipeline.py"} -import pointblank as pb - -def test_etl_handles_nulls(generate_dataset): - schema = pb.Schema( - user_id=pb.int_field(unique=True), - email=pb.string_field(preset="email", nullable=True, null_probability=0.3), - age=pb.int_field(min_val=0, max_val=120), - ) - - df = generate_dataset(schema, n=500) - result = my_etl_pipeline(df) - assert result.filter(pl.col("email").is_null()).shape[0] == 0 -``` - -All parameters from `generate_dataset()` are supported: `n=`, `seed=`, `output=`, and `country=`: - -```python -def test_german_data(generate_dataset): - schema = pb.Schema( - name=pb.string_field(preset="name"), - city=pb.string_field(preset="city"), - ) - - df = generate_dataset(schema, n=200, country="DE", output="pandas") - assert len(df) == 200 -``` - -### Multiple Datasets in One Test - -Calling the fixture multiple times within the same test produces different (but still deterministic) data on each call: - -```python -def test_merge_pipeline(generate_dataset): - customers = generate_dataset(customer_schema, n=1000, country="US") - orders = generate_dataset(order_schema, n=5000) - - # Each call gets a unique seed derived from the test name + call index, - # so both DataFrames are deterministic and different from each other. - result = merge_pipeline(customers, orders) - assert result.shape[0] > 0 -``` - -### Testing Across Locales - -The fixture makes locale testing particularly concise when combined with `pytest.mark.parametrize`: - -```python -import pytest -import pointblank as pb - -@pytest.mark.parametrize("country", ["US", "DE", "JP", "BR"]) -def test_name_normalizer(generate_dataset, country): - schema = pb.Schema(name=pb.string_field(preset="name_full")) - df = generate_dataset(schema, n=100, country=country) - result = normalize_names(df) - assert result["name"].str.len_chars().min() > 0 -``` - -### Sharing Schemas Across Tests - -Define schemas as fixtures in `conftest.py` and compose them with `generate_dataset`: - -```{.python filename="conftest.py"} -import pytest -import pointblank as pb - -@pytest.fixture -def customer_schema(): - return pb.Schema( - id=pb.int_field(unique=True), - name=pb.string_field(preset="name"), - email=pb.string_field(preset="email"), - city=pb.string_field(preset="city"), - ) -``` - -```{.python filename="test_validation.py"} -def test_customer_validation(generate_dataset, customer_schema): - df = generate_dataset(customer_schema, n=200, country="DE") - validation = pb.Validate(df).col_vals_not_null(columns="email").interrogate() - assert validation.all_passed() -``` - -```{.python filename="test_export.py"} -def test_customer_export(generate_dataset, customer_schema): - df = generate_dataset(customer_schema, n=50, country="JP") - exported = export_to_parquet(df) - assert exported.exists() -``` - -### Debugging with Seed Introspection - -The fixture callable exposes two attributes that make debugging failed tests straightforward: - -- `generate_dataset.default_seed`: the base seed derived from the test name (available before any call) -- `generate_dataset.last_seed`: the seed actually used for the most recent call (accounts for the call counter and explicit overrides) - -Include `.last_seed` in assertion messages so failures are immediately reproducible: - -```python -def test_age_range(generate_dataset): - schema = pb.Schema(age=pb.int_field(min_val=18, max_val=100)) - df = generate_dataset(schema, n=500) - min_age = df["age"].min() - assert min_age >= 18, ( - f"Expected min age >= 18, got {min_age} (seed={generate_dataset.last_seed})" - ) -``` - -You can also use `.default_seed` to reproduce the exact dataset outside of pytest: - -```python -# In a REPL or notebook, reproduce the data from a failed test: -import pointblank as pb -df = pb.generate_dataset(schema, n=500, seed=) -``` - -### Seed Stability - -A given seed (whether explicit or auto-derived) is guaranteed to produce identical output **within -the same Pointblank version**. Across versions, changes to country data files or generator logic -may alter the output for a given seed. - -For CI pipelines that require bit-exact data across library upgrades, we recommend saving -generated DataFrames as Parquet or CSV snapshot files rather than relying on cross-version seed -stability. This is the same approach used by snapshot-testing tools like `pytest-snapshot` and -`syrupy`. - -## Conclusion - -Test data generation provides a convenient way to create realistic synthetic datasets directly from -schema definitions. While the concept is straightforward (defining field types and constraints, then -generating matching data), the feature can be invaluable in many development and testing workflows. -By incorporating test data generation into your process, you can: - -- quickly prototype validation rules before working with production data -- create reproducible test fixtures for automated testing and CI/CD pipelines -- generate locale-specific data for internationalization testing across 100 countries -- ensure coherent relationships between related fields like names, emails, addresses, jobs, and -license plates -- produce datasets of any size with consistent, realistic values - -Whether you're building validation logic, testing data pipelines, or simply need sample data for -development, the schema-based generation approach gives you precise control over data -characteristics while maintaining the realism needed to uncover edge cases and validate your -assumptions about data quality. diff --git a/docs/user-guide/thresholds.qmd b/docs/user-guide/thresholds.qmd deleted file mode 100644 index a398f82eaf..0000000000 --- a/docs/user-guide/thresholds.qmd +++ /dev/null @@ -1,328 +0,0 @@ ---- -title: Thresholds -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -Thresholds are a key concept in Pointblank that allow you to define acceptable limits for failing -validation tests. Rather than a simple pass/fail model, thresholds enable you to signal failure at -different severity levels ('warning', 'error', and 'critical'), giving you fine-grained control over -how data quality issues are reported and handled. - -When used with actions (covered in the next section), thresholds create a robust system for -responding to data quality issues based on their severity. This approach allows you to: - -- set different tolerance levels for different types of validation checks -- escalate responses based on the severity of data quality issues -- configure different notification strategies for different threshold levels -- create a more nuanced data validation workflow than simple pass/fail tests - -## A Simple Example - -Let's start with a basic example that demonstrates how thresholds work in practice: - -```{python} -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_not_null( - columns="c", - - # Set thresholds for the validation step --- - thresholds=pb.Thresholds(warning=1, error=0.2) - ) - .interrogate() -) -``` - -In this example, we're validating that column `c` contains no Null values. We've set: - -- A 'warning' threshold of `1` (triggers when 1 or more values are Null) -- An 'error' threshold of `0.2` (triggers when 20% or more values are Null) - -Looking at the results: - -- the `FAIL` column shows that 2 test units have failed -- the `W` column (for 'warning') shows a filled gray circle, indicating the warning threshold has -been exceeded -- the `E` column (for 'error') shows an open yellow circle, indicating the error threshold has not -been exceeded -- the `C` column (for 'critical') shows a dash since we didn't set a critical threshold - -## Types of Threshold Values - -Thresholds in Pointblank can be specified in two different ways: - -### Absolute Thresholds - -Absolute thresholds are specified as integers and represent a fixed number of failing test units: - -```python -# Warning threshold of exactly 5 failing test units -thresholds_absolute = pb.Thresholds(warning=5) -``` - -With this configuration, the 'warning' threshold would be triggered if 5 or more test units fail. - -### Proportional Thresholds - -Proportional thresholds are specified as decimals between 0 and 1, representing a percentage of the -total test units: - -```python -# Error threshold of 10% of test units failing -thresholds_proportional = pb.Thresholds(error=0.1) -``` - -With this configuration, the 'error' threshold would be triggered if 10% or more of the test units -fail. - -### Boolean Shorthand - -For cases where you want to allow exactly 1 failing test unit, you can use `True` as a convenient -shorthand: - -```python -# Critical threshold of exactly 1 failing test unit -thresholds_boolean = pb.Thresholds(critical=True) -``` - -This is equivalent to setting `critical=1` but provides a more intuitive way to express "allow at -most one failure". This shorthand is particularly useful for strict validations where any failure -beyond a single edge case should trigger immediate attention. - -## Understanding Severity Levels - -The three threshold levels in Pointblank ('warning', 'error', and 'critical') are inspired by -traditional logging levels used in software development. These names suggest a progression of -severity: - -- **'warning'** (level `30`): indicates potential issues that don't necessarily prevent normal -operation -- **'error'** (level `40`): suggests more serious problems that might impact data quality -- **'critical'** (level `50`): represents the most severe issues that likely require immediate -attention - -These numerical values (`30`, `40`, `50`) are used internally by Pointblank when determining -threshold hierarchy and can be accessed through the `{level_num}` field in action metadata (covered -in the next **User Guide** article). - -While these names imply certain severity levels, they're ultimately just convenient labels for -different thresholds. You have complete flexibility in how you use them: - -- you could use 'warning' for issues that should block a pipeline -- you might configure 'critical' for minor issues that just need documentation -- the 'error' level could trigger informational emails rather than actual error handling - -The naming is primarily a suggestion to help organize your validation strategy. What matters most is -how you configure actions for each threshold level to suit your specific data quality requirements. - -## Threshold Behavior - -It's important to understand a few key behaviors of thresholds: - -- thresholds are **inclusive**: a value equal to or exceeding the threshold will trigger the -associated level -- thresholds can be **mixed**: you can use absolute values for some levels and proportional for -others -- threshold levels are **hierarchical**: 'critical' is more severe than 'error', which is more -severe than 'warning' -- when a test fails, **all** applicable threshold levels are marked in the report (though actions -may only execute for the highest level by default) - -## Setting Global Thresholds - -You can set thresholds globally for all validation steps in a workflow using the `thresholds=` -parameter in `Validate`: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - - # Setting thresholds for all validation steps --- - thresholds=pb.Thresholds(warning=1, error=0.1) - ) - .col_vals_not_null(columns="a") - .col_vals_gt(columns="a", value=2) - .interrogate() -) -``` - -With this approach, the same thresholds are applied to every validation step in the workflow. - -## Overriding Thresholds for Specific Steps - -You can override global thresholds for specific validation steps by providing the `thresholds=` -parameter in individual validation methods: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - - # Setting global thresholds --- - thresholds=pb.Thresholds(warning=1, error=0.1) - ) - .col_vals_not_null(columns="a") - .col_vals_gt( - columns="a", value=2, - - # Step-specific threshold that overrides global --- - thresholds=pb.Thresholds(warning=3) - ) - .interrogate() -) -``` - -In this example, the second validation step uses its own 'warning' threshold of `3`, overriding the -global setting of `1`. - -## Ways to Define Thresholds - -Pointblank offers multiple ways to define thresholds to accommodate different coding styles and -requirements. - -### 1. Using the `Thresholds` Class (Recommended) - -The most explicit and flexible approach is using the `Thresholds` class: - -```{python} -# Set individual thresholds for different levels -thresholds_all_levels = pb.Thresholds(warning=0.05, error=0.1, critical=0.25) - -# Set only specific levels -thresholds_error_only = pb.Thresholds(error=0.15) -``` - -This approach allows you to: - -- set any combination of threshold levels -- use descriptive parameter names for clarity -- skip levels you don't need to set - -### 2. Using a Tuple - -For concise code, you can use a tuple where positions represent 'warning', 'error', and 'critical' -levels in that order: - -```{python} -# (warning, error, critical) -thresholds_tuple = (1, 0.1, 0.25) - -# Shorter tuples are also allowed -thresholds_tuple_warning = (3,) # Only the 'warning' threshold -thresholds_tuple_warning_error = (3, 0.2) # Both 'warning' and 'error' thresholds -``` - -While concise, this approach requires you to start with the 'warning' level and add levels in order. - -### 3. Using a Dictionary - -You can also use a dictionary with keys that match the threshold level names: - -```{python} -# Can use any combination of threshold levels -thresholds_dict = {"warning": 1, "critical": 0.15} -``` - -The dictionary must use the exact keys `"warning"`, `"error"`, and/or `"critical"`. - -### 4. Using a Single Value - -The simplest approach is using a single numeric value, which sets just the 'warning' threshold: - -```{python} -# Sets 'warning' threshold to `5` -thresholds_single = 5 -``` - -This is equivalent to `pb.Thresholds(warning=5)`. - -## Thresholds and Validation Steps - -Let's look at a more complete validation workflow that demonstrates different threshold -configurations: - -```{python} -# Create a validation workflow with global and step-specific thresholds -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - - # Global thresholds applied to all steps unless overridden --- - thresholds=pb.Thresholds(warning=0.05, error=0.1, critical=0.2) - ) - - # Step 1: Uses global thresholds --- - .col_vals_not_null(columns="b") - - # Step 2: Overrides with step-specific thresholds --- - .col_vals_gt( - columns="a", value=2, - thresholds=pb.Thresholds(warning=1, critical=0.3) # No 'error' threshold - ) - - # Step 3: Uses a simplified tuple notation --- - .col_vals_not_null(columns="c", thresholds=(2, 0.15)) - - .interrogate() -) -``` - -## Thresholds and Actions - -While thresholds by themselves provide visual indicators of validation severity in reports, their -real power emerges when combined with Actions. The Actions system (covered in the next article) -allows you to specify what happens when a threshold is exceeded. - -For example, you might configure: - -- A 'warning' threshold that logs a message -- An 'error' threshold that sends an email notification -- A 'critical' threshold that blocks a data pipeline - -Here's a simple preview of how thresholds and actions work together: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - - # Define thresholds for all three severity levels --- - thresholds=pb.Thresholds(warning=1, error=2, critical=3), - - # Define actions for different threshold levels --- - actions=pb.Actions( - warning="Warning: {step} has {FAIL} failing values", - error="ERROR: Step {step} exceeded the 'error' threshold", - critical="CRITICAL: Data quality issue in column {col}" - ) - ) - .col_vals_not_null(columns="c") - .interrogate() -) -``` - -## Conclusion - -Thresholds are a powerful feature that transform Pointblank from a simple validation tool into a -sophisticated data quality monitoring system. By setting appropriate thresholds, you can: - -1. Define different severity levels for data quality issues -2. Customize tolerance levels for different types of validation checks -3. Create a more nuanced approach to data validation than binary pass/fail -4. Enable targeted actions based on the severity of issues detected - -In the next article, we'll explore the Actions system in depth, showing you how to define automatic -responses when thresholds are exceeded. diff --git a/docs/user-guide/validation-methods.qmd b/docs/user-guide/validation-methods.qmd deleted file mode 100644 index 880e9d6edf..0000000000 --- a/docs/user-guide/validation-methods.qmd +++ /dev/null @@ -1,985 +0,0 @@ ---- -title: Validation Methods -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_header=False, report_incl_footer_timings=False) -``` - -Pointblank provides a comprehensive suite of validation methods to verify different aspects of your -data. Each method creates a validation step that becomes part of your validation plan. - -These validation methods cover everything from checking column values against thresholds to -validating the table structure and detecting duplicates. Combined into validation steps, they form -the foundation of your data quality workflow. - -Pointblank provides [over 40 validation methods](https://posit-dev.github.io/pointblank/reference/#validation-steps) -to handle diverse data quality requirements. These are grouped into five main categories: - -1. Column Value Validations -2. Row-based Validations -3. Table Structure Validations -4. AI-Powered Validations -5. Aggregate Validations - -Within each of these categories, we'll walk through several examples showing how each validation -method creates steps in your validation plan. - -And we'll use the `small_table` dataset for all of our examples. Here's a preview of it: - -```{python} -# | echo: false -pb.preview(pb.load_dataset(dataset="small_table"), n_head=20, n_tail=20) -``` - -## Validation Methods to Validation Steps - -In Pointblank, validation *methods* become validation *steps* when you add them to a validation -plan. Each method creates a distinct step that performs a specific check on your data. - -Here's a simple example showing how three validation methods create three validation steps: - -```{python} -import pointblank as pb - -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - - # Step 1: Check that values in column `a` are greater than 2 --- - .col_vals_gt(columns="a", value=2, brief="Values in 'a' must exceed 2.") - - # Step 2: Check that column 'date' exists in the table --- - .col_exists(columns="date", brief="Column 'date' must exist.") - - # Step 3: Check that the table has exactly 13 rows --- - .row_count_match(count=13, brief="Table should have exactly 13 rows.") - .interrogate() -) -``` - -Each validation method produces one step in the validation report above. When combined, these steps -form a complete validation plan that systematically checks different aspects of your data quality. - -## Common Arguments - -Most validation methods in Pointblank share a set of common arguments that provide consistency and -flexibility across different validation types: - -- `columns=`: specifies which column(s) to validate (used in column-based validations) -- `pre=`: allows data transformation before validation -- `segments=`: enables validation across different data subsets -- `thresholds=`: sets acceptable failure thresholds -- `actions=`: defines actions to take when validations fail -- `brief=`: provides a description of what the validation is checking -- `active=`: determines if the validation step should be executed (default is `True`) -- `na_pass=`: controls how missing values are handled (only for column value validation methods) - -For column validation methods, the `na_pass=` parameter determines whether missing values -(Null/None/NA) should pass validation (this parameter is covered in a later section). - -These arguments follow a consistent pattern across validation methods, so you don't need to memorize -different parameter sets for each function. This systematic approach makes Pointblank more intuitive -to work with as you build increasingly complex validation plans. - -We'll cover most of these common arguments in their own dedicated sections later in the -**User Guide**, as some of them represent a deeper topic worthy of focused attention. - -## 1. Column Value Validations - -These methods check individual values within columns against specific criteria: - -- **Comparison checks** (`~~Validate.col_vals_gt()`, `~~Validate.col_vals_lt()`, etc.) for comparing -values to thresholds or other columns - -- **Range checks** (`~~Validate.col_vals_between()`, `~~Validate.col_vals_outside()`) for verifying -that values fall within or outside specific ranges - -- **Set membership checks** (`~~Validate.col_vals_in_set()`, `~~Validate.col_vals_not_in_set()`) for -validating values against predefined sets - -- **Null value checks** (`~~Validate.col_vals_null()`, `~~Validate.col_vals_not_null()`) for testing -presence or absence of null values - -- **Pattern matching checks** (`~~Validate.col_vals_regex()`, `~~Validate.col_vals_within_spec()`) -for validating text patterns with regular expressions or against standard specifications - -- **Trending value checks** (`~~Validate.col_vals_increasing()`, `~~Validate.col_vals_decreasing()`) -for verifying that values increase or decrease as you move down the rows - -- **Custom expression checks** (`~~Validate.col_vals_expr()`) for complex validations using custom -expressions - -Now let's look at some key examples from select categories of column value validations. - -### Comparison Checks - -Let's start with a simple example of how `~~Validate.col_vals_gt()` might be used to check if the -values in a column are greater than a specified value. - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_gt(columns="a", value=5) - .interrogate() -) -``` - -If you're checking data in a column that contains Null/`None`/`NA` values and you'd like to -disregard those values (i.e., let them pass validation), you can use `na_pass=True`. The following -example checks values in column `c` of `small_table`, which contains two `None` values: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_le(columns="c", value=10, na_pass=True) - .interrogate() -) -``` - -In the above validation table, we see that all test units passed. If we didn't use `na_pass=True` -there would be 2 failing test units, one for each `None` value in the `c` column. - -It's possible to check against column values against values in an adjacent column. To do this, -supply the `value=` argument with the column name within the `col()` helper function. Here's an -example of that: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_lt(columns="a", value=pb.col("c")) - .interrogate() -) -``` - -This validation checks that values in column `a` are less than values in column `c`. - -### Checking of Missing Values - -A very common thing to validate is that there are no Null/NA/missing values in a column. The -`~~Validate.col_vals_not_null()` method checks for the presence of missing values: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_not_null(columns="a") - .interrogate() -) -``` - -Column `a` has no missing values and the above validation proves this. - -### Checking Percentage of Missing Values - -While `~~Validate.col_vals_not_null()` ensures there are no missing values at all, sometimes you -need to validate that missing values match a specific percentage. The `~~Validate.col_pct_null()` -method checks whether the percentage of missing values in a column matches an expected value: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_pct_null(columns="c", p=0.15, tol=0.05) # Expect ~15% missing values (±5%) - .interrogate() -) -``` - -This validation checks that approximately 15% of values in column `c` are missing, allowing a -tolerance of ±5% (so the acceptable range is 10-20%). The `tol=` parameter can accept various -formats including absolute counts or percentage ranges: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_pct_null(columns="c", p=0.15, tol=(0.05, 0.10)) # Asymmetric tolerance: -5%/+10% - .interrogate() -) -``` - -### Checking Strings with Regexes - -A regular expression (regex) validation via the `~~Validate.col_vals_regex()` validation method -checks if values in a column match a specified pattern. Here's an example with two validation steps, -each checking text values in a column: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_regex(columns="b", pattern=r"^\d-[a-z]{3}-\d{3}$") - .col_vals_regex(columns="f", pattern=r"high|low|mid") - .interrogate() -) -``` - -### Checking Strings Against Specifications - -The `~~Validate.col_vals_within_spec()` method validates column values against common data -specifications like email addresses, URLs, postal codes, credit card numbers, ISBNs, VINs, and -IBANs. This is particularly useful when you need to validate that text data conforms to standard -formats: - -```{python} -import polars as pl - -# Create a sample table with various data types -sample_data = pl.DataFrame({ - "isbn": ["978-0-306-40615-7", "0-306-40615-2", "invalid"], - "email": ["test@example.com", "user@domain.co.uk", "not-an-email"], - "zip": ["12345", "90210", "invalid"] -}) - -( - pb.Validate(data=sample_data) - .col_vals_within_spec(columns="isbn", spec="isbn") - .col_vals_within_spec(columns="email", spec="email") - .col_vals_within_spec(columns="zip", spec="postal_code[US]") - .interrogate() -) -``` - -### Checking for Trending Values - -The `~~Validate.col_vals_increasing()` and `~~Validate.col_vals_decreasing()` validation methods -check whether column values are increasing or decreasing as you move down the rows. These are useful -for validating time series data, sequential identifiers, or any data where you expect monotonic -trends: - -```{python} -import polars as pl - -# Create a sample table with increasing and decreasing values -trend_data = pl.DataFrame({ - "id": [1, 2, 3, 4, 5], - "temperature": [20, 22, 25, 28, 30], - "countdown": [100, 80, 60, 40, 20] -}) - -( - pb.Validate(data=trend_data) - .col_vals_increasing(columns="id") - .col_vals_increasing(columns="temperature") - .col_vals_decreasing(columns="countdown") - .interrogate() -) -``` - -The `allow_stationary=` parameter lets you control whether consecutive identical values should pass -validation. By default, stationary values (e.g., `[1, 2, 2, 3]`) will fail the increasing check, -but setting `allow_stationary=True` will allow them to pass. - -### Handling Missing Values with `na_pass=` - -When validating columns containing Null/None/NA values, you can control how these missing values are -treated with the `na_pass=` parameter: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_vals_le(columns="c", value=10, na_pass=True) - .interrogate() -) -``` - -In the above example, column `c` contains two `None` values, but all test units pass because we set -`na_pass=True`. Without this setting, those two values would fail the validation. - -In summary, `na_pass=` works like this: - -- `na_pass=True`: missing values pass validation regardless of the condition being tested -- `na_pass=False` (the default): missing values fail validation - -## 2. Row-based Validations - -Row-based validations focus on examining properties that span across entire rows rather than -individual columns. These are essential for detecting issues that can't be found by looking at -columns in isolation: - -- `~~Validate.rows_distinct()`: ensures no duplicate rows exist in the table -- `~~Validate.rows_complete()`: verifies that no rows contain any missing values - -These row-level validations are particularly valuable for ensuring data integrity and completeness -at the record level, which is crucial for many analytical and operational data applications. - -### Checking Row Distinctness - -Here's an example where we check for duplicate rows with `~~Validate.rows_distinct()`: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .rows_distinct() - .interrogate() -) -``` - -We can also adapt the `~~Validate.rows_distinct()` check to use a single column or a subset of -columns. To do that, we need to use the `columns_subset=` parameter. Here's an example of that: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .rows_distinct(columns_subset="b") - .interrogate() -) -``` - -### Checking Row Completeness - -Another important validation is checking for complete rows: rows that have no missing values across -all columns or a specified subset of columns. The `~~Validate.rows_complete()` validation method -performs this check. - -Here's an example checking if all rows in the table are complete (have no missing values in any -column): - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .rows_complete() - .interrogate() -) -``` - -As the report indicates, there are some incomplete rows in the table. - -## 3. Table Structure Validations - -Table structure validations ensure that the overall architecture of your data meets expectations. -These structural checks form a foundation for more detailed data quality assessments: - -- `~~Validate.col_exists()`: verifies a column exists in the table -- `~~Validate.col_schema_match()`: ensures table matches a defined schema -- `~~Validate.col_count_match()`: confirms the table has the expected number of columns -- `~~Validate.row_count_match()`: verifies the table has the expected number of rows -- `~~Validate.tbl_match()`: validates that the target table matches a comparison table -- `~~Validate.data_freshness()`: checks that data is recent and not stale - -These structural validations provide essential checks on the fundamental organization of your data -tables, ensuring they have the expected dimensions and components needed for reliable data analysis. - -### Checking Column Presence - -If you need to check for the presence of individual columns, the `Validate.col_exists()` validation -method is useful. In this example, we check whether the `date` column is present in the table: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_exists(columns="date") - .interrogate() -) -``` - -That column is present, so the single test unit of this validation step is a passing one. - -### Checking the Table Schema - -For deeper checks of table structure, a schema validation can be performed with the -`~~Validate.col_schema_match()` validation method, where the goal is to check whether the structure -of a table matches an expected schema. To define an expected table schema, we need to use the -`Schema` class. Here is a simple example that (1) prepares a schema consisting of column names, (2) -uses that `schema` object in a `~~Validate.col_schema_match()` validation step: - -```{python} -schema = pb.Schema(columns=["date_time", "date", "a", "b", "c", "d", "e", "f"]) - -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_schema_match(schema=schema) - .interrogate() -) -``` - -The `~~Validate.col_schema_match()` validation step will only have a single test unit (signifying -pass or fail). We can see in the above validation report that the column schema validation passed. - -More often, a schema will be defined using column names and column types. We can do that by using a -list of tuples in the `columns=` parameter of `Schema`. Here's an example of that approach in -action: - -```{python} -schema = pb.Schema( - columns=[ - ("date_time", "Datetime(time_unit='us', time_zone=None)"), - ("date", "Date"), - ("a", "Int64"), - ("b", "String"), - ("c", "Int64"), - ("d", "Float64"), - ("e", "Boolean"), - ("f", "String"), - ] -) - -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_schema_match(schema=schema) - .interrogate() -) -``` - -The `~~Validate.col_schema_match()` validation method has several boolean parameters for making the -checks less stringent: - -- `complete=`: requires exact column matching (all expected columns must exist, no extra columns -allowed) -- `in_order=`: enforces that columns appear in the same order as defined in the schema -- `case_sensitive_colnames=`: column names must match with exact letter case -- `case_sensitive_dtypes=`: data type strings must match with exact letter case - -These parameters all default to `True`, providing strict schema validation. Setting any to `False` -relaxes the validation requirements, making the checks more flexible when exact matching isn't -necessary or practical for your use case. - -### Comparing Tables with `tbl_match()` - -The `~~Validate.tbl_match()` validation method provides a comprehensive way to verify that two -tables are identical. It performs a progressive series of checks, from least to most stringent: - -1. Column count match -2. Row count match -3. Schema match (loose - case-insensitive, any order) -4. Schema match (order - columns in correct order) -5. Schema match (exact - case-sensitive, correct order) -6. Data match (cell-by-cell comparison) - -This progressive approach helps identify exactly where tables differ. Here's an example comparing -the `small_table` dataset with itself: - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .tbl_match(tbl_compare=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .interrogate() -) -``` - -This validation method is especially useful for: - -- Verifying that data transformations preserve expected properties -- Comparing production data against a golden dataset -- Ensuring data consistency across different environments -- Validating that imported data matches source data - -### Checking Counts of Row and Columns - -Row and column count validations check the number of rows and columns in a table. - -Using `~~Validate.row_count_match()` checks whether the number of rows in a table matches a -specified count. - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .row_count_match(count=13) - .interrogate() -) -``` - -The `~~Validate.col_count_match()` validation method checks if the number of columns in a table -matches a specified count. - -```{python} -( - pb.Validate(data=pb.load_dataset(dataset="small_table", tbl_type="polars")) - .col_count_match(count=8) - .interrogate() -) -``` - -Expectations on column and row counts can be useful in certain situations and they align nicely with -schema checks. - -### Validating Data Freshness - -Late or missing data is one of the most common (and costly) data quality issues in production -systems. When data pipelines fail silently or experience delays, downstream analytics and ML models -can produce stale or misleading results. The `~~Validate.data_freshness()` validation method helps -catch these issues early by verifying that your data contains recent records. - -Data freshness validation works by checking a datetime column against a maximum allowed age. If the -most recent timestamp in that column is older than the specified threshold, the validation fails. -This simple check can prevent major downstream problems caused by stale data. - -Here's an example that validates data is no older than 2 days: - -```{python} -import polars as pl -from datetime import datetime, timedelta - -# Simulate a data feed that should be updated daily -recent_data = pl.DataFrame({ - "event": ["login", "purchase", "logout", "signup"], - "event_time": [ - datetime.now() - timedelta(hours=1), - datetime.now() - timedelta(hours=6), - datetime.now() - timedelta(hours=12), - datetime.now() - timedelta(hours=18), - ], - "user_id": [101, 102, 103, 104] -}) - -( - pb.Validate(data=recent_data) - .data_freshness(column="event_time", max_age="2d") - .interrogate() -) -``` - -The `max_age=` parameter accepts a flexible string format: `"30m"` for 30 minutes, `"6h"` for 6 -hours, `"2d"` for 2 days, or `"1w"` for 1 week. You can also combine units: `"1d 12h"` for 1.5 days. - -When validation succeeds, the report includes details about the data's age in the footer. When it -fails, you'll see exactly how old the most recent data is and what threshold was exceeded. This -context helps quickly diagnose whether you're dealing with a minor delay or a major pipeline -failure. - -Data freshness validation is particularly valuable for: - -- monitoring ETL pipelines to catch failures before they cascade to reports and dashboards -- validating data feeds to ensure third-party data sources are delivering as expected -- including freshness checks in automated data quality tests as part of continuous integration -- building alerting systems that trigger notifications when critical data becomes stale - -You might wonder why not just use `~~Validate.col_vals_gt()` with a datetime threshold. While that -approach works, `~~Validate.data_freshness()` offers several advantages: the method name clearly -communicates your intent, the `max_age=` string format (e.g., `"2d"`) is more readable than datetime -arithmetic, it auto-generates meaningful validation briefs, the report footer shows helpful context -about actual data age and thresholds, and timezone mismatches between your data and comparison time -are handled gracefully with informative warnings. - -::: {.callout-note} -When comparing timezone-aware and timezone-naive datetimes, Pointblank will include a warning in the -validation report. For consistent results, ensure your data and comparison times use compatible -timezone settings. -::: - -## 4. AI-Powered Validations - -AI-powered validations use Large Language Models (LLMs) to validate data based on natural language -criteria. This opens up new possibilities for complex validation rules that are difficult to express -with traditional programmatic methods. - -### Validating with Natural Language Prompts - -The `~~Validate.prompt()` validation method allows you to describe validation criteria in plain -language. The LLM interprets your prompt and evaluates each row, producing pass/fail results just -like other Pointblank validation methods. - -This is particularly useful for: - -- Semantic checks (e.g., "descriptions should mention a product name") -- Context-dependent validation (e.g., "prices should be reasonable for the product category") -- Subjective quality assessments (e.g., "comments should be professional and constructive") -- Complex rules that would require extensive regex patterns or custom functions - -Here's a simple example that validates whether text descriptions contain specific information: - -```{python} -#| eval: false -import polars as pl - -# Create sample data with product descriptions -products = pl.DataFrame({ - "product": ["Widget A", "Gadget B", "Tool C"], - "description": [ - "High-quality widget made in USA", - "Innovative gadget with warranty", - "Professional tool" - ], - "price": [29.99, 49.99, 19.99] -}) - -# Validate that descriptions mention quality or features -( - pb.Validate(data=products) - .prompt( - prompt="Each description should mention either quality, features, or warranty", - columns_subset=["description"], - model="anthropic:claude-sonnet-4-5" - ) - .interrogate() -) -``` - -The `columns_subset=` parameter lets you specify which columns to include in the validation, -improving performance and reducing API costs by only sending relevant data to the LLM. - -**Note:** To use `~~Validate.prompt()`, you need to have the appropriate API credentials configured -for your chosen LLM provider (Anthropic, OpenAI, Ollama, or AWS Bedrock). - -## 5. Aggregate Validations - -Aggregate validations operate on column-level statistics rather than individual row values. These -methods compute an aggregate value (such as sum, average, or standard deviation) from a column and -compare it against an expected value. Unlike row-level validations where each row is a test unit, -aggregate validations treat the entire column as a single test unit that either passes or fails. - -Pointblank provides three families of aggregate validation methods: - -- **Sum validations** (`~~Validate.col_sum_eq()`, `~~Validate.col_sum_gt()`, `~~Validate.col_sum_lt()`, - `~~Validate.col_sum_ge()`, `~~Validate.col_sum_le()`) for validating the sum of column values - -- **Average validations** (`~~Validate.col_avg_eq()`, `~~Validate.col_avg_gt()`, `~~Validate.col_avg_lt()`, - `~~Validate.col_avg_ge()`, `~~Validate.col_avg_le()`) for validating the mean of column values - -- **Standard deviation validations** (`~~Validate.col_sd_eq()`, `~~Validate.col_sd_gt()`, - `~~Validate.col_sd_lt()`, `~~Validate.col_sd_ge()`, `~~Validate.col_sd_le()`) for validating the - standard deviation of column values - -Each family supports the five comparison operators: equal to (`_eq`), greater than (`_gt`), less -than (`_lt`), greater than or equal to (`_ge`), and less than or equal to (`_le`). - -### Validating Column Sums - -Here's an example validating that the sum of column `a` equals 55: - -```{python} -import polars as pl - -agg_data = pl.DataFrame({ - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - "b": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], -}) - -( - pb.Validate(data=agg_data) - .col_sum_eq(columns="a", value=55) - .col_sum_gt(columns="b", value=500) - .interrogate() -) -``` - -### Validating Column Averages - -Average validations are useful for ensuring that typical values remain within expected bounds: - -```{python} -( - pb.Validate(data=agg_data) - .col_avg_eq(columns="a", value=5.5) - .col_avg_ge(columns="b", value=50) - .interrogate() -) -``` - -### Validating Standard Deviations - -Standard deviation validations help ensure data variability is within expected ranges: - -```{python} -( - pb.Validate(data=agg_data) - .col_sd_gt(columns="a", value=2) - .col_sd_lt(columns="b", value=35) - .interrogate() -) -``` - -### Using Tolerance for Fuzzy Comparisons - -Floating-point arithmetic can introduce small precision errors, making exact equality comparisons -unreliable. The `tol=` parameter allows for fuzzy comparisons by specifying an acceptable tolerance: - -```{python} -( - pb.Validate(data=agg_data) - .col_avg_eq(columns="a", value=5.5, tol=0.01) # Pass if average is within ±0.01 of 5.5 - .col_sum_eq(columns="b", value=550, tol=1) # Pass if sum is within ±1 of 550 - .interrogate() -) -``` - -For equality comparisons, the tolerance creates a range `[value - tol, value + tol]` within which -the aggregate is considered valid. - -### Comparing Against Reference Data - -Aggregate validations shine when comparing current data against a baseline or reference dataset. -This is invaluable for detecting drift in data properties over time: - -```{python} -# Current data -current_data = pl.DataFrame({"revenue": [100, 200, 150, 175, 125]}) - -# Historical baseline -baseline_data = pl.DataFrame({"revenue": [95, 205, 145, 180, 130]}) - -( - pb.Validate(data=current_data, reference=baseline_data) - .col_sum_eq(columns="revenue", tol=50) # Compare sums with tolerance - .col_avg_eq(columns="revenue", tol=5) # Compare averages with tolerance - .interrogate() -) -``` - -When `value=None` (the default) and reference data is set, aggregate methods automatically compare -against the same column in the reference data. - -## 6. Custom Validations with `specially()` - -While Pointblank provides over 40 built-in validation methods, there are times when you need to -implement custom validation logic that goes beyond these standard checks. The `~~Validate.specially()` -method gives you complete flexibility to create bespoke validations for domain-specific business -rules, complex multi-column checks, or cross-dataset referential integrity constraints. - -### Basic Custom Validations - -The `specially()` method accepts a callable function that performs your custom validation logic. The -function should return boolean values indicating whether each test unit passes: - -```{python} -import polars as pl - -simple_tbl = pl.DataFrame({ - "a": [5, 7, 1, 3, 9, 4], - "b": [6, 3, 0, 5, 8, 2] -}) - -# Custom validation: sum of two columns must be positive -def validate_sum_positive(data): - return data.select(pl.col("a") + pl.col("b") > 0) - -( - pb.Validate(data=simple_tbl) - .specially( - expr=validate_sum_positive, - brief="Sum of columns 'a' and 'b' must be positive" - ) - .interrogate() -) -``` - -This validation passes because all rows have a positive sum for columns `a` and `b`. The -`specially()` method provides the flexibility to implement any validation logic you can express in -Python, making it a powerful tool for custom data quality checks. - -### Cross-Dataset Referential Integrity - -One powerful use case for `specially()` is validating relationships between multiple datasets. This -is particularly valuable for checking foreign key constraints, conditional existence rules, and -cardinality relationships that span multiple tables. - -#### Foreign Key Validation - -Verify that all keys in one dataset exist in another: - -```{python} -# Create related datasets: Orders and OrderDetails -orders = pl.DataFrame({ - "order_id": [1, 2, 3, 4, 5], - "customer_id": ["A", "B", "A", "C", "B"], - "status": ["completed", "pending", "completed", "cancelled", "completed"] -}) - -order_details = pl.DataFrame({ - "detail_id": [101, 102, 103, 104, 105, 106, 107, 108, 109], - "order_id": [1, 1, 1, 2, 3, 3, 4, 5, 5], - "product_id": ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9"], - "quantity": [2, 1, 3, 1, 2, 1, 1, 2, 1] -}) - -# Validate foreign key constraint -def check_foreign_key(df): - """Check if all order_ids in order_details exist in orders table""" - valid_order_ids = orders.select("order_id") - # Semi join returns only rows with matching keys - return df.join(valid_order_ids, on="order_id", how="semi").height == df.height - -( - pb.Validate(data=order_details, tbl_name="order_details") - .specially( - expr=check_foreign_key, - brief="All order_ids must exist in orders table" - ) - .interrogate() -) -``` - -This validation ensures referential integrity by confirming that every `order_id` in the -`order_details` table has a corresponding record in the `orders` table. The use of a semi-join makes -this check efficient, as it only verifies the existence of matching keys without returning full -joined data. - -#### Conditional Existence Checks - -Implement "if X then Y must exist" logic across datasets: - -```{python} -def check_completed_orders_have_details(df): - """Completed orders must have at least one detail record""" - completed_orders = df.filter(pl.col("status") == "completed") - order_ids_with_details = order_details.select("order_id").unique() - - # Check each completed order has matching details - return completed_orders.join( - order_ids_with_details, - on="order_id", - how="left" - ).with_columns( - pl.col("order_id").is_not_null().alias("has_details") - ).select("has_details") - -( - pb.Validate(data=orders, tbl_name="orders") - .specially( - expr=check_completed_orders_have_details, - brief="Completed orders must have detail records" - ) - .interrogate() -) -``` - -This validation implements conditional business logic: only orders with a `completed` status are -required to have detail records. This pattern is common in real-world scenarios where certain -records trigger mandatory relationships while others don't. The validation returns a boolean for -each completed order, allowing you to see exactly which records pass or fail. - -#### Cardinality Constraints - -Validate that relationships between datasets follow specific cardinality rules: - -```{python} -def check_quantity_ratio(df): - """Each order should have exactly 3x quantity units in details""" - order_counts = orders.group_by("order_id").agg(pl.lit(1).alias("order_count")) - - detail_quantities = order_details.group_by("order_id").agg( - pl.col("quantity").sum().alias("total_quantity") - ) - - joined = order_counts.join(detail_quantities, on="order_id", how="left") - - return joined.with_columns( - (pl.col("total_quantity") == pl.col("order_count") * 3).alias("valid_ratio") - ).select("valid_ratio") - - -( - pb.Validate(data=orders, tbl_name="orders") - .specially( - expr=check_quantity_ratio, - brief="Each order should have 3x quantity units in details", - thresholds=(0.4, 0.7), # Allow some flexibility - ) - .interrogate() -) -``` - -Cardinality constraints like this validate that the relationship between datasets follows expected -patterns. In this example, we expect each order to have a specific quantity ratio in the detail -records. Note the use of `thresholds=` to allow some flexibility (not every order needs to meet this -requirement perfectly, but too many violations would indicate a data quality issue). - -#### Composite Keys with Business Logic - -Validate complex relationships involving multiple columns and conditional logic: - -```{python} -# More complex scenario with composite keys -employees = pl.DataFrame({ - "dept_id": ["D1", "D1", "D2", "D2", "D3"], - "emp_id": ["E001", "E002", "E003", "E004", "E005"], - "emp_name": ["Alice", "Bob", "Charlie", "Diana", "Eve"], - "is_manager": [True, False, True, False, False] -}) - -projects = pl.DataFrame({ - "project_id": ["P1", "P2", "P3", "P4"], - "dept_id": ["D1", "D2", "D1", "D3"], - "manager_emp_id": ["E001", "E003", "E001", "E005"] -}) - -def check_project_manager_validity(df): - """Project managers must be valid managers in their department""" - validation_result = df.join( - employees, - left_on=["dept_id", "manager_emp_id"], - right_on=["dept_id", "emp_id"], - how="left" - ).with_columns( - # Manager must exist in dept AND have manager status - ((pl.col("emp_name").is_not_null()) & (pl.col("is_manager") == True)).alias("valid_manager") - ).select("valid_manager") - - return validation_result - -( - pb.Validate(data=projects, tbl_name="projects") - .specially( - expr=check_project_manager_validity, - brief="Project managers must be valid managers in their department" - ) - .interrogate() -) -``` - -This example demonstrates validation using composite keys (both `dept_id` and `emp_id`) combined -with conditional business logic (checking the `is_manager` flag). Such validations are common in -enterprise systems where relationships must satisfy multiple constraints simultaneously. The -validation reveals that one project (`P4`) fails because employee `E005` is not a manager, even -though they exist in the same department. - -### Reusable Validation Factories - -For validations you'll use repeatedly, create factory functions that generate customized validators: - -```{python} -def make_foreign_key_validator(reference_table, key_columns): - """Factory function to create reusable foreign key validators""" - def validate_fk(df): - if isinstance(key_columns, str): - keys = [key_columns] - else: - keys = key_columns - - ref_keys = reference_table.select(keys).unique() - matched = df.join(ref_keys, on=keys, how="semi") - return matched.height == df.height - - return validate_fk - -# Use the factory across multiple validations -( - pb.Validate(data=order_details, tbl_name="order_details") - .specially( - expr=make_foreign_key_validator(orders, "order_id"), - brief="FK constraint: order_id → orders" - ) - .interrogate() -) -``` - -Factory functions like `make_foreign_key_validator()` make your validation code more maintainable -and reusable. Once defined, you can use the same factory to validate different foreign key -relationships across your entire data pipeline, ensuring consistency in how these constraints are -checked. This pattern is particularly valuable in production environments where you validate -multiple related tables. - -### When to Use `specially()` - -The `specially()` method is ideal for: - -- cross-dataset validations: foreign keys, referential integrity, conditional existence -- complex business rules: multi-column checks, conditional logic, domain-specific constraints -- custom statistical tests: validations requiring calculations not covered by built-in methods -- SQL-style checks: converting complex SQL queries into validation steps -- prototype validations: testing new validation patterns before implementing them as dedicated methods - -By combining `specially()` with Pointblank's built-in validation methods, you can create -comprehensive data quality checks that address both standard and highly specific validation -requirements. - -## Conclusion - -In this article, we've explored the various types of validation methods that Pointblank offers for -ensuring data quality. These methods provide a framework for validating column values, checking row -properties, verifying table structures, using AI for complex semantic validations, and validating -aggregate statistics across columns. By combining these validation methods into comprehensive plans, -you can systematically test your data against business rules and quality expectations. And this all -helps to ensure your data remains reliable and trustworthy. diff --git a/docs/user-guide/validation-overview.qmd b/docs/user-guide/validation-overview.qmd deleted file mode 100644 index 00aa55248e..0000000000 --- a/docs/user-guide/validation-overview.qmd +++ /dev/null @@ -1,296 +0,0 @@ ---- -title: Overview -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -pb.config(report_incl_footer_timings=False) -``` - -This article provides a quick overview of the data validation features in Pointblank. It introduces -the key concepts and shows examples of the main functionality, giving you a foundation for using the -library effectively. - -Later articles in the **User Guide** will expand on each section covered here, providing more -explanations and examples. - -## Validation Methods - -Pointblank's core functionality revolves around validation steps, which are individual checks that -verify different aspects of your data. These steps are created by calling validation methods from -the `Validate` class. When combined they create a comprehensive validation plan for your data. - -Here's an example of a validation that incorporates three different validation methods: - -```{python} -import pointblank as pb -import polars as pl - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Three different validation methods." - ) - .col_vals_gt(columns="a", value=0) - .rows_distinct() - .col_exists(columns="date") - .interrogate() -) -``` - -This example showcases how you can combine different types of validations in a single validation -plan: - -- a column value validation with `Validate.col_vals_gt()` -- a row-based validation with `Validate.rows_distinct()` -- a table structure validation with `Validate.col_exists()` - -Most validation methods share common parameters that enhance their flexibility and power. These -shared parameters (overviewed in the next few sections) create a consistent interface across all -validation steps while allowing you to customize validation behavior for specific needs. - -## Column Selection Patterns - -You can apply the same validation logic to multiple columns at once through use of column selection -patterns (used in the `columns=` parameter). This reduces repetitive code and makes your validation -plans more maintainable: - -```{python} -import narwhals.selectors as nws - -# Map validations across multiple columns -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Applying column mapping in `columns`." - ) - - # Apply validation rules to multiple columns --- - .col_vals_not_null( - columns=["a", "b", "c"] - ) - - # Apply to numeric columns only with a Narwhals selector --- - .col_vals_gt( - columns=nws.numeric(), - value=0 - ) - .interrogate() -) -``` - -This technique is particularly valuable when working with wide datasets containing many -similarly-structured columns or when applying standard quality checks across an entire table. It -also ensures consistency in how validation rules are applied across related data columns. - -## Preprocessing - -Preprocessing (with the `pre=` parameter) allows you to transform or modify your data before -applying validation checks, enabling you to validate derived or modified data without altering the -original dataset: - -```{python} -import polars as pl - -# Define preprocessing functions for `pre=` parameters -def double_column_a(df): - return df.with_columns(pl.col("a") * 2) - -def square_column_c(df): - return df.with_columns(pl.col("c").pow(2)) - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Preprocessing validation steps via `pre=`." - ) - .col_vals_gt( - columns="a", value=5, - - # Apply transformation before validation --- - pre=double_column_a # Double values before checking - ) - .col_vals_lt( - columns="c", value=100, - - # Apply more complex transformation --- - pre=square_column_c # Square values before checking - ) - .interrogate() -) -``` - -Preprocessing enables validation of transformed data without modifying your original dataset, making -it ideal for checking derived metrics, or validating normalized values. This approach keeps your -validation code clean while allowing for sophisticated data quality checks on calculated results. - -## Segmentation - -Segmentation (through the `segments=` parameter) allows you to validate data across different -groups, enabling you to identify segment-specific quality issues that might be hidden in aggregate -analyses: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Segmenting validation steps via `segments=`." - ) - .col_vals_gt( - columns="c", value=3, - - # Split into steps by categorical values in column 'f' --- - segments="f" - ) - .interrogate() -) -``` - -Segmentation is powerful for detecting patterns of quality issues that may exist only in specific -data subsets, such as certain time periods, categories, or geographical regions. It helps ensure -that all significant segments of your data meet quality standards, not just the data as a whole. - -## Thresholds - -Thresholds (set through the `thresholds=` parameter) let you set acceptable levels of failure before -triggering warnings, errors, or critical notifications for individual validation steps: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Using thresholds." - ) - - # Add validation steps with different thresholds --- - .col_vals_gt( - columns="a", value=1, - thresholds=pb.Thresholds(warning=0.1, error=0.2, critical=0.3) - ) - - # Add another step with stricter thresholds --- - .col_vals_lt( - columns="c", value=10, - thresholds=pb.Thresholds(warning=0.05, error=0.1) - ) - .interrogate() -) -``` - -Thresholds provide a nuanced way to monitor data quality, allowing you to set different severity -levels based on the importance of each validation and your organization's tolerance for specific -types of data issues. - -## Actions - -Actions (which can be configured in the `actions=` parameter) allow you to define specific responses -when validation thresholds are crossed. You can use simple string messages or custom functions for -more complex behavior: - -```{python} -# Example 1: Action with a string message --- - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Using actions with a string message." - ) - .col_vals_gt( - columns="c", value=2, - thresholds=pb.Thresholds(warning=0.1, error=0.2), - - # Add a print-to-console action for the 'warning' threshold --- - actions=pb.Actions( - warning="WARNING: Values below `{value}` detected in column 'c'." - ) - ) - .interrogate() -) -``` - -```{python} -# Example 2: Action with a callable function --- - -def custom_action(): - from datetime import datetime - print(f"Data quality issue found ({datetime.now()}).") - -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Using actions with a callable function." - ) - .col_vals_gt( - columns="a", value=5, - thresholds=pb.Thresholds(warning=0.1, error=0.2), - - # Apply the function to the 'error' threshold --- - actions=pb.Actions(error=custom_action) - ) - .interrogate() -) -``` - -With custom action functions, you can implement sophisticated responses like sending notifications -or logging to external systems. - -## Briefs - -Briefs (which can be set through the `brief=` parameter) allow you to customize descriptions -associated with validation steps, making validation results more understandable to stakeholders. -Briefs can be either automatically generated by setting `brief=True` or defined as custom messages -for more specific explanations: - -```{python} -( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - label="Using `brief=` for displaying brief messages." - ) - .col_vals_gt( - columns="a", value=0, - - # Use `True` for automatic generation of briefs --- - brief=True - ) - .col_exists( - columns=["date", "date_time"], - - # Add a custom brief for this validation step --- - brief="Verify required date columns exist for time-series analysis" - ) - .interrogate() -) -``` - -Briefs make validation results more meaningful by providing context about why each check matters. -They're particularly valuable in shared reports where stakeholders from various disciplines need to -understand validation results in domain-specific terms. - -## Getting More Information - -Each validation step can be further customized and has additional options. See these pages for more -information: - -- [Validation Methods](validation-methods.qmd): A closer look at the more common validation methods -- [Column Selection Patterns](column-selection-patterns.qmd): Techniques for targeting specific columns -- [Preprocessing](preprocessing.qmd): Transform data before validation -- [Segmentation](segmentation.qmd): Apply validations to specific segments of your data -- [Thresholds](thresholds.qmd): Set quality standards and trigger severity levels -- [Actions](actions.qmd): Respond to threshold exceedances with notifications or custom functions -- [Briefs](briefs.qmd): Add context to validation steps - -## Conclusion - -Validation steps are the building blocks of data validation in Pointblank. By combining steps from -different categories and leveraging common features like thresholds, actions, and preprocessing, you -can create comprehensive data quality checks tailored to your specific needs. - -The next sections of this guide will dive deeper into each of these topics, providing detailed -explanations and examples. diff --git a/docs/user-guide/validation-reports.qmd b/docs/user-guide/validation-reports.qmd deleted file mode 100644 index 25ddef72d6..0000000000 --- a/docs/user-guide/validation-reports.qmd +++ /dev/null @@ -1,617 +0,0 @@ ---- -title: "Validation Reports" -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -```{python} -#| echo: false -#| output: false -import pointblank as pb -``` - -After interrogating your data with a validation plan, Pointblank automatically generates a -*validation report*. That tabular report comprehensively summarizes the results of all validation -steps. It'll be your primary tool for understanding data quality at a glance, identifying issues, -and communicating results to stakeholders. - -Validation reports are [Great Tables](https://github.com/posit-dev/great-tables) objects that -provide rich information about each validation step. It includes: identifying information for the -step, pass/fail statistics, threshold exceedances, and visual status indicators. The report makes it -easy to quickly assess overall data quality and pinpoint specific areas that need attention. - -## Viewing the Validation Report - -The most straightforward way to view a validation report is to simply print the `Validate` object -after calling `interrogate()`: - -```{python} -import pointblank as pb -import polars as pl - -# Sample data -data = pl.DataFrame({ - "id": range(1, 11), - "value": [120, 85, 47, 210, 30, 155, 175, 95, 205, 140], - "category": ["A", "B", "C", "A", "D", "B", "A", "E", "A", "C"], - "ratio": [0.5, 0.7, 0.3, 1.2, 0.8, 0.9, 0.4, 1.5, 0.6, 0.2], -}) - -# Create and interrogate a validation -validation = ( - pb.Validate(data=data, tbl_name="sales_data") - .col_vals_gt(columns="value", value=50, brief=True) - .col_vals_in_set(columns="category", set=["A", "B", "C"], brief=True) - .col_exists(columns=["id", "value"], brief=True) - .interrogate() -) - -# Display the validation report -validation -``` - -In a notebook or interactive environment, simply typing the validation object name displays the -report automatically. In a script or REPL, you might need to explicitly call -`validation.get_tabular_report().show()` to display the table. - -::: {.callout-note} -You can display a validation report even before calling `interrogate()`. The report will show your -validation plan with all the steps you've defined, but it won't contain any interrogation results. -Additionally, validation steps that use column selection patterns (like validating multiple columns -at once) won't be expanded into individual rows yet, as that expansion happens during interrogation. -::: - -## Understanding Report Components - -The validation report table consists of several key components that work together to provide a -complete picture of your data quality: - -#### Report Header - -The report header (title and subtitle area) contains important metadata about the validation: - -- **Title**: by default, shows "Pointblank Validation" but can be customized -- **Label**: your custom label for the validation (if provided via the `label=` parameter) -- **Table Information**: the table name and type (Polars, Pandas, DuckDB, etc.) -- **Thresholds**: the warning, error, and critical threshold values used - -This header information provides essential context for interpreting the validation results, -especially when sharing reports with stakeholders or reviewing historical validations. - -#### Report Footer - -The report footer contains several pieces of information that provide context and traceability: - -**Timestamps**: The footer shows when the interrogation was performed, including the start time, -duration, and end time. This helps track when data quality checks were executed, which is especially -useful when archiving reports or monitoring data quality over time. - -**Governance Metadata**: When you provide governance parameters to `Validate`, they are displayed -in the footer as well. This metadata helps document data ownership and dependencies: - -- **Owner**: who is responsible for the data quality (e.g., `"data-platform-team"`) -- **Consumers**: who depends on this data (e.g., `["ml-team", "analytics"]`) -- **Version**: the version of the validation plan or data contract (e.g., `"2.1.0"`) - -Here's an example showing governance metadata in a validation report: - -```{python} -# Example with governance metadata -governance_validation = ( - pb.Validate( - data=data, - tbl_name="sales_data", - label="Sales data validation", - owner="data-platform-team", - consumers=["ml-team", "analytics", "finance"], - version="1.2.0", - ) - .col_vals_gt(columns="value", value=0) - .col_vals_in_set(columns="category", set=["A", "B", "C", "D", "E"]) - .interrogate() -) - -governance_validation -``` - -The governance metadata appears below the timestamps in the footer, making it easy to identify who -owns the data, who depends on it, and which version of the validation rules are being applied. - -::: {.callout-tip} -Governance metadata is particularly useful in enterprise environments where data lineage and -accountability are important. By including `owner`, `consumers`, and `version` in your validations, -you create self-documenting reports that can be easily understood by anyone reviewing them. -::: - -::: {.callout-note} -Throughout this documentation, the footer is hidden in example reports for brevity. This is -controlled through a global option (see the section on controlling header and footer display later -in this guide). In practice, including the footer provides valuable timestamp information for -tracking when validations were executed. -::: - -### Report Columns - -The validation report table includes the following columns, each providing specific information -about the validation steps: - -#### Status Indicator (first column, unlabeled) - -The first column is an unlabeled vertical colored bar that provides instant visual feedback about -each step's status: - -- **Green**: all test units passed the validation -- **Light green (semi-transparent)**: some test units failed but no thresholds were exceeded -- **Gray**: the 'warning' threshold was exceeded -- **Yellow**: the 'error' threshold was exceeded -- **Red**: the 'critical' threshold was exceeded - -This visual indicator allows you to quickly scan the report and identify problem areas. - -#### Step Number (second column, unlabeled) - -The second column is unlabeled and contains the sequential step number, starting from 1. This number -is used when referencing specific steps in other methods like `get_step_report(i=2)` or when -extracting data from specific validation steps. - -#### TYPE - -The TYPE column displays the validation method name along with an icon that visually represents the -type of validation being performed. The validation method indicates what aspect of data quality is -being checked, such as: - -- `col_vals_gt()`: column values greater than -- `col_vals_in_set()`: column values in a set -- `col_exists()`: column existence check -- `rows_distinct()`: row uniqueness check -- and many others... - -When you provide a brief message (via `brief=True` for auto-generated briefs or `brief="custom -text"` for custom messages), it appears within the TYPE column below the validation method name. -These briefs provide human-readable explanations of what each validation step is checking, making -the report more accessible to non-technical stakeholders. - -```{python} -# Example showing brief messages in the TYPE column -validation_with_briefs = ( - pb.Validate(data=data, tbl_name="sales_data") - .col_vals_gt( - columns="value", - value=50, - brief="Sales values should always exceed the $50 threshold" - ) - .col_vals_in_set( - columns="category", - set=["A", "B", "C"], - brief=True # Auto-generated brief - ) - .interrogate() -) - -validation_with_briefs -``` - -In the above report, you'll see the custom brief message appear below the `col_vals_gt` method name -in the first step, and an automatically generated brief below `col_vals_in_set` in the second step. - -#### COLUMNS - -The column(s) being validated in this step. For validation methods that don't target specific -columns (like `row_count_match`), this will show an em dash (—). - -#### VALUES - -The comparison value(s) or criteria used in the validation. For example: - -- for `col_vals_gt(value=100)`, this shows `100` -- for `col_vals_in_set(set=["A", "B", "C"])`, this shows `A | B | C` -- for existence checks, this shows an em dash (—) - -#### TBL - -Icons indicating whether any preprocessing or segmentation was applied: - -- **Table icon**: standard validation on the original data -- **Transformation icon**: preprocessing function was applied via `pre=` -- **Segmentation icon**: data was segmented via `segments=` - -These icons help you understand if you're validating transformed or segmented data. - -#### EVAL - -Indicates whether the validation step was evaluated: - -- **Checkmark**: step was successfully evaluated -- **Error icon**: an evaluation error occurred (e.g., column not found) -- **Inactive icon**: step was marked as inactive - -This column is crucial for identifying validation steps that couldn't be executed properly. - -#### UNITS - -The number of units tested in this validation step. A 'test unit' is the atomic unit being -validated, which varies by validation type: - -- for column value checks: each cell in the target column(s) -- for row checks: each row -- for table checks: typically 1 (the table itself) - -This number is formatted with locale-appropriate thousand separators for readability. Also, since -space is limited, values are often abbreviated so a figure like 43,534 will appear as `43.5K`. - -#### PASS - -The number and fraction of test units that passed the validation, displayed as: - -``` -n_passed -f_passed -``` - -For example, the cell with - -``` -8 -0.80 -``` - -means 8 test units passed out of the total, representing an 80% success rate (though `f_passed` is -always expressed as a fractional value from `0` to `1`). - -#### FAIL - -The number and fraction of test units that failed the validation, displayed similarly to PASS: - -``` -n_failed -f_failed -``` - -For example, the cell with - -``` -2 -0.20 -``` - -means 2 test units failed, representing a 20% failure rate from a fractional value of `0.20`. Note -that this fractional `f_failed` value is what's used to set failure thresholds for 'warning', -'error', and 'critical' states. - -#### W, E, C (Warning, Error, Critical) - -Three columns showing whether each threshold level was exceeded for the three different states. - -- **Long dash**: threshold wasn't set for a state -- **Empty colored circle**: threshold was set but wasn't exceeded for a given state -- **Filled colored circle**: threshold was set and exceeded - -In terms of colors, the 'warning' state is gray, the 'error' state is yellow, and the 'critical' -state is red. - -Having visual indicators makes it easy to identify which validation steps have crossed into warning, -error, or critical territory. - -#### EXT - -Indicates whether failing row data was extracted for this step: - -- **Em dash (—)**: no extract available -- **Download button**: click to download failing rows as CSV - -When extracts are available, you can download them directly from the report for further analysis or -to share with data stewards who need to fix the issues. - -## Understanding Validation Status - -The validation report helps you quickly understand the overall status of your data: - -- **All green status indicators**: all validations passed completely -- **Light green indicators**: minor failures below warning threshold -- **Gray, yellow, or red indicators**: threshold exceedances requiring attention -- **Error icons in EVAL column**: validation steps that couldn't be evaluated - -By scanning the status indicators column, you can immediately identify which validation steps need -attention and prioritize your data quality efforts accordingly. - -## Customizing the Report Title - -You can customize the validation report's title using the `title=` parameter in -`get_tabular_report()`. This is particularly useful when generating multiple reports or when you -want to provide more context: - -```{python} -# Default title -validation.get_tabular_report() -``` - -```{python} -# Use the table name as the title -validation.get_tabular_report(title=":tbl_name:") -``` - -```{python} -# Provide a custom title (supports Markdown) -validation.get_tabular_report(title="**Sales Data** Quality Report") -``` - -```{python} -# No title -validation.get_tabular_report(title=":none:") -``` - -The title customization options are: - -- `":default:"` (default): shows `"Pointblank Validation"` -- `":tbl_name:"`: uses the table name from `tbl_name=` parameter -- `":none:"`: hides the title completely -- Any string: custom title text (Markdown is supported) - -## Customizing with Great Tables - -Since the validation report is a Great Tables object, you can leverage the full power of Great -Tables to customize its appearance. This allows you to match your organization's branding, highlight -specific information, or adjust the presentation for different audiences. - -### Guide to Internal Column Names - -When working with Great Tables methods to customize the validation report, you'll need to use the -*internal column names* rather than the display labels you see in the rendered table. This is -because Great Tables operates on the underlying data table structure, where columns have technical -names that differ from their user-facing labels. - -For example, the column labeled `"STEP"` in the report is actually stored internally as `"i"`, and -the `"TYPE"` column is internally named `"type_upd"`. Most Great Tables methods that target specific -columns (like `tab_style()`, `cols_width()`, `cols_hide()`, etc.) require these internal names. - -Here's the complete mapping from display labels to internal column names: - -1. Status indicator (no label): `"status_color"` -2. Step number (no label): `"i"` -3. `TYPE`: `"type_upd"` -4. `COLUMNS`: `"columns_upd"` -5. `VALUES`: `"values_upd"` -6. `TBL`: `"tbl"` -7. `EVAL`: `"eval"` -8. `UNITS`: `"test_units"` -9. `PASS`: `"pass"` -10. `FAIL`: `"fail"` -11. `W`: `"w_upd"` -12. `E`: `"e_upd"` -13. `C`: `"c_upd"` -14. `EXT`: `"extract_upd"` - -Always use these internal names when calling Great Tables methods. Using the display labels (like -`"STEP"` or `"TYPE"`) will result in errors since these labels only exist in the rendered output, -not in the underlying data structure. - -In the examples that follow, you'll see how to use these internal column names to customize various -aspects of the validation report. - -### Adding Custom Styling - -You can apply custom styles to the report table: - -```{python} -from great_tables import style, loc - -# Get the report as a Great Tables object -report = validation.get_tabular_report() - -# Add custom styling using internal column names -report = ( - report - .tab_style( - style=style.fill(color="#F0F8FF"), - locations=loc.body(columns="i") # Internal name for step number - ) - .tab_style( - style=style.text(weight="bold"), - locations=loc.body(columns="type_upd") # Internal name for TYPE - ) -) - -report -``` - -### Modifying Column Widths - -Adjust column widths to optimize the layout: - -```{python} -report = ( - validation - .get_tabular_report() - .cols_width( - cases={ - "status_color": "20px", # Status indicator column - "i": "40px", # Step number column - "type_upd": "170px", # TYPE column - "columns_upd": "100px", # COLUMNS column - } - ) -) - -report -``` - -### Hiding Columns - -Hide specific columns that aren't relevant for your audience: - -```{python} -# Hide the TBL and EVAL columns for a cleaner presentation (using internal names) -report = ( - validation - .get_tabular_report() - .cols_hide(columns=["tbl", "eval"]) # Use internal column names -) - -report -``` - -### Adding a Source Note - -Add information about data source or validation context: - -```{python} -report = ( - validation - .get_tabular_report() - .tab_source_note( - source_note="Data validated on 2025-10-10 | Production database snapshot" - ) -) - -report -``` - -## Exporting the Report - -Great Tables provides multiple export options for sharing validation reports: - -```python -# Save as a standalone HTML file -validation.get_tabular_report().write_raw_html("validation_report.html") - -# Save as a PNG image -validation.get_tabular_report().save("validation_report.png") - -# Open in browser -validation.get_tabular_report().show("browser") -``` - -## Controlling Header and Footer Display - -You can control whether the header and footer appear in the validation report: - -```{python} -# Hide the footer -validation.get_tabular_report(incl_footer=False) -``` - -```{python} -# Hide the header -validation.get_tabular_report(incl_header=False) -``` - -```{python} -# Hide both -validation.get_tabular_report(incl_header=False, incl_footer=False) -``` - -You can also set these preferences globally using `pb.config()`: - -```python -# Set global preferences -pb.config(report_incl_header=True, report_incl_footer_timings=False) -``` - -## Best Practices for Validation Reports - -Here are some guidelines for creating effective validation reports: - -#### 1. Use Descriptive Table Names and Labels - -Provide meaningful names and labels to make reports self-documenting: - -```python -validation = pb.Validate( - data=sales_df, - tbl_name="Q3_2025_sales", - label="Quarterly sales data validation for financial reporting" -) -``` - -#### 2. Include Governance Metadata for Accountability - -Add ownership and dependency information for enterprise data governance: - -```python -validation = pb.Validate( - data=sales_df, - tbl_name="Q3_2025_sales", - label="Quarterly sales data validation", - owner="data-platform-team", - consumers=["ml-team", "analytics", "finance"], - version="2.1.0" -) -``` - -This creates a clear record of who is responsible for the data, who depends on it, and which version -of the validation rules are being applied. - -#### 3. Add Brief Messages for Stakeholder Reports - -When sharing reports with non-technical stakeholders, always include briefs: - -```python -.col_vals_between( - columns="price", - left=0, right=10000, - brief="Product prices must be between $0 and $10,000" -) -``` - -#### 4. Set Appropriate Thresholds - -Configure thresholds that align with your data quality requirements: - -```python -validation = pb.Validate( - data=data, - tbl_name="customer_data", - thresholds=pb.Thresholds( - warning=0.01, # 1% failure triggers warning - error=0.05, # 5% failure triggers error - critical=0.10 # 10% failure triggers critical - ) -) -``` - -#### 5. Customize for Your Audience - -Tailor the report presentation to your audience: - -- **Technical teams**: include all columns, show preprocessing indicators -- **Management**: hide technical columns, emphasize status indicators -- **Data stewards**: include extract download buttons, detailed briefs - -#### 6. Combine with Other Reporting Tools - -Use validation reports alongside other Pointblank features: - -- **Step reports**: drill down into specific failing steps with `get_step_report()` -- **Extracts**: use `get_data_extracts()` to get all failing data for analysis -- **Sundered data**: use `get_sundered_data()` to split data into passing/failing sets - -#### 7. Archive Reports for Trend Analysis - -Save validation reports over time to track data quality trends: - -```python -from datetime import datetime - -# Save with timestamp -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -validation.get_tabular_report().write_raw_html(f"validation_report_{timestamp}.html") -``` - -## Conclusion - -The validation report is your primary interface for understanding data quality after running a -validation. By providing a comprehensive overview of all validation steps, visual status indicators, -and detailed statistics, it enables you to: - -- quickly assess overall data quality across multiple dimensions -- identify specific validation steps that need attention -- communicate data quality status to technical and non-technical stakeholders -- track threshold exceedances and their severity levels -- access failing data through extract downloads - -Combined with customization options from Great Tables, you can create reports that perfectly match -your organization's needs and workflows. Whether you're validating data in an interactive notebook, -generating automated quality reports, or presenting findings to stakeholders, the validation report -provides the clarity and detail you need to maintain high data quality standards. diff --git a/docs/user-guide/yaml-reference.qmd b/docs/user-guide/yaml-reference.qmd deleted file mode 100644 index cbe3341b2f..0000000000 --- a/docs/user-guide/yaml-reference.qmd +++ /dev/null @@ -1,1107 +0,0 @@ ---- -title: YAML Reference -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -This reference provides a comprehensive guide to all YAML keys and parameters supported by -Pointblank's YAML validation workflows. Use this document as a quick lookup when building validation -configurations. - -## Global Configuration Keys - -### Top-level Structure - -```yaml -tbl: data_source # REQUIRED: Data source specification -df_library: "polars" # OPTIONAL: DataFrame library ("polars", "pandas", "duckdb") -tbl_name: "Custom Table Name" # OPTIONAL: Human-readable table name -label: "Validation Description" # OPTIONAL: Description for the validation workflow -lang: "en" # OPTIONAL: Language code (default: "en") -locale: "en" # OPTIONAL: Locale setting (default: "en") -brief: "Global brief: {auto}" # OPTIONAL: Global brief template -owner: "Data Engineering" # OPTIONAL: Data owner (governance metadata) -consumers: [Analytics, Finance] # OPTIONAL: Data consumers (governance metadata) -version: "1.0.0" # OPTIONAL: Validation version (governance metadata) -reference: # OPTIONAL: Reference table for comparison validations - python: | - pb.load_dataset("ref_table") -thresholds: # OPTIONAL: Global failure thresholds - warning: 0.1 - error: 0.2 - critical: 0.3 -actions: # OPTIONAL: Global failure actions - warning: "Warning message template" - error: "Error message template" - critical: "Critical message template" - highest_only: false -final_actions: # OPTIONAL: Actions triggered after all steps complete - warning: "Post-validation warning" - error: "Post-validation error" -steps: # REQUIRED: List of validation steps - - validation_method_name - - validation_method_name: - parameter: value -``` - -### Data Source (`tbl`) - -The `tbl` key specifies the data source and supports multiple formats: - -```yaml -# File paths -tbl: "data/file.csv" -tbl: "data/file.parquet" - -# Built-in datasets -tbl: small_table -tbl: game_revenue -tbl: nycflights - -# Python expressions for complex data loading -tbl: - python: | - pl.scan_csv("data.csv").filter(pl.col("date") >= "2024-01-01") -``` - -#### Using Templates with `set_tbl=` - -For reusable validation templates that will always use a custom data source via the `set_tbl=` -parameter in `yaml_interrogate()`, the `tbl` field is still required but its value doesn't matter -since it will be overridden. Recommended approaches: - -```yaml -# Option 1: Use a valid dataset name (gets overridden anyway) -tbl: small_table # Will be ignored when `set_tbl=` is used - -# Option 2: Use YAML null (clearest semantic intent) -tbl: null # Indicates table will be provided via `set_tbl=` -``` - -When using `yaml_interrogate()` with `set_tbl=`, the validation template becomes fully reusable: - -```python -# Define reusable template -template = """ -tbl: null # Will be overridden -tbl_name: "Sales Validation" -steps: - - col_exists: - columns: [customer_id, revenue, region] - - col_vals_gt: - columns: [revenue] - value: 0 -""" - -# Apply to different datasets -q1_result = pb.yaml_interrogate(template, set_tbl=q1_data) -q2_result = pb.yaml_interrogate(template, set_tbl=q2_data) -``` - -### DataFrame Library (`df_library`) - -The `df_library` key controls which DataFrame library is used to load data sources. This parameter -affects both built-in datasets and file loading: - -```yaml -# Use Polars DataFrames (default) -df_library: polars - -# Use Pandas DataFrames -df_library: pandas - -# Use DuckDB tables (via Ibis) -df_library: duckdb -``` - -Examples with different libraries: - -```yaml -# Load built-in dataset as Pandas DataFrame -tbl: small_table -df_library: pandas -steps: - - specially: - expr: "lambda df: df.assign(validation_result=df['a'] > 0)" - -# Load CSV file as Polars DataFrame -tbl: "data/sales.csv" -df_library: polars -steps: - - col_vals_gt: - columns: amount - value: 0 - -# Load dataset as DuckDB table -tbl: nycflights -df_library: duckdb -steps: - - row_count_match: - count: 336776 -``` - -The `df_library` parameter is particularly useful when: - -- using validation expressions that require specific DataFrame APIs (e.g., Pandas `.assign()`, -Polars `.select()`) -- integrating with existing pipelines that use a specific DataFrame library -- optimizing performance for different data sizes and operations -- ensuring compatibility with downstream processing steps - -### Global Thresholds - -Thresholds define when validation failures trigger different severity levels: - -```yaml -thresholds: - warning: 0.05 # 5% failure rate triggers warning - error: 0.10 # 10% failure rate triggers error - critical: 0.15 # 15% failure rate triggers critical -``` - -- values: numbers between `0` and `1` (percentages) or integers (row counts) -- levels: `warning`, `error`, `critical` - -### Global Actions - -Actions define responses when thresholds are exceeded. When supplying a string to a severity level -('warning', 'error', 'critical'), you can use template variables that will be automatically -substituted with contextual information: - -```yaml -actions: - warning: "Warning: {n_failed} failures in step {step}" - error: - python: | - lambda: print("Error detected!") - critical: "Critical failure at {time}" - highest_only: false # Execute all applicable actions vs. only highest severity -``` - -Template variables available for action strings: - -- `{step}`: current validation step number -- `{col}`: column name(s) being validated -- `{val}`: validation value or threshold -- `{n_failed}`: number of failing records -- `{n}`: total number of records -- `{type}`: validation method type -- `{level}`: severity level ('warning'/'error'/'critical') -- `{time}`: timestamp of validation - -## Validation Methods Reference - -### Column Value Validations - -#### Comparison Methods - -`col_vals_gt`: are column data greater than a fixed value or data in another column? - -```yaml -- col_vals_gt: - columns: [column_name] # REQUIRED: Column(s) to validate - value: 100 # REQUIRED: Comparison value - na_pass: true # OPTIONAL: Pass NULL values (default: false) - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values must be > 100" # OPTIONAL: Step description -``` - -`col_vals_lt`: are column data less than a fixed value or data in another column? - -```yaml -- col_vals_lt: - columns: [column_name] - value: 100 - na_pass: true - # ... (same parameters as col_vals_gt) -``` - -`col_vals_ge`: are column data greater than or equal to a fixed value or data in another column? - -```yaml -- col_vals_ge: - columns: [column_name] - value: 100 - na_pass: true - # ... (same parameters as col_vals_gt) -``` - -`col_vals_le`: are column data less than or equal to a fixed value or data in another column? - -```yaml -- col_vals_le: - columns: [column_name] - value: 100 - na_pass: true - # ... (same parameters as col_vals_gt) -``` - -`col_vals_eq`: are column data equal to a fixed value or data in another column? - -```yaml -- col_vals_eq: - columns: [column_name] - value: "expected_value" - na_pass: true - # ... (same parameters as col_vals_gt) -``` - -`col_vals_ne`: are column data not equal to a fixed value or data in another column? - -```yaml -- col_vals_ne: - columns: [column_name] - value: "forbidden_value" - na_pass: true - # ... (same parameters as col_vals_gt) -``` - -#### Range Methods - -`col_vals_between`: are column data between two specified values (inclusive)? - -```yaml -- col_vals_between: - columns: [column_name] # REQUIRED: Column(s) to validate - left: 0 # REQUIRED: Lower bound - right: 100 # REQUIRED: Upper bound - inclusive: [true, true] # OPTIONAL: Include bounds [left, right] - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values between 0 and 100" # OPTIONAL: Step description -``` - -`col_vals_outside`: are column data outside of two specified values? - -```yaml -- col_vals_outside: - columns: [column_name] - left: 0 - right: 100 - inclusive: [false, false] # OPTIONAL: Exclude bounds [left, right] - na_pass: false - # ... (same parameters as col_vals_between) -``` - -#### Set Membership Methods - -`col_vals_in_set`: are column data part of a specified set of values? - -```yaml -- col_vals_in_set: - columns: [column_name] # REQUIRED: Column(s) to validate - set: [value1, value2, value3] # REQUIRED: Allowed values - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values in allowed set" # OPTIONAL: Step description -``` - -`col_vals_not_in_set`: are column data not part of a specified set of values? - -```yaml -- col_vals_not_in_set: - columns: [column_name] - set: [forbidden1, forbidden2] # REQUIRED: Forbidden values - na_pass: false - # ... (same parameters as col_vals_in_set) -``` - -#### NULL Value Methods - -`col_vals_null`: are column data null (missing)? - -```yaml -- col_vals_null: - columns: [column_name] # REQUIRED: Column(s) to validate - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values must be NULL" # OPTIONAL: Step description -``` - -`col_vals_not_null`: are column data not null (not missing)? - -```yaml -- col_vals_not_null: - columns: [column_name] - # ... (same parameters as col_vals_null) -``` - -#### Pattern Matching Methods - -`col_vals_regex`: do string-based column data match a regular expression? -```yaml -- col_vals_regex: - columns: [column_name] # REQUIRED: Column(s) to validate - pattern: "^[A-Z]{2,3}$" # REQUIRED: Regular expression pattern - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values match pattern" # OPTIONAL: Step description -``` - -`col_vals_within_spec`: do column data conform to a specification (email, URL, postal codes, etc.)? - -```yaml -- col_vals_within_spec: - columns: [column_name] # REQUIRED: Column(s) to validate - spec: "email" # REQUIRED: Specification type - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values match spec" # OPTIONAL: Step description -``` - -Available specification types: - -- `"email"` - Email addresses -- `"url"` - Internet URLs -- `"phone"` - Phone numbers -- `"ipv4"` - IPv4 addresses -- `"ipv6"` - IPv6 addresses -- `"mac"` - MAC addresses -- `"isbn"` - International Standard Book Numbers (10 or 13 digit) -- `"vin"` - Vehicle Identification Numbers -- `"credit_card"` - Credit card numbers (uses Luhn algorithm) -- `"swift"` - Business Identifier Codes (SWIFT-BIC) -- `"postal_code[]"` - Postal codes for specific countries (e.g., `"postal_code[US]"`, `"postal_code[CA]"`) -- `"zip"` - Alias for US ZIP codes (`"postal_code[US]"`) -- `"iban[]"` - International Bank Account Numbers (e.g., `"iban[DE]"`, `"iban[FR]"`) - -Examples: - -```yaml -# Email validation -- col_vals_within_spec: - columns: user_email - spec: "email" - -# US postal codes -- col_vals_within_spec: - columns: zip_code - spec: "postal_code[US]" - -# German IBAN -- col_vals_within_spec: - columns: account_number - spec: "iban[DE]" -``` - -#### Custom Expression Methods - -`col_vals_expr`: do column data agree with a predicate expression? - -```yaml -- col_vals_expr: - expr: # REQUIRED: Custom validation expression - python: | - pl.when(pl.col("status") == "active") - .then(pl.col("value") > 0) - .otherwise(pl.lit(True)) - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Custom validation rule" # OPTIONAL: Step description -``` - -#### Trend Validation Methods - -`col_vals_increasing`: are column data increasing row-by-row? - -```yaml -- col_vals_increasing: - columns: [column_name] # REQUIRED: Column(s) to validate - allow_stationary: false # OPTIONAL: Allow consecutive equal values (default: false) - decreasing_tol: 0.5 # OPTIONAL: Tolerance for negative movement (default: null) - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values must increase" # OPTIONAL: Step description -``` - -This validation checks whether values in a column increase as you move down the rows. Useful for -validating time-series data, sequence numbers, or any monotonically increasing values. - -Parameters: - -- `allow_stationary`: If `true`, allows consecutive values to be equal (stationary phases). For -example, `[1, 2, 2, 3]` would pass when `true` but fail at the third value when `false`. -- `decreasing_tol`: Absolute tolerance for negative movement. Setting this to `0.5` means values can -decrease by up to 0.5 units and still pass. Setting any value also sets `allow_stationary` to `true`. - -Examples: - -```yaml -# Strict increasing validation -- col_vals_increasing: - columns: timestamp_seconds - brief: "Timestamps must strictly increase" - -# Allow stationary values -- col_vals_increasing: - columns: version_number - allow_stationary: true - brief: "Version numbers should increase (ties allowed)" - -# With tolerance for small decreases -- col_vals_increasing: - columns: temperature - decreasing_tol: 0.1 - brief: "Temperature trend (small drops allowed)" -``` - -`col_vals_decreasing`: are column data decreasing row-by-row? - -```yaml -- col_vals_decreasing: - columns: [column_name] # REQUIRED: Column(s) to validate - allow_stationary: false # OPTIONAL: Allow consecutive equal values (default: false) - increasing_tol: 0.5 # OPTIONAL: Tolerance for positive movement (default: null) - na_pass: false # OPTIONAL: Pass NULL values - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Values must decrease" # OPTIONAL: Step description -``` - -This validation checks whether values in a column decrease as you move down the rows. Useful for -countdown timers, inventory depletion, or any monotonically decreasing values. - -Parameters: - -- `allow_stationary`: If `true`, allows consecutive values to be equal (stationary phases). For -example, `[10, 8, 8, 5]` would pass when `true` but fail at the third value when `false`. -- `increasing_tol`: Absolute tolerance for positive movement. Setting this to `0.5` means values can -increase by up to 0.5 units and still pass. Setting any value also sets `allow_stationary` to `true`. - -Examples: - -```yaml -# Strict decreasing validation -- col_vals_decreasing: - columns: countdown_timer - brief: "Timer must strictly decrease" - -# Allow stationary values -- col_vals_decreasing: - columns: priority_score - allow_stationary: true - brief: "Priority scores should decrease (ties allowed)" - -# With tolerance for small increases -- col_vals_decreasing: - columns: stock_level - increasing_tol: 5 - brief: "Stock levels decrease (small restocks allowed)" -``` - -### Row-based Validations - -`rows_distinct`: are row data distinct? - -```yaml -- rows_distinct # Simple form - -- rows_distinct: # With parameters - columns_subset: [col1, col2] # OPTIONAL: Check subset of columns - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "No duplicate rows" # OPTIONAL: Step description -``` - -`rows_complete`: are row data complete? - -```yaml -- rows_complete # Simple form - -- rows_complete: # With parameters - columns_subset: [col1, col2] # OPTIONAL: Check subset of columns - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Complete rows only" # OPTIONAL: Step description -``` - -### Structure Validations - -`col_exists`: does column exist in the table? - -```yaml -- col_exists: - columns: [col1, col2, col3] # REQUIRED: Column(s) that must exist - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Required columns exist" # OPTIONAL: Step description -``` - -`col_schema_match`: does the table have expected column names and data types? - -```yaml -- col_schema_match: - schema: # REQUIRED: Expected schema - columns: - - [column_name, "data_type"] # Column with type validation - - column_name # Column name only (no type check) - - [column_name] # Alternative syntax - complete: true # OPTIONAL: Require exact column set - in_order: true # OPTIONAL: Require exact column order - case_sensitive_colnames: true # OPTIONAL: Case-sensitive column names - case_sensitive_dtypes: true # OPTIONAL: Case-sensitive data types - full_match_dtypes: true # OPTIONAL: Exact type matching - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Schema validation" # OPTIONAL: Step description -``` - -`row_count_match`: does the table have n rows? - -```yaml -- row_count_match: - count: 1000 # REQUIRED: Expected row count - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Expected row count" # OPTIONAL: Step description -``` - -`col_count_match`: does the table have n columns? - -```yaml -- col_count_match: - count: 10 # REQUIRED: Expected column count - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Expected column count" # OPTIONAL: Step description -``` - -`tbl_match`: does the table match a comparison table? - -```yaml -- tbl_match: - tbl_compare: # REQUIRED: Comparison table - python: | - pb.load_dataset("reference_table", tbl_type="polars") - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.0 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Table structure matches" # OPTIONAL: Step description -``` - -This validation performs a comprehensive comparison between the target table and a comparison table, -using progressively stricter checks: - -1. **Column count match**: both tables have the same number of columns -2. **Row count match**: both tables have the same number of rows -3. **Schema match (loose)**: column names and dtypes match (case-insensitive, any order) -4. **Schema match (order)**: columns in correct order (case-insensitive names) -5. **Schema match (exact)**: column names match exactly (case-sensitive, correct order) -6. **Data match**: values in corresponding cells are identical - -The validation fails at the first check that doesn't pass, making it easy to diagnose mismatches. -This operates over a single test unit (pass/fail for complete table match). - -**Cross-backend validation**: `tbl_match()` supports automatic backend coercion when comparing tables -from different backends (e.g., Polars vs. Pandas, DuckDB vs. SQLite). The comparison table is -automatically converted to match the target table's backend. - -Examples: - -```yaml -# Compare against reference dataset -- tbl_match: - tbl_compare: - python: | - pb.load_dataset("expected_output", tbl_type="polars") - brief: "Output matches expected results" - -# Compare against CSV file -- tbl_match: - tbl_compare: - python: | - pl.read_csv("reference_data.csv") - brief: "Matches reference CSV" - -# Compare with preprocessing on target table only -- tbl_match: - tbl_compare: - python: | - pb.load_dataset("reference_table", tbl_type="polars") - pre: | - lambda df: df.select(["id", "name", "value"]) - brief: "Selected columns match reference" -``` - -### Special Validation Methods - -`conjointly`: are multiple validations having a joint dependency? - -```yaml -- conjointly: - expressions: # REQUIRED: List of lambda expressions - - "lambda df: df['d'] > df['a']" - - "lambda df: df['a'] > 0" - - "lambda df: df['a'] + df['d'] < 12000" - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "All conditions must pass" # OPTIONAL: Step description -``` - -`specially`: do table data pass a custom validation function? - -```yaml -- specially: - expr: # REQUIRED: Custom validation function - "lambda df: df.select(pl.col('a') + pl.col('d') > 0)" - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Custom validation" # OPTIONAL: Step description -``` - -Alternative syntax with Python expressions: - -```yaml -- specially: - expr: - python: | - lambda df: df.select(pl.col('amount') > 0) -``` - -For Pandas DataFrames (when using `df_library: pandas`): - -```yaml -- specially: - expr: "lambda df: df.assign(is_valid=df['a'] + df['d'] > 0)" -``` - -### AI-Powered Validation - -`prompt`: validate rows using AI/LLM-powered analysis - -```yaml -- prompt: - prompt: "Values should be positive and realistic" # REQUIRED: Natural language criteria - model: "anthropic:claude-sonnet-4" # REQUIRED: Model identifier - columns_subset: [column1, column2] # OPTIONAL: Columns to validate - batch_size: 1000 # OPTIONAL: Rows per batch (default: 1000) - max_concurrent: 3 # OPTIONAL: Concurrent API requests (default: 3) - pre: | # OPTIONAL: Data preprocessing - lambda df: df.filter(condition) - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "AI validation" # OPTIONAL: Step description -``` - -This validation method uses Large Language Models (LLMs) to validate rows of data based on natural -language criteria. Each row becomes a test unit that either passes or fails the validation criteria, -producing binary True/False results that integrate with standard Pointblank reporting. - -**Supported models:** - -- **Anthropic**: `"anthropic:claude-sonnet-4"`, `"anthropic:claude-opus-4"` -- **OpenAI**: `"openai:gpt-4"`, `"openai:gpt-4-turbo"`, `"openai:gpt-3.5-turbo"` -- **Ollama**: `"ollama:"` (e.g., `"ollama:llama3"`) -- **Bedrock**: `"bedrock:"` - -**Authentication**: API keys are automatically loaded from environment variables or `.env` files: - -- **OpenAI**: Set `OPENAI_API_KEY` environment variable or add to `.env` file -- **Anthropic**: Set `ANTHROPIC_API_KEY` environment variable or add to `.env` file -- **Ollama**: No API key required (runs locally) -- **Bedrock**: Configure AWS credentials through standard AWS methods - -Example `.env` file: - -```plaintext -ANTHROPIC_API_KEY="your_anthropic_api_key_here" -OPENAI_API_KEY="your_openai_api_key_here" -``` - -**Performance optimization**: The validation process uses row signature memoization to avoid -redundant LLM calls. When multiple rows have identical values in the selected columns, only one -representative row is validated, and the result is applied to all matching rows. This dramatically -reduces API costs and processing time for datasets with repetitive patterns. - -Examples: - -```yaml -# Basic AI validation -- prompt: - prompt: "Email addresses should look realistic and professional" - model: "anthropic:claude-sonnet-4" - columns_subset: [email] - -# Complex semantic validation -- prompt: - prompt: "Product descriptions should mention the product category and include at least one benefit" - model: "openai:gpt-4" - columns_subset: [product_name, description, category] - batch_size: 500 - max_concurrent: 5 - -# Sentiment analysis -- prompt: - prompt: "Customer feedback should express positive sentiment" - model: "anthropic:claude-sonnet-4" - columns_subset: [feedback_text, rating] - -# Context-dependent validation -- prompt: - prompt: "For high-value transactions (amount > 1000), a detailed justification should be provided" - model: "openai:gpt-4" - columns_subset: [amount, justification, approver] - thresholds: - warning: 0.05 - error: 0.15 - -# Local model with Ollama -- prompt: - prompt: "Transaction descriptions should be clear and professional" - model: "ollama:llama3" - columns_subset: [description] -``` - -**Best practices for AI validation:** - -- Be specific and clear in your prompt criteria -- Include only necessary columns in `columns_subset` to reduce API costs -- Start with smaller `batch_size` for testing, increase for production -- Adjust `max_concurrent` based on API rate limits -- Use thresholds appropriate for probabilistic validation results -- Consider cost implications for large datasets -- Test prompts on sample data before full deployment - -**When to use AI validation:** - -- Semantic checks (e.g., "does the description match the category?") -- Context-dependent validation (e.g., "is the justification appropriate for the amount?") -- Subjective quality assessment (e.g., "is the text professional?") -- Pattern recognition that's hard to express programmatically -- Natural language understanding tasks - -**When NOT to use AI validation:** - -- Simple numeric comparisons (use `col_vals_gt`, `col_vals_lt`, etc.) -- Exact pattern matching (use `col_vals_regex`) -- Schema validation (use `col_schema_match`) -- Performance-critical validations with large datasets -- When deterministic results are required - -### Data Quality Methods - -`col_pct_null`: is the percentage of null values in a column within bounds? - -```yaml -- col_pct_null: - columns: [column_name] # REQUIRED: Column(s) to validate - value: 0.05 # REQUIRED: Maximum allowed null fraction - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Null rate check" # OPTIONAL: Step description -``` - -`data_freshness`: is the data in a date/datetime column recent? - -```yaml -- data_freshness: - columns: [date_column] # REQUIRED: Date/datetime column - freshness: "24h" # REQUIRED: Maximum age of data - thresholds: # OPTIONAL: Step-level thresholds - warning: 0.1 - actions: # OPTIONAL: Step-level actions - warning: "Custom message" - brief: "Data is recent" # OPTIONAL: Step description -``` - -### Aggregate Validations - -Aggregate methods validate column-level statistics (sum, average, standard deviation) against a -threshold. They follow the pattern `col_{stat}_{comparator}`: - -```yaml -# Sum validations -- col_sum_gt: - columns: [revenue] - value: 0 - brief: "Total revenue is positive" - -# Average validations -- col_avg_le: - columns: [rating] - value: 5 - brief: "Average rating at most 5" - -# Standard deviation validations -- col_sd_lt: - columns: [temperature] - value: 10 - brief: "Temperature variation is bounded" -``` - -Available aggregate methods: - -- **Sum**: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`, `col_sum_eq` -- **Average**: `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`, `col_avg_eq` -- **Standard deviation**: `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq` - -All aggregate methods accept these common parameters: `columns`, `value`, `thresholds`, `actions`, -`brief`, `active`, and `pre`. - -## Column Selection Patterns - -All validation methods that accept a `columns` parameter support these selection patterns: - -```yaml -# Single column -columns: column_name - -# Multiple columns as list -columns: [col1, col2, col3] - -# Column selector functions (when used in Python expressions) -columns: - python: | - starts_with("prefix_") - -# Examples of common patterns -columns: [customer_id, order_id] # Specific columns -columns: user_email # Single column -``` - -## Parameter Details - -### Common Parameters - -These parameters are available for most validation methods: - -- `columns`: column selection (string, list, or selector expression) -- `na_pass`: whether to pass NULL/missing values (boolean, default: false) -- `pre`: data preprocessing function (Python lambda expression) -- `thresholds`: step-level failure thresholds (dict) -- `actions`: step-level failure actions (dict) -- `brief`: step description (string, boolean, or template) -- `active`: whether the step is active (boolean, default: true) - -### Active Parameter - -The `active` parameter controls whether a validation step runs. It defaults to `true`; set it to -`false` to skip a step without removing it from the configuration: - -```yaml -steps: - # This step will be skipped - - col_vals_gt: - columns: [amount] - value: 0 - active: false - - # This step runs normally (default active: true) - - col_vals_not_null: - columns: [customer_id] -``` - -### Brief Parameter Options - -The `brief` parameter supports several formats: - -```yaml -brief: "Custom description" # Custom text -brief: true # Auto-generated description -brief: false # No description -brief: "Step {step}: {auto}" # Template with auto-generated text -brief: "Column '{col}' validation" # Template with variables -``` - -template variables: `{step}`, `{col}`, `{value}`, `{set}`, `{pattern}`, `{auto}` - -### Python Expressions - -Several parameters support Python expressions using the `python:` block syntax: - -```yaml -# Data source loading -tbl: - python: | - pl.scan_csv("data.csv").filter(pl.col("active") == True) - -# Preprocessing -pre: - python: | - lambda df: df.filter(pl.col("date") >= "2024-01-01") - -# Custom expressions -expr: - python: | - pl.col("value").is_between(0, 100) - -# Callable actions -actions: - error: - python: | - lambda: print("VALIDATION ERROR: Critical data quality issue detected!") -``` - -Note: The Python environment in YAML is restricted for security. Only built-in functions (`print`, -`len`, `str`, etc.), `Path` from pathlib, and available DataFrame libraries (`pl`, `pd`) are -accessible. You cannot import additional modules like `requests`, `logging`, or custom libraries. - -You can also use the shortcut syntax for lambda expressions: - -```yaml -# Shortcut syntax (equivalent to python: block) -pre: | - lambda df: df.filter(pl.col("status") == "active") -``` - -### Restricted Python Environment - -For security reasons, the Python environment in YAML configurations is restricted to a safe subset -of functionality. The available namespace includes: - -Built-in functions: - -- basic types: `str`, `int`, `float`, `bool`, `list`, `dict`, `tuple`, `set` -- math functions: `sum`, `min`, `max`, `abs`, `round`, `len` -- iteration: `range`, `enumerate`, `zip` -- output: `print` - -Available modules: - -- `Path` from pathlib for file path operations -- `pb` (`pointblank`) for dataset loading and validation functions -- `pl` (`polars`) if available on the system -- `pd` (`pandas`) if available on the system - -Restrictions: - -- cannot import external libraries (`requests`, `logging`, `os`, `sys`, etc.) -- cannot use `__import__`, `exec`, `eval`, or other dynamic execution functions -- file operations are limited to `Path` functionality - -**Examples of valid callable actions:** - -```yaml -# Simple output with built-in functions -actions: - warning: - python: | - lambda: print(f"WARNING: {sum([1, 2, 3])} validation issues detected") - -# Using available variables and string formatting -actions: - error: - python: | - lambda: print("ERROR: Data validation failed at " + str(len("validation"))) - -# Multiple statements in lambda (using parentheses) -actions: - critical: - python: | - lambda: ( - print("CRITICAL ALERT:"), - print("Immediate attention required"), - print("Contact data team") - )[-1] # Return the last value -``` - -For complex alerting, logging, or external system integration, use string template actions instead -of callable actions, and handle the external communication in your application code after validation -completes. - -## Best Practices - -### Organization - -- use descriptive `tbl_name` and `label` values -- add `brief` descriptions for complex validations -- group related validations logically -- use consistent indentation and formatting - -### Performance - -- apply `pre` filters early to reduce data volume -- order validations from fast to slow -- use `columns_subset` for row-based validations when appropriate -- consider data source location (local vs. remote) -- choose `df_library` based on data size and operations: - - `polars`: fastest for large datasets and analytical operations - - `pandas`: best for complex transformations and data science workflows - - `duckdb`: optimal for analytical queries on very large datasets - -### Maintainability - -- store YAML files in version control -- use template variables in actions and briefs -- document expected failures with comments -- test configurations with `validate_yaml()` before deployment -- specify `df_library` explicitly when using library-specific validation expressions -- keep DataFrame library choice consistent within related validation workflows - -### Error Handling - -- set appropriate thresholds based on data patterns -- use actions for monitoring and alerting -- start with conservative thresholds and adjust -- consider using `highest_only: false` for comprehensive reporting diff --git a/docs/user-guide/yaml-validation-workflows.qmd b/docs/user-guide/yaml-validation-workflows.qmd deleted file mode 100644 index c0e25262ea..0000000000 --- a/docs/user-guide/yaml-validation-workflows.qmd +++ /dev/null @@ -1,1041 +0,0 @@ ---- -title: YAML Validation Workflows -jupyter: python3 -toc-expand: 2 -html-table-processing: none -bread-crumbs: true ---- - -Pointblank supports defining validation workflows using YAML configuration files, providing a -declarative, readable, and maintainable approach to data validation. YAML workflows are particularly -useful for teams, version control, automation pipelines, and scenarios where you want to separate -validation logic from application code. - -YAML validation workflows offer several advantages: they're easy to read and write, can be version -controlled alongside your data processing code, enable non-programmers to contribute to data quality -definitions, and provide a clear separation between validation logic and execution code. - -The YAML approach complements Pointblank's Python API, giving you flexibility to choose the right -tool for each situation. Simple, repetitive validations work well in YAML, while complex logic with -custom functions might be better suited for the Python API. - -## Basic YAML Validation Structure - -A YAML validation workflow consists of a few key components: - -- `tbl`: specifies the data source (file path, dataset name, or Python expression) -- `steps`: defines the validation checks to perform -- Optional metadata: table name, label, thresholds, actions, and other configuration - -Here's a simple example validating the built-in `small_table` dataset: - -```yaml -tbl: small_table -df_library: polars # Optional: specify DataFrame library -tbl_name: "Small Table Validation" -label: "Basic data quality checks" -steps: - - rows_distinct - - col_exists: - columns: [a, b, c, d] - - col_vals_not_null: - columns: [a, b] -``` - -You can save this configuration to a .yaml file and execute it using the `yaml_interrogate()` -function: - -```{python} -import pointblank as pb -from pathlib import Path - -# Save the YAML configuration to a file -yaml_content = """ -tbl: small_table -df_library: polars -tbl_name: "Small Table Validation" -label: "Basic data quality checks" -steps: - - rows_distinct - - col_exists: - columns: [a, b, c, d] - - col_vals_not_null: - columns: [a, b] -""" - -yaml_file = Path("basic_validation.yaml") -yaml_file.write_text(yaml_content) - -# Execute the validation from the file -result = pb.yaml_interrogate(yaml_file) -result -``` - -The validation table shows the results of each step, just as if you had written the equivalent -Python code. You can also pass YAML content directly as a string for quick testing, but working -with files is the recommended approach for production workflows. - -## Data Sources in YAML - -The `tbl` field supports various data source types, making it easy to work with different kinds of -data. You can also control the DataFrame library used for loading data with the `df_library` -parameter. - -### DataFrame Library Selection - -By default, Pointblank loads data as Polars DataFrames, but you can specify alternative libraries: - -```yaml -# Load as Polars DataFrame (default) -tbl: small_table -df_library: polars - -# Load as Pandas DataFrame -tbl: small_table -df_library: pandas - -# Load as DuckDB table (via Ibis) -tbl: small_table -df_library: duckdb -``` - -This is particularly useful when using validation expressions that require specific DataFrame APIs: - -```yaml -# Using Pandas-specific operations -tbl: small_table -df_library: pandas -steps: - - specially: - expr: "lambda df: df.assign(total=df['a'] + df['d'])" - -# Using Polars-specific operations -tbl: small_table -df_library: polars -steps: - - specially: - expr: "lambda df: df.select(pl.col('a') + pl.col('d') > 0)" -``` - -### File-based Sources - -```yaml -# CSV files (respects df_library setting) -tbl: "data/customers.csv" -df_library: pandas - -# Parquet files -tbl: "warehouse/sales.parquet" -df_library: polars - -# Multiple files with patterns -tbl: "logs/*.parquet" -``` - -### Built-in Datasets - -```yaml -# Use Pointblank's built-in datasets -tbl: small_table -tbl: game_revenue -tbl: nycflights -``` - -### Python Expressions for Complex Sources - -For more complex data loading, use the `python:` block syntax. This syntax can be used with several -parameters throughout your YAML configuration: - -- `tbl`: For complex data source loading (as shown below) -- `expr`: For custom validation expressions in `col_vals_expr` -- `pre`: For data preprocessing before validation steps -- `actions`: For callable action functions (`warning`, `error`, `critical`, and `default`) - -```yaml -# Load data with custom Polars operations -tbl: - python: | - pl.scan_csv("sales_data.csv") - .filter(pl.col("date") >= "2024-01-01") - .head(1000) - -# Load from a database connection -tbl: - python: | - pl.read_database( - query="SELECT * FROM customers WHERE active = true", - connection="postgresql://user:pass@localhost/db" - ) -``` - -## Reusable Templates with `set_tbl=` - -One of the most powerful features of YAML validation workflows is the ability to create reusable -templates that can be applied to different datasets. Using the `set_tbl=` parameter with -`yaml_interrogate()`, you can define validation logic once and apply it to multiple data sources. - -### Creating Validation Templates - -When creating templates for use with `set_tbl=`, the `tbl` field is still required but its value -will be overridden. The recommended approach is to use `tbl: null`: - -```yaml -tbl: null -tbl_name: "Sales Data Validation Template" -label: "Standard validation checks for sales data" -steps: - - col_exists: - columns: [customer_id, revenue, region, date] - - col_vals_not_null: - columns: [customer_id, revenue] - - col_vals_gt: - columns: [revenue] - value: 0 - - col_vals_in_set: - columns: [region] - set: [North, South, East, West] -``` - -### Applying Templates to Multiple Datasets - -Here's a practical example showing how to apply the same validation template to multiple quarterly -datasets, demonstrating the power of reusable YAML configurations: - -```{python} -import pointblank as pb -import polars as pl - -# Define the template once -sales_template = """ -tbl: null # Will be overridden -tbl_name: "Sales Data Validation" -label: "Standard sales validation checks" -thresholds: - warning: 0.05 - error: 0.1 -steps: - - col_exists: - columns: [customer_id, revenue, region] - - col_vals_not_null: - columns: [customer_id, revenue] - - col_vals_gt: - columns: [revenue] - value: 0 - - col_vals_in_set: - columns: [region] - set: [North, South, East, West] -""" - -# Create different datasets -q1_data = pl.DataFrame({ - "customer_id": [1, 2, 3, 4], - "revenue": [100, 200, 150, 300], - "region": ["North", "South", "East", "West"] -}) - -q2_data = pl.DataFrame({ - "customer_id": [5, 6, 7, 8], - "revenue": [250, 180, 220, 350], - "region": ["South", "North", "West", "East"] -}) - -# Apply the same template to both datasets -q1_result = pb.yaml_interrogate(sales_template, set_tbl=q1_data) -q2_result = pb.yaml_interrogate(sales_template, set_tbl=q2_data) - -print(f"Q1 validation: {all(v.all_passed for v in q1_result.validation_info)}") -print(f"Q2 validation: {all(v.all_passed for v in q2_result.validation_info)}") -``` - -### Template Best Practices - -1. **Use `tbl: null`**: this clearly indicates the template expects a data source to be provided -2. **Include comprehensive metadata**: use `tbl_name`, `label`, and `brief` to make results -self-documenting -3. **Set appropriate thresholds**: define warning/error levels that make sense for your use case -4. **Version control templates**: store templates in your repository alongside your data processing -code -5. **Test with sample data**: validate your templates work with representative datasets - -### Common Template Patterns - -For API response validation, you can ensure that responses have the expected structure and valid -status codes: - -```yaml -tbl: null -tbl_name: "API Response Validation" -brief: "Standard checks for API response data" -steps: - - col_exists: - columns: [user_id, status, timestamp] - - col_vals_in_set: - columns: [status] - set: [success, error, pending] - - col_vals_not_null: - columns: [user_id, timestamp] -``` - -For file upload validation, you can check file sizes and formats to ensure they meet your -requirements: - -```yaml -tbl: null -tbl_name: "File Upload Validation" -steps: - - col_vals_gt: - columns: [file_size] - value: 0 - - col_vals_lt: - columns: [file_size] - value: 10485760 # 10MB limit - - col_vals_in_set: - columns: [file_type] - set: [csv, json, xlsx, parquet] -``` - -This template approach is particularly valuable in data pipelines, ETL processes, and automated -testing scenarios where you need to apply consistent validation logic across multiple similar -datasets. - -## Validation Steps - -YAML supports all of Pointblank's validation methods. Here are some common patterns: - -### Column-based Validations - -```yaml -tbl: worldcities.csv -steps: - # Check for missing values - - col_vals_not_null: - columns: [city_name, country] - - # Validate value ranges - - col_vals_between: - columns: latitude - left: -90 - right: 90 - - # Check set membership - - col_vals_in_set: - columns: country_code - set: [US, CA, MX, UK, DE, FR] - - # Regular expression validation - - col_vals_regex: - columns: postal_code - pattern: "^[0-9]{5}(-[0-9]{4})?$" -``` - -### Row-based Validations - -```yaml -tbl: sales_data.csv -steps: - # Check for duplicate rows - - rows_distinct - - # Ensure complete rows (no missing values) - - rows_complete - - # Check row count - - row_count_match: - count: 1000 -``` - -### Schema Validations - -Schema validation ensures your data has the expected structure and column types. The -`col_schema_match` validation method uses a `schema` key that contains a `columns` list, where each -item in the list can specify a column name alone or a column name with its expected data type. - -Each `column` entry can be specified as: - -- `column_name`: column name as a scalar string (structure validation, no type checking) -- `[column_name, "data_type"]`: column name with type validation (as a list with two elements) -- `[column_name]`: column name in a single-item list (equivalent to scalar, for consistency) - -```yaml -tbl: customer_data.csv -steps: - # Complete schema validation (structure and types) - - col_schema_match: - schema: - columns: - - [customer_id, "int64"] - - [name, "object"] - - [email, "object"] - - [signup_date, "datetime64[ns]"] - - # Structure-only validation (column names without types) - - col_schema_match: - schema: - columns: - - customer_id - - name - - email - complete: false - brief: "Check that core columns exist" -``` - -#### Schema Validation Options - -Schema validations support the full range of validation options: - -```yaml -tbl: data_file.csv -steps: - - col_schema_match: - schema: - columns: - - [id, "int64"] - - name - complete: false # Allow extra columns - in_order: false # Column order doesn't matter - case_sensitive_colnames: false # Case-insensitive column names - case_sensitive_dtypes: false # Case-insensitive type names - full_match_dtypes: false # Allow partial type matching - brief: "Flexible schema validation" -``` - -#### Other Structure Validations - -```yaml -tbl: customer_data.csv -steps: - # Check column count - - col_count_match: - count: 4 -``` - -### Trend Validations - -Validate that values follow increasing or decreasing patterns across rows: - -```yaml -tbl: time_series_data.csv -steps: - # Ensure timestamp values increase - - col_vals_increasing: - columns: timestamp - brief: "Timestamps must be in chronological order" - - # Validate countdown timer decreases - - col_vals_decreasing: - columns: countdown - allow_stationary: true - brief: "Countdown values should decrease (ties allowed)" - - # Check trend with tolerance - - col_vals_increasing: - columns: temperature - decreasing_tol: 0.5 - brief: "Temperature trends upward (small drops < 0.5°C allowed)" -``` - -### Specification-based Validations - -Validate values against common data specifications like email addresses, URLs, postal codes, and -more: - -```yaml -tbl: user_contact_info.csv -steps: - # Validate email addresses - - col_vals_within_spec: - columns: email - spec: "email" - - # Validate US ZIP codes - - col_vals_within_spec: - columns: zip_code - spec: "postal_code[US]" - - # Validate URLs - - col_vals_within_spec: - columns: website - spec: "url" - na_pass: true -``` - -Available specifications include: `"email"`, `"url"`, `"phone"`, `"ipv4"`, `"ipv6"`, `"mac"`, -`"isbn"`, `"vin"`, `"credit_card"`, `"swift"`, `"postal_code[]"`, `"iban[]"`. - -### Table Comparison - -Validate that an entire table matches a reference table: - -```yaml -tbl: processed_output.csv -steps: - # Compare against expected output - - tbl_match: - tbl_compare: - python: | - pb.load_dataset("expected_output", tbl_type="polars") - brief: "Output matches expected results" -``` - -The `tbl_match()` validation performs comprehensive comparison including column count, row count, -schema, and data values. It supports cross-backend validation (e.g., comparing Polars vs. Pandas -DataFrames). - -### AI-Powered Validation - -Use Large Language Models to validate data based on natural language criteria: - -```yaml -tbl: customer_feedback.csv -steps: - # Validate sentiment - - prompt: - prompt: "Customer feedback should express positive sentiment" - model: "anthropic:claude-sonnet-4" - columns_subset: [feedback_text, rating] - batch_size: 500 - thresholds: - warning: 0.1 - - # Validate semantic correctness - - prompt: - prompt: "Product descriptions should mention the product category and at least one benefit" - model: "openai:gpt-4" - columns_subset: [product_name, description, category] -``` - -**Note**: AI validations require API keys to be set as environment variables (e.g., -`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`) or in a `.env` file. These validations are best suited for -semantic, context-dependent, or subjective quality checks rather than simple numeric comparisons. - -## Thresholds and Severity Levels - -Thresholds determine when validation failures trigger different severity levels. You can set global -thresholds for the entire workflow: - -```yaml -tbl: sales_data.csv -tbl_name: "Sales Data Quality Check" -thresholds: - warning: 0.05 # 5% failure rate triggers warning - error: 0.10 # 10% failure rate triggers error - critical: 0.15 # 15% failure rate triggers critical -steps: - - col_vals_not_null: - columns: [customer_id, amount] - - col_vals_gt: - columns: amount - value: 0 -``` - -You can also set thresholds for individual validation steps: - -```yaml -tbl: user_data.csv -steps: - - col_vals_not_null: - columns: email - thresholds: - warning: 1 # Any missing email is a warning - error: 0.01 # 1% missing emails is an error - - - col_vals_regex: - columns: email - pattern: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$" - thresholds: - error: 1 # Any invalid email format is an error -``` - -## Actions: Responding to Validation Failures - -Actions define what happens when validation thresholds are exceeded. You can use string templates -with placeholder variables or callable functions. - -### String Template Actions - -```yaml -tbl: orders.csv -thresholds: - warning: 0.02 - error: 0.05 -actions: - warning: "Warning: Step {step} found {n_failed} failures in {col} column" - error: "Error in {TYPE} validation: {n_failed}/{n} rows failed (Step {step})" - critical: "Critical failure detected at {time}" -steps: - - col_vals_not_null: - columns: [order_id, customer_id] -``` - -Available template variables include: - -- `{step}`: validation step number -- `{col}`: column name being validated -- `{val}`: specific failing value (when applicable) -- `{n_failed}`: number of failing rows -- `{n}`: total number of rows checked -- `{TYPE}`: validation method name (e.g., "COL_VALS_NOT_NULL") -- `{LEVEL}`: severity level ("WARNING", "ERROR", "CRITICAL") -- `{time}`: timestamp of the validation - -### Callable Actions - -For more complex responses, use Python callable functions: - -```yaml -tbl: critical_data.csv -thresholds: - error: 1 -actions: - error: - python: | - lambda: print("ALERT: Critical data validation failed!") - critical: - python: | - lambda: print("CRITICAL: Validation failure - manual intervention required!") -steps: - - col_vals_not_null: - columns: [transaction_id, amount] -``` - -Note: The Python environment in YAML actions is restricted for security. You can use built-in -functions like `print()`, basic operations, and available DataFrame libraries, but cannot import -external modules like `requests` or `logging`. For external notifications, consider using string -template actions or handling alerts in your application code after the validation completes. - -### Step-level Actions - -You can also define actions for individual validation steps: - -```yaml -tbl: financial_data.csv -steps: - - col_vals_not_null: - columns: account_balance - thresholds: - error: 1 - actions: - error: "Missing account balance detected in step {step}." - - - col_vals_gt: - columns: account_balance - value: 0 - actions: - warning: - python: | - lambda: print("Negative balance warning triggered.") -``` - -## Advanced Features - -### Pre-processing with the `pre` Parameter - -You can apply data transformations before validation using the `pre` parameter: - -```yaml -tbl: transactions.csv -steps: - # Validate only recent transactions - - col_vals_gt: - columns: amount - value: 0 - pre: - python: | - lambda df: df.filter( - pl.col("transaction_date") >= "2024-01-01" - ) - - # Check completeness for active customers only - - col_vals_not_null: - columns: [email, phone] - pre: | - lambda df: df.filter(pl.col("status") == "active") -``` - -Note that you can use either the explicit `python:` block syntax or the shortcut syntax (just -`pre: |`) for the lambda expressions. - -### Complex Expressions - -For advanced validation logic, use a `col_vals_expr` step with custom expressions: - -```yaml -tbl: sales_data.csv -steps: - # Custom business logic validation - - col_vals_expr: - expr: - python: | - ( - pl.when(pl.col("product_type") == "premium") - .then(pl.col("price") >= 100) - .when(pl.col("product_type") == "standard") - .then(pl.col("price").is_between(20, 99)) - .otherwise(pl.col("price") <= 19) - ) -``` - -### Brief Descriptions - -Add human-readable descriptions to validation steps. The `brief` parameter supports string -templating and automatic generation: - -```yaml -tbl: customer_data.csv -brief: "Customer data quality validation for {auto}" -steps: - - col_vals_not_null: - columns: customer_id - brief: "Ensure all customers have valid IDs" - - - col_vals_regex: - columns: email - pattern: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$" - brief: "Validate email format compliance" - - - col_vals_between: - columns: age - left: 13 - right: 120 - brief: "Check reasonable age ranges" - - # Use automatic brief generation - - col_vals_not_null: - columns: phone_number - brief: true - - # Template variables in briefs - - col_vals_in_set: - columns: status - set: [active, inactive, pending] - brief: "Column '{col}' must be one of: {set}" -``` - -Brief Templating Options: - -- custom strings: Write your own descriptive text -- `true`: Automatically generates a brief based on the validation method and parameters -- `{auto}`: Placeholder for auto-generated text within custom strings -- template variables: Use the same variables available in actions: - - `{col}`: column name(s) being validated - - `{step}`: the step number in the validation plan - - `{value}`: the comparison value used in the validation (for single-value comparisons) - - `{pattern}`: for regex validations, the pattern being matched - -### Governance Metadata - -YAML workflows support governance metadata that identifies ownership and usage of validation -workflows. These fields are embedded in the validation report: - -```yaml -tbl: sales_data.csv -tbl_name: "Sales Pipeline" -owner: "Data Engineering" -consumers: [Analytics Team, Finance, Compliance] -version: "2.1.0" -steps: - - col_vals_not_null: - columns: [customer_id, revenue] - - col_vals_gt: - columns: [revenue] - value: 0 -``` - -The `owner`, `consumers`, and `version` fields are forwarded to the `Validate` constructor and -appear in the validation report header. These fields are optional and do not affect validation -behavior. - -### Data Freshness and Null Percentage - -Two additional validation methods support common data quality checks: - -**`data_freshness`**: Validate that a date/datetime column has recent data: - -```yaml -steps: - - data_freshness: - columns: event_date - freshness: "24h" -``` - -**`col_pct_null`**: Validate that the percentage of null values is within bounds: - -```yaml -steps: - - col_pct_null: - columns: [email, phone] - value: 0.05 -``` - -### Aggregate Validations - -Aggregate methods validate column-level statistics like sum, average, and standard deviation: - -```yaml -steps: - # Check that total revenue is positive - - col_sum_gt: - columns: [revenue] - value: 0 - - # Validate average rating is at most 5 - - col_avg_le: - columns: [rating] - value: 5 - - # Ensure temperature variation is bounded - - col_sd_lt: - columns: [temperature] - value: 10 -``` - -Available methods follow the `col_{stat}_{comparator}` pattern where `{stat}` is `sum`, `avg`, or -`sd`, and `{comparator}` is `gt`, `lt`, `ge`, `le`, `eq`, `between`, or `outside`. - -### Step Activation Control - -The `active` parameter allows you to temporarily disable validation steps without removing them -from the configuration: - -```yaml -steps: - # This step is disabled - - col_vals_gt: - columns: [amount] - value: 0 - active: false - - # This step runs normally (active: true is the default) - - col_vals_not_null: - columns: [customer_id] -``` - -This is useful for debugging, phased rollouts, or temporarily skipping steps that are known to fail. - -### Reference Tables - -The `reference` top-level key specifies a reference table for comparison-based validations: - -```yaml -tbl: current_data.csv -reference: - python: | - pb.load_dataset("baseline_data", tbl_type="polars") -steps: - - tbl_match: - tbl_compare: - python: | - pb.load_dataset("baseline_data", tbl_type="polars") -``` - -## Working with YAML Files - -### Loading from Files - -You can save your YAML configuration to files and load them: - -```{python} -# Create a YAML file -yaml_content = """ -tbl: small_table -tbl_name: "File-based Validation" -steps: - - col_vals_between: - columns: c - left: 1 - right: 10 - - col_vals_in_set: - columns: f - set: [low, mid, high] -""" - -# Save to file -from pathlib import Path -yaml_file = Path("validation_config.yaml") -yaml_file.write_text(yaml_content) - -# Load and execute -result = pb.yaml_interrogate(yaml_file) -result -``` - -### Converting YAML to Python - -Use `yaml_to_python()` to generate equivalent Python code from your YAML configuration: - -```{python} -yaml_config = """ -tbl: small_table -tbl_name: "Example Validation" -thresholds: - warning: 0.1 - error: 0.2 -actions: - warning: "Warning: {TYPE} validation failed" -steps: - - col_vals_gt: - columns: a - value: 0 - - col_vals_in_set: - columns: f - set: [low, mid, high] -""" - -# Generate Python code -python_code = pb.yaml_to_python(yaml_config) -print(python_code) -``` - -This is useful for: - -- learning how YAML maps to Python API calls -- transitioning from YAML to code-based workflows -- generating documentation that shows both approaches -- debugging YAML configurations - -## Practical Examples - -### Data Pipeline Validation - -Here's a comprehensive example for validating data in a processing pipeline: - -```yaml -tbl: - python: | - ( - pl.scan_csv("raw_data/customer_events.csv") - .filter(pl.col("event_date") >= "2024-01-01") - ) - -tbl_name: "Customer Events Pipeline Validation" -label: "Daily data quality check for customer events" - -thresholds: - warning: 0.01 # 1% failure rate - error: 0.05 # 5% failure rate - -actions: - warning: "Pipeline warning: {TYPE} validation found {n_failed} issues" - error: - python: | - lambda: print("ERROR: Pipeline validation failed - manual review required") - -steps: - # Schema validation - - col_schema_match: - schema: - columns: - - [customer_id, "int64"] - - [event_type, "object"] - - [event_date, "object"] - - [revenue, "float64"] - brief: "Validate table structure matches expected schema" - - # Data completeness - - col_vals_not_null: - columns: [customer_id, event_type, event_date] - brief: "Critical fields must be complete" - - # Business logic validation - - col_vals_in_set: - columns: event_type - set: [signup, purchase, cancellation, upgrade] - brief: "Event types must be from approved list" - - # Data quality checks - - col_vals_gt: - columns: revenue - value: 0 - na_pass: true - brief: "Revenue values must be positive when present" - - # Temporal validation - - col_vals_expr: - expr: - python: | - pl.col("event_date").str.strptime(pl.Date, "%Y-%m-%d").is_not_null() - brief: "Event dates must be valid YYYY-MM-DD format" -``` - -### Quality Monitoring Dashboard - -For ongoing data quality monitoring: - -```yaml -tbl: warehouse/daily_metrics.parquet -tbl_name: "Daily Metrics Quality Check" - -thresholds: - warning: 5 # 5 failing rows - error: 50 # 50 failing rows - critical: 100 # 100 failing rows - -actions: - warning: "Quality check warning: {n_failed} rows failed {TYPE} validation" - error: "Quality degradation detected: Step {step} failed for {n_failed}/{n} rows" - critical: - python: | - lambda: print("CRITICAL: Data quality failure detected - immediate attention required") - highest_only: false - -steps: - - row_count_match: - count: 10000 - brief: "Verify expected daily record count" - - - col_vals_not_null: - columns: [date, metric_value, source_system] - brief: "Core fields must be complete" - - - col_vals_between: - columns: metric_value - left: 0 - right: 1000000 - brief: "Metric values within reasonable range" - - - rows_distinct: - columns_subset: [date, metric_name, source_system] - brief: "No duplicate metric records per day" -``` - -## Best Practices - -### Organization and Structure - -1. use descriptive names: give your validations clear `tbl_name` and `label` values -2. add brief descriptions: document what each validation step checks -3. group related validations: organize steps logically (schema, completeness, business rules) -4. version control: store YAML files in git alongside your data processing code - -### Error Handling and Monitoring - -1. set appropriate thresholds: start conservative and adjust based on your data patterns -2. use actions for alerting: set up notifications for critical failures -3. document expected failures: some data quality issues might be acceptable -4. monitor validation results: track validation performance over time - -### Performance Considerations - -1. use the `pre` parameter efficiently: apply filters early to reduce data volume -2. order validations strategically: put fast, likely-to-fail checks first -3. consider data source location: local files are faster than remote sources -4. use appropriate column selections: only validate the columns you need - -## Wrapping Up - -YAML validation workflows provide a powerful, declarative approach to data validation in Pointblank. -Such workflows are great at expressing common validation patterns in a readable format that can be -easily shared, version controlled, and maintained by teams. - -Key advantages of YAML workflows: - -- readable: non-programmers can understand and contribute to validation logic -- maintainable: easy to modify validation rules without changing application code -- portable: YAML files can be shared between projects and teams -- version controlled: track changes to validation logic over time -- flexible: support for simple checks and complex custom logic - -Use YAML workflows when you want declarative, maintainable validation definitions, and fall back to -the Python API when you need complex programmatic logic or tight integration with application code. -The two approaches complement each other well and can be used together as your validation needs -evolve. diff --git a/pointblank/_utils_ai.py b/pointblank/_utils_ai.py index 35df2f9a8f..d853427ea0 100644 --- a/pointblank/_utils_ai.py +++ b/pointblank/_utils_ai.py @@ -27,7 +27,7 @@ class _LLMConfig: provider LLM provider name (e.g., 'anthropic', 'openai', 'ollama', 'bedrock'). model - Model name (e.g., 'claude-sonnet-4-5', 'gpt-4'). + Model name (e.g., 'claude-opus-4-6', 'gpt-4'). api_key API key for the provider. If None, will be read from environment. verify_ssl diff --git a/pointblank/_utils_llms_txt.py b/pointblank/_utils_llms_txt.py index 3d42ad8d95..f4fdc1465a 100644 --- a/pointblank/_utils_llms_txt.py +++ b/pointblank/_utils_llms_txt.py @@ -1,15 +1,6 @@ import inspect import re from pathlib import Path -from typing import Optional -from urllib.parse import urljoin - -try: - import requests - - SCRAPING_AVAILABLE = True -except ImportError: - SCRAPING_AVAILABLE = False def get_api_details(module, exported_list) -> str: @@ -335,7 +326,7 @@ def _get_api_text() -> str: def _get_examples_text() -> str: """ Get the examples for the Pointblank library. These examples are extracted from the Quarto - documents in the `docs/demos` directory. + documents in the `examples` directory. Returns ------- @@ -349,65 +340,44 @@ def _get_examples_text() -> str: f"{sep_line}\nThis is a set of examples for the Pointblank library.\n{sep_line}\n\n" ) - # A large set of examples is available in the docs/demos directory, and each of the - # subdirectories contains a different example (in the form of a Quarto document) - - example_dirs = [ - "01-starter", - "02-advanced", - "03-data-extracts", - "04-sundered-data", - "05-step-report-column-check", - "06-step-report-schema-check", - "apply-checks-to-several-columns", - "check-row-column-counts", - "checks-for-missing", - "col-vals-custom-expr", - "column-selector-functions", - "comparisons-across-columns", - "expect-no-duplicate-rows", - "expect-no-duplicate-values", - "expect-text-pattern", - "failure-thresholds", - "mutate-table-in-step", - "numeric-comparisons", - "schema-check", - "set-membership", - "using-parquet-data", - ] + # Examples are organized in the examples/ directory under category subdirectories, + # each containing Quarto documents with title and description in YAML front matter - for example_dir in example_dirs: - link = f"https://posit-dev.github.io/pointblank/demos/{example_dir}/" + examples_dir = Path("examples") - # Read in the index.qmd file for each example - with open(f"docs/demos/{example_dir}/index.qmd", "r") as f: - example_text = f.read() + # Collect all .qmd files from category subdirectories, sorted for deterministic order + example_files = sorted(examples_dir.glob("*/*.qmd")) + + for example_file in example_files: + # Build the link URL from the file path (e.g., examples/01-getting-started/starter.html) + link = ( + f"https://posit-dev.github.io/pointblank/" + f"{example_file.parent.name}/{example_file.stem}.html" + ) - # Remove the first eight lines of the example text (contains the YAML front matter) - example_text = "\n".join(example_text.split("\n")[8:]) + example_text = example_file.read_text() - # Extract the title of the example (the line beginning with `###`) - title_match = re.search(r"### (.*)", example_text) - assert title_match is not None - title = title_match.group(1) + # Extract title and description from YAML front matter + title_match = re.search(r'^title:\s*"(.+?)"', example_text, re.MULTILINE) + desc_match = re.search(r'^description:\s*"(.+?)"', example_text, re.MULTILINE) - # The next line with text is the short description of the example - desc_match = re.search(r"(.*)\.", example_text) - assert desc_match is not None - desc = desc_match.group(1) + if not title_match or not desc_match: + continue - # Get all of the Python code blocks in the example - # these can be identified as starting with ```python and ending with ``` - code_blocks = re.findall(r"```python\n(.*?)```", example_text, re.DOTALL) + title = title_match.group(1) + desc = desc_match.group(1) - # Wrap each code block with a leading ```python and trailing ``` - code_blocks = [f"```python\n{code}```" for code in code_blocks] + # Get the plain ```python code blocks (not ```{python} executable blocks) + code_blocks = re.findall(r"```python\n(.*?)```", example_text, re.DOTALL) - # Collapse all code blocks into a single string - code_text = "\n\n".join(code_blocks) + # Wrap each code block with a leading ```python and trailing ``` + code_blocks = [f"```python\n{code}```" for code in code_blocks] - # Add the example title, description, and code to the examples text - examples_text += f"### {title} ({link})\n\n{desc}\n\n{code_text}\n\n" + # Collapse all code blocks into a single string + code_text = "\n\n".join(code_blocks) + + # Add the example title, description, and code to the examples text + examples_text += f"### {title} ({link})\n\n{desc}\n\n{code_text}\n\n" return examples_text @@ -426,295 +396,3 @@ def _get_api_and_examples_text() -> str: examples_text = _get_examples_text() return f"{api_text}\n\n{examples_text}" - - -def scrape_examples_index(base_url: str = "https://posit-dev.github.io/pointblank/") -> list[dict]: - """ - Parse the examples index page from local .qmd file to extract demo titles and descriptions. - - Parameters - ---------- - base_url : str - The base URL of the Pointblank documentation site. - - Returns - ------- - list[dict] - A list of dictionaries with 'title', 'description', and 'url' keys. - """ - examples = [] - - # Read from local file - qmd_path = Path(__file__).parent.parent / "docs" / "demos" / "index.qmd" - - if not qmd_path.exists(): - # Fallback to web scraping if local file doesn't exist - if not SCRAPING_AVAILABLE: - raise ImportError( - "requests is required for web scraping. Install it with: pip install requests" - ) - demos_url = urljoin(base_url, "demos/") - response = requests.get(demos_url) - response.raise_for_status() - content = response.text - else: - with open(qmd_path, "r") as f: - content = f.read() - - # Pattern to match the example structure in the .qmd file: - # [Title](./path/index.qmd) - # ... potentially an image ... - #

Description

- - # First, get the grid-based examples with images - grid_pattern = r"\[([^\]]+)\]\(\./([^)]+)/index\.qmd\).*?]*>(.*?)

" - matches = re.findall(grid_pattern, content, re.DOTALL) - - for title, path, description in matches: - url = urljoin(base_url, f"demos/{path}/") - # Clean up description - desc_clean = re.sub(r"<[^>]+>", "", description).strip() - examples.append({"title": title.strip(), "description": desc_clean, "url": url}) - - # Also get the list-style examples (after the
) - list_pattern = r"\[([^\]]+)\]\(\./([^)]+)/index\.qmd\)
\s*([^\n]+)" - list_matches = re.findall(list_pattern, content) - - for title, path, description in list_matches: - url = urljoin(base_url, f"demos/{path}/") - examples.append({"title": title.strip(), "description": description.strip(), "url": url}) - - return examples - - -def scrape_api_reference_index( - base_url: str = "https://posit-dev.github.io/pointblank/", -) -> list[dict]: - """ - Parse the API reference index page from local .qmd file to extract function/class names and descriptions. - - Parameters - ---------- - base_url : str - The base URL of the Pointblank documentation site. - - Returns - ------- - list[dict] - A list of dictionaries with 'title', 'description', and 'url' keys. - """ - api_items = [] - - # Read from local file - qmd_path = Path(__file__).parent.parent / "docs" / "reference" / "index.qmd" - - if not qmd_path.exists(): - # Fallback to web scraping if local file doesn't exist - if not SCRAPING_AVAILABLE: - raise ImportError( - "requests is required for web scraping. Install it with: pip install requests" - ) - reference_url = urljoin(base_url, "reference/") - response = requests.get(reference_url) - response.raise_for_status() - content = response.text - else: - with open(qmd_path, "r") as f: - content = f.read() - - # Pattern to match the API reference structure in the .qmd file: - # | [Function](path.qmd#anchor) | Description | - - table_row_pattern = r"\| \[([^\]]+)\]\(([^)]+)\) \| ([^\|]+) \|" - matches = re.findall(table_row_pattern, content) - - for title, path, description in matches: - # Extract just the filename without the anchor and change .qmd to .html - file_path = path.split("#")[0] - if file_path.endswith(".qmd"): - file_path = file_path[:-4] + ".html" - url = urljoin(base_url, f"reference/{file_path}") - - api_items.append({"title": title.strip(), "description": description.strip(), "url": url}) - - return api_items - - -def generate_llms_txt( - base_url: str = "https://posit-dev.github.io/pointblank/", - include_user_guide: bool = True, -) -> str: - """ - Generate the llms.txt content for the Pointblank project. - - Parameters - ---------- - base_url : str - The base URL of the Pointblank documentation site. - include_user_guide : bool - Whether to include user guide pages in the output. - - Returns - ------- - str - The llms.txt formatted content. - """ - if not SCRAPING_AVAILABLE: - raise ImportError( - "requests is required for web scraping. Install it with: pip install requests" - ) - - lines = ["# Pointblank", "", "## Docs", ""] - - # Add examples section - try: - examples = scrape_examples_index(base_url) - if examples: - lines.append("### Examples") - lines.append("") - for ex in examples: - desc = f": {ex['description']}" if ex["description"] else "" - lines.append(f"- [{ex['title']}]({ex['url']}){desc}") - lines.append("") - except Exception as e: - print(f"Warning: Failed to scrape examples index: {e}") - - # Add API reference section - try: - api_items = scrape_api_reference_index(base_url) - if api_items: - lines.append("### API Reference") - lines.append("") - for item in api_items: - desc = f": {item['description']}" if item["description"] else "" - lines.append(f"- [{item['title']}]({item['url']}){desc}") - lines.append("") - except Exception as e: - print(f"Warning: Failed to scrape API reference: {e}") - - # If user guide is requested, scrape it too - if include_user_guide: - try: - user_guide_items = scrape_user_guide_index(base_url) - if user_guide_items: - lines.append("### User Guide") - lines.append("") - for item in user_guide_items: - desc = f": {item['description']}" if item["description"] else "" - lines.append(f"- [{item['title']}]({item['url']}){desc}") - except Exception as e: - print(f"Warning: Failed to scrape user guide: {e}") - - return "\n".join(lines) - - -def scrape_user_guide_index( - base_url: str = "https://posit-dev.github.io/pointblank/", -) -> list[dict]: - """ - Get the user guide pages from local directory listing. - - Parameters - ---------- - base_url : str - The base URL of the Pointblank documentation site. - - Returns - ------- - list[dict] - A list of dictionaries with 'title', 'description', and 'url' keys. - """ - guide_items = [] - - # Read from local directory - user_guide_dir = Path(__file__).parent.parent / "docs" / "user-guide" - - if not user_guide_dir.exists(): - return guide_items - - # Get all .qmd files (excluding index.qmd) - qmd_files = sorted([f for f in user_guide_dir.glob("*.qmd") if f.name != "index.qmd"]) - - for qmd_file in qmd_files: - # Read the file to extract title - with open(qmd_file, "r") as f: - content = f.read() - - # Try to extract title from YAML frontmatter - title_match = re.search(r'^title:\s*["\']?([^"\'\n]+)["\']?', content, re.MULTILINE) - if title_match: - title = title_match.group(1).strip() - else: - # Fallback to filename - title = qmd_file.stem.replace("-", " ").title() - - # Try to extract first paragraph as description (optional) - # Skip code blocks and look for first real content - description = "" - - url = urljoin(base_url, f"user-guide/{qmd_file.stem}.html") - - guide_items.append({"title": title, "description": description, "url": url}) - - return guide_items - - -def generate_llms_full_txt(output_path: Optional[str] = None) -> str: - """ - Generate the llms-full.txt content using the existing api-docs.txt file or by generating - the API and examples text. - - Parameters - ---------- - output_path : str, optional - Path to save the generated content. If None, content is returned but not saved. - - Returns - ------- - str - The llms-full.txt formatted content. - """ - # Try to use existing api-docs.txt first - api_docs_path = Path(__file__).parent / "data" / "api-docs.txt" - - if api_docs_path.exists(): - with open(api_docs_path, "r") as f: - content = f.read() - else: - # Generate the content - content = _get_api_and_examples_text() - - if output_path: - with open(output_path, "w") as f: - f.write(content) - - return content - - -def main() -> None: - """ - Main function to generate both llms.txt and llms-full.txt files. - """ - # Generate llms.txt - print("Generating llms.txt...") - try: - llms_content = generate_llms_txt() - llms_path = Path(__file__).parent.parent / "docs" / "llms.txt" - with open(llms_path, "w") as f: - f.write(llms_content) - print(f"✓ Generated {llms_path}") - except Exception as e: - print(f"✗ Failed to generate llms.txt: {e}") - - # Generate llms-full.txt - print("\nGenerating llms-full.txt...") - try: - llms_full_path = Path(__file__).parent.parent / "docs" / "llms-full.txt" - generate_llms_full_txt(str(llms_full_path)) - print(f"✓ Generated {llms_full_path}") - except Exception as e: - print(f"✗ Failed to generate llms-full.txt: {e}") - - -if __name__ == "__main__": - main() diff --git a/pointblank/assistant.py b/pointblank/assistant.py index e4fb1a73a2..fea2028ce0 100644 --- a/pointblank/assistant.py +++ b/pointblank/assistant.py @@ -54,7 +54,7 @@ def assistant( ---------- model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. data An optional data table to focus on during discussion with the PbA, which could be a diff --git a/pointblank/data/api-docs.txt b/pointblank/data/api-docs.txt index 139fa3f825..58d73b5e11 100644 --- a/pointblank/data/api-docs.txt +++ b/pointblank/data/api-docs.txt @@ -827,7 +827,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: ' retrieve metadata about the step where the action is executed. -FinalActions(*args) +FinalActions(*args) -> 'None' Define actions to be taken after validation is complete. @@ -928,7 +928,7 @@ FinalActions(*args) used to retrieve the summary of the validation results. -Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs) +Schema(columns: 'str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None' = None, tbl: 'Any | None' = None, **kwargs) -> 'None' Definition of a schema object. The schema object defines the structure of a table. Once it is defined, the object can be used @@ -1202,7 +1202,7 @@ DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ The data to be used for drafting a validation plan. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. api_key The API key to be used for the model. @@ -1268,7 +1268,7 @@ DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ # Disable SSL verification for networks with self-signed certificates pb.DraftValidation( data=data, - model="anthropic:claude-sonnet-4-5", + model="anthropic:claude-opus-4-6", verify_ssl=False ) ``` @@ -1305,7 +1305,7 @@ DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ Let's look at how the `DraftValidation` class can be used to draft a validation plan for a table. The table to be used is `"nycflights"`, which is available here via the [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is - `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The + `"anthropic:claude-opus-4-6"` (which performs very well compared to other LLMs). The example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`. ```python @@ -1315,7 +1315,7 @@ DraftValidation(data: 'Any', model: 'str', api_key: 'str | None' = None, verify_ data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") # Draft a validation plan for the "nycflights" table - pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") + pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` The output will be a drafted validation plan for the `"nycflights"` table and this will appear @@ -9971,7 +9971,7 @@ prompt(self, prompt: 'str', model: 'str', columns_subset: 'str | list[str] | Non so try to include only the columns necessary for the validation. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from the provider. Model names are subject to change so consult the provider's documentation for the most up-to-date model names. @@ -12353,7 +12353,7 @@ get_json_report(self, use_fields: 'list[str] | None' = None, exclude_fields: 'li failed validation -get_sundered_data(self, type='pass') -> 'Any' +get_sundered_data(self, type: 'str' = 'pass') -> 'Any' Get the data that passed or failed the validation steps. @@ -14096,7 +14096,7 @@ assistant(model: 'str', data: 'Any' = None, tbl_name: 'str | None' = None, api_k ---------- model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. data An optional data table to focus on during discussion with the PbA, which could be a @@ -15962,6 +15962,9 @@ generate_dataset(schema: 'Schema', n: 'int' = 100, seed: 'int | None' = None, ou integer columns named `age` are automatically constrained to working-age range (22--65). - **Financial presets** (`"iban"`, `"ssn"`, `"license_plate"`): produce identifiers in the format used by the specified country. + - **Locale preset** (`"locale_code"`): returns a locale identifier (e.g., `"en_US"`, + `"de_DE"`) derived from the country. Multilingual countries randomly select among their + official locale codes (e.g., `"CH"` yields `"de_CH"`, `"fr_CH"`, or `"it_CH"`). When multiple columns in the same schema use related presets, the generated data is automatically coherent across those columns within each row. Person-related presets will share @@ -16423,8 +16426,9 @@ string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, p for region-specific formatting (e.g., address formats, phone number patterns). **Personal:** `"name"` (first + last name), `"name_full"` (full name with possible prefix - or suffix), `"first_name"`, `"last_name"`, `"email"` (realistic email address), - `"phone_number"`, `"address"` (full street address), `"city"`, `"state"`, `"country"`, + or suffix), `"first_name"`, `"last_name"`, `"gender"` (person's gender, coherent with + name), `"email"` (realistic email address), `"phone_number"`, `"address"` (full street + address), `"city"`, `"state"`, `"country"`, `"country_code_2"` (ISO 3166-1 alpha-2 code, e.g., `"US"`), `"country_code_3"` (ISO 3166-1 alpha-3 code, e.g., `"USA"`), `"postcode"`, `"latitude"`, `"longitude"` @@ -16434,7 +16438,8 @@ string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, p **Text:** `"text"` (paragraph of text), `"sentence"`, `"paragraph"`, `"word"` - **Financial:** `"credit_card_number"`, `"iban"`, `"currency_code"` + **Financial:** `"credit_card_number"`, `"credit_card_provider"` (Visa, Mastercard, + American Express, or Discover), `"iban"`, `"currency_code"` **Identifiers:** `"uuid4"`, `"md5"` (MD5 hash, 32 hex chars), `"sha1"` (SHA-1 hash, 40 hex chars), `"sha256"` (SHA-256 hash, 64 hex chars), `"ssn"` (social security number), @@ -16449,7 +16454,9 @@ string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, p (up to 10 years back), `"time"` **Miscellaneous:** `"color_name"`, `"file_name"`, `"file_extension"`, `"mime_type"`, - `"user_agent"` (browser user agent string with country-specific browser weighting) + `"user_agent"` (browser user agent string with country-specific browser weighting), + `"locale_code"` (locale identifier like `"en_US"`, `"de_DE"`; multilingual countries + return a random official locale) Coherent Data Generation ------------------------ @@ -16457,10 +16464,13 @@ string_field(min_length: 'int | None' = None, max_length: 'int | None' = None, p coherent across those columns within each row. Specifically: - **Person-related presets** (`"name"`, `"name_full"`, `"first_name"`, `"last_name"`, - `"email"`, `"user_name"`): the email and username will be derived from the person's name. + `"gender"`, `"email"`, `"user_name"`): the email and username will be derived from the + person's name, and `"gender"` will match the person's first name. - **Address-related presets** (`"address"`, `"city"`, `"state"`, `"postcode"`, `"phone_number"`, `"latitude"`, `"longitude"`): the city, state, and postcode will correspond to the same location within the address. + - **Credit card presets** (`"credit_card_number"`, `"credit_card_provider"`): the card + number prefix and provider name will be consistent (e.g., "Visa" with a "4"-prefixed number). This coherence is automatic and requires no additional configuration. @@ -17434,31 +17444,9 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None' This is a set of examples for the Pointblank library. ---------------------------------------------------------------------- -### Starter Validation (https://posit-dev.github.io/pointblank/demos/01-starter/) +### Advanced Validation (https://posit-dev.github.io/pointblank/01-getting-started/advanced-validation.html) -A validation with the basics - -```python -import pointblank as pb - -validation = ( - pb.Validate( # Use pb.Validate to start - data=pb.load_dataset(dataset="small_table", tbl_type="polars"), - tbl_name="small_table", - label="A starter validation" - ) - .col_vals_gt(columns="d", value=1000) # STEP 1 | - .col_vals_le(columns="c", value=5) # STEP 2 | <-- Build up a validation plan - .col_exists(columns=["date", "date_time"]) # STEP 3 | - .interrogate() # This will execute all validation steps and collect intel -) - -validation -``` - -### Advanced Validation (https://posit-dev.github.io/pointblank/demos/02-advanced/) - -A validation with a comprehensive set of rules +A validation with a comprehensive set of rules. ```python import pointblank as pb @@ -17496,58 +17484,54 @@ validation = ( validation ``` -### Data Extracts (https://posit-dev.github.io/pointblank/demos/03-data-extracts/) +### Starter Validation (https://posit-dev.github.io/pointblank/01-getting-started/starter-validation.html) -Pulling out data extracts that highlight rows with validation failures +A validation with the basics. ```python import pointblank as pb validation = ( - pb.Validate( - data=pb.load_dataset(dataset="game_revenue"), - tbl_name="game_revenue", - label="Validation with test unit failures available as an extract" + pb.Validate( # Use pb.Validate to start + data=pb.load_dataset(dataset="small_table", tbl_type="polars"), + tbl_name="small_table", + label="A starter validation" ) - .col_vals_gt(columns="item_revenue", value=0) # STEP 1: no test unit failures - .col_vals_ge(columns="session_duration", value=5) # STEP 2: 14 test unit failures -> extract - .interrogate() + .col_vals_gt(columns="d", value=1000) # STEP 1 | + .col_vals_le(columns="c", value=5) # STEP 2 | <-- Build up a validation plan + .col_exists(columns=["date", "date_time"]) # STEP 3 | + .interrogate() # This will execute all validation steps and collect intel ) -``` -```python -pb.preview(validation.get_data_extracts(i=2, frame=True), n_head=20, n_tail=20) +validation ``` -### Sundered Data (https://posit-dev.github.io/pointblank/demos/04-sundered-data/) +### Data Extracts (https://posit-dev.github.io/pointblank/02-results-and-reporting/data-extracts.html) -Splitting your data into 'pass' and 'fail' subsets +Pulling out data extracts that highlight rows with validation failures. ```python import pointblank as pb -import polars as pl validation = ( pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), - tbl_name="small_table", - label="Sundering Data" + data=pb.load_dataset(dataset="game_revenue"), + tbl_name="game_revenue", + label="Validation with test unit failures available as an extract" ) - .col_vals_gt(columns="d", value=1000) - .col_vals_le(columns="c", value=5) + .col_vals_gt(columns="item_revenue", value=0) # STEP 1: no test unit failures + .col_vals_ge(columns="session_duration", value=5) # STEP 2: 14 test unit failures -> extract .interrogate() ) - -validation ``` ```python -pb.preview(validation.get_sundered_data(type="pass")) +pb.preview(validation.get_data_extracts(i=2, frame=True), n_head=20, n_tail=20) ``` -### Step Report: Column Data Checks (https://posit-dev.github.io/pointblank/demos/05-step-report-column-check/) +### Step Reports for Column Data Checks (https://posit-dev.github.io/pointblank/02-results-and-reporting/step-report-column-check.html) -A step report for column checks shows what went wrong +A step report for column checks shows what went wrong. ```python import pointblank as pb @@ -17574,9 +17558,9 @@ validation.get_step_report(i=1) validation.get_step_report(i=2) ``` -### Step Report: Schema Check (https://posit-dev.github.io/pointblank/demos/06-step-report-schema-check/) +### Step Report for a Schema Check (https://posit-dev.github.io/pointblank/02-results-and-reporting/step-report-schema-check.html) -When a schema doesn't match, a step report gives you the details +When a schema doesn't match, a step report gives you the details. ```python import pointblank as pb @@ -17613,53 +17597,184 @@ validation validation.get_step_report(i=1) ``` -### Apply Validation Rules to Multiple Columns (https://posit-dev.github.io/pointblank/demos/apply-checks-to-several-columns/) +### Sundered Data (https://posit-dev.github.io/pointblank/02-results-and-reporting/sundered-data.html) -Create multiple validation steps by using a list of column names with `columns=` +Splitting your data into 'pass' and 'fail' subsets. ```python import pointblank as pb +import polars as pl validation = ( pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") + data=pb.load_dataset(dataset="small_table", tbl_type="pandas"), + tbl_name="small_table", + label="Sundering Data" ) - .col_vals_ge(columns=["a", "c", "d"], value=0) # check values in 'a', 'c', and 'd' - .col_exists(columns=["date_time", "date"]) # check for the existence of two columns + .col_vals_gt(columns="d", value=1000) + .col_vals_le(columns="c", value=5) .interrogate() ) validation ``` -### Verifying Row and Column Counts (https://posit-dev.github.io/pointblank/demos/check-row-column-counts/) +```python +pb.preview(validation.get_sundered_data(type="pass")) +``` + +### Set Failure Threshold Levels (https://posit-dev.github.io/pointblank/03-actions-and-thresholds/failure-thresholds.html) -Check the dimensions of the table with the `*_count_match()` validation methods +Set threshold levels to better gauge adverse data quality. ```python import pointblank as pb validation = ( pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") + data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), + thresholds=pb.Thresholds( # setting relative threshold defaults for all steps + warning=0.05, # 5% failing test units: warning threshold (gray) + error=0.10, # 10% failed test units: error threshold (yellow) + critical=0.15 # 15% failed test units: critical threshold (red) + ), ) - .col_count_match(count=11) # expect 11 columns in the table - .row_count_match(count=2000) # expect 2,000 rows in the table - .row_count_match(count=0, inverse=True) # expect that the table has rows - .col_count_match( # compare column count against - count=pb.load_dataset( # that of another table - dataset="game_revenue", tbl_type="pandas" + .col_vals_in_set(columns="item_type", set=["iap", "ad"]) + .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") + .col_vals_gt(columns="item_revenue", value=0.05) + .col_vals_gt( + columns="session_duration", + value=4, + thresholds=(5, 10, 20) # setting absolute thresholds for *this* step (W, E, C) + ) + .col_exists(columns="end_day") + .interrogate() +) + +validation +``` + +### Step-Level Actions (https://posit-dev.github.io/pointblank/03-actions-and-thresholds/validation-with-actions.html) + +Configure actions to trigger when validation thresholds are exceeded, such as logging warnings or errors. + +```python +import pointblank as pb + +def log_warning(): + """Custom action to log validation warnings""" + metadata = pb.get_action_metadata() + print(f"⚠️ WARNING: Validation step '{metadata['step']}' exceeded threshold!") + +def log_error(): + """Custom action to log validation errors""" + metadata = pb.get_action_metadata() + print(f"❌ ERROR: Critical validation failure in step '{metadata['step']}'!") + print(f" This requires immediate attention.") + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="nycflights", tbl_type="polars"), + label="Validation with actions" + ) + .col_vals_between( + columns="distance", + left=100, right=2000, + thresholds=pb.Thresholds(warning=0.1), # Allow 10% failures before warning + actions=pb.Actions(warning=log_warning), + brief="Column 'distance' range check." + ) + .col_vals_gt( + columns="air_time", + value=25, + na_pass=True, + thresholds=pb.Thresholds(error=200), # Allow only 200 failures before error + actions=pb.Actions(error=log_error), + brief="Column 'origin' check for minimum value." + ) + .col_vals_not_null( + columns="carrier", + thresholds=(1, 0.05), # No tolerance for null values + actions=pb.Actions(warning=log_warning, error=log_error), + brief="Column 'carrier' completeness check." + ) + .interrogate() +) + +validation +``` + +### Final Actions (https://posit-dev.github.io/pointblank/03-actions-and-thresholds/validation-with-final-actions.html) + +Execute actions after validation completes, such as sending alerts or generating summary reports. + +```python +import pointblank as pb + +def send_alert(): + """Check validation summary and send alert if critical failures found""" + summary = pb.get_validation_summary() + if summary and summary.get("highest_severity") == "critical": + print(f"🚨 ALERT: Critical validation failures found!") + print(f" Failed steps: {summary['n_failing_steps']}") + elif summary and summary.get("highest_severity") == "error": + print(f"⚠️ WARNING: Error-level validation failures detected.") + else: + print("✅ All validation checks passed successfully!") + +def generate_summary_report(): + """Generate a summary report of validation results""" + summary = pb.get_validation_summary() + if summary: + print("\n--- Validation Summary Report ---") + print(f"Total validation steps: {summary['n_steps']}") + print(f"Passed steps: {summary['n_passing_steps']}") + print(f"Failed steps: {summary['n_failing_steps']}") + print(f"Highest severity: {summary['highest_severity']}") + print("--- End of Report ---") + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="game_revenue", tbl_type="polars"), + label="Validation with final actions", + thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15), + final_actions=pb.FinalActions( + "Validation workflow completed.", # String message + send_alert, # Alert function + generate_summary_report # Report function ) ) + .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}") + .col_vals_gt(columns="item_revenue", value=0.05) + .col_vals_gt(columns="session_duration", value=15) + .interrogate() +) + +validation +``` + +### Apply Validation Rules to Multiple Columns (https://posit-dev.github.io/pointblank/04-column-value-checks/apply-checks-to-several-columns.html) + +Create multiple validation steps by using a list of column names with columns=. + +```python +import pointblank as pb + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="small_table", tbl_type="polars") + ) + .col_vals_ge(columns=["a", "c", "d"], value=0) # check values in 'a', 'c', and 'd' + .col_exists(columns=["date_time", "date"]) # check for the existence of two columns .interrogate() ) validation ``` -### Checks for Missing Values (https://posit-dev.github.io/pointblank/demos/checks-for-missing/) +### Checks for Missing Values (https://posit-dev.github.io/pointblank/04-column-value-checks/checks-for-missing.html) -Perform validations that check whether missing/NA/Null values are present +Perform validations that check whether missing/NA/Null values are present. ```python import pointblank as pb @@ -17679,9 +17794,9 @@ validation = ( validation ``` -### Custom Expression for Checking Column Values (https://posit-dev.github.io/pointblank/demos/col-vals-custom-expr/) +### Custom Expression for Checking Column Values (https://posit-dev.github.io/pointblank/04-column-value-checks/col-vals-custom-expr.html) -A column expression can be used to check column values. Just use `col_vals_expr()` for this +A column expression can be used to check column values. Just use col_vals_expr() for this. ```python import pointblank as pb @@ -17697,32 +17812,23 @@ validation = ( validation ``` -### Column Selector Functions: Easily Pick Columns (https://posit-dev.github.io/pointblank/demos/column-selector-functions/) +### Comparison Checks Across Columns (https://posit-dev.github.io/pointblank/04-column-value-checks/comparisons-across-columns.html) -Use column selector functions in the `columns=` argument to conveniently choose columns +Perform comparisons of values in columns to values in other columns. ```python import pointblank as pb -import narwhals.selectors as ncs validation = ( pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") - ) - .col_vals_ge( - columns=pb.matches("rev|dur"), # check values in columns having 'rev' or 'dur' in name - value=0 - ) - .col_vals_regex( - columns=pb.ends_with("_id"), # check values in columns with names ending in '_id' - pattern=r"^[A-Z]{12}\d{3}" - ) - .col_vals_not_null( - columns=pb.last_n(2) # check that the last two columns don't have Null values + data=pb.load_dataset(dataset="small_table", tbl_type="polars") ) - .col_vals_regex( - columns=ncs.string(), # check that all string columns are non-empty strings - pattern=r"(.|\s)*\S(.|\s)*" + .col_vals_lt(columns="a", value=pb.col("c")) # values in 'a' > values in 'c' + .col_vals_between( + columns="d", # values in 'd' are between values + left=pb.col("c"), # in 'c' and the fixed value of 12,000; + right=12000, # any missing values encountered result + na_pass=True # in a passing test unit ) .interrogate() ) @@ -17730,23 +17836,38 @@ validation = ( validation ``` -### Comparison Checks Across Columns (https://posit-dev.github.io/pointblank/demos/comparisons-across-columns/) +### Custom Validation with specially() (https://posit-dev.github.io/pointblank/04-column-value-checks/custom-validation-specially.html) -Perform comparisons of values in columns to values in other columns +Create bespoke validations using specially() to implement domain-specific business rules. ```python import pointblank as pb +import polars as pl + +def within_std_deviations(df, column, n_std=2): + """Check if all values are within n standard deviations of the mean""" + mean_val = df[column].mean() + std_val = df[column].std() + + lower_bound = mean_val - (n_std * std_val) + upper_bound = mean_val + (n_std * std_val) + + # Add a boolean column and return the modified DataFrame + return df.with_columns( + pl.col(column).is_between(lower_bound, upper_bound, closed="both").alias("validation_result") + ) validation = ( pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") + data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") ) - .col_vals_lt(columns="a", value=pb.col("c")) # values in 'a' > values in 'c' - .col_vals_between( - columns="d", # values in 'd' are between values - left=pb.col("c"), # in 'c' and the fixed value of 12,000; - right=12000, # any missing values encountered result - na_pass=True # in a passing test unit + .specially( + expr=lambda df: within_std_deviations(df, column="session_duration", n_std=2), + brief="All values in column 'a' should be within 2 std devs of mean" + ) + .specially( + expr=lambda df: within_std_deviations(df, column="session_duration", n_std=3), + brief="All values in column 'c' should be within 3 std devs of mean" ) .interrogate() ) @@ -17754,9 +17875,9 @@ validation = ( validation ``` -### Expect No Duplicate Rows (https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-rows/) +### Expect No Duplicate Rows (https://posit-dev.github.io/pointblank/04-column-value-checks/expect-no-duplicate-rows.html) -We can check for duplicate rows in the table with `rows_distinct()` +We can check for duplicate rows in the table with rows_distinct(). ```python import pointblank as pb @@ -17772,9 +17893,9 @@ validation = ( validation ``` -### Checking for Duplicate Values (https://posit-dev.github.io/pointblank/demos/expect-no-duplicate-values/) +### Checking for Duplicate Values (https://posit-dev.github.io/pointblank/04-column-value-checks/expect-no-duplicate-values.html) -To check for duplicate values down a column, use `rows_distinct()` with a `columns_subset=` value +To check for duplicate values down a column, use rows_distinct() with a columns_subset= value. ```python import pointblank as pb @@ -17790,9 +17911,9 @@ validation = ( validation ``` -### Expectations with a Text Pattern (https://posit-dev.github.io/pointblank/demos/expect-text-pattern/) +### Expectations with a Text Pattern (https://posit-dev.github.io/pointblank/04-column-value-checks/expect-text-pattern.html) -With the `col_vals_regex()`, check for conformance to a regular expression +With col_vals_regex(), check for conformance to a regular expression. ```python import pointblank as pb @@ -17809,40 +17930,240 @@ validation = ( validation ``` -### Set Failure Threshold Levels (https://posit-dev.github.io/pointblank/demos/failure-thresholds/) +### Numeric Comparisons (https://posit-dev.github.io/pointblank/04-column-value-checks/numeric-comparisons.html) -Set threshold levels to better gauge adverse data quality +Perform comparisons of values in columns to fixed values. ```python import pointblank as pb validation = ( pb.Validate( - data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"), - thresholds=pb.Thresholds( # setting relative threshold defaults for all steps - warning=0.05, # 5% failing test units: warning threshold (gray) - error=0.10, # 10% failed test units: error threshold (yellow) - critical=0.15 # 15% failed test units: critical threshold (red) + data=pb.load_dataset(dataset="small_table", tbl_type="polars") + ) + .col_vals_gt(columns="d", value=1000) # values in 'd' > 1000 + .col_vals_lt(columns="d", value=10000) # values in 'd' < 10000 + .col_vals_ge(columns="a", value=1) # values in 'a' >= 1 + .col_vals_le(columns="c", value=5) # values in 'c' <= 5 + .col_vals_ne(columns="a", value=7) # values in 'a' not equal to 7 + .col_vals_between(columns="c", left=0, right=15) # 0 <= 'c' values <= 15 + .interrogate() +) + +validation +``` + +### Set Membership (https://posit-dev.github.io/pointblank/04-column-value-checks/set-membership.html) + +Perform validations that check whether values are part of a set (or not part of one). + +```python +import pointblank as pb + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="small_table", tbl_type="polars") + ) + .col_vals_in_set(columns="f", set=["low", "mid", "high"]) # part of this set + .col_vals_not_in_set(columns="f", set=["zero", "infinity"]) # not part of this set + .interrogate() +) + +validation +``` + +### Validating Data Freshness (https://posit-dev.github.io/pointblank/05-advanced-topics/check-for-freshness.html) + +Use date-based validations to ensure your data is current and recent. + +```python +import pointblank as pb +import polars as pl +from datetime import date, datetime, timedelta + +# Create sample data with mixed freshness levels +freshness_data = pl.DataFrame({ + "data_timestamp": [ + datetime(2023, 12, 28, 10, 30), # 3 days ago from Dec 31 + datetime(2023, 12, 29, 14, 15), # 2 days ago + datetime(2023, 12, 30, 9, 45), # 1 day ago + datetime(2023, 12, 31, 16, 20), # Today + ], + "sensor_id": ["TEMP_01", "TEMP_02", "TEMP_01", "TEMP_03"], + "reading": [22.5, 21.8, 23.1, 22.9], + "quality_score": [0.95, 0.88, 0.92, 0.97] +}) + +# Assuming today is 2023-12-31, check for data freshness +current_date = date(2023, 12, 31) +freshness_cutoff = current_date - timedelta(days=2) # Data should be within 2 days + +validation = ( + pb.Validate(freshness_data) + .specially( + expr=lambda df: df.filter( + pl.col("data_timestamp").dt.date() >= freshness_cutoff + ).height > 0, + brief=f"Recent data available (within 2 days of {current_date})" + ) + .col_vals_ge( + columns="data_timestamp", + value=current_date - timedelta(days=7), # Within last week + brief="All data points are from the last week" + ) + .specially( + expr=lambda df: ( + df.select(pl.col("data_timestamp").max()).item().date() >= current_date ), + brief="Most recent data is from today" ) - .col_vals_in_set(columns="item_type", set=["iap", "ad"]) - .col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}\d{3}") - .col_vals_gt(columns="item_revenue", value=0.05) - .col_vals_gt( - columns="session_duration", - value=4, - thresholds=(5, 10, 20) # setting absolute thresholds for *this* step (W, E, C) + .col_vals_not_null( + columns="data_timestamp", + brief="No missing timestamps" + ) + .interrogate() +) + +validation +``` + +### Verifying Row and Column Counts (https://posit-dev.github.io/pointblank/05-advanced-topics/check-row-column-counts.html) + +Check the dimensions of the table with the *_count_match() validation methods. + +```python +import pointblank as pb + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb") + ) + .col_count_match(count=11) # expect 11 columns in the table + .row_count_match(count=2000) # expect 2,000 rows in the table + .row_count_match(count=0, inverse=True) # expect that the table has rows + .col_count_match( # compare column count against + count=pb.load_dataset( # that of another table + dataset="game_revenue", tbl_type="pandas" + ) ) - .col_exists(columns="end_day") .interrogate() ) validation ``` -### Mutate the Table in a Validation Step (https://posit-dev.github.io/pointblank/demos/mutate-table-in-step/) +### CLI Interactive Demos (https://posit-dev.github.io/pointblank/05-advanced-topics/cli-interactive.html) + +These CLI demos showcase practical data quality workflows that you can use! + -For far more specialized validations, modify the table with the `pre=` argument before checking it + +### Column Selector Functions: Easily Pick Columns (https://posit-dev.github.io/pointblank/05-advanced-topics/column-selector-functions.html) + +Use column selector functions in the columns= argument to conveniently choose columns. + +```python +import pointblank as pb +import narwhals.selectors as ncs + +validation = ( + pb.Validate( + data=pb.load_dataset(dataset="game_revenue", tbl_type="polars") + ) + .col_vals_ge( + columns=pb.matches("rev|dur"), # check values in columns having 'rev' or 'dur' in name + value=0 + ) + .col_vals_regex( + columns=pb.ends_with("_id"), # check values in columns with names ending in '_id' + pattern=r"^[A-Z]{12}\d{3}" + ) + .col_vals_not_null( + columns=pb.last_n(2) # check that the last two columns don't have Null values + ) + .col_vals_regex( + columns=ncs.string(), # check that all string columns are non-empty strings + pattern=r"(.|\s)*\S(.|\s)*" + ) + .interrogate() +) + +validation +``` + +### Date and Datetime Validations (https://posit-dev.github.io/pointblank/05-advanced-topics/datetime-validations.html) + +Comprehensive examples of date, datetime, and timezone-aware datetime comparisons. + +```python +import pointblank as pb +import polars as pl +from datetime import date, datetime +from zoneinfo import ZoneInfo + +# Create sample data with various temporal data types +temporal_data = pl.DataFrame({ + "order_date": [ + date(2023, 1, 15), + date(2023, 6, 10), + date(2023, 12, 5), + date(2024, 3, 20) + ], + "created_at": [ + datetime(2023, 1, 15, 9, 30, 0), + datetime(2023, 6, 10, 14, 45, 30), + datetime(2023, 12, 5, 8, 15, 0), + datetime(2024, 3, 20, 17, 22, 45) + ], + "event_time_tz": [ + datetime(2023, 1, 15, 9, 0, tzinfo=ZoneInfo("America/New_York")), + datetime(2023, 6, 10, 12, 30, tzinfo=ZoneInfo("America/New_York")), + datetime(2023, 12, 5, 15, 45, tzinfo=ZoneInfo("America/New_York")), + datetime(2024, 3, 20, 18, 15, tzinfo=ZoneInfo("America/New_York")) + ], + "order_id": [1001, 1002, 1003, 1004], + "amount": [150.0, 275.5, 89.99, 420.00] +}) + +validation = ( + pb.Validate(temporal_data) + .col_vals_ge( + columns="order_date", + value=date(2023, 1, 1), + brief="Orders are from 2023 or later" + ) + .col_vals_between( + columns="created_at", + left=datetime(2023, 1, 1, 0, 0, 0), + right=datetime(2024, 12, 31, 23, 59, 59), + brief="Creation timestamps within expected range" + ) + .col_vals_ge( + columns="event_time_tz", + value=datetime(2023, 1, 1, 8, 0, tzinfo=ZoneInfo("America/New_York")), + brief="Timezone-aware events after 8 AM Eastern" + ) + .col_schema_match( + pb.Schema( + columns=[ + ("order_date", "Date"), + ("created_at", "Datetime(time_unit='us', time_zone=None)"), + ("event_time_tz", "Datetime(time_unit='us', time_zone='America/New_York')"), + ("order_id", "Int64"), + ("amount", "Float64") + ] + ), + brief="Schema includes proper date/datetime types" + ) + .interrogate() +) + +validation +``` + +### Mutate the Table in a Validation Step (https://posit-dev.github.io/pointblank/05-advanced-topics/mutate-table-in-step.html) + +For far more specialized validations, modify the table with the pre= argument before checking it. ```python import pointblank as pb @@ -17881,32 +18202,9 @@ validation = ( validation ``` -### Numeric Comparisons (https://posit-dev.github.io/pointblank/demos/numeric-comparisons/) - -Perform comparisons of values in columns to fixed values - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_gt(columns="d", value=1000) # values in 'd' > 1000 - .col_vals_lt(columns="d", value=10000) # values in 'd' < 10000 - .col_vals_ge(columns="a", value=1) # values in 'a' >= 1 - .col_vals_le(columns="c", value=5) # values in 'c' <= 5 - .col_vals_ne(columns="a", value=7) # values in 'a' not equal to 7 - .col_vals_between(columns="c", left=0, right=15) # 0 <= 'c' values <= 15 - .interrogate() -) - -validation -``` - -### Check the Schema of a Table (https://posit-dev.github.io/pointblank/demos/schema-check/) +### Check the Schema of a Table (https://posit-dev.github.io/pointblank/05-advanced-topics/schema-check.html) -The schema of a table can be flexibly defined with `Schema` and verified with `col_schema_match()` +The schema of a table can be flexibly defined with Schema and verified with col_schema_match(). ```python import pointblank as pb @@ -17939,28 +18237,9 @@ validation = ( validation ``` -### Set Membership (https://posit-dev.github.io/pointblank/demos/set-membership/) - -Perform validations that check whether values are part of a set (or *not* part of one) - -```python -import pointblank as pb - -validation = ( - pb.Validate( - data=pb.load_dataset(dataset="small_table", tbl_type="polars") - ) - .col_vals_in_set(columns="f", set=["low", "mid", "high"]) # part of this set - .col_vals_not_in_set(columns="f", set=["zero", "infinity"]) # not part of this set - .interrogate() -) - -validation -``` - -### Using Parquet Data (https://posit-dev.github.io/pointblank/demos/using-parquet-data/) +### Using Parquet Data (https://posit-dev.github.io/pointblank/05-advanced-topics/using-parquet-data.html) -A Parquet dataset can be used for data validation, thanks to Ibis +A Parquet dataset can be used for data validation, thanks to Ibis. ```python import pointblank as pb diff --git a/pointblank/draft.py b/pointblank/draft.py index f62f6104c6..3e6608e6d2 100644 --- a/pointblank/draft.py +++ b/pointblank/draft.py @@ -37,7 +37,7 @@ class DraftValidation: The data to be used for drafting a validation plan. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. api_key The API key to be used for the model. @@ -103,7 +103,7 @@ class DraftValidation: # Disable SSL verification for networks with self-signed certificates pb.DraftValidation( data=data, - model="anthropic:claude-sonnet-4-5", + model="anthropic:claude-opus-4-6", verify_ssl=False ) ``` @@ -140,7 +140,7 @@ class DraftValidation: Let's look at how the `DraftValidation` class can be used to draft a validation plan for a table. The table to be used is `"nycflights"`, which is available here via the [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is - `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The + `"anthropic:claude-opus-4-6"` (which performs very well compared to other LLMs). The example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`. ```python @@ -150,7 +150,7 @@ class DraftValidation: data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb") # Draft a validation plan for the "nycflights" table - pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") + pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` The output will be a drafted validation plan for the `"nycflights"` table and this will appear diff --git a/pointblank/validate.py b/pointblank/validate.py index 68b7cec026..8b3b99e495 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -10928,7 +10928,7 @@ def prompt( so try to include only the columns necessary for the validation. model The model to be used. This should be in the form of `provider:model` (e.g., - `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`, + `"anthropic:claude-opus-4-6"`). Supported providers are `"anthropic"`, `"openai"`, `"ollama"`, and `"bedrock"`. The model name should be the specific model to be used from the provider. Model names are subject to change so consult the provider's documentation for the most up-to-date model names. diff --git a/pyproject.toml b/pyproject.toml index c1d42d0985..526088f775 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,8 +77,6 @@ docs = [ "jupyter", "nbclient>=0.10.0", "nbformat>=5.10.4", - "quartodoc>=0.8.1; python_version >= '3.9'", - "griffe>=0.38.1,<2", # Pin below griffe 2.0 which removed parse_numpy(options) "pandas>=2.2.3", "polars>=1.17.1", "pyspark==3.5.6", @@ -90,7 +88,6 @@ docs = [ dev = [ "chatlas>=0.6.1", "duckdb>=1.2.0,<1.3.3", # Pin to stable versions avoiding 1.4.0+ RecordBatchReader issues - "griffe==0.38.1", "hypothesis>=6.129.2", "ibis-framework[duckdb,mysql,postgres,sqlite]>=9.5.0", "jupyter", @@ -110,7 +107,6 @@ dev = [ "pytest-snapshot", "pytest-xdist>=3.6.1", "pytz>=2025.2", - "quartodoc>=0.8.1; python_version >= '3.9'", "ruff==0.14.10", # NOTE: must match rev in .pre-commit-config.yaml "shiny>=1.4.0", "openpyxl>=3.0.0", diff --git a/scripts/generate_api_docs.py b/scripts/generate_api_docs.py new file mode 100644 index 0000000000..19a4a2b7e7 --- /dev/null +++ b/scripts/generate_api_docs.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Script to regenerate the api-docs.txt file used by DraftValidation and assistant(). + +The llms.txt and llms-full.txt files are now generated natively by Great Docs during +the site build process. +""" + +import sys +from pathlib import Path + +# Add the parent directory to the path so we can import from pointblank +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from pointblank._utils_llms_txt import _get_api_and_examples_text + + +def main(): + """Regenerate the api-docs.txt file.""" + base_dir = Path(__file__).parent.parent + data_dir = base_dir / "pointblank" / "data" + + # Ensure directory exists + data_dir.mkdir(exist_ok=True) + + # Regenerate the api-docs.txt file (used by DraftValidation and assistant()) + print("Regenerating api-docs.txt...") + try: + api_docs_content = _get_api_and_examples_text() + api_docs_path = data_dir / "api-docs.txt" + with open(api_docs_path, "w") as f: + f.write(api_docs_content) + print(f"✓ Generated {api_docs_path}") + except Exception as e: + print(f"✗ Failed to generate api-docs.txt: {e}") + import traceback + + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_llms_txt.py b/scripts/generate_llms_txt.py deleted file mode 100644 index 4d8950dc5c..0000000000 --- a/scripts/generate_llms_txt.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to generate llms.txt and llms-full.txt files for the Pointblank documentation. - -This script can be run standalone without importing the full pointblank package. -""" - -import sys -from pathlib import Path - -# Add the parent directory to the path so we can import from pointblank -sys.path.insert(0, str(Path(__file__).parent.parent)) - -from pointblank._utils_llms_txt import ( - _get_api_and_examples_text, - generate_llms_full_txt, - generate_llms_txt, -) - - -def main(): - """Generate both llms.txt and llms-full.txt files.""" - base_dir = Path(__file__).parent.parent - docs_dir = base_dir / "docs" - data_dir = base_dir / "pointblank" / "data" - - # Ensure directories exist - docs_dir.mkdir(exist_ok=True) - data_dir.mkdir(exist_ok=True) - - # First, regenerate the api-docs.txt file (used by assistant() and as cache for llms-full.txt) - print("Regenerating api-docs.txt...") - try: - api_docs_content = _get_api_and_examples_text() - api_docs_path = data_dir / "api-docs.txt" - with open(api_docs_path, "w") as f: - f.write(api_docs_content) - print(f"✓ Generated {api_docs_path}") - except Exception as e: - print(f"✗ Failed to generate api-docs.txt: {e}") - import traceback - - traceback.print_exc() - - # Generate llms.txt - print("\nGenerating llms.txt...") - try: - llms_content = generate_llms_txt() - llms_path = docs_dir / "llms.txt" - with open(llms_path, "w") as f: - f.write(llms_content) - print(f"✓ Generated {llms_path}") - except Exception as e: - print(f"✗ Failed to generate llms.txt: {e}") - import traceback - - traceback.print_exc() - - # Generate llms-full.txt - print("\nGenerating llms-full.txt...") - try: - llms_full_path = docs_dir / "llms-full.txt" - generate_llms_full_txt(str(llms_full_path)) - print(f"✓ Generated {llms_full_path}") - except Exception as e: - print(f"✗ Failed to generate llms-full.txt: {e}") - import traceback - - traceback.print_exc() - - -if __name__ == "__main__": - main() diff --git a/translations/README.ar.md b/translations/README.ar.md index 64f7bc3096..7ff72a742f 100644 --- a/translations/README.ar.md +++ b/translations/README.ar.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # مجموعة بيانات عينة # استخدم DraftValidation لإنشاء خطة تحقق -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` الناتج هو خطة تحقق كاملة مع اقتراحات ذكية مبنية على بياناتك: diff --git a/translations/README.de.md b/translations/README.de.md index 873e826dd8..bc79489d45 100644 --- a/translations/README.de.md +++ b/translations/README.de.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Ein Beispieldatensatz # Verwenden Sie DraftValidation, um einen Validierungsplan zu generieren -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` Die Ausgabe ist ein vollständiger Validierungsplan mit intelligenten Vorschlägen basierend auf Ihren Daten: diff --git a/translations/README.es.md b/translations/README.es.md index c694e437ec..3f913405c7 100644 --- a/translations/README.es.md +++ b/translations/README.es.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Un conjunto de datos de ejemplo # Usa DraftValidation para generar un plan de validación -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` La salida es un plan de validación completo con sugerencias inteligentes basadas en tus datos: diff --git a/translations/README.fr.md b/translations/README.fr.md index 9dbc6aa169..f139daea39 100644 --- a/translations/README.fr.md +++ b/translations/README.fr.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Un jeu de données d'exemple # Utilisez DraftValidation pour générer un plan de validation -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` La sortie est un plan de validation complet avec des suggestions intelligentes basées sur vos données : diff --git a/translations/README.hi.md b/translations/README.hi.md index 72a7002ab6..9f3cacf7f7 100644 --- a/translations/README.hi.md +++ b/translations/README.hi.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # एक नमूना डेटासेट # वैलिडेशन योजना गेनरेट करने के लिए DraftValidation का उपयोग करें -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` आउटपुट आपके डेटा के आधार पर बुद्धिमान सुझावों के साथ एक पूर्ण वैलिडेशन योजना है: diff --git a/translations/README.it.md b/translations/README.it.md index d649aefded..3f94f97957 100644 --- a/translations/README.it.md +++ b/translations/README.it.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Un dataset di esempio # Usa DraftValidation per generare un piano di validazione -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` L'output è un piano di validazione completo con suggerimenti intelligenti basati sui tuoi dati: diff --git a/translations/README.ja.md b/translations/README.ja.md index 7e7d029022..e82cf7af62 100644 --- a/translations/README.ja.md +++ b/translations/README.ja.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # サンプルデータセット # DraftValidation を使用して検証プランを生成 -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` 出力は、データに基づいた知的な提案を含む完全な検証プランです: diff --git a/translations/README.ko.md b/translations/README.ko.md index 1fcb1b7234..f0de496b10 100644 --- a/translations/README.ko.md +++ b/translations/README.ko.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # 예제 데이터셋 # DraftValidation을 사용하여 검증 계획 생성 -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` 결과는 데이터에 기반한 지능적 제안이 포함된 완전한 검증 계획입니다: diff --git a/translations/README.nl.md b/translations/README.nl.md index cdaeff5de6..55303c5da7 100644 --- a/translations/README.nl.md +++ b/translations/README.nl.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Een voorbeeld dataset # Gebruik DraftValidation om een validatieplan te genereren -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` De uitvoer is een volledig validatieplan met intelligente suggesties gebaseerd op je data: diff --git a/translations/README.pt-BR.md b/translations/README.pt-BR.md index 16582023a1..f60876bc64 100644 --- a/translations/README.pt-BR.md +++ b/translations/README.pt-BR.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # Um conjunto de dados de exemplo # Use DraftValidation para gerar um plano de validação -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` A saída é um plano de validação completo com sugestões inteligentes baseadas em seus dados: diff --git a/translations/README.zh-CN.md b/translations/README.zh-CN.md index f5746925ba..59301e5973 100644 --- a/translations/README.zh-CN.md +++ b/translations/README.zh-CN.md @@ -52,7 +52,7 @@ import pointblank as pb data = pb.load_dataset("game_revenue") # 示例数据集 # 使用 DraftValidation 生成验证计划 -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") ``` 输出是基于您的数据的具有智能建议的完整验证计划: diff --git a/user_guide/01-validation-plan/02-validation-methods.qmd b/user_guide/01-validation-plan/02-validation-methods.qmd index aba6e4ff29..eb76b2fd0c 100644 --- a/user_guide/01-validation-plan/02-validation-methods.qmd +++ b/user_guide/01-validation-plan/02-validation-methods.qmd @@ -601,7 +601,7 @@ products = pl.DataFrame({ .prompt( prompt="Each description should mention either quality, features, or warranty", columns_subset=["description"], - model="anthropic:claude-sonnet-4-5" + model="anthropic:claude-opus-4-6" ) .interrogate() ) diff --git a/user_guide/02-advanced-validation/04-draft-validation.qmd b/user_guide/02-advanced-validation/04-draft-validation.qmd index 98dbeb3c07..cda5892dd0 100644 --- a/user_guide/02-advanced-validation/04-draft-validation.qmd +++ b/user_guide/02-advanced-validation/04-draft-validation.qmd @@ -67,7 +67,7 @@ data = pb.load_dataset(dataset="global_sales", tbl_type="polars") # Generate a validation plan pb.DraftValidation( data=data, - model="anthropic:claude-sonnet-4-5", + model="anthropic:claude-opus-4-6", api_key="your_api_key_here" # Replace with your actual API key ) ``` @@ -160,7 +160,7 @@ api_key = os.getenv("ANTHROPIC_API_KEY") draft_validation = pb.DraftValidation( data=data, - model="anthropic:claude-sonnet-4-5", + model="anthropic:claude-opus-4-6", api_key=api_key ) ``` @@ -180,7 +180,7 @@ If your API keys have standard names (like `ANTHROPIC_API_KEY` or `OPENAI_API_KE # No API key needed if stored in .env with standard names draft_validation = pb.DraftValidation( data=data, - model="anthropic:claude-sonnet-4-5" + model="anthropic:claude-opus-4-6" ) ``` @@ -192,7 +192,7 @@ Here's an example of a validation plan that might be generated by `DraftValidati ```python pb.DraftValidation( pb.load_dataset(dataset="nycflights", tbl_type="duckdb", - model="anthropic:claude-sonnet-4-5" + model="anthropic:claude-opus-4-6" ) ``` @@ -270,7 +270,7 @@ When using `DraftValidation`, you specify the model in the format `"provider:mod ```python # Using Anthropic's Claude model -pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5") +pb.DraftValidation(data=data, model="anthropic:claude-opus-4-6") # Using OpenAI's GPT model pb.DraftValidation(data=data, model="openai:gpt-4-turbo") diff --git a/user_guide/index.qmd b/user_guide/index.qmd index b7870f3b77..f33ec54d46 100644 --- a/user_guide/index.qmd +++ b/user_guide/index.qmd @@ -171,13 +171,6 @@ pip install "pointblank[postgres]" # PostgreSQL support See the [Installation guide](user-guide/installation.qmd) for more details. -## Text Formats - -The docs are also available in `llms.txt` format: - -- [`llms.txt`](llms.txt): a sitemap listing all documentation pages -- [`llms-full.txt`](llms-full.txt): all the documentation in one file - ## Join the Community We'd love to hear from you! Connect with us: