Skip to content

Commit fd90887

Browse files
authored
🎯 feat: add data quality and write report file after validation (#4)
* 📄 docs: update readme for usage of data-quality. * ⚙️ fixed: remove usesage pattern of data-quality. * ⚙️ fixed: change way to get dq. * 🎯 feat: add tempate for quality check.
1 parent 5d4389b commit fd90887

File tree

6 files changed

+141
-12
lines changed

6 files changed

+141
-12
lines changed

README.md

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ pip install -U sqlplate
4040

4141
## :fork_and_knife: Usage
4242

43+
### Generate SQL template
44+
4345
Start passing option parameters before generate the Delta ETL SQL statement that
4446
will use on the Azure Databricks service.
4547

@@ -67,7 +69,7 @@ print(statement.strip().strip('\n'))
6769

6870
The result SQL statement:
6971

70-
```text
72+
```sql
7173
MERGE INTO catalog-name.schema-name.table-name AS target
7274
USING (
7375
WITH change_query AS (
@@ -108,6 +110,55 @@ WHEN NOT MATCHED THEN INSERT
108110
;
109111
```
110112

113+
### Data Quality
114+
115+
This package handle generate SQL statement only. For a data quality part, you can
116+
use the quality template.
117+
118+
> [!IMPORTANT]
119+
> This feature does not support yet!!!
120+
121+
```python
122+
from sqlplate import SQLPlate
123+
124+
statement: str = (
125+
SQLPlate.format('databricks')
126+
.template('quality.check')
127+
.option('catalog', 'catalog-name')
128+
.option('schema', 'schema-name')
129+
.option('table', 'table-name')
130+
.option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')")
131+
.option('unique', ['pk_col'])
132+
.option('notnull', ['col01', 'col02'])
133+
.option("contain", [("col01", ["A", "B", "C"])])
134+
.option("validate", [("col03", "> 10000")])
135+
.load()
136+
)
137+
print(statement.strip().strip('\n'))
138+
```
139+
140+
The result SQL statement:
141+
142+
```sql
143+
WITH source AS (
144+
SELECT
145+
*
146+
FROM
147+
catalog-name.schema-name.table-name
148+
WHERE load_date >= to_timestamp('20250201', 'yyyyMMdd')
149+
)
150+
, records AS (
151+
SELECT COUNT(1) AS table_records
152+
FROM source
153+
)
154+
SELECT
155+
(SELECT table_records FROM records) AS table_records
156+
, ((SELECT COUNT( DISTINCT pk_col ) FROM source) = (SELECT table_records FROM records)) AS unique_pk_col
157+
, (SELECT COUNT(1) FROM source WHERE pk_col IS NULL) = 0 AS notnull_pk_col
158+
, (SELECT COUNT(1) FROM source WHERE col01 NOT IN ['A', 'B', 'C']) = 0 AS contain_col01
159+
, ((SELECT COUNT(1) FROM source WHERE col03 > 10000) = (SELECT table_records FROM records)) AS validate_col03
160+
```
161+
111162
## :chains: Support Systems
112163

113164
| System | Progress Status | System Integration Test | Remark |

src/sqlplate/sqlity.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/sqlplate/sqlplate.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from __future__ import annotations
77

88
from pathlib import Path
9-
from typing import Any, Iterator, Optional, Callable
9+
from typing import Any, Iterator, Optional, Callable, Literal
1010

1111
from jinja2 import Template
1212

@@ -83,6 +83,9 @@ def template(self, name: str) -> 'SQLPlate':
8383
)
8484
return self
8585

86+
def quality(self, mode: Literal["pushdown", "memory"]) -> 'SQLPlate':
87+
return self
88+
8689
def option(self, key: str, value: Any) -> 'SQLPlate':
8790
"""Pass an option key-value pair before generate template."""
8891
self._option[key] = value
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{% extends "base.jinja" %}
2+
3+
{% block statement %}
4+
WITH source AS (
5+
SELECT
6+
*
7+
FROM
8+
{{ catalog }}.{{ schema }}.{{ table }}
9+
{%+ if filter %}WHERE {{ filter }}{% endif +%}
10+
)
11+
, records AS (
12+
SELECT COUNT(1) AS table_records
13+
FROM source
14+
)
15+
SELECT
16+
(SELECT table_records FROM records) AS table_records
17+
{%+ if unique -%}
18+
{%- for col in unique -%}
19+
, ((SELECT COUNT( DISTINCT {{ col }} ) FROM source) = (SELECT table_records FROM records)) AS unique_{{ col }}
20+
{%- endfor -%}
21+
{%- endif +%}
22+
{%+ if notnull -%}
23+
{%- for col in unique -%}
24+
, (SELECT COUNT(1) FROM source WHERE {{ col }} IS NULL) = 0 AS notnull_{{ col }}
25+
{%- endfor -%}
26+
{%- endif +%}
27+
{%+ if contain -%}
28+
{%- for col in contain -%}
29+
, (SELECT COUNT(1) FROM source WHERE {{ col[0] }} NOT IN {{ col[1] }}) = 0 AS contain_{{ col[0] }}
30+
{%- endfor -%}
31+
{%- endif +%}
32+
{%+ if contain -%}
33+
{%- for col in validate -%}
34+
, ((SELECT COUNT(1) FROM source WHERE {{ col[0] }} {{ col[1] }}) = (SELECT table_records FROM records)) AS validate_{{ col[0] }}
35+
{%- endfor -%}
36+
{%- endif +%}
37+
{% endblock statement %}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{% extends "base.jinja" %}
2+
3+
{% block statement %}
4+
WITH source AS (
5+
SELECT
6+
*
7+
FROM {{ catalog }}.{{ schema }}.{{ table }}
8+
{%+ if filter %}WHERE {{ filter }}{% endif +%}
9+
)
10+
SELECT
11+
{% endblock statement %}

tests/test_databricks.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,3 +277,40 @@ def test_sql_full_dump(template_path):
277277
FROM ( SELECT * FROM catalog-name.schema-name.source-name ) AS sub_query
278278
;
279279
""").strip('\n')
280+
281+
282+
def test_quality_check(template_path):
283+
statement: SQLPlate = (
284+
SQLPlate.format('databricks', path=template_path)
285+
.template('quality.check')
286+
.option('catalog', 'catalog-name')
287+
.option('schema', 'schema-name')
288+
.option('table', 'table-name')
289+
.option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')")
290+
.option('unique', ['pk_col'])
291+
.option('notnull', ['col01', 'col02'])
292+
.option(
293+
"contain",
294+
[("col01", ["A", "B", "C"])],
295+
)
296+
.option(
297+
"validate",
298+
[("col03", "> 10000")],
299+
)
300+
.load()
301+
)
302+
print(statement)
303+
304+
305+
def test_quality_metrix(template_path):
306+
statement: SQLPlate = (
307+
SQLPlate.format('databricks', path=template_path)
308+
.template('quality.metrix')
309+
.option('catalog', 'catalog-name')
310+
.option('schema', 'schema-name')
311+
.option('table', 'table-name')
312+
.option('filter', "load_date >= to_timestamp('20250201', 'yyyyMMdd')")
313+
.option("metrix", ["col1", "col2", "col3"])
314+
.load()
315+
)
316+
print(statement)

0 commit comments

Comments
 (0)