Skip to content

Commit 3a32a50

Browse files
committed
Fix message for invalid geometries
1 parent 8ae6976 commit 3a32a50

File tree

2 files changed

+74
-74
lines changed

2 files changed

+74
-74
lines changed

src/databricks/labs/dqx/geo/check_funcs.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def has_area_less_than(column: str | Column, limit: int | float | str | Column)
472472
return _compare_area(
473473
column,
474474
limit,
475-
compare_op=py_operator.eq,
475+
compare_op=py_operator.gt,
476476
compare_op_label="greater than",
477477
compare_op_name="greater_than",
478478
)
@@ -496,7 +496,7 @@ def has_area_greater_than(column: str | Column, limit: int | float | str | Colum
496496
return _compare_area(
497497
column,
498498
limit,
499-
compare_op=py_operator.eq,
499+
compare_op=py_operator.lt,
500500
compare_op_label="less than",
501501
compare_op_name="less_than",
502502
)
@@ -521,7 +521,7 @@ def has_num_points_less_than(column: str | Column, limit: int | float | str | Co
521521
return _compare_num_points(
522522
column,
523523
limit,
524-
compare_op=py_operator.eq,
524+
compare_op=py_operator.gt,
525525
compare_op_label="greater than",
526526
compare_op_name="greater_than",
527527
)
@@ -546,7 +546,7 @@ def has_num_points_greater_than(column: str | Column, limit: int | float | str |
546546
return _compare_num_points(
547547
column,
548548
limit,
549-
compare_op=py_operator.eq,
549+
compare_op=py_operator.lt,
550550
compare_op_label="less than",
551551
compare_op_name="less_than",
552552
)
@@ -580,18 +580,22 @@ def _compare_area(
580580
# NOTE: This function is currently only available in Databricks runtime 17.1 or above or in
581581
# Databricks SQL, due to the use of the `try_to_geometry` and `st_area` functions.
582582
geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL")
583-
geom_type_cond = compare_op(F.expr(f"st_area(try_to_geometry({col_str_norm}))"), limit_expr)
584-
condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond)
583+
geom_cond_message = F.concat_ws(
584+
"", F.lit("value `"), col_expr.cast("string"), F.lit(f"` in column `{col_expr_str}` is not a valid geometry")
585+
)
586+
geom_area_cond = compare_op(F.expr(f"st_area(try_to_geometry({col_str_norm}))"), limit_expr)
587+
geom_area_message = F.concat_ws(
588+
"",
589+
F.lit("value `"),
590+
col_expr.cast("string"),
591+
F.lit(f"` in column `{col_expr_str}` has area {compare_op_label} limit: "),
592+
limit_expr.cast("string"),
593+
)
594+
condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_area_cond)
585595

586596
return make_condition(
587597
condition,
588-
F.concat_ws(
589-
"",
590-
F.lit("value `"),
591-
col_expr.cast("string"),
592-
F.lit(f"` in column `{col_expr_str}` has area {compare_op_label} limit: "),
593-
limit_expr.cast("string"),
594-
),
598+
F.when(geom_area_cond, geom_area_message).otherwise(geom_cond_message),
595599
f"{col_str_norm}_area_{compare_op_name}_limit",
596600
)
597601

@@ -625,17 +629,21 @@ def _compare_num_points(
625629
# NOTE: This function is currently only available in Databricks runtime 17.1 or above or in
626630
# Databricks SQL, due to the use of the `try_to_geometry` and `st_area` functions.
627631
geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL")
628-
geom_type_cond = compare_op(F.expr(f"st_npoints(try_to_geometry({col_str_norm}))"), limit_expr)
629-
condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond)
632+
geom_cond_message = F.concat_ws(
633+
"", F.lit("value `"), col_expr.cast("string"), F.lit(f"` in column `{col_expr_str}` is not a valid geometry")
634+
)
635+
geom_num_points_cond = compare_op(F.expr(f"st_npoints(try_to_geometry({col_str_norm}))"), limit_expr)
636+
geom_num_points_message = F.concat_ws(
637+
"",
638+
F.lit("value `"),
639+
col_expr.cast("string"),
640+
F.lit(f"` in column `{col_expr_str}` has number of coordinates {compare_op_label} limit: "),
641+
limit_expr.cast("string"),
642+
)
643+
condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_num_points_cond)
630644

631645
return make_condition(
632646
condition,
633-
F.concat_ws(
634-
"",
635-
F.lit("value `"),
636-
col_expr.cast("string"),
637-
F.lit(f"` in column `{col_expr_str}` has number of coordinates {compare_op_label} limit: "),
638-
limit_expr.cast("string"),
639-
),
647+
F.when(geom_num_points_message, geom_num_points_message).otherwise(geom_cond_message),
640648
f"{col_str_norm}_num_points_{compare_op_name}_limit",
641649
)

tests/integration/test_row_checks_geo.py

Lines changed: 44 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from databricks.labs.dqx.geo.check_funcs import (
33
has_area_greater_than,
44
has_area_less_than,
5+
has_num_points_greater_than,
6+
has_num_points_less_than,
57
has_dimension,
68
has_x_coordinate_between,
79
has_y_coordinate_between,
@@ -18,8 +20,6 @@
1820
is_point,
1921
is_polygon,
2022
is_ogc_valid,
21-
has_num_points_greater_than,
22-
has_num_points_less_than,
2323
)
2424

2525

@@ -404,133 +404,125 @@ def test_has_y_coordinate_between(skip_if_runtime_not_geo_compatible, spark):
404404
assert_df_equality(actual, expected, ignore_nullable=True)
405405

406406

407-
def test_area_not_greater_than(skip_if_runtime_not_geo_compatible, spark):
407+
def test_has_area_less_than(skip_if_runtime_not_geo_compatible, spark):
408408
test_df = spark.sql(
409409
"""
410410
SELECT geom FROM VALUES
411411
('POINT(0 0)'), -- Point has area 0
412-
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- Unit square has area 1
413-
('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), -- 2x2 square has area 4
412+
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- Unit square has area 1
413+
('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), -- 2x2 square has area 4
414414
('invalid-geometry'), -- Invalid geometry
415415
(null) -- Null geometry
416416
AS data(geom)
417-
"""
417+
"""
418418
)
419419

420-
actual = test_df.select(has_area_greater_than("geom", 2.0))
420+
actual = test_df.select(has_area_less_than("geom", 2.0))
421421

422422
checked_schema = "geom_area_greater_than_limit: string"
423423
expected = spark.createDataFrame(
424424
[
425-
[None], # Point area (0) <= 2.0, so no error
426-
[None], # Square area (1) <= 2.0, so no error
427-
[
428-
"value `POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))` in column `geom` has area greater than limit: 2.0"
429-
], # Area (4) > 2.0
430-
["value `invalid-geometry` in column `geom` has area greater than limit: 2.0"], # Invalid geometry
431-
[None], # Null geometry
425+
[None],
426+
[None],
427+
["value `POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))` in column `geom` has area greater than limit: 2.0"],
428+
["value `invalid-geometry` in column `geom` is not a valid geometry"],
429+
[None],
432430
],
433431
checked_schema,
434432
)
435433

436434
assert_df_equality(actual, expected, ignore_nullable=True)
437435

438436

439-
def test_area_not_less_than(skip_if_runtime_not_geo_compatible, spark):
437+
def test_has_area_greater_than(skip_if_runtime_not_geo_compatible, spark):
440438
test_df = spark.sql(
441439
"""
442440
SELECT geom FROM VALUES
443441
('POINT(0 0)'), -- Point has area 0
444-
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- Unit square has area 1
445-
('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), -- 2x2 square has area 4
442+
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- Unit square has area 1
443+
('POLYGON((0 0, 2 0, 2 2, 0 2, 0 0))'), -- 2x2 square has area 4
446444
('invalid-geometry'), -- Invalid geometry
447445
(null) -- Null geometry
448446
AS data(geom)
449-
"""
447+
"""
450448
)
451449

452-
actual = test_df.select(has_area_less_than("geom", 1.0))
450+
actual = test_df.select(has_area_greater_than("geom", 1.0))
453451

454452
checked_schema = "geom_area_less_than_limit: string"
455453
expected = spark.createDataFrame(
456454
[
457-
["value `POINT(0 0)` in column `geom` has area less than limit: 1.0"], # Point area (0) < 1.0
458-
[None], # Square area (1) >= 1.0, so no error
459-
[None], # Square area (4) >= 1.0, so no error
460-
["value `invalid-geometry` in column `geom` has area less than limit: 1.0"], # Invalid geometry
461-
[None], # Null geometry
455+
["value `POINT(0 0)` in column `geom` has area less than limit: 1.0"],
456+
[None],
457+
[None],
458+
["value `invalid-geometry` in column `geom` is not a valid geometry"],
459+
[None],
462460
],
463461
checked_schema,
464462
)
465463

466464
assert_df_equality(actual, expected, ignore_nullable=True)
467465

468466

469-
def test_num_points_not_greater_than(skip_if_runtime_not_geo_compatible, spark):
467+
def test_has_num_points_less_than(skip_if_runtime_not_geo_compatible, spark):
470468
test_df = spark.sql(
471469
"""
472470
SELECT geom FROM VALUES
473471
('POINT(0 0)'), -- 1 point
474-
('LINESTRING(0 0, 1 1)'), -- 2 points
475-
('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points
476-
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point)
472+
('LINESTRING(0 0, 1 1)'), -- 2 points
473+
('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points
474+
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point)
477475
('invalid-geometry'), -- Invalid geometry
478476
(null) -- Null geometry
479477
AS data(geom)
480-
"""
478+
"""
481479
)
482480

483481
actual = test_df.select(has_num_points_less_than("geom", 3))
484482

485483
checked_schema = "geom_num_points_greater_than_limit: string"
486484
expected = spark.createDataFrame(
487485
[
488-
[None], # Point (1 point) <= 3, so no error
489-
[None], # LineString (2 points) <= 3, so no error
490-
[None], # LineString (3 points) <= 3, so no error
486+
[None],
487+
[None],
488+
[None],
491489
[
492490
"value `POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))` in column `geom` has number of coordinates greater than limit: 3"
493-
], # Polygon (5 points) > 3
494-
[
495-
"value `invalid-geometry` in column `geom` has number of coordinates greater than limit: 3"
496-
], # Invalid geometry
497-
[None], # Null geometry
491+
],
492+
["value `invalid-geometry` in column `geom` is not a valid geometry"],
493+
[None],
498494
],
499495
checked_schema,
500496
)
501497

502498
assert_df_equality(actual, expected, ignore_nullable=True)
503499

504500

505-
def test_num_points_not_less_than(skip_if_runtime_not_geo_compatible, spark):
501+
def test_has_num_points_greater_than(skip_if_runtime_not_geo_compatible, spark):
506502
test_df = spark.sql(
507503
"""
508504
SELECT geom FROM VALUES
509505
('POINT(0 0)'), -- 1 point
510-
('LINESTRING(0 0, 1 1)'), -- 2 points
511-
('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points
512-
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point)
506+
('LINESTRING(0 0, 1 1)'), -- 2 points
507+
('LINESTRING(0 0, 1 1, 2 2)'), -- 3 points
508+
('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), -- 5 points (including closing point)
513509
('invalid-geometry'), -- Invalid geometry
514510
(null) -- Null geometry
515511
AS data(geom)
516-
"""
512+
"""
517513
)
518514

519515
actual = test_df.select(has_num_points_greater_than("geom", 3))
520516

521517
checked_schema = "geom_num_points_less_than_limit: string"
522518
expected = spark.createDataFrame(
523519
[
524-
["value `POINT(0 0)` in column `geom` has number of coordinates less than limit: 3"], # Point (1 point) < 3
525-
[
526-
"value `LINESTRING(0 0, 1 1)` in column `geom` has number of coordinates less than limit: 3"
527-
], # LineString (2 points) < 3
528-
[None], # LineString (3 points) >= 3, so no error
529-
[None], # Polygon (5 points) >= 3, so no error
530-
[
531-
"value `invalid-geometry` in column `geom` has number of coordinates less than limit: 3"
532-
], # Invalid geometry
533-
[None], # Null geometry
520+
["value `POINT(0 0)` in column `geom` has number of coordinates less than limit: 3"],
521+
["value `LINESTRING(0 0, 1 1)` in column `geom` has number of coordinates less than limit: 3"],
522+
[None],
523+
[None],
524+
["value `invalid-geometry` in column `geom` is not a valid geometry"],
525+
[None],
534526
],
535527
checked_schema,
536528
)

0 commit comments

Comments
 (0)