-
Notifications
You must be signed in to change notification settings - Fork 3.5k
fix(redshift): use boundary-aware segment stitching for query reconstruction #16253
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
347a23b
c1dcc44
d156355
1a1d35e
1b5c3b0
1ffff9a
e27b075
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically, we are just doing string contains validations So I would remove the focus on "stitching" and make this just a generic test for redshift queries
So we can add more validations in the future
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Revised. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| """Tests for Redshift query generation and validation. | ||
|
|
||
| Covers query patterns used across RedshiftProvisionedQuery and RedshiftServerlessQuery, | ||
| including segment stitching strategies that preserve word boundaries when reconstructing | ||
| queries from fixed-width character segments (200 bytes provisioned, 4000 bytes serverless). | ||
| """ | ||
|
|
||
| from datetime import datetime | ||
|
|
||
| from datahub.ingestion.source.redshift.query import ( | ||
| RedshiftProvisionedQuery, | ||
| RedshiftServerlessQuery, | ||
| ) | ||
|
|
||
| START_TIME = datetime(2024, 1, 1, 12, 0, 0) | ||
| END_TIME = datetime(2024, 1, 10, 12, 0, 0) | ||
|
|
||
| # The boundary-aware LISTAGG pattern for 200-byte segments (provisioned). | ||
| # Appends a space when the trimmed segment is shorter than the segment size, | ||
| # indicating a word boundary was at the segment edge. | ||
| PROVISIONED_LISTAGG_PATTERN = ( | ||
| "RTRIM(LISTAGG(RTRIM(text) " | ||
| "|| CASE WHEN LEN(RTRIM(text)) < 200 THEN ' ' ELSE '' END, '')" | ||
| ) | ||
|
|
||
| # The boundary-aware LISTAGG pattern for 4000-byte segments (serverless). | ||
| SERVERLESS_LISTAGG_PATTERN_TEXT = ( | ||
| 'RTRIM(LISTAGG(RTRIM(qt."text") ' | ||
| "|| CASE WHEN LEN(RTRIM(qt.\"text\")) < 4000 THEN ' ' ELSE '' END, '')" | ||
| ) | ||
|
|
||
| SERVERLESS_LISTAGG_PATTERN_QUERYTXT = ( | ||
| "RTRIM(LISTAGG(RTRIM(querytxt) " | ||
| "|| CASE WHEN LEN(RTRIM(querytxt)) < 4000 THEN ' ' ELSE '' END, '')" | ||
| ) | ||
|
|
||
|
|
||
| class TestProvisionedQueries: | ||
| def test_list_insert_create_queries_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftProvisionedQuery.list_insert_create_queries_sql( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert PROVISIONED_LISTAGG_PATTERN in sql | ||
|
|
||
| def test_temp_table_ddl_query_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftProvisionedQuery.temp_table_ddl_query( | ||
| start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert PROVISIONED_LISTAGG_PATTERN in sql | ||
|
|
||
| def test_stl_scan_based_lineage_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftProvisionedQuery.stl_scan_based_lineage_query( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert PROVISIONED_LISTAGG_PATTERN in sql | ||
|
|
||
| def test_stl_scan_based_lineage_uses_cte_not_stl_query(self): | ||
| """The provisioned scan lineage query should use a CTE from STL_QUERYTEXT | ||
| instead of stl_query.querytxt (which is truncated to 4000 chars).""" | ||
| sql = RedshiftProvisionedQuery.stl_scan_based_lineage_query( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert "WITH query_txt AS" in sql | ||
| assert "STL_QUERYTEXT" in sql | ||
| # Should join query_txt CTE (not stl_query table) for querytxt | ||
| assert "join query_txt sq" in sql.lower() | ||
| # Should NOT join stl_query table directly (only stl_querytext via CTE) | ||
| assert "join stl_query " not in sql.lower() | ||
|
|
||
| def test_no_old_listagg_pattern_provisioned(self): | ||
| """Ensure the old LISTAGG pattern with LEN(RTRIM(text)) = 0 is gone.""" | ||
| for sql in [ | ||
| RedshiftProvisionedQuery.list_insert_create_queries_sql( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ), | ||
| RedshiftProvisionedQuery.temp_table_ddl_query( | ||
| start_time=START_TIME, end_time=END_TIME | ||
| ), | ||
| ]: | ||
| assert "LEN(RTRIM(text)) = 0" not in sql | ||
|
|
||
|
|
||
| class TestServerlessQueries: | ||
| def test_stl_scan_based_lineage_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftServerlessQuery.stl_scan_based_lineage_query( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert SERVERLESS_LISTAGG_PATTERN_TEXT in sql | ||
|
|
||
| def test_list_insert_create_queries_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftServerlessQuery.list_insert_create_queries_sql( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert SERVERLESS_LISTAGG_PATTERN_QUERYTXT in sql | ||
|
|
||
| def test_temp_table_ddl_query_uses_boundary_aware_listagg(self): | ||
| sql = RedshiftServerlessQuery.temp_table_ddl_query( | ||
| start_time=START_TIME, end_time=END_TIME | ||
| ) | ||
| assert SERVERLESS_LISTAGG_PATTERN_TEXT in sql | ||
|
|
||
| def test_no_old_listagg_pattern_serverless(self): | ||
| """Ensure the old bare LISTAGG(qt."text") pattern is gone for serverless.""" | ||
| for sql in [ | ||
| RedshiftServerlessQuery.stl_scan_based_lineage_query( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ), | ||
| RedshiftServerlessQuery.list_insert_create_queries_sql( | ||
| db_name="test_db", start_time=START_TIME, end_time=END_TIME | ||
| ), | ||
| RedshiftServerlessQuery.temp_table_ddl_query( | ||
| start_time=START_TIME, end_time=END_TIME | ||
| ), | ||
| ]: | ||
| # Should not have bare LISTAGG without RTRIM wrapper | ||
| assert 'LISTAGG(qt."text")' not in sql | ||
| assert "LEN(RTRIM(querytxt)) = 0" not in sql |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Just wondering how is the implementation in PR is slightly different to the one suggested here
https://docs.aws.amazon.com/redshift/latest/dg/r_SVL_STATEMENTTEXT.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AWS's approach doesn't fix the keyword merge problem. Their condition LEN(RTRIM(text)) = 0 only preserves completely empty segments. A segment like "GROUP " becomes "GROUP" (length 5, not 0), which merges with the next "BY" segment -> "GROUPBY"
Our approach uses segment size as a boundary detector: if LEN(RTRIM(text)) < segment_size, we add a space back because the segment likely reached a natural boundary (not just had trailing spaces stripped).