Skip to content

Commit 6640a06

Browse files
authored
fix(athena): escape single quotes in iceberg DDL splices (#3372)
`_create_iceberg_table` interpolates `columns_comments` values and `additional_table_properties` keys/values into a generated CREATE TABLE DDL with no escaping. A value containing a single quote can close the surrounding string literal and append additional clauses (LOCATION, TBLPROPERTIES, etc.). Add `_escape_athena_string_literal` that doubles single quotes per the Athena/Trino string-literal escape rule, and apply it at both splice sites. Both keys and values in `additional_table_properties` are escaped for symmetry — the cost is one function call. Add a unit test in `test_moto.py` that exercises both splice sites with a representative payload and asserts the structure of the generated DDL is preserved.
1 parent e71f775 commit 6640a06

2 files changed

Lines changed: 65 additions & 2 deletions

File tree

awswrangler/athena/_write_iceberg.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,21 @@
2424
_logger: logging.Logger = logging.getLogger(__name__)
2525

2626

27+
def _escape_athena_string_literal(value: Any) -> str:
28+
# Used for caller-supplied values spliced inside SQL string literals, e.g.
29+
# COMMENT '<value>' or TBLPROPERTIES ('<key>'='<value>').
30+
#
31+
# The wrapping single quotes around the splice site are *delimiters* — they tell
32+
# Athena "this is a string literal" and are part of the DDL grammar, not escaping.
33+
# A naive splice f"'{value}'" lets a value containing ' close the literal and
34+
# append arbitrary DDL (e.g. LOCATION '...').
35+
#
36+
# Athena/Trino string literals have exactly one escape mechanism: a single quote
37+
# inside the literal must be doubled. Pre-doubling caller quotes here means the
38+
# whole value parses as one literal regardless of what it contains.
39+
return str(value).replace("'", "''")
40+
41+
2742
def _create_iceberg_table(
2843
df: pd.DataFrame,
2944
database: str,
@@ -50,13 +65,19 @@ def _create_iceberg_table(
5065
[
5166
f"{k} {v}"
5267
if (columns_comments is None or columns_comments.get(k) is None)
53-
else f"{k} {v} COMMENT '{columns_comments[k]}'"
68+
else f"{k} {v} COMMENT '{_escape_athena_string_literal(columns_comments[k])}'"
5469
for k, v in columns_types.items()
5570
]
5671
)
5772
partition_cols_str: str = f"PARTITIONED BY ({', '.join([col for col in partition_cols])})" if partition_cols else ""
5873
table_properties_str: str = (
59-
", " + ", ".join([f"'{key}'='{value}'" for key, value in additional_table_properties.items()])
74+
", "
75+
+ ", ".join(
76+
[
77+
f"'{_escape_athena_string_literal(key)}'='{_escape_athena_string_literal(value)}'"
78+
for key, value in additional_table_properties.items()
79+
]
80+
)
6081
if additional_table_properties
6182
else ""
6283
)

tests/unit/test_moto.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,3 +802,45 @@ def mock_make_api_call(self, operation_name, kwarg):
802802
allow_full_scan=True,
803803
)
804804
assert describe_table_calls == 1
805+
806+
807+
def test_create_iceberg_table_escapes_single_quotes_in_columns_comments() -> None:
808+
# Single quotes in caller-supplied columns_comments / additional_table_properties
809+
# values must be doubled so they cannot terminate the surrounding 'literal' and
810+
# change the structure of the generated DDL.
811+
from awswrangler.athena import _write_iceberg
812+
813+
captured: list[str] = []
814+
815+
def fake_start(*, sql: str, **_) -> str:
816+
captured.append(sql)
817+
return "qid"
818+
819+
df = pd.DataFrame({"id": pd.Series(dtype="int64"), "user_name": pd.Series(dtype="string")})
820+
wg_config = mock.MagicMock()
821+
wg_config.enforce_workgroup_location = False
822+
823+
with mock.patch.object(_write_iceberg, "_start_query_execution", side_effect=fake_start), mock.patch.object(
824+
_write_iceberg, "wait_query"
825+
):
826+
_write_iceberg._create_iceberg_table(
827+
df=df,
828+
database="db",
829+
table="t",
830+
path="s3://intended/output/",
831+
wg_config=wg_config,
832+
partition_cols=None,
833+
additional_table_properties={"prop": "val') LOCATION 's3://other/' --"},
834+
index=False,
835+
boto3_session=mock.MagicMock(),
836+
columns_comments={"user_name": "') LOCATION 's3://other/' TBLPROPERTIES ('x'='y"},
837+
)
838+
839+
sql = captured[0]
840+
# Quotes were doubled in both splices, so unescaped caller content stays inside the
841+
# COMMENT / TBLPROPERTIES string literals and does not open a new DDL clause.
842+
assert "COMMENT ''') LOCATION ''s3://other/'' TBLPROPERTIES (''x''=''y'" in sql
843+
assert "'prop'='val'') LOCATION ''s3://other/'' --'" in sql
844+
# The intended LOCATION (un-doubled quotes) is the only top-level clause.
845+
assert "LOCATION 's3://intended/output/'" in sql
846+
assert "LOCATION 's3://other/'" not in sql

0 commit comments

Comments
 (0)