Skip to content

Commit a3a392d

Browse files
authored
Merge branch 'main' into dependabot/uv/production-dependencies-51bc56ea16
2 parents b987f45 + a4e72e4 commit a3a392d

2 files changed

Lines changed: 154 additions & 17 deletions

File tree

awswrangler/neptune/_neptune.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def to_property_graph(
170170
... )
171171
"""
172172
# check if ~id and ~label column exist and if not throw error
173-
g = gremlin.traversal().withGraph(gremlin.Graph())
173+
g = gremlin.Graph().traversal()
174174
is_edge_df = False
175175
is_update_df = True
176176
if "~id" in df.columns:
@@ -203,6 +203,24 @@ def to_property_graph(
203203
return _run_gremlin_insert(client, g)
204204

205205

206+
# SPARQL 1.1 IRIREF grammar: '<' ([^<>"{}|^`\]-[#x00-#x20])* '>'
207+
# A cell value spliced between '<' and '>' must contain only the characters allowed
208+
# inside the IRIREF token. Anything else can close the token and inject arbitrary
209+
# SPARQL UPDATE syntax (DELETE / DROP / LOAD / ...).
210+
_IRIREF_INNER_RE = re.compile(r"^[^\x00-\x20<>\"{}|^`\\]*$")
211+
212+
213+
def _validate_iriref_cell(value: Any, column: str, row_index: int) -> str:
214+
text = str(value)
215+
if not _IRIREF_INNER_RE.match(text):
216+
raise exceptions.InvalidArgumentValue(
217+
f"Value in column {column!r} at row index {row_index} is not a valid IRI: "
218+
f"{text!r}. Cells written by `to_rdf_graph` must conform to the SPARQL "
219+
'IRIREF grammar (no whitespace, control characters, or any of <>"{}|^`\\).'
220+
)
221+
return text
222+
223+
206224
@_utils.check_optional_dependency(sparql, "SPARQLWrapper")
207225
def to_rdf_graph(
208226
client: NeptuneClient,
@@ -267,14 +285,18 @@ def to_rdf_graph(
267285
query = ""
268286
# Loop through items in the DF
269287
for i, (_, row) in enumerate(df.iterrows()):
288+
subject = _validate_iriref_cell(row[subject_column], subject_column, i)
289+
predicate = _validate_iriref_cell(row[predicate_column], predicate_column, i)
290+
obj = _validate_iriref_cell(row[object_column], object_column, i)
270291
# build up a query
271292
if is_quads:
272-
insert = f"""INSERT DATA {{ GRAPH <{row[graph_column]}> {{<{row[subject_column]}>
273-
<{str(row[predicate_column])}> <{row[object_column]}> . }} }}; """
293+
graph = _validate_iriref_cell(row[graph_column], graph_column, i)
294+
insert = f"""INSERT DATA {{ GRAPH <{graph}> {{<{subject}>
295+
<{predicate}> <{obj}> . }} }}; """
274296
query = query + insert
275297
else:
276-
insert = f"""INSERT DATA {{ <{row[subject_column]}> <{str(row[predicate_column])}>
277-
<{row[object_column]}> . }}; """
298+
insert = f"""INSERT DATA {{ <{subject}> <{predicate}>
299+
<{obj}> . }}; """
278300
query = query + insert
279301
# run the query
280302
if i > 0 and i % batch_size == 0:

tests/unit/test_neptune_parsing.py

Lines changed: 127 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import logging
2+
from unittest.mock import MagicMock
23

34
import pytest # type: ignore
45
from gremlin_python.process.traversal import T
56
from gremlin_python.structure.graph import Edge, Path, Property, Vertex, VertexProperty
67

78
import awswrangler as wr
89
import awswrangler.pandas as pd
10+
from awswrangler import exceptions
11+
from awswrangler.neptune._client import NeptuneClient
912

1013
logging.getLogger("awswrangler").setLevel(logging.DEBUG)
1114

@@ -29,7 +32,7 @@ def test_parse_gremlin_vertex_elements(gremlin_parser):
2932
assert df.shape == (1, 3)
3033
assert row["id"] == "foo"
3134
assert row["label"] == "vertex"
32-
assert row["properties"] is None
35+
assert not row["properties"] # gremlinpython <3.8 returns None, >=3.8 returns []
3336

3437
# parse multiple vertex elements
3538
v1 = Vertex("bar")
@@ -40,7 +43,7 @@ def test_parse_gremlin_vertex_elements(gremlin_parser):
4043
assert df.shape == (2, 3)
4144
assert row["id"] == "bar"
4245
assert row["label"] == "vertex"
43-
assert row["properties"] is None
46+
assert not row["properties"] # gremlinpython <3.8 returns None, >=3.8 returns []
4447

4548

4649
# parse Edge elements
@@ -56,7 +59,7 @@ def test_parse_gremlin_edge_elements(gremlin_parser):
5659
assert row["outV"] == "out1"
5760
assert row["label"] == "label"
5861
assert row["inV"] == "in1"
59-
assert row["properties"] is None
62+
assert not row["properties"] # gremlinpython <3.8 returns None, >=3.8 returns []
6063

6164
# parse multiple edge elements
6265
v1 = Edge("bar", "out1", "label", "in2")
@@ -69,7 +72,7 @@ def test_parse_gremlin_edge_elements(gremlin_parser):
6972
assert row["outV"] == "out1"
7073
assert row["label"] == "label"
7174
assert row["inV"] == "in2"
72-
assert row["properties"] is None
75+
assert not row["properties"] # gremlinpython <3.8 returns None, >=3.8 returns []
7376

7477

7578
# parse Property elements
@@ -86,7 +89,7 @@ def test_parse_gremlin_property_elements(gremlin_parser):
8689
assert row["value"] == "bar"
8790
assert row["key"] == "name"
8891
assert row["vertex"] == "v1"
89-
assert row["properties"] is None
92+
assert not row["properties"] # gremlinpython <3.8 returns None, >=3.8 returns []
9093

9194
v = Property("foo", "name", "bar")
9295
input = [v]
@@ -100,6 +103,11 @@ def test_parse_gremlin_property_elements(gremlin_parser):
100103

101104

102105
# parse Path elements
106+
def _normalize_properties(d: dict) -> dict:
107+
# gremlinpython <3.8 returns properties=None, >=3.8 returns properties=[].
108+
return {k: (None if k == "properties" and not v else v) for k, v in d.items()}
109+
110+
103111
def test_parse_gremlin_path_elements(gremlin_parser):
104112
# parse path with elements
105113
v = Vertex("foo")
@@ -110,9 +118,15 @@ def test_parse_gremlin_path_elements(gremlin_parser):
110118
df = pd.DataFrame.from_records(out)
111119
row = df.iloc[0]
112120
assert df.shape == (1, 3)
113-
assert row[0] == {"id": "foo", "label": "vertex", "properties": None}
114-
assert row[1] == {"id": "e1", "label": "label", "outV": "foo", "inV": "bar", "properties": None}
115-
assert row[2] == {"id": "bar", "label": "vertex", "properties": None}
121+
assert _normalize_properties(row[0]) == {"id": "foo", "label": "vertex", "properties": None}
122+
assert _normalize_properties(row[1]) == {
123+
"id": "e1",
124+
"label": "label",
125+
"outV": "foo",
126+
"inV": "bar",
127+
"properties": None,
128+
}
129+
assert _normalize_properties(row[2]) == {"id": "bar", "label": "vertex", "properties": None}
116130

117131
# parse path with multiple elements
118132
e2 = Edge("bar", "out1", "label", "in2")
@@ -122,9 +136,15 @@ def test_parse_gremlin_path_elements(gremlin_parser):
122136
df = pd.DataFrame.from_records(out)
123137
row = df.iloc[1]
124138
assert df.shape == (2, 3)
125-
assert row[0] == {"id": "bar", "label": "vertex", "properties": None}
126-
assert row[1] == {"id": "bar", "label": "label", "outV": "out1", "inV": "in2", "properties": None}
127-
assert row[2] == {"id": "in2", "label": "vertex", "properties": None}
139+
assert _normalize_properties(row[0]) == {"id": "bar", "label": "vertex", "properties": None}
140+
assert _normalize_properties(row[1]) == {
141+
"id": "bar",
142+
"label": "label",
143+
"outV": "out1",
144+
"inV": "in2",
145+
"properties": None,
146+
}
147+
assert _normalize_properties(row[2]) == {"id": "in2", "label": "vertex", "properties": None}
128148

129149
# parse path with maps
130150
p = Path(
@@ -152,7 +172,13 @@ def test_parse_gremlin_path_elements(gremlin_parser):
152172
assert df.shape == (1, 3)
153173
assert row[0]["name"] == "foo"
154174
assert row[0]["age"] == 29
155-
assert row[1] == {"id": "bar", "label": "label", "outV": "out1", "inV": "in2", "properties": None}
175+
assert _normalize_properties(row[1]) == {
176+
"id": "bar",
177+
"label": "label",
178+
"outV": "out1",
179+
"inV": "in2",
180+
"properties": None,
181+
}
156182
assert row[2]["name"] == "bar"
157183
assert row[2]["age"] == 40
158184

@@ -216,3 +242,92 @@ def test_parse_gremlin_subgraph(gremlin_parser):
216242
assert df.shape == (1, 2)
217243
assert row["@type"] == "tinker:graph"
218244
assert row["@value"] == {"vertices": ["v[45]", "v[9]"], "edges": ["e[3990][9-route->45]"]}
245+
246+
247+
# to_rdf_graph IRIREF validation: caller-supplied DataFrame cells must conform to
248+
# the SPARQL IRIREF grammar so they cannot close the <...> token and inject
249+
# arbitrary SPARQL UPDATE syntax (DELETE / DROP / LOAD / ...).
250+
251+
252+
def _rdf_triples_df() -> pd.DataFrame:
253+
return pd.DataFrame(
254+
{
255+
"s": ["http://example.org/alice", "http://example.org/bob"],
256+
"p": ["http://xmlns.com/foaf/0.1/name", "http://xmlns.com/foaf/0.1/name"],
257+
"o": ["http://example.org/AliceName", "http://example.org/BobName"],
258+
}
259+
)
260+
261+
262+
def _mock_neptune_client() -> MagicMock:
263+
client = MagicMock(spec=NeptuneClient)
264+
client.write_sparql.return_value = True
265+
return client
266+
267+
268+
def test_to_rdf_graph_accepts_well_formed_iris():
269+
client = _mock_neptune_client()
270+
df = _rdf_triples_df()
271+
272+
assert wr.neptune.to_rdf_graph(client, df) is True
273+
client.write_sparql.assert_called_once()
274+
query = client.write_sparql.call_args.args[0]
275+
assert "<http://example.org/alice>" in query
276+
assert "<http://example.org/bob>" in query
277+
278+
279+
@pytest.mark.parametrize(
280+
"malicious_cell, column",
281+
[
282+
# Bug-bounty PoC payload: closes IRI, runs DELETE WHERE, reopens INSERT.
283+
(
284+
"> . }; DELETE WHERE { ?s ?p ?o }; "
285+
"INSERT DATA { <http://evil.com/x> <http://evil.com/y> <http://evil.com/z",
286+
"o",
287+
),
288+
# DROP ALL via the subject slot.
289+
("http://x.com/s> <http://x.com/p> <http://x.com/o> . }; DROP ALL ; INSERT DATA { <a", "s"),
290+
# LOAD via the predicate slot.
291+
(
292+
"http://x.com/p> <http://x.com/o> . }; LOAD <http://evil.com/payload.ttl> ; "
293+
"INSERT DATA { <http://x.com/s> <http://x.com/p2",
294+
"p",
295+
),
296+
# Whitespace alone is enough to break the IRIREF token.
297+
("http://example.org/ has space", "o"),
298+
("http://example.org/a\n<http://x>", "o"),
299+
("http://example.org/<inner>", "s"),
300+
],
301+
)
302+
def test_to_rdf_graph_rejects_malicious_cells(malicious_cell, column):
303+
client = _mock_neptune_client()
304+
df = _rdf_triples_df()
305+
df.loc[0, column] = malicious_cell
306+
307+
with pytest.raises(exceptions.InvalidArgumentValue, match="not a valid IRI"):
308+
wr.neptune.to_rdf_graph(client, df)
309+
# Validation must run before any network call.
310+
client.write_sparql.assert_not_called()
311+
312+
313+
def test_to_rdf_graph_rejects_malicious_graph_column_for_quads():
314+
client = _mock_neptune_client()
315+
df = _rdf_triples_df()
316+
df["g"] = ["http://example.org/g1", "http://example.org/g2"]
317+
df.loc[0, "g"] = "http://x> {} }; DROP ALL ; INSERT DATA { GRAPH <http://x> { <a"
318+
319+
with pytest.raises(exceptions.InvalidArgumentValue, match="'g'"):
320+
wr.neptune.to_rdf_graph(client, df)
321+
client.write_sparql.assert_not_called()
322+
323+
324+
def test_to_rdf_graph_error_identifies_row_and_column():
325+
client = _mock_neptune_client()
326+
df = _rdf_triples_df()
327+
df.loc[1, "o"] = "http://example.org/bad value"
328+
329+
with pytest.raises(exceptions.InvalidArgumentValue) as exc_info:
330+
wr.neptune.to_rdf_graph(client, df)
331+
message = str(exc_info.value)
332+
assert "'o'" in message
333+
assert "row index 1" in message

0 commit comments

Comments
 (0)