11import logging
2+ from unittest .mock import MagicMock
23
34import pytest # type: ignore
45from gremlin_python .process .traversal import T
56from gremlin_python .structure .graph import Edge , Path , Property , Vertex , VertexProperty
67
78import awswrangler as wr
89import awswrangler .pandas as pd
10+ from awswrangler import exceptions
11+ from awswrangler .neptune ._client import NeptuneClient
912
1013logging .getLogger ("awswrangler" ).setLevel (logging .DEBUG )
1114
@@ -29,7 +32,7 @@ def test_parse_gremlin_vertex_elements(gremlin_parser):
2932 assert df .shape == (1 , 3 )
3033 assert row ["id" ] == "foo"
3134 assert row ["label" ] == "vertex"
32- assert row ["properties" ] is None
35+ assert not row ["properties" ] # gremlinpython <3.8 returns None, >=3.8 returns []
3336
3437 # parse multiple vertex elements
3538 v1 = Vertex ("bar" )
@@ -40,7 +43,7 @@ def test_parse_gremlin_vertex_elements(gremlin_parser):
4043 assert df .shape == (2 , 3 )
4144 assert row ["id" ] == "bar"
4245 assert row ["label" ] == "vertex"
43- assert row ["properties" ] is None
46+ assert not row ["properties" ] # gremlinpython <3.8 returns None, >=3.8 returns []
4447
4548
4649# parse Edge elements
@@ -56,7 +59,7 @@ def test_parse_gremlin_edge_elements(gremlin_parser):
5659 assert row ["outV" ] == "out1"
5760 assert row ["label" ] == "label"
5861 assert row ["inV" ] == "in1"
59- assert row ["properties" ] is None
62+ assert not row ["properties" ] # gremlinpython <3.8 returns None, >=3.8 returns []
6063
6164 # parse multiple edge elements
6265 v1 = Edge ("bar" , "out1" , "label" , "in2" )
@@ -69,7 +72,7 @@ def test_parse_gremlin_edge_elements(gremlin_parser):
6972 assert row ["outV" ] == "out1"
7073 assert row ["label" ] == "label"
7174 assert row ["inV" ] == "in2"
72- assert row ["properties" ] is None
75+ assert not row ["properties" ] # gremlinpython <3.8 returns None, >=3.8 returns []
7376
7477
7578# parse Property elements
@@ -86,7 +89,7 @@ def test_parse_gremlin_property_elements(gremlin_parser):
8689 assert row ["value" ] == "bar"
8790 assert row ["key" ] == "name"
8891 assert row ["vertex" ] == "v1"
89- assert row ["properties" ] is None
92+ assert not row ["properties" ] # gremlinpython <3.8 returns None, >=3.8 returns []
9093
9194 v = Property ("foo" , "name" , "bar" )
9295 input = [v ]
@@ -100,6 +103,11 @@ def test_parse_gremlin_property_elements(gremlin_parser):
100103
101104
102105# parse Path elements
106+ def _normalize_properties (d : dict ) -> dict :
107+ # gremlinpython <3.8 returns properties=None, >=3.8 returns properties=[].
108+ return {k : (None if k == "properties" and not v else v ) for k , v in d .items ()}
109+
110+
103111def test_parse_gremlin_path_elements (gremlin_parser ):
104112 # parse path with elements
105113 v = Vertex ("foo" )
@@ -110,9 +118,15 @@ def test_parse_gremlin_path_elements(gremlin_parser):
110118 df = pd .DataFrame .from_records (out )
111119 row = df .iloc [0 ]
112120 assert df .shape == (1 , 3 )
113- assert row [0 ] == {"id" : "foo" , "label" : "vertex" , "properties" : None }
114- assert row [1 ] == {"id" : "e1" , "label" : "label" , "outV" : "foo" , "inV" : "bar" , "properties" : None }
115- assert row [2 ] == {"id" : "bar" , "label" : "vertex" , "properties" : None }
121+ assert _normalize_properties (row [0 ]) == {"id" : "foo" , "label" : "vertex" , "properties" : None }
122+ assert _normalize_properties (row [1 ]) == {
123+ "id" : "e1" ,
124+ "label" : "label" ,
125+ "outV" : "foo" ,
126+ "inV" : "bar" ,
127+ "properties" : None ,
128+ }
129+ assert _normalize_properties (row [2 ]) == {"id" : "bar" , "label" : "vertex" , "properties" : None }
116130
117131 # parse path with multiple elements
118132 e2 = Edge ("bar" , "out1" , "label" , "in2" )
@@ -122,9 +136,15 @@ def test_parse_gremlin_path_elements(gremlin_parser):
122136 df = pd .DataFrame .from_records (out )
123137 row = df .iloc [1 ]
124138 assert df .shape == (2 , 3 )
125- assert row [0 ] == {"id" : "bar" , "label" : "vertex" , "properties" : None }
126- assert row [1 ] == {"id" : "bar" , "label" : "label" , "outV" : "out1" , "inV" : "in2" , "properties" : None }
127- assert row [2 ] == {"id" : "in2" , "label" : "vertex" , "properties" : None }
139+ assert _normalize_properties (row [0 ]) == {"id" : "bar" , "label" : "vertex" , "properties" : None }
140+ assert _normalize_properties (row [1 ]) == {
141+ "id" : "bar" ,
142+ "label" : "label" ,
143+ "outV" : "out1" ,
144+ "inV" : "in2" ,
145+ "properties" : None ,
146+ }
147+ assert _normalize_properties (row [2 ]) == {"id" : "in2" , "label" : "vertex" , "properties" : None }
128148
129149 # parse path with maps
130150 p = Path (
@@ -152,7 +172,13 @@ def test_parse_gremlin_path_elements(gremlin_parser):
152172 assert df .shape == (1 , 3 )
153173 assert row [0 ]["name" ] == "foo"
154174 assert row [0 ]["age" ] == 29
155- assert row [1 ] == {"id" : "bar" , "label" : "label" , "outV" : "out1" , "inV" : "in2" , "properties" : None }
175+ assert _normalize_properties (row [1 ]) == {
176+ "id" : "bar" ,
177+ "label" : "label" ,
178+ "outV" : "out1" ,
179+ "inV" : "in2" ,
180+ "properties" : None ,
181+ }
156182 assert row [2 ]["name" ] == "bar"
157183 assert row [2 ]["age" ] == 40
158184
@@ -216,3 +242,92 @@ def test_parse_gremlin_subgraph(gremlin_parser):
216242 assert df .shape == (1 , 2 )
217243 assert row ["@type" ] == "tinker:graph"
218244 assert row ["@value" ] == {"vertices" : ["v[45]" , "v[9]" ], "edges" : ["e[3990][9-route->45]" ]}
245+
246+
247+ # to_rdf_graph IRIREF validation: caller-supplied DataFrame cells must conform to
248+ # the SPARQL IRIREF grammar so they cannot close the <...> token and inject
249+ # arbitrary SPARQL UPDATE syntax (DELETE / DROP / LOAD / ...).
250+
251+
252+ def _rdf_triples_df () -> pd .DataFrame :
253+ return pd .DataFrame (
254+ {
255+ "s" : ["http://example.org/alice" , "http://example.org/bob" ],
256+ "p" : ["http://xmlns.com/foaf/0.1/name" , "http://xmlns.com/foaf/0.1/name" ],
257+ "o" : ["http://example.org/AliceName" , "http://example.org/BobName" ],
258+ }
259+ )
260+
261+
262+ def _mock_neptune_client () -> MagicMock :
263+ client = MagicMock (spec = NeptuneClient )
264+ client .write_sparql .return_value = True
265+ return client
266+
267+
268+ def test_to_rdf_graph_accepts_well_formed_iris ():
269+ client = _mock_neptune_client ()
270+ df = _rdf_triples_df ()
271+
272+ assert wr .neptune .to_rdf_graph (client , df ) is True
273+ client .write_sparql .assert_called_once ()
274+ query = client .write_sparql .call_args .args [0 ]
275+ assert "<http://example.org/alice>" in query
276+ assert "<http://example.org/bob>" in query
277+
278+
279+ @pytest .mark .parametrize (
280+ "malicious_cell, column" ,
281+ [
282+ # Bug-bounty PoC payload: closes IRI, runs DELETE WHERE, reopens INSERT.
283+ (
284+ "> . }; DELETE WHERE { ?s ?p ?o }; "
285+ "INSERT DATA { <http://evil.com/x> <http://evil.com/y> <http://evil.com/z" ,
286+ "o" ,
287+ ),
288+ # DROP ALL via the subject slot.
289+ ("http://x.com/s> <http://x.com/p> <http://x.com/o> . }; DROP ALL ; INSERT DATA { <a" , "s" ),
290+ # LOAD via the predicate slot.
291+ (
292+ "http://x.com/p> <http://x.com/o> . }; LOAD <http://evil.com/payload.ttl> ; "
293+ "INSERT DATA { <http://x.com/s> <http://x.com/p2" ,
294+ "p" ,
295+ ),
296+ # Whitespace alone is enough to break the IRIREF token.
297+ ("http://example.org/ has space" , "o" ),
298+ ("http://example.org/a\n <http://x>" , "o" ),
299+ ("http://example.org/<inner>" , "s" ),
300+ ],
301+ )
302+ def test_to_rdf_graph_rejects_malicious_cells (malicious_cell , column ):
303+ client = _mock_neptune_client ()
304+ df = _rdf_triples_df ()
305+ df .loc [0 , column ] = malicious_cell
306+
307+ with pytest .raises (exceptions .InvalidArgumentValue , match = "not a valid IRI" ):
308+ wr .neptune .to_rdf_graph (client , df )
309+ # Validation must run before any network call.
310+ client .write_sparql .assert_not_called ()
311+
312+
313+ def test_to_rdf_graph_rejects_malicious_graph_column_for_quads ():
314+ client = _mock_neptune_client ()
315+ df = _rdf_triples_df ()
316+ df ["g" ] = ["http://example.org/g1" , "http://example.org/g2" ]
317+ df .loc [0 , "g" ] = "http://x> {} }; DROP ALL ; INSERT DATA { GRAPH <http://x> { <a"
318+
319+ with pytest .raises (exceptions .InvalidArgumentValue , match = "'g'" ):
320+ wr .neptune .to_rdf_graph (client , df )
321+ client .write_sparql .assert_not_called ()
322+
323+
324+ def test_to_rdf_graph_error_identifies_row_and_column ():
325+ client = _mock_neptune_client ()
326+ df = _rdf_triples_df ()
327+ df .loc [1 , "o" ] = "http://example.org/bad value"
328+
329+ with pytest .raises (exceptions .InvalidArgumentValue ) as exc_info :
330+ wr .neptune .to_rdf_graph (client , df )
331+ message = str (exc_info .value )
332+ assert "'o'" in message
333+ assert "row index 1" in message
0 commit comments