@@ -46,6 +46,21 @@ def _create_dataset(
4646
4747# Pytest parameterization for all dataset creation formats
4848DATASET_FORMATS = ["pandas" , "arrow" ]
49+ MAP_DATASET_FORMATS = ["arrow" ]
50+
51+
52+ def _create_map_dataset (dataset_format : str ):
53+ """Create a dataset backed by an Arrow MapArray column."""
54+
55+ map_items = [
56+ {"attrs" : {"color" : "red" , "size" : "M" }},
57+ {"attrs" : {"brand" : "Ray" }},
58+ ]
59+ map_type = pa .map_ (pa .string (), pa .string ())
60+ arrow_table = pa .table (
61+ {"attrs" : pa .array ([row ["attrs" ] for row in map_items ], type = map_type )}
62+ )
63+ return _create_dataset (map_items , dataset_format , arrow_table )
4964
5065
5166# ──────────────────────────────────────
@@ -530,6 +545,112 @@ def test_struct_nested_bracket(self, dataset_format):
530545 assert rows_same (result , expected )
531546
532547
548+ # ──────────────────────────────────────
549+ # Map Namespace Tests
550+ # ──────────────────────────────────────
551+
552+
553+ @pytest .mark .parametrize ("dataset_format" , MAP_DATASET_FORMATS )
554+ class TestMapNamespace :
555+ """Tests for map namespace operations."""
556+
557+ def test_map_keys (self , dataset_format ):
558+ ds = _create_map_dataset (dataset_format )
559+
560+ result = ds .with_column ("keys" , col ("attrs" ).map .keys ()).to_pandas ()
561+ result = result .drop (columns = ["attrs" ])
562+
563+ expected = pd .DataFrame ({"keys" : [["color" , "size" ], ["brand" ]]})
564+ assert rows_same (result , expected )
565+
566+ def test_map_values (self , dataset_format ):
567+ ds = _create_map_dataset (dataset_format )
568+
569+ result = ds .with_column ("values" , col ("attrs" ).map .values ()).to_pandas ()
570+ result = result .drop (columns = ["attrs" ])
571+
572+ expected = pd .DataFrame ({"values" : [["red" , "M" ], ["Ray" ]]})
573+ assert rows_same (result , expected )
574+
575+ def test_physical_map_extraction (self , dataset_format ):
576+ """Test extraction works on List<Struct> (Physical Maps)."""
577+ # Construct List<Struct<k, v>>
578+ struct_type = pa .struct ([pa .field ("k" , pa .string ()), pa .field ("v" , pa .int64 ())])
579+ list_type = pa .list_ (struct_type )
580+
581+ data_py = [[{"k" : "a" , "v" : 1 }], [{"k" : "b" , "v" : 2 }]]
582+ arrow_table = pa .Table .from_arrays (
583+ [pa .array (data_py , type = list_type )], names = ["data" ]
584+ )
585+
586+ items_data = [{"data" : row } for row in data_py ]
587+ ds = _create_dataset (items_data , dataset_format , arrow_table )
588+
589+ result = (
590+ ds .with_column ("keys" , col ("data" ).map .keys ())
591+ .with_column ("values" , col ("data" ).map .values ())
592+ .to_pandas ()
593+ )
594+
595+ expected = pd .DataFrame (
596+ {
597+ "data" : data_py ,
598+ "keys" : [["a" ], ["b" ]],
599+ "values" : [[1 ], [2 ]],
600+ }
601+ )
602+ assert rows_same (result , expected )
603+
604+ def test_map_sliced_offsets (self , dataset_format ):
605+ """Test extraction works correctly on sliced Arrow arrays (offset > 0)."""
606+ items = [{"m" : {"id" : i }} for i in range (10 )]
607+ map_type = pa .map_ (pa .string (), pa .int64 ())
608+ arrays = pa .array ([row ["m" ] for row in items ], type = map_type )
609+ table = pa .Table .from_arrays ([arrays ], names = ["m" ])
610+
611+ # Force offsets by slicing the table before ingestion
612+ sliced_table = table .slice (offset = 7 , length = 3 )
613+ ds = ray .data .from_arrow (sliced_table )
614+
615+ result = ds .with_column ("vals" , col ("m" ).map .values ()).to_pandas ()
616+ result = result .drop (columns = ["m" ])
617+
618+ expected = pd .DataFrame ({"vals" : [[7 ], [8 ], [9 ]]})
619+ assert rows_same (result , expected )
620+
621+ def test_map_nulls_and_empty (self , dataset_format ):
622+ """Test handling of null maps and empty maps."""
623+ items_data = [{"m" : {"a" : 1 }}, {"m" : {}}, {"m" : None }]
624+
625+ map_type = pa .map_ (pa .string (), pa .int64 ())
626+ arrays = pa .array ([row ["m" ] for row in items_data ], type = map_type )
627+ arrow_table = pa .Table .from_arrays ([arrays ], names = ["m" ])
628+ ds = _create_dataset (items_data , dataset_format , arrow_table )
629+
630+ # Use take_all() to avoid pandas casting errors with mixed None/list types
631+ rows = (
632+ ds .with_column ("keys" , col ("m" ).map .keys ())
633+ .with_column ("values" , col ("m" ).map .values ())
634+ .take_all ()
635+ )
636+
637+ assert list (rows [0 ]["keys" ]) == ["a" ] and list (rows [0 ]["values" ]) == [1 ]
638+ assert len (rows [1 ]["keys" ]) == 0 and len (rows [1 ]["values" ]) == 0
639+ assert rows [2 ]["keys" ] is None and rows [2 ]["values" ] is None
640+
641+ def test_map_chaining (self , dataset_format ):
642+ ds = _create_map_dataset (dataset_format )
643+
644+ # map.keys() returns a list, so .list.len() should apply
645+ result = ds .with_column (
646+ "num_keys" , col ("attrs" ).map .keys ().list .len ()
647+ ).to_pandas ()
648+ result = result .drop (columns = ["attrs" ])
649+
650+ expected = pd .DataFrame ({"num_keys" : [2 , 1 ]})
651+ assert rows_same (result , expected )
652+
653+
533654# ──────────────────────────────────────
534655# Datetime Namespace Tests
535656# ──────────────────────────────────────
0 commit comments