@@ -493,21 +493,93 @@ def test_empty_data_raises(self):
493493 tu .upsert_by_arrow_with_key (data , upsert_keys = ['id' ])
494494 self .assertIn ('empty' , str (ctx .exception ))
495495
496- def test_duplicate_keys_in_input_raises (self ):
497- """Duplicate composite keys in input data should raise ValueError ."""
496+ def test_duplicate_keys_in_input_keeps_last (self ):
497+ """Duplicate keys in input data should keep the last occurrence ."""
498498 table = self ._create_table ()
499- data = pa .Table .from_pydict ({
500- 'id' : [1 , 1 ],
501- 'name' : ['A ' , 'B ' ],
499+ self . _write ( table , pa .Table .from_pydict ({
500+ 'id' : [1 , 2 ],
501+ 'name' : ['Alice ' , 'Bob ' ],
502502 'age' : [25 , 30 ],
503503 'city' : ['NYC' , 'LA' ],
504+ }, schema = self .pa_schema ))
505+
506+ # id=1 appears twice; the second row (name='A_last') should win
507+ data = pa .Table .from_pydict ({
508+ 'id' : [1 , 1 ],
509+ 'name' : ['A_first' , 'A_last' ],
510+ 'age' : [90 , 91 ],
511+ 'city' : ['X' , 'Y' ],
504512 }, schema = self .pa_schema )
513+ self ._upsert (table , data , upsert_keys = ['id' ])
505514
506- with self .assertRaises (ValueError ) as ctx :
507- wb = table .new_batch_write_builder ()
508- tu = wb .new_update ()
509- tu .upsert_by_arrow_with_key (data , upsert_keys = ['id' ])
510- self .assertIn ('duplicate' , str (ctx .exception ).lower ())
515+ result = self ._read_all (table )
516+ rows = {r : (n , a , c ) for r , n , a , c in zip (
517+ result ['id' ].to_pylist (),
518+ result ['name' ].to_pylist (),
519+ result ['age' ].to_pylist (),
520+ result ['city' ].to_pylist (),
521+ )}
522+ # id=1 updated with last duplicate row
523+ self .assertEqual (rows [1 ], ('A_last' , 91 , 'Y' ))
524+ # id=2 unchanged
525+ self .assertEqual (rows [2 ], ('Bob' , 30 , 'LA' ))
526+
527+ def test_duplicate_keys_all_new_keeps_last (self ):
528+ """Duplicate keys in input on empty table keeps the last occurrence."""
529+ table = self ._create_table ()
530+
531+ # id=1 appears three times; last row should win
532+ data = pa .Table .from_pydict ({
533+ 'id' : [1 , 1 , 1 , 2 ],
534+ 'name' : ['A1' , 'A2' , 'A3' , 'B' ],
535+ 'age' : [10 , 20 , 30 , 40 ],
536+ 'city' : ['X1' , 'X2' , 'X3' , 'Y' ],
537+ }, schema = self .pa_schema )
538+ self ._upsert (table , data , upsert_keys = ['id' ])
539+
540+ result = self ._read_all (table )
541+ self .assertEqual (result .num_rows , 2 )
542+ rows = {r : (n , a , c ) for r , n , a , c in zip (
543+ result ['id' ].to_pylist (),
544+ result ['name' ].to_pylist (),
545+ result ['age' ].to_pylist (),
546+ result ['city' ].to_pylist (),
547+ )}
548+ self .assertEqual (rows [1 ], ('A3' , 30 , 'X3' ))
549+ self .assertEqual (rows [2 ], ('B' , 40 , 'Y' ))
550+
551+ def test_duplicate_keys_partitioned_keeps_last (self ):
552+ """Duplicate keys in a partitioned table keep the last per partition."""
553+ table = self ._create_table (
554+ pa_schema = self .partitioned_pa_schema ,
555+ partition_keys = ['region' ],
556+ )
557+ self ._write (table , pa .Table .from_pydict ({
558+ 'id' : [1 , 2 ],
559+ 'name' : ['Alice' , 'Bob' ],
560+ 'age' : [25 , 30 ],
561+ 'region' : ['US' , 'EU' ],
562+ }, schema = self .partitioned_pa_schema ))
563+
564+ # id=1 duplicated in US partition; id=2 duplicated in EU partition
565+ data = pa .Table .from_pydict ({
566+ 'id' : [1 , 1 , 2 , 2 ],
567+ 'name' : ['A_first' , 'A_last' , 'B_first' , 'B_last' ],
568+ 'age' : [50 , 51 , 60 , 61 ],
569+ 'region' : ['US' , 'US' , 'EU' , 'EU' ],
570+ }, schema = self .partitioned_pa_schema )
571+ self ._upsert (table , data , upsert_keys = ['id' ])
572+
573+ result = self ._read_all (table )
574+ self .assertEqual (result .num_rows , 2 )
575+ rows = {(r , reg ): (n , a ) for r , n , a , reg in zip (
576+ result ['id' ].to_pylist (),
577+ result ['name' ].to_pylist (),
578+ result ['age' ].to_pylist (),
579+ result ['region' ].to_pylist (),
580+ )}
581+ self .assertEqual (rows [(1 , 'US' )], ('A_last' , 51 ))
582+ self .assertEqual (rows [(2 , 'EU' )], ('B_last' , 61 ))
511583
512584 def test_partitioned_table_missing_partition_col_in_data_raises (self ):
513585 """Input data missing partition column should raise ValueError."""
0 commit comments