ibis-project · jakepenzak · Apr 13, 2025 · Apr 13, 2025 · Apr 13, 2025 · Apr 13, 2025
diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
@@ -604,8 +604,7 @@ def create_table(
         database: str | None = None,
         temp: bool | None = None,
         overwrite: bool = False,
-        format: str = "parquet",
-        partition_by: str | list[str] | None = None,
+        **kwargs: Any,
     ) -> ir.Table:
         """Create a new table in Spark.
 
@@ -626,11 +625,10 @@ def create_table(
         temp
             Whether the new table is temporary (unsupported)
         overwrite
-            If `True`, overwrite existing data
-        format
-            Format of the table on disk
-        partition_by
-            Name(s) of partitioning column(s)
+            If `True`, overwrite existing data. If `mode` is passed as a kwarg, it will take precedence over this argument.
+        **kwargs
+            Additional keyword arguments passed to [pyspark.sql.DataFrameWriter.saveAsTable](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.saveAsTable.html#pyspark.sql.DataFrameWriter.saveAsTable)
+            if `obj` is passed or [pyspark.sql.Catalog.createTable](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Catalog.createTable.html#pyspark.sql.Catalog.createTable) if `schema` is passed.
 
         Returns
         -------
@@ -655,18 +653,21 @@ def create_table(
             else:
                 table = ibis.memtable(obj)
             query = self.compile(table)
-            mode = "overwrite" if overwrite else "error"
+            if "mode" in kwargs:
+                mode = kwargs["mode"]
+                del kwargs["mode"]
+            else:
+                mode = "overwrite" if overwrite else "error"
+
             with self._active_catalog_database(catalog, db):
                 self._run_pre_execute_hooks(table)
                 df = self._session.sql(query)
-                df.write.saveAsTable(
-                    name, format=format, mode=mode, partitionBy=partition_by
-                )
+                df.write.saveAsTable(name, mode=mode, **kwargs)
         elif schema is not None:
             schema = ibis.schema(schema)
             schema = PySparkSchema.from_ibis(schema)
             with self._active_catalog_database(catalog, db):
-                self._session.catalog.createTable(name, schema=schema, format=format)
+                self._session.catalog.createTable(name, schema=schema, **kwargs)
         else:
             raise com.IbisError("The schema or obj parameter is required")
 

diff --git a/ibis/backends/pyspark/tests/test_client.py b/ibis/backends/pyspark/tests/test_client.py
@@ -1,6 +1,18 @@
 from __future__ import annotations
 
+import pandas as pd
 import pytest
+from pandas.testing import assert_frame_equal
+
+from ibis.conftest import IS_SPARK_REMOTE
+
+try:
+    if IS_SPARK_REMOTE:
+        from pyspark.errors.exceptions.connect import AnalysisException
+    else:
+        from pyspark.errors.exceptions.captured import AnalysisException
+except ImportError:
+    from pyspark.sql.utils import AnalysisException
 
 import ibis
 
@@ -60,8 +72,20 @@
     assert con.current_database != "default"
 
 
-@pytest.mark.xfail_version(pyspark=["pyspark<3.4"], reason="no catalog support")
-def test_create_table_with_partition_and_catalog(con):
+@pytest.mark.parametrize(
+    "database_param",
+    [
+        pytest.param(
+            ("spark_catalog", "ibis_testing"),
+            marks=pytest.mark.xfail_version(
+                pyspark=["pyspark<3.4"], reason="no catalog support"
+            ),
+            id="with_catalog",
+        ),
+        pytest.param(None, id="no_catalog"),
+    ],
+)
+def test_create_table_with_partitions(con, database_param):
     # Create a sample table with a partition column
     data = {
         "epoch": [1712848119, 1712848121, 1712848155, 1712848169],
@@ -71,49 +95,54 @@
 
     t = ibis.memtable(data)
 
+    db_ref = database_param
+    db_str = "spark_catalog.ibis_testing" if db_ref else None
+
     # 1D partition
     table_name = "pt1"
 
     con.create_table(
         table_name,
-        database=("spark_catalog", "default"),
+        database=db_ref,
         obj=t,
         overwrite=True,
-        partition_by="category1",
+        partitionBy="category1",
     )
-    assert table_name in con.list_tables(database="spark_catalog.default")
+    assert table_name in con.list_tables(database=db_str)
 
-    partitions = (
-        con.raw_sql(f"SHOW PARTITIONS spark_catalog.default.{table_name}")
-        .toPandas()
-        .to_dict()
+    loc = (
+        f"spark_catalog.ibis_testing.{table_name}"
+        if db_ref
+        else f"ibis_testing.{table_name}"
     )
+    partitions = con.raw_sql(f"SHOW PARTITIONS {loc}").toPandas().to_dict()
     expected_partitions = {
         "partition": {0: "category1=A", 1: "category1=B", 2: "category1=C"}
     }
     assert partitions == expected_partitions
 
     # Cleanup
-    con.drop_table(table_name, database="spark_catalog.default")
-    assert table_name not in con.list_tables(database="spark_catalog.default")
+    con.drop_table(table_name, database=db_str)
+    assert table_name not in con.list_tables(database=db_str)
 
     # 2D partition
     table_name = "pt2"
 
     con.create_table(
         table_name,
-        database=("spark_catalog", "default"),
+        database=db_ref,
         obj=t,
         overwrite=True,
-        partition_by=["category1", "category2"],
+        partitionBy=["category1", "category2"],
     )
-    assert table_name in con.list_tables(database="spark_catalog.default")
+    assert table_name in con.list_tables(database=db_str)
 
-    partitions = (
-        con.raw_sql(f"SHOW PARTITIONS spark_catalog.default.{table_name}")
-        .toPandas()
-        .to_dict()
+    loc = (
+        f"spark_catalog.ibis_testing.{table_name}"
+        if db_ref
+        else f"ibis_testing.{table_name}"
     )
+    partitions = con.raw_sql(f"SHOW PARTITIONS {loc}").toPandas().to_dict()
     expected_partitions = {
         "partition": {
             0: "category1=A/category2=G",
@@ -124,65 +153,141 @@
     assert partitions == expected_partitions
 
     # Cleanup
-    con.drop_table(table_name, database="spark_catalog.default")
-    assert table_name not in con.list_tables(database="spark_catalog.default")
-
-
-def test_create_table_with_partition_no_catalog(con):
-    data = {
+    con.drop_table(table_name, database=db_str)
+    assert table_name not in con.list_tables(database=db_str)
+
+
+@pytest.mark.parametrize(
+    "format",
+    [
+        pytest.param(
+            "delta",
+            marks=pytest.mark.xfail_version(
+                pyspark=["pyspark<3.4"], reason="no delta support"
+            ),
+            id="delta",
+        ),
+        "parquet",
+        "csv",
+    ],
+)
+@pytest.mark.parametrize(
+    "database_param",
+    [
+        pytest.param(
+            ("spark_catalog", "ibis_testing"),
+            marks=pytest.mark.xfail_version(
+                pyspark=["pyspark<3.4"], reason="no catalog support"
+            ),
+            id="with_catalog",
+        ),
+        pytest.param(None, id="no_catalog"),
+    ],
+)
+def test_create_table_kwargs(con, format, database_param):
+    def compare_tables(t_out, t_in):
+        cols = list(t_out.columns)
+        expected = t_out[cols].sort_values(cols).reset_index(drop=True)
+        result = t_in[cols].sort_values(cols).reset_index(drop=True)
+        assert_frame_equal(expected, result)
+
+    base_data = {
         "epoch": [1712848119, 1712848121, 1712848155, 1712848169],
         "category1": ["A", "B", "A", "C"],
-        "category2": ["G", "J", "G", "H"],
     }
+    base_data_pd = pd.DataFrame(base_data)
 
-    t = ibis.memtable(data)
+    table_name = f"kwarg_test_{format}"
+    db_ref = database_param
+    db_str = "spark_catalog.ibis_testing" if db_ref else None
 
-    # 1D partition
-    table_name = "pt1"
+    # Helper to get table
+    def get_table():
+        loc = f"{db_str}.{table_name}" if db_ref else table_name
+        return con.raw_sql(f"SELECT * from {loc}").toPandas()
 
+    # 1. Create db table
+    t = ibis.memtable(base_data)
     con.create_table(
         table_name,
+        database=db_ref,
         obj=t,
         overwrite=True,
-        partition_by="category1",
+        partitionBy="category1",
+        format=format,
     )
-    assert table_name in con.list_tables()
+    assert table_name in con.list_tables(database=db_str)
+    compare_tables(base_data_pd, get_table())
 
-    partitions = (
-        con.raw_sql(f"SHOW PARTITIONS ibis_testing.{table_name}").toPandas().to_dict()
+    # 2. Append, same schema (mode & format kwargs) - this is similar behavior to `insert` method
+    con.create_table(
+        table_name,
+        database=db_ref,
+        obj=t,
+        overwrite=True,  # Will get overwritten by mode
+        mode="append",
+        partitionBy="category1",
+        format=format,
     )
-    expected_partitions = {
-        "partition": {0: "category1=A", 1: "category1=B", 2: "category1=C"}
-    }
-    assert partitions == expected_partitions
 
-    # Cleanup
-    con.drop_table(table_name)
-    assert table_name not in con.list_tables()
+    assert table_name in con.list_tables(database=db_str)
+    expected_2x = pd.concat([base_data_pd] * 2, ignore_index=True)
+    compare_tables(expected_2x, get_table())
 
-    # 2D partition
-    table_name = "pt2"
+    # 3. Overwrite table & schema (mode, overwriteSchema, & format kwargs)
+    data2 = {
+        **base_data,
+        "category2": ["G", "J", "G", "H"],
+    }
+    data2_pd = pd.DataFrame(data2)
+    t2 = ibis.memtable(data2)
 
     con.create_table(
         table_name,
-        obj=t,
-        overwrite=True,
-        partition_by=["category1", "category2"],
+        database=db_ref,
+        obj=t2,
+        mode="overwrite",
+        overwriteSchema=True,
+        format=format,
     )
-    assert table_name in con.list_tables()
 
-    partitions = (
-        con.raw_sql(f"SHOW PARTITIONS ibis_testing.{table_name}").toPandas().to_dict()
-    )
-    expected_partitions = {
-        "partition": {
-            0: "category1=A/category2=G",
-            1: "category1=B/category2=J",
-            2: "category1=C/category2=H",
-        }
-    }
-    assert partitions == expected_partitions
+    assert table_name in con.list_tables(database=db_str)
+    compare_tables(data2_pd, get_table())
+
+    # 4. Append and merge schema (mode, mergeSchema, & format kwargs)
+    data_merge = {**data2, "category3": ["W", "Z", "Q", "X"]}
+    data_merge_pd = pd.DataFrame(data_merge)
+    t_merge = ibis.memtable(data_merge)
+
+    if format == "delta":
+        con.create_table(
+            table_name,
+            database=db_ref,
+            obj=t_merge,
+            mode="append",
+            mergeSchema=True,
+            format=format,
+        )
+
+        assert table_name in con.list_tables(database=db_str)
+        expected_merged = pd.concat(
+            [data2_pd, data_merge_pd], ignore_index=True
+        ).fillna(value=pd.NA)
+
+        compare_tables(expected_merged, get_table().fillna(value=pd.NA))
+    elif AnalysisException is None:
+        pass
+    else:
+        with pytest.raises(AnalysisException) as _:
+            con.create_table(
+                table_name,
+                database=db_ref,
+                obj=t_merge,
+                mode="append",
+                mergeSchema=True,
+                format=format,
+            )
 
     # Cleanup
-    con.drop_table(table_name)
+    con.drop_table(table_name, database=db_str)
     assert table_name not in con.list_tables()