create athena table kwargs

liquidcarbon · liquidcarbon · commit 3f611e6ce156 · 2024-12-12T16:57:47.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-**/__pycache__
+**/__pycache__
+uv.lock
diff --git a/affinity.py b/affinity.py
@@ -1,6 +1,5 @@
 __doc__ = """
 Module for creating well-documented datasets, with types and annotations.
-Version 0.8.1
 """
 
 from dataclasses import dataclass, field
@@ -235,6 +234,34 @@ def from_sql(cls, query: str, **kwargs):
         instance.origin["source"] += f"\nquery:\n{query}"
         return instance
 
+    @property
+    def athena_types(self):
+        """Convert pandas types to SQL types for loading into AWS Athena."""
+
+        wr = try_import("awswrangler")
+        columns_types, partition_types = wr.catalog.extract_athena_types(
+            df=self.df,
+            partition_cols=self.LOCATION.partition_by,
+        )
+        return columns_types, partition_types
+
+    def kwargs_for_create_athena_table(
+        self, db: str, table: str, compression: str | None = None, **kwargs
+    ):
+        """Arguments for creating AWS Athena tables."""
+        columns_types, partitions_types = self.athena_types
+        return dict(
+            database=db,
+            table=table,
+            path=self.LOCATION.folder,
+            columns_types=columns_types,
+            partitions_types=partitions_types,
+            compression=compression,
+            description=self.__doc__,
+            columns_comments=self.data_dict,
+            **kwargs,
+        )
+
 
 class Dataset(DatasetBase):
     """Base class for typed, annotated datasets."""
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "affinity"
-version = "0.8.7"
+version = "0.9.0"
 description = "Module for creating well-documented datasets, with types and annotations."
 authors = [
     { name = "Alex Kislukhin" }
@@ -13,6 +13,7 @@ readme = "README.md"
 requires-python = ">=3.11"
 
 dependencies = [
+    "awswrangler>=3.10.1",
     "duckdb>=1",
     "pandas",
 ]
diff --git a/test_affinity.py b/test_affinity.py
@@ -101,7 +101,7 @@ class aDataset(af.Dataset):
 def test_wrong_dataset_declaration():
     class aDataset(af.Dataset):
         v: af.Vector(np.int8)  # type: ignore
-        # v = af.Vector(np.int8)  # the correct way
+        # v = af.Vector(np.int8)  # this is the correct way
 
     with pytest.raises(ValueError):
         aDataset()
@@ -329,6 +329,29 @@ class cDataset(af.Dataset):
         cDataset().sql("SELECT v2 FROM df")  # "df" != last test's data_a.df
 
 
+def test_kwargs_for_create_athena_table():
+    class aDataset(af.Dataset):
+        """Document me!"""
+
+        v1 = af.VectorI8("abc")
+        v2 = af.VectorString("xyz")
+        LOCATION = af.Location(folder=".", partition_by=["v1"])
+
+    create_athena_table_kwargs = aDataset().kwargs_for_create_athena_table(
+        db="bd", table="desk"
+    )
+    assert create_athena_table_kwargs == {
+        "database": "bd",
+        "table": "desk",
+        "path": ".",
+        "columns_types": {"v2": "string"},
+        "partitions_types": {"v1": "tinyint"},
+        "compression": None,
+        "description": "Document me!",
+        "columns_comments": {"v1": "abc", "v2": "xyz"},
+    }
+
+
 @pytest.mark.skipif(NO_PYARROW, reason="pyarrow is not installed")
 def test_objects_as_metadata():
     class aDataset(af.Dataset):

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-**/__pycache__`
	`1`	`+**/__pycache__`
	`2`	`+uv.lock`