Skip to content

Commit

Permalink
create athena table kwargs
Browse files Browse the repository at this point in the history
  • Loading branch information
liquidcarbon committed Dec 12, 2024
1 parent 562ff47 commit 3f611e6
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
**/__pycache__
**/__pycache__
uv.lock
29 changes: 28 additions & 1 deletion affinity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
__doc__ = """
Module for creating well-documented datasets, with types and annotations.
Version 0.8.1
"""

from dataclasses import dataclass, field
Expand Down Expand Up @@ -235,6 +234,34 @@ def from_sql(cls, query: str, **kwargs):
instance.origin["source"] += f"\nquery:\n{query}"
return instance

@property
def athena_types(self):
"""Convert pandas types to SQL types for loading into AWS Athena."""

wr = try_import("awswrangler")
columns_types, partition_types = wr.catalog.extract_athena_types(
df=self.df,
partition_cols=self.LOCATION.partition_by,
)
return columns_types, partition_types

def kwargs_for_create_athena_table(
self, db: str, table: str, compression: str | None = None, **kwargs
):
"""Arguments for creating AWS Athena tables."""
columns_types, partitions_types = self.athena_types
return dict(
database=db,
table=table,
path=self.LOCATION.folder,
columns_types=columns_types,
partitions_types=partitions_types,
compression=compression,
description=self.__doc__,
columns_comments=self.data_dict,
**kwargs,
)


class Dataset(DatasetBase):
"""Base class for typed, annotated datasets."""
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "affinity"
version = "0.8.7"
version = "0.9.0"
description = "Module for creating well-documented datasets, with types and annotations."
authors = [
{ name = "Alex Kislukhin" }
Expand All @@ -13,6 +13,7 @@ readme = "README.md"
requires-python = ">=3.11"

dependencies = [
"awswrangler>=3.10.1",
"duckdb>=1",
"pandas",
]
Expand Down
25 changes: 24 additions & 1 deletion test_affinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class aDataset(af.Dataset):
def test_wrong_dataset_declaration():
class aDataset(af.Dataset):
v: af.Vector(np.int8) # type: ignore
# v = af.Vector(np.int8) # the correct way
# v = af.Vector(np.int8) # this is the correct way

with pytest.raises(ValueError):
aDataset()
Expand Down Expand Up @@ -329,6 +329,29 @@ class cDataset(af.Dataset):
cDataset().sql("SELECT v2 FROM df") # "df" != last test's data_a.df


def test_kwargs_for_create_athena_table():
class aDataset(af.Dataset):
"""Document me!"""

v1 = af.VectorI8("abc")
v2 = af.VectorString("xyz")
LOCATION = af.Location(folder=".", partition_by=["v1"])

create_athena_table_kwargs = aDataset().kwargs_for_create_athena_table(
db="bd", table="desk"
)
assert create_athena_table_kwargs == {
"database": "bd",
"table": "desk",
"path": ".",
"columns_types": {"v2": "string"},
"partitions_types": {"v1": "tinyint"},
"compression": None,
"description": "Document me!",
"columns_comments": {"v1": "abc", "v2": "xyz"},
}


@pytest.mark.skipif(NO_PYARROW, reason="pyarrow is not installed")
def test_objects_as_metadata():
class aDataset(af.Dataset):
Expand Down

0 comments on commit 3f611e6

Please sign in to comment.