-
Notifications
You must be signed in to change notification settings - Fork 103
add feature to create tables from pyarrow objects #597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3a54ea9
4f0d9ce
1c4cf8b
8f8da0b
7773276
4da3ab0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,11 @@ | |
|
|
||
| from clickhouse_connect.datatypes.base import ClickHouseType | ||
|
|
||
| try: | ||
| import pyarrow as pa | ||
| except Exception: | ||
| pa = None | ||
|
|
||
|
|
||
| class TableColumnDef(NamedTuple): | ||
| """ | ||
|
|
@@ -26,3 +31,108 @@ def create_table(table_name: str, columns: Sequence[TableColumnDef], engine: str | |
| for key, value in engine_params.items(): | ||
| stmt += f' {key} {value}' | ||
| return stmt | ||
|
|
||
|
|
||
| def _arrow_type_to_ch(arrow_type: "pa.DataType") -> str: | ||
| """ | ||
| Best-effort mapping from common PyArrow types to ClickHouse type names. | ||
|
|
||
| Covers core scalar types. For anything unknown, we raise so the | ||
| caller is aware that the automatic mapping is not implemented for that Arrow type. | ||
| """ | ||
| if pa is None: | ||
| raise ImportError( | ||
| "PyArrow is required, but it is not installed." | ||
| ) | ||
|
||
|
|
||
| pat = pa.types | ||
|
|
||
| # Signed ints | ||
| if pat.is_int8(arrow_type): | ||
| return 'Int8' | ||
| if pat.is_int16(arrow_type): | ||
| return 'Int16' | ||
| if pat.is_int32(arrow_type): | ||
| return 'Int32' | ||
| if pat.is_int64(arrow_type): | ||
| return 'Int64' | ||
|
|
||
| # Unsigned ints | ||
| if pat.is_uint8(arrow_type): | ||
| return 'UInt8' | ||
| if pat.is_uint16(arrow_type): | ||
| return 'UInt16' | ||
| if pat.is_uint32(arrow_type): | ||
| return 'UInt32' | ||
| if pat.is_uint64(arrow_type): | ||
| return 'UInt64' | ||
|
|
||
| # Floats | ||
| if pat.is_float16(arrow_type) or pat.is_float32(arrow_type): | ||
| return 'Float32' | ||
| if pat.is_float64(arrow_type): | ||
| return 'Float64' | ||
|
|
||
| # Boolean | ||
| if pat.is_boolean(arrow_type): | ||
| return 'Bool' | ||
|
|
||
| # Strings (this covers pa.string(), pa.large_string()) | ||
| if pat.is_string(arrow_type) or pat.is_large_string(arrow_type): | ||
| return 'String' | ||
|
|
||
| # for any currently unsupported type, we raise so it’s clear that | ||
| # this Arrow type isn’t supported by the helper yet. | ||
| raise TypeError(f'Unsupported Arrow type for automatic mapping: {arrow_type!r}') | ||
|
|
||
|
|
||
| class _DDLType: | ||
| """ | ||
| Minimal helper used to satisfy TableColumnDef.ch_type. | ||
|
|
||
| create_table() only needs ch_type.name when building the DDL string, | ||
| so we'll wrap the ClickHouse type name in this tiny object instead of | ||
| constructing full ClickHouseType instances here. | ||
| """ | ||
| def __init__(self, name: str): | ||
| self.name = name | ||
|
|
||
|
|
||
| def arrow_schema_to_column_defs(schema: "pa.Schema") -> list[TableColumnDef]: | ||
| """ | ||
| Convert a PyArrow Schema into a list of TableColumnDef objects. | ||
| """ | ||
joe-clickhouse marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if pa is None: | ||
| raise ImportError( | ||
| "PyArrow is required, but it is not installed." | ||
| ) | ||
|
|
||
| if not isinstance(schema, pa.Schema): | ||
| raise TypeError(f'Expected pyarrow.Schema, got {type(schema)!r}') | ||
|
|
||
| col_defs: list[TableColumnDef] = [] | ||
| for field in schema: | ||
| ch_type_name = _arrow_type_to_ch(field.type) | ||
| col_defs.append( | ||
| TableColumnDef( | ||
| name=field.name, | ||
| ch_type=_DDLType(ch_type_name), | ||
| ) | ||
| ) | ||
| return col_defs | ||
|
|
||
|
|
||
| def create_table_from_arrow_schema( | ||
| table_name: str, | ||
| schema: "pa.Schema", | ||
| engine: str, | ||
| engine_params: dict, | ||
| ) -> str: | ||
| """ | ||
| Helper function to build a CREATE TABLE statement from a PyArrow Schema. | ||
|
|
||
| Internally: | ||
| schema -> arrow_schema_to_column_defs -> create_table(...) | ||
| """ | ||
| col_defs = arrow_schema_to_column_defs(schema) | ||
| return create_table(table_name, col_defs, engine, engine_params) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| import pytest | ||
|
|
||
| pytest.importorskip("pyarrow") | ||
|
|
||
| import pyarrow as pa | ||
|
|
||
| from clickhouse_connect.driver.ddl import ( | ||
| arrow_schema_to_column_defs, | ||
| create_table, | ||
| create_table_from_arrow_schema, | ||
| ) | ||
|
|
||
|
|
||
| def test_arrow_schema_to_column_defs_basic_mappings(): | ||
| schema = pa.schema( | ||
| [ | ||
| ("i8", pa.int8()), | ||
| ("i16", pa.int16()), | ||
| ("i32", pa.int32()), | ||
| ("i64", pa.int64()), | ||
| ("u8", pa.uint8()), | ||
| ("u16", pa.uint16()), | ||
| ("u32", pa.uint32()), | ||
| ("u64", pa.uint64()), | ||
| ("f16", pa.float16()), | ||
| ("f32", pa.float32()), | ||
| ("f64", pa.float64()), | ||
| ("s", pa.string()), | ||
| ("ls", pa.large_string()), | ||
| ("b", pa.bool_()), | ||
| ] | ||
| ) | ||
|
|
||
| col_defs = arrow_schema_to_column_defs(schema) | ||
|
|
||
| assert [c.name for c in col_defs] == [ | ||
| "i8", | ||
| "i16", | ||
| "i32", | ||
| "i64", | ||
| "u8", | ||
| "u16", | ||
| "u32", | ||
| "u64", | ||
| "f16", | ||
| "f32", | ||
| "f64", | ||
| "s", | ||
| "ls", | ||
| "b", | ||
| ] | ||
|
|
||
| type_names = [c.ch_type.name for c in col_defs] | ||
|
|
||
| assert type_names == [ | ||
| "Int8", | ||
| "Int16", | ||
| "Int32", | ||
| "Int64", | ||
| "UInt8", | ||
| "UInt16", | ||
| "UInt32", | ||
| "UInt64", | ||
| "Float32", | ||
| "Float32", | ||
| "Float64", | ||
| "String", | ||
| "String", | ||
| "Bool", | ||
| ] | ||
|
|
||
|
|
||
| def test_arrow_schema_to_column_defs_unsupported_type_raises(): | ||
| schema = pa.schema( | ||
| [ | ||
| ("ts", pa.timestamp("ms")), | ||
| ] | ||
| ) | ||
|
|
||
| with pytest.raises(TypeError, match="Unsupported Arrow type"): | ||
| arrow_schema_to_column_defs(schema) | ||
|
|
||
|
|
||
| def test_arrow_schema_to_column_defs_invalid_input_type(): | ||
| with pytest.raises(TypeError, match="Expected pyarrow.Schema"): | ||
| arrow_schema_to_column_defs("not a schema") | ||
|
|
||
|
|
||
| def test_create_table_from_arrow_schema_builds_expected_ddl(): | ||
| schema = pa.schema( | ||
| [ | ||
| ("id", pa.int64()), | ||
| ("name", pa.string()), | ||
| ("score", pa.float32()), | ||
| ("flag", pa.bool_()), | ||
| ] | ||
| ) | ||
|
|
||
| ddl = create_table_from_arrow_schema( | ||
| table_name="arrow_basic_test", | ||
| schema=schema, | ||
| engine="MergeTree", | ||
| engine_params={"ORDER BY": "id"}, | ||
| ) | ||
|
|
||
| assert ( | ||
| ddl | ||
| == "CREATE TABLE arrow_basic_test " | ||
| "(id Int64, name String, score Float32, flag Bool) " | ||
| "ENGINE MergeTree ORDER BY id" | ||
| ) | ||
|
|
||
|
|
||
| def test_create_table_from_arrow_schema_matches_manual_create_table(): | ||
| schema = pa.schema( | ||
| [ | ||
| ("id", pa.int64()), | ||
| ("name", pa.string()), | ||
| ] | ||
| ) | ||
|
|
||
| col_defs = arrow_schema_to_column_defs(schema) | ||
|
|
||
| ddl_manual = create_table( | ||
| table_name="arrow_compare_test", | ||
| columns=col_defs, | ||
| engine="MergeTree", | ||
| engine_params={"ORDER BY": "id"}, | ||
| ) | ||
|
|
||
| ddl_wrapper = create_table_from_arrow_schema( | ||
| table_name="arrow_compare_test", | ||
| schema=schema, | ||
| engine="MergeTree", | ||
| engine_params={"ORDER BY": "id"}, | ||
| ) | ||
|
|
||
| assert ddl_manual == ddl_wrapper |
Uh oh!
There was an error while loading. Please reload this page.