Skip to content

Add partitions to table index like PyHive does #231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions impala/sqlalchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,27 @@ def get_foreign_keys(self, connection, table_name, schema=None, **kw):
return []

def get_indexes(self, connection, table_name, schema=None, **kw):
# no indexes in impala
# TODO(laserson): handle partitions, like in PyHive
return []
name = table_name
if schema is not None:
name = '%s.%s' % (schema, name)
query = 'DESCRIBE FORMATTED %s' % name
cursor = connection.execute(query)
rows = cursor.fetchall()
# Strip whitespace
rows = [[col.strip() if col else None for col in row] for row in rows]
# Filter out empty rows and comment
rows = [row for row in rows if row[0]]
for i, (col_name, _col_type, _comment) in enumerate(rows):
if col_name == '# Partition Information':
break
# Handle partition columns
col_names = []
for col_name, _col_type, _comment in rows[i + 1:]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this work? It seems like this is going to process all of the "DESCRIBE FORMATTED" output after "#Partition Information" regardless of whether it's a comment or not.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My best guess is # Partition Information was last in describe output at that time in impala. Will add a fix in the next few days when I catch some time.

col_names.append(col_name)
if col_names:
return [{'name': 'partition', 'column_names': col_names, 'unique': False}]
else:
return []

def do_rollback(self, dbapi_connection):
# no transactions in impala
Expand Down
36 changes: 36 additions & 0 deletions impala/tests/test_sqlalchemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,42 @@

from impala.sqlalchemy import STRING, INT, DOUBLE, TINYINT

def table_metadata_from_ddl_template(con, ddl, table_name):
"""
Helper for loading table metadata from ddl create table.
"""
cur = con.cursor()
cur.execute(ddl.format(table=table_name))
cur.close()
engine = create_engine('impala://', creator=lambda x: con)
metadata = MetaData()
return Table(table_name, metadata, autoload=True, autoload_with=engine)

def test_no_partitions_no_indexes(con):
"""
Assert that table with no partitions contains no indices.
"""
ddl = 'CREATE TABLE {table} (a STRING)'
table = table_metadata_from_ddl_template(con, ddl, 'no_partitions')
assert len(table.indexes) == 0

def test_one_partitions_indexes(con):
"""
Assert that table with one partition has one index with one column.
"""
ddl = 'CREATE TABLE {table} (a STRING) PARTITIONED BY (b INT);'
table = table_metadata_from_ddl_template(con, ddl, 'one_partition')
assert len(table.indexes) == 1
assert str(list(table.indexes)[0].columns) == "['one_partition.b']"

def test_two_partitions_indexes(con):
"""
Assert that table with two partitions has one index with two columns.
"""
ddl = 'CREATE TABLE {table} (a STRING) PARTITIONED BY (b INT, c INT);'
table = table_metadata_from_ddl_template(con, ddl, 'two_partitions')
assert len(table.indexes) == 1
assert str(list(table.indexes)[0].columns) == "['two_partitions.b', 'two_partitions.c']"

def test_sqlalchemy_compilation():
engine = create_engine('impala://localhost')
Expand Down