Skip to content

Feature/lazyness #229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import rows.plugins as plugins

from rows.operations import join, transform, transpose # NOQA
from rows.table import Table, FlexibleTable # NOQA
from rows.table import FlexibleTable, LazyTable, Table # NOQA
from rows.localization import locale_context # NOQA


Expand Down
16 changes: 10 additions & 6 deletions rows/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,27 +379,30 @@ def query(input_encoding, output_encoding, input_locale, output_locale,
if input_locale is not None:
with rows.locale_context(input_locale):
table = import_from_source(source, DEFAULT_INPUT_ENCODING,
samples=samples)
lazy=True, samples=samples)
else:
table = import_from_source(source, DEFAULT_INPUT_ENCODING,
samples=samples)
lazy=True, samples=samples)

sqlite_connection = sqlite3.Connection(':memory:')
rows.export_to_sqlite(table,
sqlite_connection,
table_name='table1')
result = rows.import_from_sqlite(sqlite_connection, query=query)
result = rows.import_from_sqlite(sqlite_connection, query=query,
lazy=True, samples=samples)

else:
# TODO: if all sources are SQLite we can also optimize the import
if input_locale is not None:
with rows.locale_context(input_locale):
tables = [_import_table(source, encoding=input_encoding,
verify_ssl=verify_ssl, samples=samples)
verify_ssl=verify_ssl, lazy=True,
samples=samples)
for source in sources]
else:
tables = [_import_table(source, encoding=input_encoding,
verify_ssl=verify_ssl, samples=samples)
verify_ssl=verify_ssl, lazy=True,
samples=samples)
for source in sources]

sqlite_connection = sqlite3.Connection(':memory:')
Expand All @@ -408,7 +411,8 @@ def query(input_encoding, output_encoding, input_locale, output_locale,
sqlite_connection,
table_name='table{}'.format(index))

result = rows.import_from_sqlite(sqlite_connection, query=query)
result = rows.import_from_sqlite(sqlite_connection, query=query,
lazy=True, samples=samples)

# TODO: may use sys.stdout.encoding if output_file = '-'
output_encoding = output_encoding or sys.stdout.encoding or \
Expand Down
3 changes: 3 additions & 0 deletions rows/plugins/dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def import_from_dicts(data, samples=None, *args, **kwargs):
return create_table(chain([headers], data_rows), meta=meta, *args, **kwargs)


import_from_dicts.is_lazy = False


def export_to_dicts(table, *args, **kwargs):
"""Export a `rows.Table` to a list of dicts"""
field_names = table.field_names
Expand Down
5 changes: 5 additions & 0 deletions rows/plugins/ods.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,5 +103,10 @@ def import_from_ods(filename_or_fobj, index=0, *args, **kwargs):

max_length = max(len(row) for row in table_rows)
full_rows = complete_with_None(table_rows, max_length)

meta = {'imported_from': 'ods', 'filename': filename,}

return create_table(full_rows, meta=meta, *args, **kwargs)


import_from_ods.is_lazy = False
4 changes: 3 additions & 1 deletion rows/plugins/plugin_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None,
return create_table(reader, meta=meta, *args, **kwargs)


import_from_csv.is_lazy = True


def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
dialect=unicodecsv.excel, batch_size=100, callback=None,
*args, **kwargs):
Expand All @@ -130,7 +133,6 @@ def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
contents.
"""
# TODO: will work only if table.fields is OrderedDict
# TODO: should use fobj? What about creating a method like json.dumps?

if filename_or_fobj is not None:
_, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
Expand Down
4 changes: 4 additions & 0 deletions rows/plugins/plugin_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
return create_table(table_rows, meta=meta, *args, **kwargs)


import_from_html.is_lazy = False


def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
**kwargs):
"""Export and return rows.Table data to HTML file."""
Expand All @@ -106,6 +109,7 @@ def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
header = [' <th> {} </th>\n'.format(field) for field in fields]
result.extend(header)
result.extend([' </tr>\n', ' </thead>\n', '\n', ' <tbody>\n', '\n'])
# TODO: could be lazy so we don't need to store the whole table into memory
for index, row in enumerate(serialized_table, start=1):
css_class = 'odd' if index % 2 == 1 else 'even'
result.append(' <tr class="{}">\n'.format(css_class))
Expand Down
6 changes: 6 additions & 0 deletions rows/plugins/plugin_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
filename, fobj = get_filename_and_fobj(filename_or_fobj)

json_obj = json.load(fobj, encoding=encoding)
# TODO: may use import_from_dicts here
field_names = list(json_obj[0].keys())
table_rows = [[item[key] for key in field_names] for item in json_obj]

Expand All @@ -44,6 +45,9 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)


import_from_json.is_lazy = False


def _convert(value, field_type, *args, **kwargs):
if value is None or field_type in (
fields.BinaryField,
Expand Down Expand Up @@ -74,6 +78,8 @@ def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None,
fields = table.fields
prepared_table = prepare_to_export(table, *args, **kwargs)
field_names = next(prepared_table)

# TODO: could be lazy so we don't need to store the whole table into memory
data = [{field_name: _convert(value, fields[field_name], *args, **kwargs)
for field_name, value in zip(field_names, row)}
for row in prepared_table]
Expand Down
6 changes: 5 additions & 1 deletion rows/plugins/plugin_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,12 @@ def import_from_parquet(filename_or_fobj, *args, **kwargs):
for schema in parquet._read_footer(fobj).schema
if schema.type is not None])
header = list(types.keys())
table_rows = list(parquet.reader(fobj)) # TODO: be lazy
# TODO: make it lazy
table_rows = list(parquet.reader(fobj))

meta = {'imported_from': 'parquet', 'filename': filename,}
return create_table([header] + table_rows, meta=meta, force_types=types,
*args, **kwargs)


import_from_parquet.is_lazy = False
15 changes: 10 additions & 5 deletions rows/plugins/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import sqlite3
import string

from itertools import chain

import six

import rows.fields as fields
Expand All @@ -29,6 +31,7 @@
prepare_to_export)

SQL_TABLE_NAMES = 'SELECT name FROM sqlite_master WHERE type="table"'
# TODO: may use query args instead of string formatting
SQL_CREATE_TABLE = 'CREATE TABLE IF NOT EXISTS "{table_name}" ({field_types})'
SQL_SELECT_ALL = 'SELECT * FROM "{table_name}"'
SQL_INSERT = 'INSERT INTO "{table_name}" ({field_names}) VALUES ({placeholders})'
Expand Down Expand Up @@ -122,13 +125,15 @@ def import_from_sqlite(filename_or_connection, table_name='table1', query=None,
if query_args is None:
query_args = tuple()

table_rows = list(cursor.execute(query, query_args)) # TODO: may be lazy
header = [six.text_type(info[0]) for info in cursor.description]
cursor.close()
# TODO: should close connection also?
cursor.execute(query, query_args)
data = chain([[six.text_type(info[0]) for info in cursor.description]],
cursor)

meta = {'imported_from': 'sqlite', 'filename': filename_or_connection, }
return create_table([header] + table_rows, meta=meta, *args, **kwargs)
return create_table(data, meta=meta, *args, **kwargs)


import_from_sqlite.is_lazy = True


def export_to_sqlite(table, filename_or_connection, table_name=None,
Expand Down
3 changes: 3 additions & 0 deletions rows/plugins/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ def import_from_txt(filename_or_fobj, encoding='utf-8',
return create_table(table_rows, meta=meta, *args, **kwargs)


import_from_txt.is_lazy = False


def export_to_txt(table, filename_or_fobj=None, encoding=None,
frame_style="ASCII", safe_none_frame=True, *args, **kwargs):
"""Export a `rows.Table` to text.
Expand Down
58 changes: 46 additions & 12 deletions rows/plugins/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from collections.abc import Iterator

from rows.fields import detect_types
from rows.table import FlexibleTable, Table
from rows.table import FlexibleTable, Table, LazyTable

SLUG_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_'

Expand Down Expand Up @@ -134,10 +134,25 @@ def make_header(field_names, permit_not=False):
return result


def get_row_data(full_field_names, field_names):

field_indexes = [full_field_names.index(field_name)
for field_name in field_names]

def func(rows_data):
for row_data in rows_data:
yield [row_data[field_index] for field_index in field_indexes]

return func


def create_table(data, meta=None, fields=None, skip_header=True,
import_fields=None, samples=None, force_types=None,
*args, **kwargs):
lazy=False, *args, **kwargs):
# TODO: change samples to be a fixed number
# TODO: may change samples logic (`float('inf')` or `all`)
# TODO: add auto_detect_types=True parameter

table_rows = iter(data)
sample_rows = []

Expand All @@ -159,6 +174,9 @@ def create_table(data, meta=None, fields=None, skip_header=True,
if not isinstance(fields, OrderedDict):
raise ValueError('`fields` must be an `OrderedDict`')

# TODO: if `fields` is set, we're going to have the wrong order,
# compared to the first row (header).

if skip_header:
next(table_rows)

Expand All @@ -181,26 +199,38 @@ def create_table(data, meta=None, fields=None, skip_header=True,
new_fields[field_name] = fields[field_name]
fields = new_fields

table = Table(fields=fields, meta=meta)
# TODO: put this inside Table.__init__
for row in chain(sample_rows, table_rows):
table.append({field_name: value
for field_name, value in zip(header, row)})
if not lazy:
table = Table(fields=fields, meta=meta)

# TODO: put this inside Table.__init__
for row in chain(sample_rows, table_rows):
table.append({field_name: value
for field_name, value in zip(header, row)})

else:
data = chain(sample_rows, table_rows)
field_names = fields.keys()

if header != field_names:
rows_data = get_row_data(header, field_names)
data = chain(rows_data(sample_rows), rows_data(table_rows))

table = LazyTable(fields=fields, data=data, meta=meta)

return table


def prepare_to_export(table, export_fields=None, *args, **kwargs):
# TODO: optimize for more used cases (export_fields=None)

# TODO: may create `BaseTable` and use `isinstance` instead
table_type = type(table)
if table_type not in (FlexibleTable, Table):
if table_type not in (FlexibleTable, Table, LazyTable):
raise ValueError('Table type not recognized')

if export_fields is None:
# we use already slugged-fieldnames
if export_fields is None: # Table has slugged fieldnames already
export_fields = table.field_names
else:
# we need to slug all the field names
else: # Need to slug all the field names before exporting
export_fields = make_header(export_fields)

table_field_names = table.field_names
Expand All @@ -211,13 +241,17 @@ def prepare_to_export(table, export_fields=None, *args, **kwargs):

yield export_fields

# TODO: create a standard API on all `Table` classes
if table_type is Table:
field_indexes = list(map(table_field_names.index, export_fields))
for row in table._rows:
yield [row[field_index] for field_index in field_indexes]
elif table_type is FlexibleTable:
for row in table._rows:
yield [row[field_name] for field_name in export_fields]
elif table_type is LazyTable:
for row in table:
yield [getattr(row, field_name) for field_name in export_fields]


def serialize(table, *args, **kwargs):
Expand Down
3 changes: 3 additions & 0 deletions rows/plugins/xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
return create_table(table_rows, meta=meta, *args, **kwargs)


import_from_xls.is_lazy = False


def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
**kwargs):
"""Export the rows.Table to XLS file and return the saved file."""
Expand Down
4 changes: 4 additions & 0 deletions rows/plugins/xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,16 @@ def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0,
for row_index in range(start_row + 1, end_row + 2)]

filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)

metadata = {'imported_from': 'xlsx',
'filename': filename,
'sheet_name': sheet_name, }
return create_table(table_rows, meta=metadata, *args, **kwargs)


import_from_xlsx.is_lazy = False


FORMATTING_STYLES = {
fields.DateField: 'YYYY-MM-DD',
fields.DatetimeField: 'YYYY-MM-DD HH:MM:SS',
Expand Down
4 changes: 4 additions & 0 deletions rows/plugins/xpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,

filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
xml = fobj.read().decode(encoding)
# TODO: make it lazy (is it possible with lxml?)
tree = tree_from_string(xml)
row_elements = tree.xpath(rows_xpath)

Expand All @@ -79,3 +80,6 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
'filename': filename,
'encoding': encoding,}
return create_table([header] + result_rows, meta=meta, *args, **kwargs)


import_from_xpath.is_lazy = False
33 changes: 33 additions & 0 deletions rows/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,39 @@
from collections.abc import MutableSequence, Sized


class LazyTable(object):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it a good idea to create a BaseTable class to all *Table classes to inherit from?


def __init__(self, fields, data, meta=None):
self.fields = OrderedDict(fields)

self.Row = namedtuple('Row', self.field_names)
self.meta = dict(meta) if meta is not None else {}
self._rows = data

@property
def field_names(self):
return list(self.fields.keys())

@property
def field_types(self):
return list(self.fields.values())

def __repr__(self):
imported = ''
if 'imported_from' in self.meta:
imported = ' (from {})'.format(self.meta['imported_from'])

return '<rows.LazyTable{}, {} fields>'.format(
imported, len(self.fields))

def __iter__(self):
fields = list(self.fields.items())
for row in self._rows:
yield self.Row(*[field_type.deserialize(value)
for value, (field_name, field_type) in
zip(row, fields)])


class Table(MutableSequence):

def __init__(self, fields, meta=None):
Expand Down
Loading