turicas · turicas · Sep 24, 2017 · May 22, 2017 · May 23, 2017 · May 23, 2017
diff --git a/rows/__init__.py b/rows/__init__.py
@@ -22,7 +22,7 @@
 import rows.plugins as plugins
 
 from rows.operations import join, transform, transpose  # NOQA
-from rows.table import Table, FlexibleTable  # NOQA
+from rows.table import FlexibleTable, LazyTable, Table  # NOQA
 from rows.localization import locale_context  # NOQA
 
 

diff --git a/rows/cli.py b/rows/cli.py
@@ -379,27 +379,30 @@ def query(input_encoding, output_encoding, input_locale, output_locale,
             if input_locale is not None:
                 with rows.locale_context(input_locale):
                     table = import_from_source(source, DEFAULT_INPUT_ENCODING,
-                            samples=samples)
+                            lazy=True, samples=samples)
             else:
                 table = import_from_source(source, DEFAULT_INPUT_ENCODING,
-                        samples=samples)
+                        lazy=True, samples=samples)
 
             sqlite_connection = sqlite3.Connection(':memory:')
             rows.export_to_sqlite(table,
                                   sqlite_connection,
                                   table_name='table1')
-            result = rows.import_from_sqlite(sqlite_connection, query=query)
+            result = rows.import_from_sqlite(sqlite_connection, query=query,
+                    lazy=True, samples=samples)
 
     else:
         # TODO: if all sources are SQLite we can also optimize the import
         if input_locale is not None:
             with rows.locale_context(input_locale):
                 tables = [_import_table(source, encoding=input_encoding,
-                                        verify_ssl=verify_ssl, samples=samples)
+                                        verify_ssl=verify_ssl, lazy=True,
+                                        samples=samples)
                           for source in sources]
         else:
             tables = [_import_table(source, encoding=input_encoding,
-                                    verify_ssl=verify_ssl, samples=samples)
+                                    verify_ssl=verify_ssl, lazy=True,
+                                    samples=samples)
                       for source in sources]
 
         sqlite_connection = sqlite3.Connection(':memory:')
@@ -408,7 +411,8 @@ def query(input_encoding, output_encoding, input_locale, output_locale,
                                   sqlite_connection,
                                   table_name='table{}'.format(index))
 
-        result = rows.import_from_sqlite(sqlite_connection, query=query)
+        result = rows.import_from_sqlite(sqlite_connection, query=query,
+                lazy=True, samples=samples)
 
     # TODO: may use sys.stdout.encoding if output_file = '-'
     output_encoding = output_encoding or sys.stdout.encoding or \

diff --git a/rows/plugins/dicts.py b/rows/plugins/dicts.py
@@ -52,6 +52,9 @@ def import_from_dicts(data, samples=None, *args, **kwargs):
     return create_table(chain([headers], data_rows), meta=meta, *args, **kwargs)
 
 
+import_from_dicts.is_lazy = False
+
+
 def export_to_dicts(table, *args, **kwargs):
     """Export a `rows.Table` to a list of dicts"""
     field_names = table.field_names

diff --git a/rows/plugins/ods.py b/rows/plugins/ods.py
@@ -103,5 +103,10 @@ def import_from_ods(filename_or_fobj, index=0, *args, **kwargs):
 
     max_length = max(len(row) for row in table_rows)
     full_rows = complete_with_None(table_rows, max_length)
+
     meta = {'imported_from': 'ods', 'filename': filename,}
+
     return create_table(full_rows, meta=meta, *args, **kwargs)
+
+
+import_from_ods.is_lazy = False
diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py
@@ -118,6 +118,9 @@ def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None,
     return create_table(reader, meta=meta, *args, **kwargs)
 
 
+import_from_csv.is_lazy = True
+
+
 def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
                   dialect=unicodecsv.excel, batch_size=100, callback=None,
                   *args, **kwargs):
@@ -130,7 +133,6 @@ def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
     contents.
     """
     # TODO: will work only if table.fields is OrderedDict
-    # TODO: should use fobj? What about creating a method like json.dumps?
 
     if filename_or_fobj is not None:
         _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')

diff --git a/rows/plugins/plugin_html.py b/rows/plugins/plugin_html.py
@@ -97,6 +97,9 @@ def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_html.is_lazy = False
+
+
 def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
                    **kwargs):
     """Export and return rows.Table data to HTML file."""
@@ -106,6 +109,7 @@ def export_to_html(table, filename_or_fobj=None, encoding='utf-8', *args,
     header = ['      <th> {} </th>\n'.format(field) for field in fields]
     result.extend(header)
     result.extend(['    </tr>\n', '  </thead>\n', '\n', '  <tbody>\n', '\n'])
+    # TODO: could be lazy so we don't need to store the whole table into memory
     for index, row in enumerate(serialized_table, start=1):
         css_class = 'odd' if index % 2 == 1 else 'even'
         result.append('    <tr class="{}">\n'.format(css_class))

diff --git a/rows/plugins/plugin_json.py b/rows/plugins/plugin_json.py
@@ -35,6 +35,7 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
     filename, fobj = get_filename_and_fobj(filename_or_fobj)
 
     json_obj = json.load(fobj, encoding=encoding)
+    # TODO: may use import_from_dicts here
     field_names = list(json_obj[0].keys())
     table_rows = [[item[key] for key in field_names] for item in json_obj]
 
@@ -44,6 +45,9 @@ def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
     return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_json.is_lazy = False
+
+
 def _convert(value, field_type, *args, **kwargs):
     if value is None or field_type in (
                 fields.BinaryField,
@@ -74,6 +78,8 @@ def export_to_json(table, filename_or_fobj=None, encoding='utf-8', indent=None,
     fields = table.fields
     prepared_table = prepare_to_export(table, *args, **kwargs)
     field_names = next(prepared_table)
+
+    # TODO: could be lazy so we don't need to store the whole table into memory
     data = [{field_name: _convert(value, fields[field_name], *args, **kwargs)
              for field_name, value in zip(field_names, row)}
             for row in prepared_table]

diff --git a/rows/plugins/plugin_parquet.py b/rows/plugins/plugin_parquet.py
@@ -52,8 +52,12 @@ def import_from_parquet(filename_or_fobj, *args, **kwargs):
                          for schema in parquet._read_footer(fobj).schema
                          if schema.type is not None])
     header = list(types.keys())
-    table_rows = list(parquet.reader(fobj))  # TODO: be lazy
+    # TODO: make it lazy
+    table_rows = list(parquet.reader(fobj))
 
     meta = {'imported_from': 'parquet', 'filename': filename,}
     return create_table([header] + table_rows, meta=meta, force_types=types,
                         *args, **kwargs)
+
+
+import_from_parquet.is_lazy = False
diff --git a/rows/plugins/sqlite.py b/rows/plugins/sqlite.py
@@ -21,6 +21,8 @@
 import sqlite3
 import string
 
+from itertools import chain
+
 import six
 
 import rows.fields as fields
@@ -29,6 +31,7 @@
                                 prepare_to_export)
 
 SQL_TABLE_NAMES = 'SELECT name FROM sqlite_master WHERE type="table"'
+# TODO: may use query args instead of string formatting
 SQL_CREATE_TABLE = 'CREATE TABLE IF NOT EXISTS "{table_name}" ({field_types})'
 SQL_SELECT_ALL = 'SELECT * FROM "{table_name}"'
 SQL_INSERT = 'INSERT INTO "{table_name}" ({field_names}) VALUES ({placeholders})'
@@ -122,13 +125,15 @@ def import_from_sqlite(filename_or_connection, table_name='table1', query=None,
     if query_args is None:
         query_args = tuple()
 
-    table_rows = list(cursor.execute(query, query_args)) # TODO: may be lazy
-    header = [six.text_type(info[0]) for info in cursor.description]
-    cursor.close()
-    # TODO: should close connection also?
+    cursor.execute(query, query_args)
+    data = chain([[six.text_type(info[0]) for info in cursor.description]],
+                 cursor)
 
     meta = {'imported_from': 'sqlite', 'filename': filename_or_connection, }
-    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
+    return create_table(data, meta=meta, *args, **kwargs)
+
+
+import_from_sqlite.is_lazy = True
 
 
 def export_to_sqlite(table, filename_or_connection, table_name=None,

diff --git a/rows/plugins/txt.py b/rows/plugins/txt.py
@@ -175,6 +175,9 @@ def import_from_txt(filename_or_fobj, encoding='utf-8',
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_txt.is_lazy = False
+
+
 def export_to_txt(table, filename_or_fobj=None, encoding=None,
                   frame_style="ASCII", safe_none_frame=True, *args, **kwargs):
     """Export a `rows.Table` to text.

diff --git a/rows/plugins/utils.py b/rows/plugins/utils.py
@@ -28,7 +28,7 @@
     from collections.abc import Iterator
 
 from rows.fields import detect_types
-from rows.table import FlexibleTable, Table
+from rows.table import FlexibleTable, Table, LazyTable
 
 SLUG_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_'
 
@@ -134,10 +134,25 @@ def make_header(field_names, permit_not=False):
     return result
 
 
+def get_row_data(full_field_names, field_names):
+
+    field_indexes = [full_field_names.index(field_name)
+                     for field_name in field_names]
+
+    def func(rows_data):
+        for row_data in rows_data:
+            yield [row_data[field_index] for field_index in field_indexes]
+
+    return func
+
+
 def create_table(data, meta=None, fields=None, skip_header=True,
                  import_fields=None, samples=None, force_types=None,
-                 *args, **kwargs):
+                 lazy=False, *args, **kwargs):
+    # TODO: change samples to be a fixed number
+    # TODO: may change samples logic (`float('inf')` or `all`)
     # TODO: add auto_detect_types=True parameter
+
     table_rows = iter(data)
     sample_rows = []
 
@@ -159,6 +174,9 @@ def create_table(data, meta=None, fields=None, skip_header=True,
         if not isinstance(fields, OrderedDict):
             raise ValueError('`fields` must be an `OrderedDict`')
 
+        # TODO: if `fields` is set, we're going to have the wrong order,
+        # compared to the first row (header).
+
         if skip_header:
             next(table_rows)
 
@@ -181,26 +199,38 @@ def create_table(data, meta=None, fields=None, skip_header=True,
             new_fields[field_name] = fields[field_name]
         fields = new_fields
 
-    table = Table(fields=fields, meta=meta)
-    # TODO: put this inside Table.__init__
-    for row in chain(sample_rows, table_rows):
-        table.append({field_name: value
-                      for field_name, value in zip(header, row)})
+    if not lazy:
+        table = Table(fields=fields, meta=meta)
+
+        # TODO: put this inside Table.__init__
+        for row in chain(sample_rows, table_rows):
+            table.append({field_name: value
+                          for field_name, value in zip(header, row)})
+
+    else:
+        data = chain(sample_rows, table_rows)
+        field_names = fields.keys()
+
+        if header != field_names:
+            rows_data = get_row_data(header, field_names)
+            data = chain(rows_data(sample_rows), rows_data(table_rows))
+
+        table = LazyTable(fields=fields, data=data, meta=meta)
 
     return table
 
 
 def prepare_to_export(table, export_fields=None, *args, **kwargs):
     # TODO: optimize for more used cases (export_fields=None)
+
+    # TODO: may create `BaseTable` and use `isinstance` instead
     table_type = type(table)
-    if table_type not in (FlexibleTable, Table):
+    if table_type not in (FlexibleTable, Table, LazyTable):
         raise ValueError('Table type not recognized')
 
-    if export_fields is None:
-        # we use already slugged-fieldnames
+    if export_fields is None:  # Table has slugged fieldnames already
         export_fields = table.field_names
-    else:
-        # we need to slug all the field names
+    else:  # Need to slug all the field names before exporting
         export_fields = make_header(export_fields)
 
     table_field_names = table.field_names
@@ -211,13 +241,17 @@ def prepare_to_export(table, export_fields=None, *args, **kwargs):
 
     yield export_fields
 
+    # TODO: create a standard API on all `Table` classes
     if table_type is Table:
         field_indexes = list(map(table_field_names.index, export_fields))
         for row in table._rows:
             yield [row[field_index] for field_index in field_indexes]
     elif table_type is FlexibleTable:
         for row in table._rows:
             yield [row[field_name] for field_name in export_fields]
+    elif table_type is LazyTable:
+        for row in table:
+            yield [getattr(row, field_name) for field_name in export_fields]
 
 
 def serialize(table, *args, **kwargs):

diff --git a/rows/plugins/xls.py b/rows/plugins/xls.py
@@ -163,6 +163,9 @@ def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
     return create_table(table_rows, meta=meta, *args, **kwargs)
 
 
+import_from_xls.is_lazy = False
+
+
 def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
                   **kwargs):
     """Export the rows.Table to XLS file and return the saved file."""

diff --git a/rows/plugins/xlsx.py b/rows/plugins/xlsx.py
@@ -79,12 +79,16 @@ def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0,
                   for row_index in range(start_row + 1, end_row + 2)]
 
     filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
+
     metadata = {'imported_from': 'xlsx',
                 'filename': filename,
                 'sheet_name': sheet_name, }
     return create_table(table_rows, meta=metadata, *args, **kwargs)
 
 
+import_from_xlsx.is_lazy = False
+
+
 FORMATTING_STYLES = {
         fields.DateField: 'YYYY-MM-DD',
         fields.DatetimeField: 'YYYY-MM-DD HH:MM:SS',

diff --git a/rows/plugins/xpath.py b/rows/plugins/xpath.py
@@ -68,6 +68,7 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
 
     filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
     xml = fobj.read().decode(encoding)
+    # TODO: make it lazy (is it possible with lxml?)
     tree = tree_from_string(xml)
     row_elements = tree.xpath(rows_xpath)
 
@@ -79,3 +80,6 @@ def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
             'filename': filename,
             'encoding': encoding,}
     return create_table([header] + result_rows, meta=meta, *args, **kwargs)
+
+
+import_from_xpath.is_lazy = False
diff --git a/rows/table.py b/rows/table.py
@@ -28,6 +28,39 @@
     from collections.abc import MutableSequence, Sized
 
 
+class LazyTable(object):
+
+    def __init__(self, fields, data, meta=None):
+        self.fields = OrderedDict(fields)
+
+        self.Row = namedtuple('Row', self.field_names)
+        self.meta = dict(meta) if meta is not None else {}
+        self._rows = data
+
+    @property
+    def field_names(self):
+        return list(self.fields.keys())
+
+    @property
+    def field_types(self):
+        return list(self.fields.values())
+
+    def __repr__(self):
+        imported = ''
+        if 'imported_from' in self.meta:
+            imported = ' (from {})'.format(self.meta['imported_from'])
+
+        return '<rows.LazyTable{}, {} fields>'.format(
+                imported, len(self.fields))
+
+    def __iter__(self):
+        fields = list(self.fields.items())
+        for row in self._rows:
+            yield self.Row(*[field_type.deserialize(value)
+                             for value, (field_name, field_type) in
+                             zip(row, fields)])
+
+
 class Table(MutableSequence):
 
     def __init__(self, fields, meta=None):