Merge pull request #263 from cgoldshtein/master - Implements chunked file processing

duttonw · web-flow · commit 7be997df0917 · 2025-10-22T09:25:12.000+10:00
## Description Implements chunked file processing to resolve system freezing when loading very large files (>2GB) into DataStore. Fixes #259 ## Problem - XLoader would freeze/hang when processing files >2GB - Entire file loaded into memory causing system unresponsiveness - No progress feedback during large file processing - Memory exhaustion on very large datasets ## Solution - **Chunked Processing**: Split large files into configurable chunks (default: 1GB) - **Progress Logging**: Log each chunk as it's processed - **Memory Efficiency**: Consistent memory usage regardless of file size - **Configurable**: New `ckanext.xloader.copy_chunk_size` setting ## Changes - Add `split_copy_by_size()` function for chunked file processing - Extract `copy_file()` helper for PostgreSQL COPY operations - Add configuration option `ckanext.xloader.copy_chunk_size` (default: 1GB) - Update tests to use smaller chunk size for testing - Maintain existing functionality for smaller files ## Configuration ```ini # Optional: Set chunk size (default: 1073741824 = 1GB) ckanext.xloader.copy_chunk_size = 104857600 # 100MB chunks Fixes #259
diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml
@@ -206,3 +206,13 @@ groups:
           like database deadlocks or network timeouts. Set to 0 to disable retries.
         type: int
         required: false
+      - key: ckanext.xloader.copy_chunk_size
+        default: 1073741824
+        example: 536870912
+        description: |
+          Maximum size in bytes for each chunk when processing files.
+          Files are split into chunks to prevent memory exhaustion and
+          system freezing. Default is 1GB (1073741824 bytes). Smaller values
+          use less memory but create more chunks.
+        type: int
+        required: false
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -143,6 +143,109 @@ def _clear_datastore_resource(resource_id):
         conn.execute(sa.text('TRUNCATE TABLE "{}" RESTART IDENTITY'.format(resource_id)))
 
 
+def copy_file(csv_filepath, engine, logger, resource_id, headers, delimiter):
+    # Options for loading into postgres:
+    # 1. \copy - can't use as that is a psql meta-command and not accessible
+    #    via psycopg2
+    # 2. COPY - requires the db user to have superuser privileges. This is
+    #    dangerous. It is also not available on AWS, for example.
+    # 3. pgloader method? - as described in its docs:
+    #
+    #    Note that while the COPY command is restricted to read either from
+    #    its standard input or from a local file on the server's file system,
+    #    the command line tool psql implements a \copy command that knows
+    #    how to stream a file local to the client over the network and into
+    #    the PostgreSQL server, using the same protocol as pgloader uses.
+    # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
+    #    the superuser issue. <-- picked
+
+    with engine.begin() as conn:
+        cur = conn.connection.cursor()
+        #cur.execute('SET DATESTYLE TO "SQL , MDY"')
+        try:
+            with open(csv_filepath, 'rb') as f:
+                # can't use :param for table name because params are only
+                # for filter values that are single quoted.
+                try:
+                    cur.copy_expert(
+                        "COPY \"{resource_id}\" ({column_names}) "
+                        "FROM STDIN "
+                        "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
+                        "      ENCODING '{encoding}');"
+                        .format(
+                            resource_id=resource_id,
+                            column_names=', '.join(['"{}"'.format(h)
+                                                    for h in headers]),
+                            delimiter=delimiter,
+                            encoding='UTF8',
+                        ),
+                        f)
+                except psycopg2.DataError as e:
+                    # e is a str but with foreign chars e.g.
+                    # 'extra data: "paul,pa\xc3\xbcl"\n'
+                    # but logging and exceptions need a normal (7 bit) str
+                    error_str = str(e)
+                    logger.warning('%s: %s', resource_id, error_str)
+                    raise LoaderError('Error during the load into PostgreSQL:'
+                                        ' {}'.format(error_str))  
+        finally:
+            cur.close()
+
+def split_copy_by_size(input_file, engine, logger,  resource_id, headers, delimiter = ',',  max_size=1024**3, encoding='utf-8'):  # 1 Gigabyte
+    """
+    Reads a CSV file, splits it into chunks of maximum size, and writes each chunk
+    to PostgreSQL COPY command to load the data into a table.
+
+    Args:
+        input_file (str): Path to the input CSV file.
+        max_size (int, optional): Maximum size (in bytes) of each output file. Defaults to 1 Gigabyte.
+        tablename (str, optional): Name of the target table in PostgreSQL for the COPY command. Defaults to None.
+        columns (list, optional): List of column names for the COPY command, matching the CSV header. Defaults to None.
+        connection (str, optional): Connection string for the PostgreSQL database. Defaults to an empty string.
+        delimiter (str, optional): Delimiter character used in the CSV file. Defaults to ','.
+    """
+    
+    chunk_count = 0
+    file_size = os.path.getsize(input_file)
+    logger.info('Starting chunked processing for file size: %s bytes with chunk size: %s bytes', file_size, max_size)
+
+    with open(input_file, 'r', encoding = encoding) as infile:
+        current_file = None
+        output_filename = f'/tmp/output_{resource_id}.csv'
+        header = False
+        for row in infile:
+            if current_file is None or current_file.tell() >= max_size:
+                # Close previous file if necessary
+                if current_file:
+                    chunk_count += 1
+                    logger.debug('Before copying chunk %s: %s', chunk_count, output_filename)
+                    copy_file(output_filename, engine, logger, resource_id, headers, delimiter)
+                    logger.debug('Copied chunk %s: %s', chunk_count, output_filename)
+                    current_file.close()
+                    header = True
+
+                current_file = open(output_filename, 'w')
+                if header:
+                    current_file.write(delimiter.join(headers) + '\n')
+            current_file.write(row)
+            
+
+        # Close the last file if open
+        if current_file:
+            current_file.close()
+
+        # Copy the last file
+        chunk_count += 1
+        logger.debug('Before copying final chunk %s: %s', chunk_count, output_filename)
+        copy_file(output_filename, engine, logger, resource_id, headers, delimiter)
+        logger.debug('Copied final chunk %s: %s', chunk_count, output_filename)
+        os.remove(output_filename)
+        
+    logger.info('Completed chunked processing: %s chunks processed for file size %s bytes', chunk_count, file_size)
+    if infile:
+        infile.close()        
+
+
 def _read_metadata(table_filepath, mimetype, logger):
     # Determine the header row
     logger.info('Determining column names and types')
@@ -342,51 +445,10 @@ def strip_white_space_iter():
 
     logger.info('Copying to database...')
 
-    # Options for loading into postgres:
-    # 1. \copy - can't use as that is a psql meta-command and not accessible
-    #    via psycopg2
-    # 2. COPY - requires the db user to have superuser privileges. This is
-    #    dangerous. It is also not available on AWS, for example.
-    # 3. pgloader method? - as described in its docs:
-    #    Note that while the COPY command is restricted to read either from
-    #    its standard input or from a local file on the server's file system,
-    #    the command line tool psql implements a \copy command that knows
-    #    how to stream a file local to the client over the network and into
-    #    the PostgreSQL server, using the same protocol as pgloader uses.
-    # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
-    #    the superuser issue. <-- picked
-
-    with engine.begin() as conn:
-        cur = conn.connection.cursor()
-        try:
-            with open(csv_filepath, 'rb') as f:
-                # can't use :param for table name because params are only
-                # for filter values that are single quoted.
-                try:
-                    cur.copy_expert(
-                        "COPY \"{resource_id}\" ({column_names}) "
-                        "FROM STDIN "
-                        "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
-                        "      ENCODING '{encoding}');"
-                        .format(
-                            resource_id=resource_id,
-                            column_names=', '.join(['"{}"'.format(h)
-                                                    for h in headers]),
-                            delimiter=delimiter,
-                            encoding='UTF8',
-                        ),
-                        f)
-                except psycopg2.DataError as e:
-                    # e is a str but with foreign chars e.g.
-                    # 'extra data: "paul,pa\xc3\xbcl"\n'
-                    # but logging and exceptions need a normal (7 bit) str
-                    error_str = str(e)
-                    logger.warning(error_str)
-                    raise LoaderError('Error during the load into PostgreSQL:'
-                                      ' {}'.format(error_str))
-
-        finally:
-            cur.close()
+    # Copy file to datastore db, split to chunks.
+    max_size = config.get('ckanext.xloader.copy_chunk_size', 1024**3)
+    logger.debug('Using chunk size: %s bytes for resource %s', max_size, resource_id)
+    split_copy_by_size(csv_filepath, engine, logger,  resource_id, headers, delimiter, int(max_size))
 
     logger.info('...copying done')
 
diff --git a/ckanext/xloader/tests/test_chunks.py b/ckanext/xloader/tests/test_chunks.py
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+import os
+import pytest
+import tempfile
+import logging
+from typing import Callable, List, Tuple, Any
+from unittest.mock import patch, MagicMock
+import csv
+import sqlalchemy.orm as orm
+
+from ckan.tests import factories
+from ckanext.xloader import loader
+from ckanext.xloader.loader import get_write_engine
+from ckanext.xloader.tests.test_loader import TestLoadBase, get_sample_filepath
+
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture()
+def Session():
+    engine = get_write_engine()
+    Session = orm.scoped_session(orm.sessionmaker(bind=engine))
+    yield Session
+    Session.close()
+
+
+@pytest.mark.usefixtures("full_reset", "with_plugins")
+@pytest.mark.ckan_config("ckan.plugins", "datastore xloader")
+class TestChunkedLoading(TestLoadBase):
+    
+    def _create_mock_split_copy(self, chunk_size: int) -> Callable:
+        """Create a mock function for split_copy_by_size with specified chunk size"""
+        original_split_copy = loader.split_copy_by_size
+        
+        def mock_split_copy(input_file: Any, engine: Any, logger: Any, resource_id: str, headers: List[str], delimiter: str = ',', max_size: int = 1024**3) -> Any:
+            return original_split_copy(input_file, engine, logger, resource_id, headers, delimiter, chunk_size)
+        
+        return mock_split_copy
+    
+    def _create_mock_copy_file(self, copy_calls_list: List[Tuple]) -> Callable:
+        """Create a mock function for copy_file that tracks calls"""
+        original_copy_file = loader.copy_file
+        
+        def mock_copy_file(*args: Any, **kwargs: Any) -> Any:
+            copy_calls_list.append(args)
+            return original_copy_file(*args, **kwargs)
+        
+        return mock_copy_file
+    
+    def _generate_large_csv(self, filepath: str, num_rows: int = 100000, row_size_kb: int = 1) -> Tuple[str, List[str], int]:
+        """Generate a large CSV file for testing chunked processing"""
+        headers = ['id', 'name', 'description', 'data']
+        
+        # Create data that will make each row approximately row_size_kb KB
+        padding_size = (row_size_kb * 1024) - 50  # Account for other columns
+        padding_data = 'x' * max(1, padding_size)
+        
+        with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(headers)
+            
+            for i in range(num_rows):
+                writer.writerow([
+                    i + 1,
+                    f'Name_{i + 1}',
+                    f'Description for row {i + 1}',
+                    padding_data
+                ])
+        
+        return filepath, headers, num_rows
+
+    def test_chunked_processing_large_file(self, Session: Any) -> None:
+        """Test that large files are processed in chunks and data integrity is maintained"""
+        
+        # Create a temporary large CSV file (~15MB to trigger chunking)
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as temp_file:
+            temp_filepath = temp_file.name
+        
+        try:
+            # Generate file with ~15MB (15000 rows * ~1KB each)
+            csv_filepath, expected_headers, expected_rows = self._generate_large_csv(
+                temp_filepath, num_rows=15000, row_size_kb=1
+            )
+            
+            # Verify file size is large enough to trigger chunking
+            file_size = os.path.getsize(csv_filepath)
+            assert file_size > 10 * 1024 * 1024, f"File size {file_size} should be > 10MB"
+            
+            resource = factories.Resource()
+            resource_id = resource['id']
+            
+            # Set up mocks with 10MB chunk size
+            copy_calls = []
+            mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024)
+            mock_copy_file = self._create_mock_copy_file(copy_calls)
+            
+            with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy):
+                with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file):
+                    # Load the CSV with chunked processing
+                    fields = loader.load_csv(
+                        csv_filepath,
+                        resource_id=resource_id,
+                        mimetype="text/csv",
+                        logger=logger,
+                    )
+            
+            # Verify chunking occurred (should have multiple copy calls)
+            assert len(copy_calls) > 1, "Expected multiple chunks but file was not chunked"
+            
+            # Verify data integrity - check that all rows were loaded
+            records = self._get_records(Session, resource_id)
+            assert len(records) == expected_rows, f"Expected {expected_rows} records, got {len(records)}"
+            
+            # Verify column structure
+            column_names = self._get_column_names(Session, resource_id)
+            expected_columns = ['_id', '_full_text'] + expected_headers
+            assert column_names == expected_columns
+            
+            # Verify first and last records to ensure data integrity
+            # Sort records by the 'id' column (index 1) to ensure consistent ordering
+            sorted_records = sorted(records, key=lambda x: int(x[1]))
+            first_record = sorted_records[0]
+            last_record = sorted_records[-1]
+            
+            # Check first record (excluding _id and _full_text columns)
+            # The _get_records method excludes _full_text by default, so indices are:
+            # 0: _id, 1: id, 2: name, 3: description, 4: data
+            
+            assert first_record[1] == '1'  # id column (index 1 after _id)
+            assert first_record[2] == 'Name_1'  # name column (index 2)
+            
+            # Check last record
+            assert last_record[1] == str(expected_rows)  # id column
+            assert last_record[2] == f'Name_{expected_rows}'  # name column
+            
+        finally:
+            # Clean up temporary file
+            if os.path.exists(temp_filepath):
+                os.unlink(temp_filepath)
+
+    def test_small_file_no_chunking(self, Session: Any) -> None:
+        """Test that small files are not chunked when chunk size is larger than file"""
+        
+        # Use existing small sample file
+        csv_filepath = get_sample_filepath("simple.csv")
+        resource = factories.Resource()
+        resource_id = resource['id']
+        
+        # Set up mocks with large chunk size to prevent chunking
+        copy_calls = []
+        mock_split_copy = self._create_mock_split_copy(10 * 1024 * 1024)  # 10MB
+        mock_copy_file = self._create_mock_copy_file(copy_calls)
+        
+        with patch('ckanext.xloader.loader.split_copy_by_size', side_effect=mock_split_copy):
+            with patch('ckanext.xloader.loader.copy_file', side_effect=mock_copy_file):
+                fields = loader.load_csv(
+                    csv_filepath,
+                    resource_id=resource_id,
+                    mimetype="text/csv",
+                    logger=logger,
+                )
+        
+        # Small file should only have one copy call (no chunking)
+        assert len(copy_calls) == 1, f"Small file should not be chunked, got {len(copy_calls)} copy calls"
+        
+        # Verify data loaded correctly
+        records = self._get_records(Session, resource_id)
+        assert len(records) == 6  # Known number of records in simple.csv
+
diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py
@@ -83,6 +83,7 @@ def data(create_with_upload, apikey):
 
 @pytest.mark.usefixtures("clean_db", "with_plugins")
 @pytest.mark.ckan_config("ckanext.xloader.job_timeout", 2)
+@pytest.mark.ckan_config("ckanext.xloader.copy_chunk_size", 5120)
 @pytest.mark.ckan_config("ckan.jobs.timeout", 2)
 class TestXLoaderJobs(helpers.FunctionalRQTestBase):