feat: implement chunked file processing to prevent freezing on large files

Chava Goldshtein · Chava Goldshtein · commit e7ae0b1969ac · 2025-09-25T09:57:37.000+03:00
- Add split_copy_by_size() function to process files in configurable chunks - Extract copy_file() helper for PostgreSQL COPY operations - Add ckanext.xloader.copy_chunk_size config (default: 1GB) - Process large files (>2GB) without memory exhaustion or system freezing - Maintain progress logging for each chunk processed - Prevent system unresponsiveness during large file uploads Fixes ckan#259
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
@@ -123,6 +123,101 @@ def _clear_datastore_resource(resource_id):
         conn.execute(sa.text('TRUNCATE TABLE "{}" RESTART IDENTITY'.format(resource_id)))
 
 
+def copy_file(csv_filepath, engine, logger, resource_id, headers, delimiter):
+    # Options for loading into postgres:
+    # 1. \copy - can't use as that is a psql meta-command and not accessible
+    #    via psycopg2
+    # 2. COPY - requires the db user to have superuser privileges. This is
+    #    dangerous. It is also not available on AWS, for example.
+    # 3. pgloader method? - as described in its docs:
+    #
+    #    Note that while the COPY command is restricted to read either from
+    #    its standard input or from a local file on the server's file system,
+    #    the command line tool psql implements a \copy command that knows
+    #    how to stream a file local to the client over the network and into
+    #    the PostgreSQL server, using the same protocol as pgloader uses.
+    # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
+    #    the superuser issue. <-- picked
+
+    with engine.begin() as conn:
+        cur = conn.connection.cursor()
+        #cur.execute('SET DATESTYLE TO "SQL , MDY"')
+        try:
+            with open(csv_filepath, 'rb') as f:
+                # can't use :param for table name because params are only
+                # for filter values that are single quoted.
+                try:
+                    cur.copy_expert(
+                        "COPY \"{resource_id}\" ({column_names}) "
+                        "FROM STDIN "
+                        "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
+                        "      ENCODING '{encoding}');"
+                        .format(
+                            resource_id=resource_id,
+                            column_names=', '.join(['"{}"'.format(h)
+                                                    for h in headers]),
+                            delimiter=delimiter,
+                            encoding='UTF8',
+                        ),
+                        f)
+                except psycopg2.DataError as e:
+                    # e is a str but with foreign chars e.g.
+                    # 'extra data: "paul,pa\xc3\xbcl"\n'
+                    # but logging and exceptions need a normal (7 bit) str
+                    error_str = str(e)
+                    logger.warning('{id}: {error_str}'.format(id=resource_id, error_str=error_str))
+                    jobs.write_log('{id}: {error_str}'.format(id=resource_id, error_str=error_str))
+                    raise LoaderError('Error during the load into PostgreSQL:'
+                                        ' {}'.format(error_str))  
+        finally:
+            cur.close()
+
+            
+def split_copy_by_size(input_file, engine, logger,  resource_id, headers, delimiter = ',',  max_size=1024**3):  # 1 Gigabyte
+    """
+    Reads a CSV file, splits it into chunks of maximum size, and writes each chunk
+    to PostgreSQL COPY command to load the data into a table.
+
+    Args:
+        input_file (str): Path to the input CSV file.
+        max_size (int, optional): Maximum size (in bytes) of each output file. Defaults to 1 Gigabyte.
+        tablename (str, optional): Name of the target table in PostgreSQL for the COPY command. Defaults to None.
+        columns (list, optional): List of column names for the COPY command, matching the CSV header. Defaults to None.
+        connection (str, optional): Connection string for the PostgreSQL database. Defaults to an empty string.
+        delimiter (str, optional): Delimiter character used in the CSV file. Defaults to ','.
+    """
+
+    with open(input_file, 'r', encoding='utf-8') as infile:
+        current_file = None
+        output_filename = f'/tmp/output_{resource_id}.csv'
+        header = False
+        for row in infile:
+            if current_file is None or current_file.tell() >= max_size:
+                # Close previous file if necessary
+                if current_file:
+                    logger.info('Before copying file: {}'.format(output_filename))
+                    copy_file(output_filename, engine, logger, resource_id, headers, delimiter)
+                    logger.info('Copied file: {}'.format(output_filename))
+                    current_file.close()
+                    header = True
+
+                current_file = open(output_filename, 'w')
+                if header:
+                    current_file.write(delimiter.join(headers) + '\n')
+            current_file.write(row)
+            
+
+        # Close the last file if open
+        if current_file:
+            current_file.close()
+
+        # Copy the last file
+        copy_file(output_filename, engine, logger, resource_id, headers, delimiter)
+        os.remove(output_filename)
+    if infile:
+        infile.close()        
+
+        
 def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
     '''Loads a CSV into DataStore. Does not create the indexes.'''
 
@@ -289,51 +384,9 @@ def strip_white_space_iter():
 
         logger.info('Copying to database...')
 
-        # Options for loading into postgres:
-        # 1. \copy - can't use as that is a psql meta-command and not accessible
-        #    via psycopg2
-        # 2. COPY - requires the db user to have superuser privileges. This is
-        #    dangerous. It is also not available on AWS, for example.
-        # 3. pgloader method? - as described in its docs:
-        #    Note that while the COPY command is restricted to read either from
-        #    its standard input or from a local file on the server's file system,
-        #    the command line tool psql implements a \copy command that knows
-        #    how to stream a file local to the client over the network and into
-        #    the PostgreSQL server, using the same protocol as pgloader uses.
-        # 4. COPY FROM STDIN - not quite as fast as COPY from a file, but avoids
-        #    the superuser issue. <-- picked
-
-        with engine.begin() as conn:
-            cur = conn.connection.cursor()
-            try:
-                with open(csv_filepath, 'rb') as f:
-                    # can't use :param for table name because params are only
-                    # for filter values that are single quoted.
-                    try:
-                        cur.copy_expert(
-                            "COPY \"{resource_id}\" ({column_names}) "
-                            "FROM STDIN "
-                            "WITH (DELIMITER '{delimiter}', FORMAT csv, HEADER 1, "
-                            "      ENCODING '{encoding}');"
-                            .format(
-                                resource_id=resource_id,
-                                column_names=', '.join(['"{}"'.format(h)
-                                                        for h in headers]),
-                                delimiter=delimiter,
-                                encoding='UTF8',
-                            ),
-                            f)
-                    except psycopg2.DataError as e:
-                        # e is a str but with foreign chars e.g.
-                        # 'extra data: "paul,pa\xc3\xbcl"\n'
-                        # but logging and exceptions need a normal (7 bit) str
-                        error_str = str(e)
-                        logger.warning(error_str)
-                        raise LoaderError('Error during the load into PostgreSQL:'
-                                          ' {}'.format(error_str))
-
-            finally:
-                cur.close()
+        # Copy file to datastore db, split to chunks.
+        max_size = config.get('ckanext.xloader.copy_chunk_size', 1024**3)
+        split_copy_by_size(csv_filepath, engine, logger,  resource_id, headers, delimiter, int(max_size))
     finally:
         os.remove(csv_filepath)  # i.e. the tempfile
 
diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py
@@ -84,6 +84,7 @@ def data(create_with_upload, apikey):
 
 @pytest.mark.usefixtures("clean_db", "with_plugins")
 @pytest.mark.ckan_config("ckanext.xloader.job_timeout", 2)
+@pytest.mark.ckan_config("ckanext.xloader.copy_chunk_size", 5120)
 @pytest.mark.ckan_config("ckan.jobs.timeout", 2)
 class TestXLoaderJobs(helpers.FunctionalRQTestBase):