From 30657c56088e177a3b590b4f40334a81a70de48b Mon Sep 17 00:00:00 2001
From: "Colin (Wilkie) McLellan" <cmclel@amazon.com>
Date: Thu, 25 Jan 2024 15:42:32 +0000
Subject: [PATCH 01/10] Change list of match ids to set for json_handler.py

Increase the speed of the json_handler by migrating from a list to a set. Move from O(n) to O(1)
---
 backend/ecs_tasks/delete_files/json_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py
index 03f4236d..f84ebe32 100644
--- a/backend/ecs_tasks/delete_files/json_handler.py
+++ b/backend/ecs_tasks/delete_files/json_handler.py
@@ -52,7 +52,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
             for column in to_delete:
                 if column["Type"] == "Simple":
                     record = get_value(column["Column"], parsed)
-                    if record and record in column["MatchIds"]:
+                    if record and record in set(column["MatchIds"]):
                         should_delete = True
                         break
                 else:

From ca23bac7ebebc904b21ebdde3b09f58f9ae79875 Mon Sep 17 00:00:00 2001
From: "Colin (Wilkie) McLellan" <cmclel@amazon.com>
Date: Fri, 26 Jan 2024 11:31:18 +0000
Subject: [PATCH 02/10] Update CHANGELOG.md

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6104c99a..6fc2378a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## v0.66
+
+- [#395](https://github.com/awslabs/amazon-s3-find-and-forget/issues/395):
+  Increase the speed of the json_handler by migrating from a list to a set.
+  Move from O(n) to O(1)
+
 ## v0.65
 
 - [#393](https://github.com/awslabs/amazon-s3-find-and-forget/issues/393): Fix

From 05be12930bbde60b9a366227e60240acb16eb612 Mon Sep 17 00:00:00 2001
From: "Colin (Wilkie) McLellan" <cmclel@amazon.com>
Date: Fri, 26 Jan 2024 12:39:53 +0000
Subject: [PATCH 03/10] Bump Version 0.65 -> 0.66

---
 templates/template.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/template.yaml b/templates/template.yaml
index 94e717b3..5e3d3f7b 100644
--- a/templates/template.yaml
+++ b/templates/template.yaml
@@ -1,6 +1,6 @@
 AWSTemplateFormatVersion: "2010-09-09"
 Transform: AWS::Serverless-2016-10-31
-Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.65) (tag:main)
+Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.66) (tag:main)
 
 Parameters:
   AccessControlAllowOriginOverride:
@@ -206,7 +206,7 @@ Conditions:
 Mappings:
   Solution:
     Constants:
-      Version: 'v0.65'
+      Version: 'v0.66'
 
 Resources:
   TempBucket:

From c497147dfc63733c312d59934846d52597f0009d Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Fri, 26 Jan 2024 14:26:30 +0000
Subject: [PATCH 04/10] Include optimisation for composite json matches

---
 backend/ecs_tasks/delete_files/json_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py
index f84ebe32..188d8483 100644
--- a/backend/ecs_tasks/delete_files/json_handler.py
+++ b/backend/ecs_tasks/delete_files/json_handler.py
@@ -61,7 +61,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
                         record = get_value(col, parsed)
                         if record:
                             matched.append(record)
-                    if matched in column["MatchIds"]:
+                    if tuple(matched) in set(map(tuple, column["MatchIds"])):
                         should_delete = True
                         break
             if should_delete:

From f6defcbcf91b5442e787e861c6e31ff92e4f36c6 Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Tue, 30 Jan 2024 18:35:50 +0000
Subject: [PATCH 05/10] Improve JSON performance and include filesize in the
 logs

---
 CHANGELOG.md                                  |  9 ++-
 .../ecs_tasks/delete_files/json_handler.py    |  4 +-
 backend/ecs_tasks/delete_files/main.py        | 18 ++++--
 .../ecs_tasks/delete_files/parquet_handler.py | 16 ++---
 tests/unit/ecs_tasks/test_json.py             | 46 +++++++++-----
 tests/unit/ecs_tasks/test_main.py             | 46 +++++++++-----
 tests/unit/ecs_tasks/test_parquet.py          | 60 +++++++++++++------
 7 files changed, 134 insertions(+), 65 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6fc2378a..6f78653d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,15 @@
 # Change Log
 
+## v0.67
+
+- [#396](https://github.com/awslabs/amazon-s3-find-and-forget/issues/396):
+  Performance increase for JSON processing and log object size
+
 ## v0.66
 
 - [#395](https://github.com/awslabs/amazon-s3-find-and-forget/issues/395):
-  Increase the speed of the json_handler by migrating from a list to a set.
-  Move from O(n) to O(1)
+  Increase the speed of the json_handler by migrating from a list to a set. Move
+  from O(n) to O(1)
 
 ## v0.65
 
diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py
index 188d8483..32557de0 100644
--- a/backend/ecs_tasks/delete_files/json_handler.py
+++ b/backend/ecs_tasks/delete_files/json_handler.py
@@ -52,7 +52,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
             for column in to_delete:
                 if column["Type"] == "Simple":
                     record = get_value(column["Column"], parsed)
-                    if record and record in set(column["MatchIds"]):
+                    if record and record in column["MatchIds"]:
                         should_delete = True
                         break
                 else:
@@ -61,7 +61,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False):
                         record = get_value(col, parsed)
                         if record:
                             matched.append(record)
-                    if tuple(matched) in set(map(tuple, column["MatchIds"])):
+                    if tuple(matched) in column["MatchIds"]:
                         should_delete = True
                         break
             if should_delete:
diff --git a/backend/ecs_tasks/delete_files/main.py b/backend/ecs_tasks/delete_files/main.py
index 15b2cb28..558b8700 100644
--- a/backend/ecs_tasks/delete_files/main.py
+++ b/backend/ecs_tasks/delete_files/main.py
@@ -101,17 +101,17 @@ def build_matches(cols, manifest_object):
     Input example:
     [{"Column":"customer_id", "Type":"Simple"}]
     Output example:
-    [{"Column":"customer_id", "Type":"Simple", "MatchIds":[123, 234]}]
+    [{"Column":"customer_id", "Type":"Simple", "MatchIds": {123, 234}}]
     """
     COMPOSITE_MATCH_TOKEN = "_S3F2COMP_"
     manifest = fetch_manifest(manifest_object)
     matches = {}
     for line in json_lines_iterator(manifest):
         if not line["QueryableColumns"] in matches:
-            matches[line["QueryableColumns"]] = []
+            matches[line["QueryableColumns"]] = set()
         is_simple = len(line["Columns"]) == 1
-        match = line["MatchId"][0] if is_simple else line["MatchId"]
-        matches[line["QueryableColumns"]].append(match)
+        match = line["MatchId"][0] if is_simple else tuple(line["MatchId"])
+        matches[line["QueryableColumns"]].add(match)
     return list(
         map(
             lambda c: {
@@ -158,8 +158,14 @@ def execute(queue_url, message_body, receipt_handle):
             "{}/{}".format(input_bucket, input_key),
             buffer_size=FIVE_MB,
         ) as f:
-            source_version = f.metadata()["VersionId"].decode("utf-8")
-            logger.info("Using object version %s as source", source_version)
+            source_metadata = f.metadata()
+            source_version = source_metadata["VersionId"].decode("utf-8")
+            source_size = source_metadata["Content-Length"].decode("utf-8")
+            logger.info(
+                "Download Complete. Using object version %s as source (object size: %s)",
+                source_version,
+                source_size,
+            )
             # Write new file in-memory
             compressed = object_path.endswith(".gz")
             object_info, _ = get_object_info(
diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py
index 99adfd82..1e36ea93 100644
--- a/backend/ecs_tasks/delete_files/parquet_handler.py
+++ b/backend/ecs_tasks/delete_files/parquet_handler.py
@@ -41,7 +41,6 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete):
     """
     indexes = []
     data = {}
-    to_delete_set = set(map(tuple, to_delete))
     for identifier in identifiers:
         column_first_level = identifier.split(".")[0].lower()
         if not column_first_level in data:
@@ -60,7 +59,7 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete):
                 )
                 current = current[next_segment]
             values_array.append(current)
-        indexes.append(tuple(values_array) in to_delete_set)
+        indexes.append(tuple(values_array) in to_delete)
     return np.array(indexes)
 
 
@@ -71,7 +70,6 @@ def get_row_indexes_to_delete(table, identifier, to_delete):
     can be simple like "customer_id" or complex like "user.info.id"
     """
     indexes = []
-    to_delete_set = set(to_delete)
     segments = identifier.split(".")
     column_identifier = case_insensitive_getter(table.column_names, segments[0])
     for obj in table.column(column_identifier).to_pylist():
@@ -79,7 +77,7 @@ def get_row_indexes_to_delete(table, identifier, to_delete):
         for i in range(1, len(segments)):
             next_segment = case_insensitive_getter(list(current.keys()), segments[i])
             current = current[next_segment]
-        indexes.append(current in to_delete_set)
+        indexes.append(current in to_delete)
     return np.array(indexes)
 
 
@@ -114,12 +112,16 @@ def cast_column_values(column, schema):
     """
     if column["Type"] == "Simple":
         if is_column_type_decimal(schema, column["Column"]):
-            column["MatchIds"] = [Decimal(m) for m in column["MatchIds"]]
+            column["MatchIds"] = set([Decimal(m) for m in column["MatchIds"]])
     else:
         for i in range(0, len(column["Columns"])):
             if is_column_type_decimal(schema, column["Columns"][i]):
-                for composite_match in column["MatchIds"]:
-                    composite_match[i] = Decimal(composite_match[i])
+                columns_copy = set()
+                for composite_match_tuple in column["MatchIds"]:
+                    match_array = [x for x in composite_match_tuple]
+                    match_array[i] = Decimal(match_array[i])
+                    columns_copy.add(tuple(match_array))
+                column["MatchIds"] = columns_copy
     return column
 
 
diff --git a/tests/unit/ecs_tasks/test_json.py b/tests/unit/ecs_tasks/test_json.py
index d47488ac..a9b819de 100644
--- a/tests/unit/ecs_tasks/test_json.py
+++ b/tests/unit/ecs_tasks/test_json.py
@@ -13,7 +13,9 @@
 
 def test_it_generates_new_json_file_without_matches():
     # Arrange
-    to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
+    ]
     data = (
         '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
         '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n'
@@ -32,7 +34,9 @@ def test_it_generates_new_json_file_without_matches():
 
 def test_it_handles_json_with_gzip_compression():
     # Arrange
-    to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
+    ]
     data = (
         '{"customer_id": "12345", "x": 7, "d":"2001-01-01"}\n'
         '{"customer_id": "23456", "x": 8, "d":"2001-01-03"}\n'
@@ -51,7 +55,9 @@ def test_it_handles_json_with_gzip_compression():
 
 def test_delete_correct_rows_when_missing_newline_at_the_end():
     # Arrange
-    to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
+    ]
     data = (
         '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
         '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n'
@@ -71,7 +77,9 @@ def test_delete_correct_rows_when_missing_newline_at_the_end():
 def test_delete_correct_rows_containing_newlines_as_content():
     # UNICODE_NEWLINE_SEP = '\u2028'
     # Arrange
-    to_delete = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"}
+    ]
     data = (
         '{"customer_id": "12345", "d": "foo"}\n'
         '{"customer_id": "23456", "d": "foo\u2028\\nbar"}\n'
@@ -90,7 +98,7 @@ def test_delete_correct_rows_containing_newlines_as_content():
 
 def test_delete_correct_rows_from_json_file_with_complex_types():
     # Arrange
-    to_delete = [{"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [{"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"}]
     data = (
         '{"user": {"id": "12345", "name": "John"}, "d":["2001-01-01"]}\n'
         '{"user": {"id": "23456", "name": "Jane"}, "d":[]}\n'
@@ -112,7 +120,9 @@ def test_delete_correct_rows_from_json_file_with_composite_types_tuple_col():
     to_delete = [
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Mary", "Doe"]],
+            "MatchIds": set(
+                [tuple(["John", "Doe"]), tuple(["Jane", "Doe"]), tuple(["Mary", "Doe"])]
+            ),
             "Type": "Composite",
         }
     ]
@@ -136,7 +146,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_single_col():
     to_delete = [
         {
             "Columns": ["last_name"],
-            "MatchIds": [["Doe"]],
+            "MatchIds": set([tuple(["Doe"])]),
             "Type": "Composite",
         }
     ]
@@ -160,7 +170,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_with_nullable_o
     to_delete = [
         {
             "Columns": ["user.name", "parents.mother"],
-            "MatchIds": [["John", "23456"]],
+            "MatchIds": set([tuple(["John", "23456"])]),
             "Type": "Composite",
         }
     ]
@@ -189,7 +199,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types(
     to_delete = [
         {
             "Columns": ["age", "last_name"],
-            "MatchIds": [[12, "Doe"]],
+            "MatchIds": set([tuple([12, "Doe"])]),
             "Type": "Composite",
         }
     ]
@@ -212,10 +222,10 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types(
 def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types():
     # Arrange
     to_delete = [
-        {"Column": "customer_id", "MatchIds": [12345], "Type": "Simple"},
+        {"Column": "customer_id", "MatchIds": set([12345]), "Type": "Simple"},
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["Jane", "Doe"]],
+            "MatchIds": set([tuple(["Jane", "Doe"])]),
             "Type": "Composite",
         },
     ]
@@ -236,7 +246,9 @@ def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types
 
 def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifiers():
     # Arrange
-    to_delete = [{"Column": "parents.mother", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "parents.mother", "MatchIds": set(["23456"]), "Type": "Simple"}
+    ]
     data = (
         '{"user": {"id": "12345", "name": "John"}, "parents": {"mother": "23456"}}\n'
         '{"user": {"id": "23456", "name": "Jane"}, "parents": {"mother": null}}\n'
@@ -259,7 +271,7 @@ def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifie
 
 def test_delete_correct_rows_from_json_file_with_lower_cased_column_id():
     # Arrange
-    to_delete = [{"Column": "userid", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [{"Column": "userid", "MatchIds": set(["23456"]), "Type": "Simple"}]
     data = (
         '{"userId": "12345", "fullName": "JohnDoe"}\n'
         '{"userId": "23456", "fullName": "JaneDoe"}\n'
@@ -279,8 +291,8 @@ def test_delete_correct_rows_from_json_file_with_lower_cased_column_id():
 def test_delete_correct_rows_from_json_file_with_multiple_identifiers():
     # Arrange
     to_delete = [
-        {"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"},
-        {"Column": "mother", "MatchIds": ["23456"], "Type": "Simple"},
+        {"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"},
+        {"Column": "mother", "MatchIds": set(["23456"]), "Type": "Simple"},
     ]
     data = (
         '{"user": {"id": "12345", "name": "John"}, "mother": "23456"}\n'
@@ -297,7 +309,9 @@ def test_delete_correct_rows_from_json_file_with_multiple_identifiers():
 
 def test_it_throws_meaningful_error_for_serialization_issues():
     # Arrange
-    to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}]
+    to_delete = [
+        {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}
+    ]
     data = (
         '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n'
         '{"customer_id": "23456", "x": 2.3, "d":"invalid\n'
diff --git a/tests/unit/ecs_tasks/test_main.py b/tests/unit/ecs_tasks/test_main.py
index 52c29f98..d976ec10 100644
--- a/tests/unit/ecs_tasks/test_main.py
+++ b/tests/unit/ecs_tasks/test_main.py
@@ -74,11 +74,14 @@ def test_happy_path_when_queue_not_empty(
     mock_verify_integrity,
     message_stub,
 ):
-    column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]}
+    column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])}
     mock_build_matches.return_value = [column]
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_save.return_value = "new_version123"
@@ -129,11 +132,14 @@ def test_happy_path_when_queue_not_empty_for_compressed_json(
     mock_verify_integrity,
     message_stub,
 ):
-    column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]}
+    column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])}
     mock_build_matches.return_value = [column]
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_save.return_value = "new_version123"
@@ -190,12 +196,15 @@ def test_cse_kms_encrypted(
     mock_verify_integrity,
     message_stub,
 ):
-    column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]}
+    column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])}
     metadata = {"x-amz-wrap-alg": "kms", "x-amz-key-v2": "key123"}
     mock_build_matches.return_value = [column]
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": metadata}, None
     mock_save.return_value = "new_version123"
@@ -300,7 +309,10 @@ def test_it_removes_old_versions(
 ):
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_save.return_value = "new_version123"
@@ -348,7 +360,10 @@ def test_it_handles_old_version_delete_failures(
 ):
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_save.return_value = "new_version123"
@@ -394,7 +409,10 @@ def test_it_handles_no_deletions(
 ):
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_delete.return_value = pa.BufferOutputStream(), {"DeletedRows": 0}
@@ -1130,7 +1148,7 @@ def test_it_builds_matches_grouped_by_column_simple(mock_fetch):
 
     matches = build_matches(cols, "s3://path-to-manifest.json")
     assert matches == [
-        {"Column": "customer_id", "MatchIds": ["12345", "23456"]},
+        {"Column": "customer_id", "MatchIds": set(["12345", "23456"])},
     ]
 
 
@@ -1148,7 +1166,7 @@ def test_it_builds_matches_grouped_by_column_composite(mock_fetch):
     assert matches == [
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["john", "doe"], ["jane", "doe"]],
+            "MatchIds": set([tuple(["john", "doe"]), tuple(["jane", "doe"])]),
         },
     ]
 
@@ -1174,8 +1192,8 @@ def test_it_builds_matches_grouped_by_column_mixed(mock_fetch):
     assert matches == [
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["john", "doe"], ["jane", "doe"]],
+            "MatchIds": set([tuple(["john", "doe"]), tuple(["jane", "doe"])]),
         },
-        {"Column": "first_name", "MatchIds": ["smith"]},
-        {"Column": "last_name", "MatchIds": ["smith", "parker"]},
+        {"Column": "first_name", "MatchIds": set(["smith"])},
+        {"Column": "last_name", "MatchIds": set(["smith", "parker"])},
     ]
diff --git a/tests/unit/ecs_tasks/test_parquet.py b/tests/unit/ecs_tasks/test_parquet.py
index 8c4f6834..d2f23b72 100644
--- a/tests/unit/ecs_tasks/test_parquet.py
+++ b/tests/unit/ecs_tasks/test_parquet.py
@@ -23,7 +23,7 @@ def test_it_generates_new_parquet_file_without_matches(mock_delete, mock_load_pa
     # Arrange
     column = {
         "Column": "customer_id",
-        "MatchIds": ["12345", "23456"],
+        "MatchIds": set(["12345", "23456"]),
         "Type": "Simple",
     }
     data = [{"customer_id": "12345"}, {"customer_id": "34567"}]
@@ -53,7 +53,7 @@ def test_it_handles_files_with_multiple_row_groups_and_pandas_indexes(
         {"customer_id": "12345"},
         {"customer_id": "34567"},
     ]
-    columns = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}]
+    columns = [{"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"}]
     df = pd.DataFrame(data, list("ab"))
     table = pa.Table.from_pandas(df)
     buf = BytesIO()
@@ -81,7 +81,7 @@ def test_delete_correct_rows_from_table():
         {"customer_id": "34567"},
     ]
     columns = [
-        {"Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple"}
+        {"Column": "customer_id", "MatchIds": set(["12345", "23456"]), "Type": "Simple"}
     ]
     df = pd.DataFrame(data)
     table = pa.Table.from_pandas(df)
@@ -97,8 +97,8 @@ def test_delete_handles_multiple_columns_with_no_rows_left():
         {"customer_id": "12345", "other_customer_id": "23456"},
     ]
     columns = [
-        {"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"},
-        {"Column": "other_customer_id", "MatchIds": ["23456"], "Type": "Simple"},
+        {"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"},
+        {"Column": "other_customer_id", "MatchIds": set(["23456"]), "Type": "Simple"},
     ]
     df = pd.DataFrame(data)
     table = pa.Table.from_pandas(df)
@@ -117,7 +117,7 @@ def test_handles_lower_cased_column_names():
     columns = [
         {
             "Column": "userdata.customerid",
-            "MatchIds": ["12345", "23456"],
+            "MatchIds": set(["12345", "23456"]),
             "Type": "Simple",
         }
     ]
@@ -137,7 +137,7 @@ def test_it_handles_data_with_pandas_indexes():
         {"customer_id": "34567"},
     ]
     columns = [
-        {"Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple"}
+        {"Column": "customer_id", "MatchIds": set(["12345", "23456"]), "Type": "Simple"}
     ]
     df = pd.DataFrame(data, list("abc"))
     table = pa.Table.from_pandas(df)
@@ -160,7 +160,7 @@ def test_delete_correct_rows_from_parquet_table_with_complex_types():
     columns = [
         {
             "Column": "user_info.personal_information.name",
-            "MatchIds": ["matteo", "chris"],
+            "MatchIds": set(["matteo", "chris"]),
             "Type": "Simple",
         }
     ]
@@ -186,7 +186,13 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_tuple_col()
     columns = [
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["john", "doe"], ["jane", "doe"], ["matteo", "doe"]],
+            "MatchIds": set(
+                [
+                    tuple(["john", "doe"]),
+                    tuple(["jane", "doe"]),
+                    tuple(["matteo", "doe"]),
+                ]
+            ),
             "Type": "Composite",
         }
     ]
@@ -205,7 +211,13 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_single_col(
         "first_name": ["john", "jane", "matteo"],
         "last_name": ["doe", "doe", "hey"],
     }
-    columns = [{"Columns": ["last_name"], "MatchIds": [["doe"]], "Type": "Composite"}]
+    columns = [
+        {
+            "Columns": ["last_name"],
+            "MatchIds": set([tuple(["doe"])]),
+            "Type": "Composite",
+        }
+    ]
     df = pd.DataFrame(data)
     table = pa.Table.from_pandas(df)
     table, deleted_rows = delete_from_table(table, columns)
@@ -225,7 +237,7 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_multiple_ty
     columns = [
         {
             "Columns": ["age", "last_name"],
-            "MatchIds": [[12, "doe"]],
+            "MatchIds": set([tuple([12, "doe"])]),
             "Type": "Composite",
         }
     ]
@@ -251,7 +263,13 @@ def test_delete_correct_rows_from_parquet_table_with_complex_composite_types():
     columns = [
         {
             "Columns": ["details.first_name", "details.last_name"],
-            "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Matteo", "Doe"]],
+            "MatchIds": set(
+                [
+                    tuple(["John", "Doe"]),
+                    tuple(["Jane", "Doe"]),
+                    tuple(["Matteo", "Doe"]),
+                ]
+            ),
             "Type": "Composite",
         }
     ]
@@ -271,10 +289,10 @@ def test_delete_correct_rows_from_parquet_table_with_both_simple_and_composite_t
         "last_name": ["doe", "doe", "hey"],
     }
     columns = [
-        {"Column": "customer_id", "MatchIds": [12345], "Type": "Simple"},
+        {"Column": "customer_id", "MatchIds": set([12345]), "Type": "Simple"},
         {
             "Columns": ["first_name", "last_name"],
-            "MatchIds": [["jane", "doe"]],
+            "MatchIds": set([tuple(["jane", "doe"])]),
             "Type": "Composite",
         },
     ]
@@ -309,7 +327,7 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_types():
     columns = [
         {
             "Column": "customer_id_decimal",
-            "MatchIds": ["123.450", "234.560"],
+            "MatchIds": set(["123.450", "234.560"]),
             "Type": "Simple",
         },
     ]
@@ -334,7 +352,7 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_complex_types():
     columns = [
         {
             "Column": "user_info.personal_information.decimal",
-            "MatchIds": ["12.34", "34.56"],
+            "MatchIds": set(["12.34", "34.56"]),
             "Type": "Simple",
         }
     ]
@@ -366,7 +384,13 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_complex_composite_t
                 "user_info.personal_information.name",
                 "user_info.personal_information.decimal",
             ],
-            "MatchIds": [["matteo", "12.34"], ["chris", "34.56"], ["nick", "11.22"]],
+            "MatchIds": set(
+                [
+                    tuple(["matteo", "12.34"]),
+                    tuple(["chris", "34.56"]),
+                    tuple(["nick", "11.22"]),
+                ]
+            ),
             "Type": "Composite",
         }
     ]
@@ -389,7 +413,7 @@ def test_it_throws_for_invalid_schema_column_not_found():
         columns = [
             {
                 "Column": "user_info.personal_information.name",
-                "MatchIds": ["matteo"],
+                "MatchIds": set(["matteo"]),
                 "Type": "Simple",
             }
         ]

From 2a4f0b7b859fd299d61342a2569693d1ecfe1582 Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Tue, 30 Jan 2024 18:38:47 +0000
Subject: [PATCH 06/10] Bump version

---
 templates/template.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/template.yaml b/templates/template.yaml
index 5e3d3f7b..553935b3 100644
--- a/templates/template.yaml
+++ b/templates/template.yaml
@@ -1,6 +1,6 @@
 AWSTemplateFormatVersion: "2010-09-09"
 Transform: AWS::Serverless-2016-10-31
-Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.66) (tag:main)
+Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.67) (tag:main)
 
 Parameters:
   AccessControlAllowOriginOverride:
@@ -206,7 +206,7 @@ Conditions:
 Mappings:
   Solution:
     Constants:
-      Version: 'v0.66'
+      Version: 'v0.67'
 
 Resources:
   TempBucket:

From a7d63cf111c9cc5e4da0802f8d985fe531f41948 Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Tue, 30 Jan 2024 18:48:13 +0000
Subject: [PATCH 07/10] Cleanup test

---
 tests/unit/ecs_tasks/test_main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/ecs_tasks/test_main.py b/tests/unit/ecs_tasks/test_main.py
index d976ec10..919eb956 100644
--- a/tests/unit/ecs_tasks/test_main.py
+++ b/tests/unit/ecs_tasks/test_main.py
@@ -265,7 +265,10 @@ def test_it_assumes_role(
 ):
     mock_fs.S3FileSystem.return_value = mock_fs
     mock_file = MagicMock()
-    mock_file.metadata.return_value = {"VersionId": b"abc123"}
+    mock_file.metadata.return_value = {
+        "VersionId": b"abc123",
+        "Content-Length": b"5558",
+    }
     mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file
     mock_get_object_info.return_value = {"Metadata": {}}, None
     mock_delete.return_value = pa.BufferOutputStream(), {"DeletedRows": 1}

From 26e0d217ac257e1f8e8b50890cdfe6e654853d7b Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Thu, 15 Feb 2024 13:05:24 +0000
Subject: [PATCH 08/10] Update
 backend/ecs_tasks/delete_files/parquet_handler.py

Co-authored-by: Chris Deigan <ctd@users.noreply.github.com>
---
 backend/ecs_tasks/delete_files/parquet_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py
index 1e36ea93..f762b9de 100644
--- a/backend/ecs_tasks/delete_files/parquet_handler.py
+++ b/backend/ecs_tasks/delete_files/parquet_handler.py
@@ -112,7 +112,7 @@ def cast_column_values(column, schema):
     """
     if column["Type"] == "Simple":
         if is_column_type_decimal(schema, column["Column"]):
-            column["MatchIds"] = set([Decimal(m) for m in column["MatchIds"]])
+            column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"])
     else:
         for i in range(0, len(column["Columns"])):
             if is_column_type_decimal(schema, column["Columns"][i]):

From ff3bd0ccdb2511e75408abd90c0cb4f81fd9652b Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Thu, 15 Feb 2024 16:35:24 +0000
Subject: [PATCH 09/10] Don't copy the columns multiple time for multiple
 Decimal identifiers

---
 .../ecs_tasks/delete_files/parquet_handler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py
index f762b9de..873a8774 100644
--- a/backend/ecs_tasks/delete_files/parquet_handler.py
+++ b/backend/ecs_tasks/delete_files/parquet_handler.py
@@ -114,14 +114,19 @@ def cast_column_values(column, schema):
         if is_column_type_decimal(schema, column["Column"]):
             column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"])
     else:
-        for i in range(0, len(column["Columns"])):
-            if is_column_type_decimal(schema, column["Columns"][i]):
-                columns_copy = set()
-                for composite_match_tuple in column["MatchIds"]:
-                    match_array = [x for x in composite_match_tuple]
+        decimal_columns = [
+            i
+            for i in range(0, len(column["Columns"]))
+            if is_column_type_decimal(schema, column["Columns"][i])
+        ]
+        if len(decimal_columns):
+            columns_copy = set()
+            for composite_match_tuple in column["MatchIds"]:
+                match_array = [x for x in composite_match_tuple]
+                for i in decimal_columns:
                     match_array[i] = Decimal(match_array[i])
-                    columns_copy.add(tuple(match_array))
-                column["MatchIds"] = columns_copy
+                columns_copy.add(tuple(match_array))
+            column["MatchIds"] = columns_copy
     return column
 
 

From ec1e6623bba05bbd4b6c694f83597c974c95757f Mon Sep 17 00:00:00 2001
From: Matteo Figus <matteofigus@gmail.com>
Date: Fri, 16 Feb 2024 12:27:50 +0000
Subject: [PATCH 10/10] Further improvements

---
 .../ecs_tasks/delete_files/parquet_handler.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py
index 873a8774..0837a16a 100644
--- a/backend/ecs_tasks/delete_files/parquet_handler.py
+++ b/backend/ecs_tasks/delete_files/parquet_handler.py
@@ -114,19 +114,19 @@ def cast_column_values(column, schema):
         if is_column_type_decimal(schema, column["Column"]):
             column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"])
     else:
-        decimal_columns = [
+        decimal_columns = set(
             i
-            for i in range(0, len(column["Columns"]))
-            if is_column_type_decimal(schema, column["Columns"][i])
-        ]
-        if len(decimal_columns):
-            columns_copy = set()
-            for composite_match_tuple in column["MatchIds"]:
-                match_array = [x for x in composite_match_tuple]
-                for i in decimal_columns:
-                    match_array[i] = Decimal(match_array[i])
-                columns_copy.add(tuple(match_array))
-            column["MatchIds"] = columns_copy
+            for i, col in enumerate(column["Columns"])
+            if is_column_type_decimal(schema, col)
+        )
+        if decimal_columns:
+            column["MatchIds"] = set(
+                tuple(
+                    Decimal(m) if i in decimal_columns else m
+                    for i, m in enumerate(composite_match_tuple)
+                )
+                for composite_match_tuple in column["MatchIds"]
+            )
     return column