From 30657c56088e177a3b590b4f40334a81a70de48b Mon Sep 17 00:00:00 2001 From: "Colin (Wilkie) McLellan" Date: Thu, 25 Jan 2024 15:42:32 +0000 Subject: [PATCH 01/10] Change list of match ids to set for json_handler.py Increase the speed of the json_handler by migrating from a list to a set. Move from O(n) to O(1) --- backend/ecs_tasks/delete_files/json_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py index 03f4236d..f84ebe32 100644 --- a/backend/ecs_tasks/delete_files/json_handler.py +++ b/backend/ecs_tasks/delete_files/json_handler.py @@ -52,7 +52,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False): for column in to_delete: if column["Type"] == "Simple": record = get_value(column["Column"], parsed) - if record and record in column["MatchIds"]: + if record and record in set(column["MatchIds"]): should_delete = True break else: From ca23bac7ebebc904b21ebdde3b09f58f9ae79875 Mon Sep 17 00:00:00 2001 From: "Colin (Wilkie) McLellan" Date: Fri, 26 Jan 2024 11:31:18 +0000 Subject: [PATCH 02/10] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6104c99a..6fc2378a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## v0.66 + +- [#395](https://github.com/awslabs/amazon-s3-find-and-forget/issues/395): + Increase the speed of the json_handler by migrating from a list to a set. + Move from O(n) to O(1) + ## v0.65 - [#393](https://github.com/awslabs/amazon-s3-find-and-forget/issues/393): Fix From 05be12930bbde60b9a366227e60240acb16eb612 Mon Sep 17 00:00:00 2001 From: "Colin (Wilkie) McLellan" Date: Fri, 26 Jan 2024 12:39:53 +0000 Subject: [PATCH 03/10] Bump Version 0.65 -> 0.66 --- templates/template.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/template.yaml b/templates/template.yaml index 94e717b3..5e3d3f7b 100644 --- a/templates/template.yaml +++ b/templates/template.yaml @@ -1,6 +1,6 @@ AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 -Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.65) (tag:main) +Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.66) (tag:main) Parameters: AccessControlAllowOriginOverride: @@ -206,7 +206,7 @@ Conditions: Mappings: Solution: Constants: - Version: 'v0.65' + Version: 'v0.66' Resources: TempBucket: From c497147dfc63733c312d59934846d52597f0009d Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Fri, 26 Jan 2024 14:26:30 +0000 Subject: [PATCH 04/10] Include optimisation for composite json matches --- backend/ecs_tasks/delete_files/json_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py index f84ebe32..188d8483 100644 --- a/backend/ecs_tasks/delete_files/json_handler.py +++ b/backend/ecs_tasks/delete_files/json_handler.py @@ -61,7 +61,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False): record = get_value(col, parsed) if record: matched.append(record) - if matched in column["MatchIds"]: + if tuple(matched) in set(map(tuple, column["MatchIds"])): should_delete = True break if should_delete: From f6defcbcf91b5442e787e861c6e31ff92e4f36c6 Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Tue, 30 Jan 2024 18:35:50 +0000 Subject: [PATCH 05/10] Improve JSON performance and include filesize in the logs --- CHANGELOG.md | 9 ++- .../ecs_tasks/delete_files/json_handler.py | 4 +- backend/ecs_tasks/delete_files/main.py | 18 ++++-- .../ecs_tasks/delete_files/parquet_handler.py | 16 ++--- tests/unit/ecs_tasks/test_json.py | 46 +++++++++----- tests/unit/ecs_tasks/test_main.py | 46 +++++++++----- tests/unit/ecs_tasks/test_parquet.py | 60 +++++++++++++------ 7 files changed, 134 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fc2378a..6f78653d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,15 @@ # Change Log +## v0.67 + +- [#396](https://github.com/awslabs/amazon-s3-find-and-forget/issues/396): + Performance increase for JSON processing and log object size + ## v0.66 - [#395](https://github.com/awslabs/amazon-s3-find-and-forget/issues/395): - Increase the speed of the json_handler by migrating from a list to a set. - Move from O(n) to O(1) + Increase the speed of the json_handler by migrating from a list to a set. Move + from O(n) to O(1) ## v0.65 diff --git a/backend/ecs_tasks/delete_files/json_handler.py b/backend/ecs_tasks/delete_files/json_handler.py index 188d8483..32557de0 100644 --- a/backend/ecs_tasks/delete_files/json_handler.py +++ b/backend/ecs_tasks/delete_files/json_handler.py @@ -52,7 +52,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False): for column in to_delete: if column["Type"] == "Simple": record = get_value(column["Column"], parsed) - if record and record in set(column["MatchIds"]): + if record and record in column["MatchIds"]: should_delete = True break else: @@ -61,7 +61,7 @@ def delete_matches_from_json_file(input_file, to_delete, compressed=False): record = get_value(col, parsed) if record: matched.append(record) - if tuple(matched) in set(map(tuple, column["MatchIds"])): + if tuple(matched) in column["MatchIds"]: should_delete = True break if should_delete: diff --git a/backend/ecs_tasks/delete_files/main.py b/backend/ecs_tasks/delete_files/main.py index 15b2cb28..558b8700 100644 --- a/backend/ecs_tasks/delete_files/main.py +++ b/backend/ecs_tasks/delete_files/main.py @@ -101,17 +101,17 @@ def build_matches(cols, manifest_object): Input example: [{"Column":"customer_id", "Type":"Simple"}] Output example: - [{"Column":"customer_id", "Type":"Simple", "MatchIds":[123, 234]}] + [{"Column":"customer_id", "Type":"Simple", "MatchIds": {123, 234}}] """ COMPOSITE_MATCH_TOKEN = "_S3F2COMP_" manifest = fetch_manifest(manifest_object) matches = {} for line in json_lines_iterator(manifest): if not line["QueryableColumns"] in matches: - matches[line["QueryableColumns"]] = [] + matches[line["QueryableColumns"]] = set() is_simple = len(line["Columns"]) == 1 - match = line["MatchId"][0] if is_simple else line["MatchId"] - matches[line["QueryableColumns"]].append(match) + match = line["MatchId"][0] if is_simple else tuple(line["MatchId"]) + matches[line["QueryableColumns"]].add(match) return list( map( lambda c: { @@ -158,8 +158,14 @@ def execute(queue_url, message_body, receipt_handle): "{}/{}".format(input_bucket, input_key), buffer_size=FIVE_MB, ) as f: - source_version = f.metadata()["VersionId"].decode("utf-8") - logger.info("Using object version %s as source", source_version) + source_metadata = f.metadata() + source_version = source_metadata["VersionId"].decode("utf-8") + source_size = source_metadata["Content-Length"].decode("utf-8") + logger.info( + "Download Complete. Using object version %s as source (object size: %s)", + source_version, + source_size, + ) # Write new file in-memory compressed = object_path.endswith(".gz") object_info, _ = get_object_info( diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py index 99adfd82..1e36ea93 100644 --- a/backend/ecs_tasks/delete_files/parquet_handler.py +++ b/backend/ecs_tasks/delete_files/parquet_handler.py @@ -41,7 +41,6 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete): """ indexes = [] data = {} - to_delete_set = set(map(tuple, to_delete)) for identifier in identifiers: column_first_level = identifier.split(".")[0].lower() if not column_first_level in data: @@ -60,7 +59,7 @@ def get_row_indexes_to_delete_for_composite(table, identifiers, to_delete): ) current = current[next_segment] values_array.append(current) - indexes.append(tuple(values_array) in to_delete_set) + indexes.append(tuple(values_array) in to_delete) return np.array(indexes) @@ -71,7 +70,6 @@ def get_row_indexes_to_delete(table, identifier, to_delete): can be simple like "customer_id" or complex like "user.info.id" """ indexes = [] - to_delete_set = set(to_delete) segments = identifier.split(".") column_identifier = case_insensitive_getter(table.column_names, segments[0]) for obj in table.column(column_identifier).to_pylist(): @@ -79,7 +77,7 @@ def get_row_indexes_to_delete(table, identifier, to_delete): for i in range(1, len(segments)): next_segment = case_insensitive_getter(list(current.keys()), segments[i]) current = current[next_segment] - indexes.append(current in to_delete_set) + indexes.append(current in to_delete) return np.array(indexes) @@ -114,12 +112,16 @@ def cast_column_values(column, schema): """ if column["Type"] == "Simple": if is_column_type_decimal(schema, column["Column"]): - column["MatchIds"] = [Decimal(m) for m in column["MatchIds"]] + column["MatchIds"] = set([Decimal(m) for m in column["MatchIds"]]) else: for i in range(0, len(column["Columns"])): if is_column_type_decimal(schema, column["Columns"][i]): - for composite_match in column["MatchIds"]: - composite_match[i] = Decimal(composite_match[i]) + columns_copy = set() + for composite_match_tuple in column["MatchIds"]: + match_array = [x for x in composite_match_tuple] + match_array[i] = Decimal(match_array[i]) + columns_copy.add(tuple(match_array)) + column["MatchIds"] = columns_copy return column diff --git a/tests/unit/ecs_tasks/test_json.py b/tests/unit/ecs_tasks/test_json.py index d47488ac..a9b819de 100644 --- a/tests/unit/ecs_tasks/test_json.py +++ b/tests/unit/ecs_tasks/test_json.py @@ -13,7 +13,9 @@ def test_it_generates_new_json_file_without_matches(): # Arrange - to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [ + {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"} + ] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n' @@ -32,7 +34,9 @@ def test_it_generates_new_json_file_without_matches(): def test_it_handles_json_with_gzip_compression(): # Arrange - to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [ + {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"} + ] data = ( '{"customer_id": "12345", "x": 7, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 8, "d":"2001-01-03"}\n' @@ -51,7 +55,9 @@ def test_it_handles_json_with_gzip_compression(): def test_delete_correct_rows_when_missing_newline_at_the_end(): # Arrange - to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [ + {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"} + ] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"2001-01-03"}\n' @@ -71,7 +77,9 @@ def test_delete_correct_rows_when_missing_newline_at_the_end(): def test_delete_correct_rows_containing_newlines_as_content(): # UNICODE_NEWLINE_SEP = '\u2028' # Arrange - to_delete = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}] + to_delete = [ + {"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"} + ] data = ( '{"customer_id": "12345", "d": "foo"}\n' '{"customer_id": "23456", "d": "foo\u2028\\nbar"}\n' @@ -90,7 +98,7 @@ def test_delete_correct_rows_containing_newlines_as_content(): def test_delete_correct_rows_from_json_file_with_complex_types(): # Arrange - to_delete = [{"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [{"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"}] data = ( '{"user": {"id": "12345", "name": "John"}, "d":["2001-01-01"]}\n' '{"user": {"id": "23456", "name": "Jane"}, "d":[]}\n' @@ -112,7 +120,9 @@ def test_delete_correct_rows_from_json_file_with_composite_types_tuple_col(): to_delete = [ { "Columns": ["first_name", "last_name"], - "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Mary", "Doe"]], + "MatchIds": set( + [tuple(["John", "Doe"]), tuple(["Jane", "Doe"]), tuple(["Mary", "Doe"])] + ), "Type": "Composite", } ] @@ -136,7 +146,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_single_col(): to_delete = [ { "Columns": ["last_name"], - "MatchIds": [["Doe"]], + "MatchIds": set([tuple(["Doe"])]), "Type": "Composite", } ] @@ -160,7 +170,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_with_nullable_o to_delete = [ { "Columns": ["user.name", "parents.mother"], - "MatchIds": [["John", "23456"]], + "MatchIds": set([tuple(["John", "23456"])]), "Type": "Composite", } ] @@ -189,7 +199,7 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types( to_delete = [ { "Columns": ["age", "last_name"], - "MatchIds": [[12, "Doe"]], + "MatchIds": set([tuple([12, "Doe"])]), "Type": "Composite", } ] @@ -212,10 +222,10 @@ def test_delete_correct_rows_from_json_file_with_composite_types_multiple_types( def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types(): # Arrange to_delete = [ - {"Column": "customer_id", "MatchIds": [12345], "Type": "Simple"}, + {"Column": "customer_id", "MatchIds": set([12345]), "Type": "Simple"}, { "Columns": ["first_name", "last_name"], - "MatchIds": [["Jane", "Doe"]], + "MatchIds": set([tuple(["Jane", "Doe"])]), "Type": "Composite", }, ] @@ -236,7 +246,9 @@ def test_delete_correct_rows_from_json_file_with_both_simple_and_composite_types def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifiers(): # Arrange - to_delete = [{"Column": "parents.mother", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [ + {"Column": "parents.mother", "MatchIds": set(["23456"]), "Type": "Simple"} + ] data = ( '{"user": {"id": "12345", "name": "John"}, "parents": {"mother": "23456"}}\n' '{"user": {"id": "23456", "name": "Jane"}, "parents": {"mother": null}}\n' @@ -259,7 +271,7 @@ def test_delete_correct_rows_from_json_file_with_nullable_or_undefined_identifie def test_delete_correct_rows_from_json_file_with_lower_cased_column_id(): # Arrange - to_delete = [{"Column": "userid", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [{"Column": "userid", "MatchIds": set(["23456"]), "Type": "Simple"}] data = ( '{"userId": "12345", "fullName": "JohnDoe"}\n' '{"userId": "23456", "fullName": "JaneDoe"}\n' @@ -279,8 +291,8 @@ def test_delete_correct_rows_from_json_file_with_lower_cased_column_id(): def test_delete_correct_rows_from_json_file_with_multiple_identifiers(): # Arrange to_delete = [ - {"Column": "user.id", "MatchIds": ["23456"], "Type": "Simple"}, - {"Column": "mother", "MatchIds": ["23456"], "Type": "Simple"}, + {"Column": "user.id", "MatchIds": set(["23456"]), "Type": "Simple"}, + {"Column": "mother", "MatchIds": set(["23456"]), "Type": "Simple"}, ] data = ( '{"user": {"id": "12345", "name": "John"}, "mother": "23456"}\n' @@ -297,7 +309,9 @@ def test_delete_correct_rows_from_json_file_with_multiple_identifiers(): def test_it_throws_meaningful_error_for_serialization_issues(): # Arrange - to_delete = [{"Column": "customer_id", "MatchIds": ["23456"], "Type": "Simple"}] + to_delete = [ + {"Column": "customer_id", "MatchIds": set(["23456"]), "Type": "Simple"} + ] data = ( '{"customer_id": "12345", "x": 1.2, "d":"2001-01-01"}\n' '{"customer_id": "23456", "x": 2.3, "d":"invalid\n' diff --git a/tests/unit/ecs_tasks/test_main.py b/tests/unit/ecs_tasks/test_main.py index 52c29f98..d976ec10 100644 --- a/tests/unit/ecs_tasks/test_main.py +++ b/tests/unit/ecs_tasks/test_main.py @@ -74,11 +74,14 @@ def test_happy_path_when_queue_not_empty( mock_verify_integrity, message_stub, ): - column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]} + column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])} mock_build_matches.return_value = [column] mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_save.return_value = "new_version123" @@ -129,11 +132,14 @@ def test_happy_path_when_queue_not_empty_for_compressed_json( mock_verify_integrity, message_stub, ): - column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]} + column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])} mock_build_matches.return_value = [column] mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_save.return_value = "new_version123" @@ -190,12 +196,15 @@ def test_cse_kms_encrypted( mock_verify_integrity, message_stub, ): - column = {"Column": "customer_id", "MatchIds": ["12345", "23456"]} + column = {"Column": "customer_id", "MatchIds": set(["12345", "23456"])} metadata = {"x-amz-wrap-alg": "kms", "x-amz-key-v2": "key123"} mock_build_matches.return_value = [column] mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": metadata}, None mock_save.return_value = "new_version123" @@ -300,7 +309,10 @@ def test_it_removes_old_versions( ): mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_save.return_value = "new_version123" @@ -348,7 +360,10 @@ def test_it_handles_old_version_delete_failures( ): mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_save.return_value = "new_version123" @@ -394,7 +409,10 @@ def test_it_handles_no_deletions( ): mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_delete.return_value = pa.BufferOutputStream(), {"DeletedRows": 0} @@ -1130,7 +1148,7 @@ def test_it_builds_matches_grouped_by_column_simple(mock_fetch): matches = build_matches(cols, "s3://path-to-manifest.json") assert matches == [ - {"Column": "customer_id", "MatchIds": ["12345", "23456"]}, + {"Column": "customer_id", "MatchIds": set(["12345", "23456"])}, ] @@ -1148,7 +1166,7 @@ def test_it_builds_matches_grouped_by_column_composite(mock_fetch): assert matches == [ { "Columns": ["first_name", "last_name"], - "MatchIds": [["john", "doe"], ["jane", "doe"]], + "MatchIds": set([tuple(["john", "doe"]), tuple(["jane", "doe"])]), }, ] @@ -1174,8 +1192,8 @@ def test_it_builds_matches_grouped_by_column_mixed(mock_fetch): assert matches == [ { "Columns": ["first_name", "last_name"], - "MatchIds": [["john", "doe"], ["jane", "doe"]], + "MatchIds": set([tuple(["john", "doe"]), tuple(["jane", "doe"])]), }, - {"Column": "first_name", "MatchIds": ["smith"]}, - {"Column": "last_name", "MatchIds": ["smith", "parker"]}, + {"Column": "first_name", "MatchIds": set(["smith"])}, + {"Column": "last_name", "MatchIds": set(["smith", "parker"])}, ] diff --git a/tests/unit/ecs_tasks/test_parquet.py b/tests/unit/ecs_tasks/test_parquet.py index 8c4f6834..d2f23b72 100644 --- a/tests/unit/ecs_tasks/test_parquet.py +++ b/tests/unit/ecs_tasks/test_parquet.py @@ -23,7 +23,7 @@ def test_it_generates_new_parquet_file_without_matches(mock_delete, mock_load_pa # Arrange column = { "Column": "customer_id", - "MatchIds": ["12345", "23456"], + "MatchIds": set(["12345", "23456"]), "Type": "Simple", } data = [{"customer_id": "12345"}, {"customer_id": "34567"}] @@ -53,7 +53,7 @@ def test_it_handles_files_with_multiple_row_groups_and_pandas_indexes( {"customer_id": "12345"}, {"customer_id": "34567"}, ] - columns = [{"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}] + columns = [{"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"}] df = pd.DataFrame(data, list("ab")) table = pa.Table.from_pandas(df) buf = BytesIO() @@ -81,7 +81,7 @@ def test_delete_correct_rows_from_table(): {"customer_id": "34567"}, ] columns = [ - {"Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple"} + {"Column": "customer_id", "MatchIds": set(["12345", "23456"]), "Type": "Simple"} ] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) @@ -97,8 +97,8 @@ def test_delete_handles_multiple_columns_with_no_rows_left(): {"customer_id": "12345", "other_customer_id": "23456"}, ] columns = [ - {"Column": "customer_id", "MatchIds": ["12345"], "Type": "Simple"}, - {"Column": "other_customer_id", "MatchIds": ["23456"], "Type": "Simple"}, + {"Column": "customer_id", "MatchIds": set(["12345"]), "Type": "Simple"}, + {"Column": "other_customer_id", "MatchIds": set(["23456"]), "Type": "Simple"}, ] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) @@ -117,7 +117,7 @@ def test_handles_lower_cased_column_names(): columns = [ { "Column": "userdata.customerid", - "MatchIds": ["12345", "23456"], + "MatchIds": set(["12345", "23456"]), "Type": "Simple", } ] @@ -137,7 +137,7 @@ def test_it_handles_data_with_pandas_indexes(): {"customer_id": "34567"}, ] columns = [ - {"Column": "customer_id", "MatchIds": ["12345", "23456"], "Type": "Simple"} + {"Column": "customer_id", "MatchIds": set(["12345", "23456"]), "Type": "Simple"} ] df = pd.DataFrame(data, list("abc")) table = pa.Table.from_pandas(df) @@ -160,7 +160,7 @@ def test_delete_correct_rows_from_parquet_table_with_complex_types(): columns = [ { "Column": "user_info.personal_information.name", - "MatchIds": ["matteo", "chris"], + "MatchIds": set(["matteo", "chris"]), "Type": "Simple", } ] @@ -186,7 +186,13 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_tuple_col() columns = [ { "Columns": ["first_name", "last_name"], - "MatchIds": [["john", "doe"], ["jane", "doe"], ["matteo", "doe"]], + "MatchIds": set( + [ + tuple(["john", "doe"]), + tuple(["jane", "doe"]), + tuple(["matteo", "doe"]), + ] + ), "Type": "Composite", } ] @@ -205,7 +211,13 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_single_col( "first_name": ["john", "jane", "matteo"], "last_name": ["doe", "doe", "hey"], } - columns = [{"Columns": ["last_name"], "MatchIds": [["doe"]], "Type": "Composite"}] + columns = [ + { + "Columns": ["last_name"], + "MatchIds": set([tuple(["doe"])]), + "Type": "Composite", + } + ] df = pd.DataFrame(data) table = pa.Table.from_pandas(df) table, deleted_rows = delete_from_table(table, columns) @@ -225,7 +237,7 @@ def test_delete_correct_rows_from_parquet_table_with_composite_types_multiple_ty columns = [ { "Columns": ["age", "last_name"], - "MatchIds": [[12, "doe"]], + "MatchIds": set([tuple([12, "doe"])]), "Type": "Composite", } ] @@ -251,7 +263,13 @@ def test_delete_correct_rows_from_parquet_table_with_complex_composite_types(): columns = [ { "Columns": ["details.first_name", "details.last_name"], - "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Matteo", "Doe"]], + "MatchIds": set( + [ + tuple(["John", "Doe"]), + tuple(["Jane", "Doe"]), + tuple(["Matteo", "Doe"]), + ] + ), "Type": "Composite", } ] @@ -271,10 +289,10 @@ def test_delete_correct_rows_from_parquet_table_with_both_simple_and_composite_t "last_name": ["doe", "doe", "hey"], } columns = [ - {"Column": "customer_id", "MatchIds": [12345], "Type": "Simple"}, + {"Column": "customer_id", "MatchIds": set([12345]), "Type": "Simple"}, { "Columns": ["first_name", "last_name"], - "MatchIds": [["jane", "doe"]], + "MatchIds": set([tuple(["jane", "doe"])]), "Type": "Composite", }, ] @@ -309,7 +327,7 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_types(): columns = [ { "Column": "customer_id_decimal", - "MatchIds": ["123.450", "234.560"], + "MatchIds": set(["123.450", "234.560"]), "Type": "Simple", }, ] @@ -334,7 +352,7 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_complex_types(): columns = [ { "Column": "user_info.personal_information.decimal", - "MatchIds": ["12.34", "34.56"], + "MatchIds": set(["12.34", "34.56"]), "Type": "Simple", } ] @@ -366,7 +384,13 @@ def test_delete_correct_rows_from_parquet_table_with_decimal_complex_composite_t "user_info.personal_information.name", "user_info.personal_information.decimal", ], - "MatchIds": [["matteo", "12.34"], ["chris", "34.56"], ["nick", "11.22"]], + "MatchIds": set( + [ + tuple(["matteo", "12.34"]), + tuple(["chris", "34.56"]), + tuple(["nick", "11.22"]), + ] + ), "Type": "Composite", } ] @@ -389,7 +413,7 @@ def test_it_throws_for_invalid_schema_column_not_found(): columns = [ { "Column": "user_info.personal_information.name", - "MatchIds": ["matteo"], + "MatchIds": set(["matteo"]), "Type": "Simple", } ] From 2a4f0b7b859fd299d61342a2569693d1ecfe1582 Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Tue, 30 Jan 2024 18:38:47 +0000 Subject: [PATCH 06/10] Bump version --- templates/template.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/template.yaml b/templates/template.yaml index 5e3d3f7b..553935b3 100644 --- a/templates/template.yaml +++ b/templates/template.yaml @@ -1,6 +1,6 @@ AWSTemplateFormatVersion: "2010-09-09" Transform: AWS::Serverless-2016-10-31 -Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.66) (tag:main) +Description: Amazon S3 Find and Forget (uksb-1q2j8beb0) (version:v0.67) (tag:main) Parameters: AccessControlAllowOriginOverride: @@ -206,7 +206,7 @@ Conditions: Mappings: Solution: Constants: - Version: 'v0.66' + Version: 'v0.67' Resources: TempBucket: From a7d63cf111c9cc5e4da0802f8d985fe531f41948 Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Tue, 30 Jan 2024 18:48:13 +0000 Subject: [PATCH 07/10] Cleanup test --- tests/unit/ecs_tasks/test_main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/ecs_tasks/test_main.py b/tests/unit/ecs_tasks/test_main.py index d976ec10..919eb956 100644 --- a/tests/unit/ecs_tasks/test_main.py +++ b/tests/unit/ecs_tasks/test_main.py @@ -265,7 +265,10 @@ def test_it_assumes_role( ): mock_fs.S3FileSystem.return_value = mock_fs mock_file = MagicMock() - mock_file.metadata.return_value = {"VersionId": b"abc123"} + mock_file.metadata.return_value = { + "VersionId": b"abc123", + "Content-Length": b"5558", + } mock_fs.open_input_stream.return_value.__enter__.return_value = mock_file mock_get_object_info.return_value = {"Metadata": {}}, None mock_delete.return_value = pa.BufferOutputStream(), {"DeletedRows": 1} From 26e0d217ac257e1f8e8b50890cdfe6e654853d7b Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Thu, 15 Feb 2024 13:05:24 +0000 Subject: [PATCH 08/10] Update backend/ecs_tasks/delete_files/parquet_handler.py Co-authored-by: Chris Deigan --- backend/ecs_tasks/delete_files/parquet_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py index 1e36ea93..f762b9de 100644 --- a/backend/ecs_tasks/delete_files/parquet_handler.py +++ b/backend/ecs_tasks/delete_files/parquet_handler.py @@ -112,7 +112,7 @@ def cast_column_values(column, schema): """ if column["Type"] == "Simple": if is_column_type_decimal(schema, column["Column"]): - column["MatchIds"] = set([Decimal(m) for m in column["MatchIds"]]) + column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"]) else: for i in range(0, len(column["Columns"])): if is_column_type_decimal(schema, column["Columns"][i]): From ff3bd0ccdb2511e75408abd90c0cb4f81fd9652b Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Thu, 15 Feb 2024 16:35:24 +0000 Subject: [PATCH 09/10] Don't copy the columns multiple time for multiple Decimal identifiers --- .../ecs_tasks/delete_files/parquet_handler.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py index f762b9de..873a8774 100644 --- a/backend/ecs_tasks/delete_files/parquet_handler.py +++ b/backend/ecs_tasks/delete_files/parquet_handler.py @@ -114,14 +114,19 @@ def cast_column_values(column, schema): if is_column_type_decimal(schema, column["Column"]): column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"]) else: - for i in range(0, len(column["Columns"])): - if is_column_type_decimal(schema, column["Columns"][i]): - columns_copy = set() - for composite_match_tuple in column["MatchIds"]: - match_array = [x for x in composite_match_tuple] + decimal_columns = [ + i + for i in range(0, len(column["Columns"])) + if is_column_type_decimal(schema, column["Columns"][i]) + ] + if len(decimal_columns): + columns_copy = set() + for composite_match_tuple in column["MatchIds"]: + match_array = [x for x in composite_match_tuple] + for i in decimal_columns: match_array[i] = Decimal(match_array[i]) - columns_copy.add(tuple(match_array)) - column["MatchIds"] = columns_copy + columns_copy.add(tuple(match_array)) + column["MatchIds"] = columns_copy return column From ec1e6623bba05bbd4b6c694f83597c974c95757f Mon Sep 17 00:00:00 2001 From: Matteo Figus Date: Fri, 16 Feb 2024 12:27:50 +0000 Subject: [PATCH 10/10] Further improvements --- .../ecs_tasks/delete_files/parquet_handler.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/backend/ecs_tasks/delete_files/parquet_handler.py b/backend/ecs_tasks/delete_files/parquet_handler.py index 873a8774..0837a16a 100644 --- a/backend/ecs_tasks/delete_files/parquet_handler.py +++ b/backend/ecs_tasks/delete_files/parquet_handler.py @@ -114,19 +114,19 @@ def cast_column_values(column, schema): if is_column_type_decimal(schema, column["Column"]): column["MatchIds"] = set(Decimal(m) for m in column["MatchIds"]) else: - decimal_columns = [ + decimal_columns = set( i - for i in range(0, len(column["Columns"])) - if is_column_type_decimal(schema, column["Columns"][i]) - ] - if len(decimal_columns): - columns_copy = set() - for composite_match_tuple in column["MatchIds"]: - match_array = [x for x in composite_match_tuple] - for i in decimal_columns: - match_array[i] = Decimal(match_array[i]) - columns_copy.add(tuple(match_array)) - column["MatchIds"] = columns_copy + for i, col in enumerate(column["Columns"]) + if is_column_type_decimal(schema, col) + ) + if decimal_columns: + column["MatchIds"] = set( + tuple( + Decimal(m) if i in decimal_columns else m + for i, m in enumerate(composite_match_tuple) + ) + for composite_match_tuple in column["MatchIds"] + ) return column