1+ import concurrent .futures
12import json
23import logging
34import time
1213
1314logger = logging .getLogger (__name__ )
1415
15- READ_BATCH_SIZE = 1_000
16- WRITE_MAX_ROW_GROUP_SIZE = 1_000
16+ READ_BATCH_SIZE = 10_000
1717WRITE_MAX_ROWS_PER_FILE = 100_000
18+ MAX_PARALLEL_WORKERS = 6
1819
1920DIFFS_DATASET_OUTPUT_SCHEMA = pa .schema (
2021 (
@@ -52,29 +53,55 @@ def calc_ab_diffs(run_directory: str, collated_dataset_path: str) -> str:
5253 return str (diffs_dataset )
5354
5455
56+ def process_batch (batch : pa .RecordBatch ) -> pa .RecordBatch :
57+ """Parallel worker for calculating record diffs for a batch.
58+
59+ The pyarrow RecordBatch is converted into a pandas dataframe, a diff is calculated via
60+ DeepDiff for each record in the batch, and this is converted back to a pyarrow
61+ RecordBatch for returning.
62+ """
63+ df = batch .to_pandas () # noqa: PD901
64+ diff_results = df .apply (
65+ lambda row : calc_record_diff (row ["record_a" ], row ["record_b" ]), axis = 1
66+ )
67+ df ["ab_diff" ] = diff_results .apply (lambda x : x [0 ])
68+ df ["modified_timdex_fields" ] = diff_results .apply (
69+ lambda x : list (x [1 ]) if x [1 ] else []
70+ )
71+ df ["has_diff" ] = diff_results .apply (lambda x : x [2 ])
72+ return pa .RecordBatch .from_pandas (df ) # type: ignore[attr-defined]
73+
74+
5575def get_diffed_batches_iter (
5676 collated_dataset : ds .Dataset ,
5777 batch_size : int = READ_BATCH_SIZE ,
78+ max_parallel_processes : int = MAX_PARALLEL_WORKERS ,
5879) -> Generator [pa .RecordBatch , None , None ]:
59- """Yield pyarrow record batches with diff calculated for records in batch."""
80+ """Yield pyarrow record batches with diff calculated for each record.
81+
82+ This work is performed in parallel, leveraging CPU cores to calculate the diffs and
83+ yield batches for writing to the "diffs" dataset.
84+ """
6085 batches_iter = collated_dataset .to_batches (batch_size = batch_size )
61- for i , batch in enumerate (batches_iter ):
62- logger .info (f"Calculating AB diff for batch: { i } " )
6386
64- # convert batch to pandas dataframe and calc values for new columns
65- df = batch .to_pandas () # noqa: PD901
87+ with concurrent .futures .ProcessPoolExecutor (
88+ max_workers = max_parallel_processes + 1
89+ ) as executor :
90+ pending_futures = []
91+ for batch_count , batch in enumerate (batches_iter ):
92+ future = executor .submit (process_batch , batch )
93+ pending_futures .append ((batch_count , future ))
6694
67- # calculate all diffs and unpack into separate columns
68- diff_results = df .apply (
69- lambda row : calc_record_diff (row ["record_a" ], row ["record_b" ]), axis = 1
70- )
71- df ["ab_diff" ] = diff_results .apply (lambda x : x [0 ])
72- df ["modified_timdex_fields" ] = diff_results .apply (
73- lambda x : list (x [1 ]) if x [1 ] else []
74- )
75- df ["has_diff" ] = diff_results .apply (lambda x : x [2 ])
95+ if len (pending_futures ) >= max_parallel_processes :
96+ idx , completed_future = pending_futures .pop (0 )
97+ result = completed_future .result ()
98+ logger .info (f"Yielding diffed batch: { idx } " )
99+ yield result
76100
77- yield pa .RecordBatch .from_pandas (df ) # type: ignore[attr-defined]
101+ for idx , future in pending_futures :
102+ result = future .result ()
103+ logger .info (f"Yielding diffed batch: { idx } " )
104+ yield result
78105
79106
80107def calc_record_diff (
@@ -83,32 +110,53 @@ def calc_record_diff(
83110 * ,
84111 ignore_order : bool = True ,
85112 report_repetition : bool = True ,
86- ) -> tuple [str | None , list [str ] | None , bool ]:
113+ ) -> tuple [str , set [str ], bool ]:
87114 """Calculate diff from two JSON byte strings.
88115
89116 The DeepDiff library has the property 'affected_root_keys' on the produced diff object
90117 that is very useful for our purposes. At this time, we simply want to know if
91118 anything about a particular root level TIMDEX field (e.g. 'dates' or 'title') has
92- changed which this method provides explicitly. We also serialize the full diff to
93- JSON via the to_json() method for storage and possible further analysis.
119+ changed which this method provides explicitly. In the unlikely case that the records
120+ share ZERO keys, a special case is handled where the modified root paths are returned
121+ as only ['root'], in which case we get a combined set keys from both records, which is
122+ effectively the modified root fields.
123+
124+ We also serialize the full diff to JSON via the to_json() method for storage and
125+ possible further analysis.
94126
95- This method returns a tuple :
127+ Returns tuple(ab_diff, modified_timdex_fields, has_diff) :
96128 - ab_diff: [str] - full diff as JSON
97129 - modified_timdex_fields: list[str] - list of modified root keys (TIMDEX fields)
98130 - has_diff: bool - True/False if any diff present
99131 """
100- if record_a is None or record_b is None :
101- return None , None , False
132+ # Replace None with empty dict
133+ record_a = record_a or {}
134+ record_b = record_b or {}
135+
136+ # Parse JSON strings or bytes into dictionaries
137+ if isinstance (record_a , (str | bytes )):
138+ record_a = json .loads (record_a )
139+ if isinstance (record_b , (str | bytes )):
140+ record_b = json .loads (record_b )
102141
103142 diff = DeepDiff (
104- json . loads ( record_a ) if isinstance ( record_a , str | bytes ) else record_a ,
105- json . loads ( record_b ) if isinstance ( record_b , str | bytes ) else record_b ,
143+ record_a ,
144+ record_b ,
106145 ignore_order = ignore_order ,
107146 report_repetition = report_repetition ,
108147 )
109148
110149 ab_diff = diff .to_json ()
111- modified_timdex_fields = diff .affected_root_keys
150+
151+ # get modified root fields, handling edge cases
152+ if diff .affected_paths != ["root" ]:
153+ modified_timdex_fields = diff .affected_root_keys
154+ else :
155+ modified_timdex_fields = set ()
156+ for record in [record_a , record_b ]:
157+ if isinstance (record , dict ):
158+ modified_timdex_fields .update (record .keys ())
159+
112160 has_diff = bool (modified_timdex_fields )
113161
114162 return ab_diff , modified_timdex_fields , has_diff
0 commit comments