13
13
# Type definitions for better readability
14
14
ManifestType = Dict [str , Any ]
15
15
DefinitionsType = Dict [str , Any ]
16
- DuplicatesType = DefaultDict [str , List [Tuple [List [str ], Dict , Dict ]]]
16
+ DuplicatesType = DefaultDict [str , List [Tuple [List [str ], Dict [ str , Any ], Dict [ str , Any ] ]]]
17
17
18
18
# Configuration constants
19
19
N_OCCURANCES = 2
27
27
"url_base" ,
28
28
]
29
29
30
- # the placeholder for collected duplicates
31
- DUPLICATES : DuplicatesType = defaultdict (list , {})
32
-
33
30
34
31
def deduplicate_definitions (resolved_manifest : ManifestType ) -> ManifestType :
35
32
"""
@@ -47,23 +44,23 @@ def deduplicate_definitions(resolved_manifest: ManifestType) -> ManifestType:
47
44
_manifest = copy .deepcopy (resolved_manifest )
48
45
definitions = _manifest .get (DEF_TAG , {})
49
46
50
- _collect_duplicates (definitions )
51
- _handle_duplicates (definitions )
47
+ duplicates = _collect_duplicates (definitions )
48
+ _handle_duplicates (definitions , duplicates )
52
49
53
50
return _manifest
54
51
except ManifestDeduplicationException :
55
52
# if any arror occurs, we just return the original manifest.
56
53
return resolved_manifest
57
54
58
55
59
- def _replace_duplicates_with_refs (definitions : ManifestType ) -> None :
56
+ def _replace_duplicates_with_refs (definitions : ManifestType , duplicates : DuplicatesType ) -> None :
60
57
"""
61
58
Process duplicate objects and replace them with references.
62
59
63
60
Args:
64
61
definitions: The definitions dictionary to modify
65
62
"""
66
- for _ , occurrences in DUPLICATES .items ():
63
+ for _ , occurrences in duplicates .items ():
67
64
# Skip non-duplicates
68
65
if len (occurrences ) < N_OCCURANCES :
69
66
continue
@@ -84,20 +81,20 @@ def _replace_duplicates_with_refs(definitions: ManifestType) -> None:
84
81
parent_obj [key ] = _create_ref_object (ref_key )
85
82
86
83
87
- def _handle_duplicates (definitions : DefinitionsType ) -> None :
84
+ def _handle_duplicates (definitions : DefinitionsType , duplicates : DuplicatesType ) -> None :
88
85
"""
89
- Process the DUPLICATES and replace them with references.
86
+ Process the duplicates and replace them with references.
90
87
91
88
Args:
92
- DUPLICATES : Dictionary of duplicate objects
89
+ duplicates : Dictionary of duplicate objects
93
90
"""
94
91
# process duplicates only if there are any
95
- if len (DUPLICATES ) > 0 :
92
+ if len (duplicates ) > 0 :
96
93
if not SHARED_TAG in definitions :
97
94
definitions [SHARED_TAG ] = {}
98
95
99
96
try :
100
- _replace_duplicates_with_refs (definitions )
97
+ _replace_duplicates_with_refs (definitions , duplicates )
101
98
except Exception as e :
102
99
raise ManifestDeduplicationException (str (e ))
103
100
@@ -116,19 +113,21 @@ def _is_allowed_tag(key: str) -> bool:
116
113
117
114
118
115
def _add_duplicate (
116
+ duplicates : DuplicatesType ,
119
117
current_path : List [str ],
120
- obj : Dict ,
118
+ obj : Dict [ str , Any ] ,
121
119
value : Any ,
122
120
key : Optional [str ] = None ,
123
121
) -> None :
124
122
"""
125
123
Adds a duplicate record of an observed object by computing a unique hash for the provided value.
126
124
127
125
This function computes a hash for the given value (or a dictionary composed of the key and value if a key is provided)
128
- and appends a tuple containing the current path, the original object, and the value to the global DUPLICATES
126
+ and appends a tuple containing the current path, the original object, and the value to the duplicates
129
127
dictionary under the corresponding hash.
130
128
131
129
Parameters:
130
+ duplicates (DuplicatesType): The dictionary to store duplicate records.
132
131
current_path (List[str]): The list of keys or indices representing the current location in the object hierarchy.
133
132
obj (Dict): The original dictionary object where the duplicate is observed.
134
133
value (Any): The value to be hashed and used for identifying duplicates.
@@ -138,7 +137,7 @@ def _add_duplicate(
138
137
value_to_hash = value if key is None else {key : value }
139
138
obj_hash = _hash_object (value_to_hash )
140
139
if obj_hash :
141
- DUPLICATES [obj_hash ].append ((current_path , obj , value ))
140
+ duplicates [obj_hash ].append ((current_path , obj , value ))
142
141
143
142
144
143
def _add_to_shared_definitions (
@@ -161,49 +160,61 @@ def _add_to_shared_definitions(
161
160
return definitions
162
161
163
162
164
- def _collect_duplicates (node : ManifestType , path : Optional [ List [ str ]] = None ) -> None :
163
+ def _collect_duplicates (node : ManifestType ) -> DuplicatesType :
165
164
"""
166
165
Traverse the JSON object and collect all potential duplicate values and objects.
167
166
168
167
Args:
169
- node: The JSON object to analyze
168
+ node: The JSON object to analyze.
170
169
171
170
Returns:
172
- DUPLICATES : A dictionary of duplicate objects
171
+ duplicates : A dictionary of duplicate objects.
173
172
"""
174
173
175
- try :
176
- if not isinstance (node , dict ):
174
+ def _collect (obj : Dict [str , Any ], path : Optional [List [str ]] = None ) -> None :
175
+ """
176
+ The closure to recursively collect duplicates in the JSON object.
177
+
178
+ Args:
179
+ obj: The current object being analyzed.
180
+ path: The current path in the object hierarchy.
181
+ """
182
+ if not isinstance (obj , dict ):
177
183
return
178
184
179
185
path = [] if path is None else path
180
-
181
186
# Check if the object is empty
182
- for key , value in node .items ():
187
+ for key , value in obj .items ():
183
188
current_path = path + [key ]
184
189
185
190
if isinstance (value , dict ):
186
191
# First process nested dictionaries
187
- _collect_duplicates (value , current_path )
192
+ _collect (value , current_path )
188
193
# Process allowed-only component tags
189
194
if _is_allowed_tag (key ):
190
- _add_duplicate (current_path , node , value )
195
+ _add_duplicate (duplicates , current_path , obj , value )
191
196
192
197
# handle primitive types
193
198
elif isinstance (value , (str , int , float , bool )):
194
199
# Process allowed-only field tags
195
200
if _is_allowed_tag (key ):
196
- _add_duplicate (current_path , node , value , key )
201
+ _add_duplicate (duplicates , current_path , obj , value , key )
197
202
198
203
# handle list cases
199
204
elif isinstance (value , list ):
200
205
for i , item in enumerate (value ):
201
- _collect_duplicates (item , current_path + [str (i )])
206
+ _collect (item , current_path + [str (i )])
207
+
208
+ duplicates : DuplicatesType = defaultdict (list , {})
209
+
210
+ try :
211
+ _collect (node )
212
+ return duplicates
202
213
except Exception as e :
203
214
raise ManifestDeduplicationException (str (e ))
204
215
205
216
206
- def _hash_object (node : Dict ) -> Optional [str ]:
217
+ def _hash_object (node : Dict [ str , Any ] ) -> Optional [str ]:
207
218
"""
208
219
Create a unique hash for a dictionary object.
209
220
0 commit comments