1
+ import functools
1
2
import gzip
2
3
import json
3
4
import logging
4
5
from datetime import datetime
5
- from itertools import islice
6
6
from threading import Lock
7
7
from typing import Any
8
8
@@ -25,12 +25,14 @@ def __init__(
25
25
from_date : str ,
26
26
env_obj : Environment | None = None ,
27
27
deadline : datetime | None = None ,
28
+ mirror_writes : bool = False ,
28
29
):
29
30
self .env = env
30
31
self .settings = settings
31
32
self .from_date = from_date
32
33
self .env_obj = env_obj or get_environment (env )
33
34
self .deadline = deadline
35
+ self .mirror_writes = mirror_writes
34
36
self .client = DynamoDBClientWrapper (self .env_obj .aws_profile ).client
35
37
self ._lock = Lock ()
36
38
self ._definitions = None
@@ -208,6 +210,31 @@ def query_definitions(self) -> dict[str, Any]:
208
210
out = json .loads (item_json )
209
211
return out
210
212
213
+ @functools .lru_cache (maxsize = 2500 )
214
+ def uris_for_item (self , item ) -> list [str ]:
215
+ """Returns all URIs to be written for the given item.
216
+
217
+ In practice, always returns either one or two URIs depending on
218
+ configured aliases and other settings, though the caller should
219
+ assume any number of URIs.
220
+ """
221
+
222
+ # Resolve aliases. We only write to the deepest path
223
+ # after all alias resolution, hence only using the
224
+ # first result from uri_alias.
225
+ uris = [uri_alias (item .web_uri , self .aliases_for_write )[0 ]]
226
+
227
+ # We only want to mirror writes for release ver aliases. Recalculating
228
+ # the aliases completely is a bit inefficient, but I'd rather not
229
+ # duplicate any alias logic.
230
+ if (
231
+ self .mirror_writes
232
+ and uri_alias (item .web_uri , self ._aliases (["releasever_alias" ]))[0 ]
233
+ != item .web_uri
234
+ ):
235
+ uris .append (item .web_uri )
236
+ return uris
237
+
211
238
def create_request (
212
239
self ,
213
240
items : list [models .Item ],
@@ -216,8 +243,6 @@ def create_request(
216
243
"""Create the dictionary structure expected by batch_write_item."""
217
244
table_name = self .env_obj .table
218
245
request : dict [str , list [Any ]] = {table_name : []}
219
- uri_aliases = self .aliases_for_write
220
-
221
246
for item in items :
222
247
# Items carry their own from_date. This effectively resolves
223
248
# conflicts in the case of two publishes updating the same web_uri
@@ -226,35 +251,32 @@ def create_request(
226
251
# updated timestamp.
227
252
from_date = str (item .updated )
228
253
229
- # Resolve aliases. We only write to the deepest path
230
- # after all alias resolution, hence only using the
231
- # first result from uri_alias.
232
- web_uri = uri_alias (item .web_uri , uri_aliases )[0 ]
233
-
234
- if delete :
235
- request [table_name ].append (
236
- {
237
- "DeleteRequest" : {
238
- "Key" : {
239
- "from_date" : {"S" : from_date },
240
- "web_uri" : {"S" : web_uri },
254
+ for web_uri in self .uris_for_item (item ):
255
+ if delete :
256
+ request [table_name ].append (
257
+ {
258
+ "DeleteRequest" : {
259
+ "Key" : {
260
+ "from_date" : {"S" : from_date },
261
+ "web_uri" : {"S" : web_uri },
262
+ }
241
263
}
242
264
}
243
- }
244
- )
245
- else :
246
- request [ table_name ]. append (
247
- {
248
- "PutRequest " : {
249
- "Item " : {
250
- "from_date " : {"S" : from_date },
251
- "web_uri " : {"S" : web_uri },
252
- "object_key " : {"S" : item .object_key },
253
- "content_type" : { "S" : item . content_type },
265
+ )
266
+ else :
267
+ request [ table_name ]. append (
268
+ {
269
+ "PutRequest" : {
270
+ "Item " : {
271
+ "from_date " : {"S" : from_date },
272
+ "web_uri " : {"S" : web_uri },
273
+ "object_key " : {"S" : item . object_key },
274
+ "content_type " : {"S" : item .content_type },
275
+ }
254
276
}
255
277
}
256
- }
257
- )
278
+ )
279
+
258
280
return request
259
281
260
282
def create_config_request (self , config ):
@@ -332,11 +354,30 @@ def _batch_write(req):
332
354
return _batch_write (request )
333
355
334
356
def get_batches (self , items : list [models .Item ]):
335
- """Divide the publish items into batches of size 'write_batch_size'."""
357
+ """
358
+ Divide the publish items into batches of size 'write_batch_size'.
359
+
360
+ Due to mirroring, an item might have multiple write requests. We need
361
+ to account for this when splitting items into batches. We memoize the
362
+ results of uri_for_item to avoid recalculating aliases.
363
+ """
336
364
it = iter (items )
337
- batches = list (
338
- iter (lambda : tuple (islice (it , self .settings .write_batch_size )), ())
339
- )
365
+ batches : list [list [models .Item ]] = []
366
+ current_batch : list [models .Item ] = []
367
+ current_batch_size = 0
368
+ for item in it :
369
+ item_weight = len (self .uris_for_item (item ))
370
+ if (
371
+ current_batch_size + item_weight
372
+ > self .settings .write_batch_size
373
+ ):
374
+ batches .append (current_batch )
375
+ current_batch = []
376
+ current_batch_size = 0
377
+ current_batch .append (item )
378
+ current_batch_size += item_weight
379
+ if current_batch :
380
+ batches .append (current_batch )
340
381
return batches
341
382
342
383
def write_batch (self , items : list [models .Item ], delete : bool = False ):
0 commit comments