27
27
from .serialize import SerializableMetaclass , ValueType , ProviderType , IdentityField , \
28
28
ListField , DictField , Int32Field , BoolField , StringField
29
29
from .tiles import NotSupportTile
30
- from .utils import AttributeDict , to_str , calc_data_size , is_eager_mode , is_object_dtype
30
+ from .utils import AttributeDict , to_str , calc_data_size , is_eager_mode , calc_object_overhead
31
31
32
32
33
33
operand_type_to_oprand_cls = {}
34
34
OP_TYPE_KEY = '_op_type_'
35
35
OP_MODULE_KEY = '_op_module_'
36
- OBJECT_FIELD_OVERHEAD = 50
37
36
T = TypeVar ('T' )
38
37
39
38
@@ -373,27 +372,26 @@ def execute(cls, ctx, op):
373
372
374
373
@classmethod
375
374
def estimate_size (cls , ctx , op ):
376
- from .dataframe .core import DATAFRAME_CHUNK_TYPE , SERIES_CHUNK_TYPE , INDEX_CHUNK_TYPE
377
-
378
375
exec_size = 0
379
376
outputs = op .outputs
380
377
if all (not c .is_sparse () and hasattr (c , 'nbytes' ) and not np .isnan (c .nbytes ) for c in outputs ):
381
378
for out in outputs :
382
379
ctx [out .key ] = (out .nbytes , out .nbytes )
383
380
381
+ all_overhead = 0
384
382
for inp in op .inputs or ():
385
383
try :
384
+ if isinstance (inp .op , FetchShuffle ):
385
+ keys_and_shapes = inp .extra_params .get ('_shapes' , dict ()).items ()
386
+ else :
387
+ keys_and_shapes = [(inp .key , getattr (inp , 'shape' , None ))]
388
+
386
389
# execution size of a specific data chunk may be
387
390
# larger than stored type due to objects
388
- obj_overhead = n_strings = 0
389
- if getattr (inp , 'shape' , None ) and not np .isnan (inp .shape [0 ]):
390
- if isinstance (inp , DATAFRAME_CHUNK_TYPE ) and inp .dtypes is not None :
391
- n_strings = len ([dt for dt in inp .dtypes if is_object_dtype (dt )])
392
- elif isinstance (inp , (INDEX_CHUNK_TYPE , SERIES_CHUNK_TYPE )) and inp .dtype is not None :
393
- n_strings = 1 if is_object_dtype (inp .dtype ) else 0
394
- obj_overhead += n_strings * inp .shape [0 ] * OBJECT_FIELD_OVERHEAD
395
-
396
- exec_size += ctx [inp .key ][0 ] + obj_overhead
391
+ for key , shape in keys_and_shapes :
392
+ overhead = calc_object_overhead (inp , shape )
393
+ all_overhead += overhead
394
+ exec_size += ctx [key ][0 ] + overhead
397
395
except KeyError :
398
396
if not op .sparse :
399
397
inp_size = calc_data_size (inp )
@@ -405,7 +403,10 @@ def estimate_size(cls, ctx, op):
405
403
chunk_sizes = dict ()
406
404
for out in outputs :
407
405
try :
408
- chunk_size = calc_data_size (out ) if not out .is_sparse () else exec_size
406
+ if not out .is_sparse ():
407
+ chunk_size = calc_data_size (out ) + all_overhead // len (outputs )
408
+ else :
409
+ chunk_size = exec_size
409
410
if np .isnan (chunk_size ):
410
411
raise TypeError
411
412
chunk_sizes [out .key ] = chunk_size
0 commit comments