-
Notifications
You must be signed in to change notification settings - Fork 33
Description
INFO:sagemaker:Creating training-job with name: tf2-object-detection-2023-04-15-10-29-55-436
2023-04-15 10:29:57 Starting - Starting the training job...
2023-04-15 10:30:13 Starting - Preparing the instances for training...
2023-04-15 10:30:55 Downloading - Downloading input data...
2023-04-15 10:31:20 Training - Downloading the training image.........
2023-04-15 10:32:51 Training - Training image download completed. Training in progress....2023-04-15 10:33:19,147 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,183 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,220 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Invoking user script
Training Env:
{
"additional_framework_parameters": {},
"channel_input_dirs": {
"train": "/opt/ml/input/data/train"
},
"current_host": "algo-1",
"current_instance_group": "homogeneousCluster",
"current_instance_group_hosts": [
"algo-1"
],
"current_instance_type": "ml.g4dn.2xlarge",
"distribution_hosts": [],
"distribution_instance_groups": [],
"framework_module": null,
"hosts": [
"algo-1"
],
"hyperparameters": {
"model_dir": "/opt/training",
"num_train_steps": "1000",
"pipeline_config_path": "pipeline.config",
"sample_1_of_n_eval_examples": "1"
},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"train": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
}
},
"input_dir": "/opt/ml/input",
"instance_groups": [
"homogeneousCluster"
],
"instance_groups_dict": {
"homogeneousCluster": {
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.2xlarge",
"hosts": [
"algo-1"
]
}
},
"is_hetero": false,
"is_master": true,
"is_modelparallel_enabled": null,
"is_smddpmprun_installed": false,
"job_name": "tf2-object-detection-2023-04-15-10-29-55-436",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz",
"module_name": "run_training.sh",
"network_interface_name": "eth0",
"num_cpus": 8,
"num_gpus": 1,
"num_neurons": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"current_instance_type": "ml.g4dn.2xlarge",
"current_group_name": "homogeneousCluster",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.2xlarge",
"hosts": [
"algo-1"
]
}
],
"network_interface_name": "eth0"
},
"user_entry_point": "run_training.sh"
}
Environment variables:
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS={"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"}
SM_USER_ENTRY_POINT=run_training.sh
SM_FRAMEWORK_PARAMS={}
SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"}
SM_INPUT_DATA_CONFIG={"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["train"]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.g4dn.2xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT={"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}}
SM_DISTRIBUTION_INSTANCE_GROUPS=[]
SM_IS_HETERO=false
SM_MODULE_NAME=run_training.sh
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=8
SM_NUM_GPUS=1
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz
SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"train":"/opt/ml/input/data/train"},"current_host":"algo-1","current_instance_group":"homogeneousCluster","current_instance_group_hosts":["algo-1"],"current_instance_type":"ml.g4dn.2xlarge","distribution_hosts":[],"distribution_instance_groups":[],"framework_module":null,"hosts":["algo-1"],"hyperparameters":{"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"},"input_config_dir":"/opt/ml/input/config","input_data_config":{"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","instance_groups":["homogeneousCluster"],"instance_groups_dict":{"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}},"is_hetero":false,"is_master":true,"is_modelparallel_enabled":null,"is_smddpmprun_installed":false,"job_name":"tf2-object-detection-2023-04-15-10-29-55-436","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz","module_name":"run_training.sh","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"num_neurons":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"run_training.sh"}
SM_USER_ARGS=["--model_dir","/opt/training","--num_train_steps","1000","--pipeline_config_path","pipeline.config","--sample_1_of_n_eval_examples","1"]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
SM_HP_MODEL_DIR=/opt/training
SM_HP_NUM_TRAIN_STEPS=1000
SM_HP_PIPELINE_CONFIG_PATH=pipeline.config
SM_HP_SAMPLE_1_OF_N_EVAL_EXAMPLES=1
PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages
Invoking script with the following command:
/bin/sh -c "./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1"
2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker Debugger as it is not installed.
===TRAINING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
I0415 10:33:26.837829 140166345439040 mirrored_strategy.py:374] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Maybe overwriting train_steps: 1000
I0415 10:33:26.841578 140166345439040 config_util.py:552] Maybe overwriting train_steps: 1000
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I0415 10:33:26.841733 140166345439040 config_util.py:552] Maybe overwriting use_bfloat16: False
I0415 10:33:26.853476 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:33:26.853606 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:33:26.853662 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:33:26.857728 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.903624 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.907572 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.914127 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:33:26.914241 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.936754 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.939711 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.006097 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.009125 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.034329 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.037334 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.100979 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.103941 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.114144 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:33:27.114249 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:33:27.485301 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:33:27.485446 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:33:27.854993 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:33:27.855136 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:33:28.334065 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:33:28.334210 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:33:28.843916 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:33:28.844068 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:33:29.612781 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:33:29.612927 140166345439040 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:33:29.859471 140166345439040 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:33:29.914009 140166345439040 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0415 10:33:29.959440 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
I0415 10:33:30.210091 140166345439040 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
I0415 10:33:30.228283 140166345439040 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
INFO:tensorflow:Number of filenames to read: 1
I0415 10:33:30.228426 140166345439040 dataset_builder.py:80] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W0415 10:33:30.228483 140166345439040 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
W0415 10:33:30.238938 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.map() W0415 10:33:30.255320 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
W0415 10:33:36.161500 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
W0415 10:33:39.770519 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
W0415 10:33:41.886407 140166345439040 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
/usr/local/lib/python3.8/dist-packages/keras/backend.py:452: UserWarning: tf.keras.backend.set_learning_phase is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the training argument of the __call__ method of your layer or model.
warnings.warn(
I0415 10:33:56.750935 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
I0415 10:34:07.238219 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
Traceback (most recent call last):
File "model_main_tf2.py", line 114, in
tf.compat.v1.app.run()
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/platform/app.py", line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "model_main_tf2.py", line 105, in main
model_lib_v2.train_loop(
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 605, in train_loop
load_fine_tune_checkpoint(
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 401, in load_fine_tune_checkpoint
_ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors)
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 176, in _ensure_model_is_built
strategy.run(
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 1316, in run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 2895, in call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_strategy.py", line 696, in _call_for_each_replica
return mirrored_run.call_for_each_replica(
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 84, in call_for_each_replica
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py", line 52, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.UnimplementedError: Graph execution error:
Detected at node 'EfficientDet-D1/model/stem_conv2d/Conv2D' defined at (most recent call last):
File "/usr/lib/python3.8/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 171, in _dummy_computation_fn
return _compute_losses_and_predictions_dicts(model, features, labels,
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 124, in _compute_losses_and_predictions_dicts
prediction_dict = model.predict(
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 570, in predict
if self._feature_extractor.is_keras_model:
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 571, in predict
feature_maps = self._feature_extractor(preprocessed_inputs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call
return super().call(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 252, in call
return self._extract_features(inputs)
File "/usr/local/lib/python3.8/dist-packages/object_detection/models/ssd_efficientnet_bifpn_feature_extractor.py", line 234, in _extract_features
base_feature_maps = self._efficientnet(
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call
return super().call(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 512, in call
return self._run_internal_graph(inputs, training=training, mask=mask)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 669, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 290, in call
outputs = self.convolution_op(inputs, self.kernel)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 262, in convolution_op
return tf.nn.convolution(
Node: 'EfficientDet-D1/model/stem_conv2d/Conv2D'
DNN library is not found.
#11 [[{{node EfficientDet-D1/model/stem_conv2d/Conv2D}}]] [Op:__inference__dummy_computation_fn_30818]
==EVALUATING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
WARNING:tensorflow:Forced number of epochs for all eval validations to be 1.
W0415 10:34:20.197691 139818454525760 model_lib_v2.py:1089] Forced number of epochs for all eval validations to be 1.
INFO:tensorflow:Maybe overwriting sample_1_of_n_eval_examples: None
I0415 10:34:20.197891 139818454525760 config_util.py:552] Maybe overwriting sample_1_of_n_eval_examples: None
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I0415 10:34:20.197963 139818454525760 config_util.py:552] Maybe overwriting use_bfloat16: False
INFO:tensorflow:Maybe overwriting eval_num_epochs: 1
I0415 10:34:20.198034 139818454525760 config_util.py:552] Maybe overwriting eval_num_epochs: 1
WARNING:tensorflow:Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
W0415 10:34:20.198128 139818454525760 model_lib_v2.py:1106] Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
I0415 10:34:21.334374 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:34:21.334518 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:34:21.334567 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:34:21.338575 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:21.377283 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:21.377415 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:21.544433 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:21.544581 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:21.815359 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:21.815515 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:22.077441 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:22.077582 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:22.423800 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:22.423947 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:22.765852 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:22.766001 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:23.356628 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:23.356780 139818454525760 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:34:23.542802 139818454525760 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:34:23.585031 139818454525760 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
I0415 10:34:23.855814 139818454525760 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
I0415 10:34:23.877571 139818454525760 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
INFO:tensorflow:Number of filenames to read: 1
I0415 10:34:23.877725 139818454525760 dataset_builder.py:80] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W0415 10:34:23.877783 139818454525760 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
W0415 10:34:23.885428 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.map() W0415 10:34:23.902438 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
W0415 10:34:27.488649 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
W0415 10:34:28.549533 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
W0415 10:34:31.064076 139818454525760 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
INFO:tensorflow:Waiting for new checkpoint at /opt/training
I0415 10:34:31.064445 139818454525760 checkpoint_utils.py:168] Waiting for new checkpoint at /opt/training
INFO:tensorflow:Timed-out waiting for a checkpoint.
I0415 10:34:40.073957 139818454525760 checkpoint_utils.py:231] Timed-out waiting for a checkpoint.
==EXPORTING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
I0415 10:34:45.487078 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:34:45.487244 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:34:45.487298 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:34:45.491338 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:45.531586 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:45.531744 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:45.684208 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:45.684357 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:45.971665 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:45.971818 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:46.260079 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:46.260246 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:46.636266 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:46.636410 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:47.010365 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:47.010508 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:47.476083 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:47.476248 140492014913344 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:34:47.672554 140492014913344 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:34:47.720135 140492014913344 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))
W0415 10:34:48.031827 140492014913344 deprecation.py:641] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))
I0415 10:34:52.011857 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
I0415 10:35:01.855062 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
Traceback (most recent call last):
File "exporter_main_v2.py", line 164, in
app.run(main)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "exporter_main_v2.py", line 157, in main
exporter_lib_v2.export_inference_graph(
File "/usr/local/lib/python3.8/dist-packages/object_detection/exporter_lib_v2.py", line 271, in export_inference_graph
status.assert_existing_objects_matched()
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/checkpoint/checkpoint.py", line 955, in assert_existing_objects_matched
raise AssertionError(
AssertionError: No checkpoint specified (save_path=None); nothing is being restored.
mv: cannot stat '/tmp/exported/saved_model': No such file or directory
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Reporting training FAILURE
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1"
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Encountered exit_code 1
2023-04-15 10:35:22 Uploading - Uploading generated training model
2023-04-15 10:35:22 Failed - Training job failed
UnexpectedStatusException Traceback (most recent call last)
/tmp/ipykernel_10180/1459479382.py in <cell line: 1>()
----> 1 estimator.fit(inputs)
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py in wrapper(*args, **kwargs)
270 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
271
--> 272 return run_func(*args, **kwargs)
273
274 return wrapper
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
1161 self.jobs.append(self.latest_training_job)
1162 if wait:
-> 1163 self.latest_training_job.wait(logs=logs)
1164
1165 def _compilation_job_name(self):
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in wait(self, logs)
2309 # If logs are requested, call logs_for_jobs.
2310 if logs != "None":
-> 2311 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
2312 else:
2313 self.sagemaker_session.wait_for_job(self.job_name)
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
4174
4175 if wait:
-> 4176 self._check_job_status(job_name, description, "TrainingJobStatus")
4177 if dot:
4178 print()
~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
3705 actual_status=status,
3706 )
-> 3707 raise exceptions.UnexpectedStatusException(
3708 message=message,
3709 allowed_statuses=["Completed", "Stopped"],
UnexpectedStatusException: Error for Training job tf2-object-detection-2023-04-15-10-29-55-436: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1", exit code: 1
I am not able to understand this error please help me to figure this out.