Skip to content

not able to train the model on different region using ml.g4dn.2xlarge for training #35

@rhitwijweldxit

Description

@rhitwijweldxit

INFO:sagemaker:Creating training-job with name: tf2-object-detection-2023-04-15-10-29-55-436
2023-04-15 10:29:57 Starting - Starting the training job...
2023-04-15 10:30:13 Starting - Preparing the instances for training...
2023-04-15 10:30:55 Downloading - Downloading input data...
2023-04-15 10:31:20 Training - Downloading the training image.........
2023-04-15 10:32:51 Training - Training image download completed. Training in progress....2023-04-15 10:33:19,147 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,183 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,220 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)
2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Invoking user script
Training Env:
{
"additional_framework_parameters": {},
"channel_input_dirs": {
"train": "/opt/ml/input/data/train"
},
"current_host": "algo-1",
"current_instance_group": "homogeneousCluster",
"current_instance_group_hosts": [
"algo-1"
],
"current_instance_type": "ml.g4dn.2xlarge",
"distribution_hosts": [],
"distribution_instance_groups": [],
"framework_module": null,
"hosts": [
"algo-1"
],
"hyperparameters": {
"model_dir": "/opt/training",
"num_train_steps": "1000",
"pipeline_config_path": "pipeline.config",
"sample_1_of_n_eval_examples": "1"
},
"input_config_dir": "/opt/ml/input/config",
"input_data_config": {
"train": {
"TrainingInputMode": "File",
"S3DistributionType": "FullyReplicated",
"RecordWrapperType": "None"
}
},
"input_dir": "/opt/ml/input",
"instance_groups": [
"homogeneousCluster"
],
"instance_groups_dict": {
"homogeneousCluster": {
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.2xlarge",
"hosts": [
"algo-1"
]
}
},
"is_hetero": false,
"is_master": true,
"is_modelparallel_enabled": null,
"is_smddpmprun_installed": false,
"job_name": "tf2-object-detection-2023-04-15-10-29-55-436",
"log_level": 20,
"master_hostname": "algo-1",
"model_dir": "/opt/ml/model",
"module_dir": "s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz",
"module_name": "run_training.sh",
"network_interface_name": "eth0",
"num_cpus": 8,
"num_gpus": 1,
"num_neurons": 0,
"output_data_dir": "/opt/ml/output/data",
"output_dir": "/opt/ml/output",
"output_intermediate_dir": "/opt/ml/output/intermediate",
"resource_config": {
"current_host": "algo-1",
"current_instance_type": "ml.g4dn.2xlarge",
"current_group_name": "homogeneousCluster",
"hosts": [
"algo-1"
],
"instance_groups": [
{
"instance_group_name": "homogeneousCluster",
"instance_type": "ml.g4dn.2xlarge",
"hosts": [
"algo-1"
]
}
],
"network_interface_name": "eth0"
},
"user_entry_point": "run_training.sh"
}
Environment variables:
SM_HOSTS=["algo-1"]
SM_NETWORK_INTERFACE_NAME=eth0
SM_HPS={"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"}
SM_USER_ENTRY_POINT=run_training.sh
SM_FRAMEWORK_PARAMS={}
SM_RESOURCE_CONFIG={"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"}
SM_INPUT_DATA_CONFIG={"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}
SM_OUTPUT_DATA_DIR=/opt/ml/output/data
SM_CHANNELS=["train"]
SM_CURRENT_HOST=algo-1
SM_CURRENT_INSTANCE_TYPE=ml.g4dn.2xlarge
SM_CURRENT_INSTANCE_GROUP=homogeneousCluster
SM_CURRENT_INSTANCE_GROUP_HOSTS=["algo-1"]
SM_INSTANCE_GROUPS=["homogeneousCluster"]
SM_INSTANCE_GROUPS_DICT={"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}}
SM_DISTRIBUTION_INSTANCE_GROUPS=[]
SM_IS_HETERO=false
SM_MODULE_NAME=run_training.sh
SM_LOG_LEVEL=20
SM_FRAMEWORK_MODULE=
SM_INPUT_DIR=/opt/ml/input
SM_INPUT_CONFIG_DIR=/opt/ml/input/config
SM_OUTPUT_DIR=/opt/ml/output
SM_NUM_CPUS=8
SM_NUM_GPUS=1
SM_NUM_NEURONS=0
SM_MODEL_DIR=/opt/ml/model
SM_MODULE_DIR=s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz
SM_TRAINING_ENV={"additional_framework_parameters":{},"channel_input_dirs":{"train":"/opt/ml/input/data/train"},"current_host":"algo-1","current_instance_group":"homogeneousCluster","current_instance_group_hosts":["algo-1"],"current_instance_type":"ml.g4dn.2xlarge","distribution_hosts":[],"distribution_instance_groups":[],"framework_module":null,"hosts":["algo-1"],"hyperparameters":{"model_dir":"/opt/training","num_train_steps":"1000","pipeline_config_path":"pipeline.config","sample_1_of_n_eval_examples":"1"},"input_config_dir":"/opt/ml/input/config","input_data_config":{"train":{"RecordWrapperType":"None","S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}},"input_dir":"/opt/ml/input","instance_groups":["homogeneousCluster"],"instance_groups_dict":{"homogeneousCluster":{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}},"is_hetero":false,"is_master":true,"is_modelparallel_enabled":null,"is_smddpmprun_installed":false,"job_name":"tf2-object-detection-2023-04-15-10-29-55-436","log_level":20,"master_hostname":"algo-1","model_dir":"/opt/ml/model","module_dir":"s3://sagemaker-ap-south-1-657101763531/tf2-object-detection-2023-04-15-10-29-55-436/source/sourcedir.tar.gz","module_name":"run_training.sh","network_interface_name":"eth0","num_cpus":8,"num_gpus":1,"num_neurons":0,"output_data_dir":"/opt/ml/output/data","output_dir":"/opt/ml/output","output_intermediate_dir":"/opt/ml/output/intermediate","resource_config":{"current_group_name":"homogeneousCluster","current_host":"algo-1","current_instance_type":"ml.g4dn.2xlarge","hosts":["algo-1"],"instance_groups":[{"hosts":["algo-1"],"instance_group_name":"homogeneousCluster","instance_type":"ml.g4dn.2xlarge"}],"network_interface_name":"eth0"},"user_entry_point":"run_training.sh"}
SM_USER_ARGS=["--model_dir","/opt/training","--num_train_steps","1000","--pipeline_config_path","pipeline.config","--sample_1_of_n_eval_examples","1"]
SM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate
SM_CHANNEL_TRAIN=/opt/ml/input/data/train
SM_HP_MODEL_DIR=/opt/training
SM_HP_NUM_TRAIN_STEPS=1000
SM_HP_PIPELINE_CONFIG_PATH=pipeline.config
SM_HP_SAMPLE_1_OF_N_EVAL_EXAMPLES=1
PYTHONPATH=/opt/ml/code:/usr/local/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages
Invoking script with the following command:
/bin/sh -c "./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1"
2023-04-15 10:33:19,232 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker Debugger as it is not installed.
===TRAINING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
I0415 10:33:26.837829 140166345439040 mirrored_strategy.py:374] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Maybe overwriting train_steps: 1000
I0415 10:33:26.841578 140166345439040 config_util.py:552] Maybe overwriting train_steps: 1000
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I0415 10:33:26.841733 140166345439040 config_util.py:552] Maybe overwriting use_bfloat16: False
I0415 10:33:26.853476 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:33:26.853606 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:33:26.853662 140166345439040 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:33:26.857728 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.903624 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.907572 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.914127 140166345439040 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:33:26.914241 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.936754 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:26.939711 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.006097 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.009125 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.034329 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.037334 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.100979 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.103941 140166345439040 cross_device_ops.py:616] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0415 10:33:27.114144 140166345439040 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:33:27.114249 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:33:27.485301 140166345439040 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:33:27.485446 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:33:27.854993 140166345439040 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:33:27.855136 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:33:28.334065 140166345439040 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:33:28.334210 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:33:28.843916 140166345439040 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:33:28.844068 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:33:29.612781 140166345439040 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:33:29.612927 140166345439040 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:33:29.859471 140166345439040 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:33:29.914009 140166345439040 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0415 10:33:29.959440 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py:563: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
I0415 10:33:30.210091 140166345439040 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
I0415 10:33:30.228283 140166345439040 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/train.records']
INFO:tensorflow:Number of filenames to read: 1
I0415 10:33:30.228426 140166345439040 dataset_builder.py:80] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W0415 10:33:30.228483 140166345439040 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
W0415 10:33:30.238938 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.map() W0415 10:33:30.255320 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
W0415 10:33:36.161500 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
W0415 10:33:39.770519 140166345439040 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
W0415 10:33:41.886407 140166345439040 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
/usr/local/lib/python3.8/dist-packages/keras/backend.py:452: UserWarning: tf.keras.backend.set_learning_phase is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the training argument of the __call__ method of your layer or model.
warnings.warn(
I0415 10:33:56.750935 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
I0415 10:34:07.238219 140143522457344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
Traceback (most recent call last):
File "model_main_tf2.py", line 114, in
tf.compat.v1.app.run()
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/platform/app.py", line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "model_main_tf2.py", line 105, in main
model_lib_v2.train_loop(
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 605, in train_loop
load_fine_tune_checkpoint(
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 401, in load_fine_tune_checkpoint
_ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors)
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 176, in _ensure_model_is_built
strategy.run(
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 1316, in run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py", line 2895, in call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_strategy.py", line 696, in _call_for_each_replica
return mirrored_run.call_for_each_replica(
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/mirrored_run.py", line 84, in call_for_each_replica
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/execute.py", line 52, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.UnimplementedError: Graph execution error:
Detected at node 'EfficientDet-D1/model/stem_conv2d/Conv2D' defined at (most recent call last):
File "/usr/lib/python3.8/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 171, in _dummy_computation_fn
return _compute_losses_and_predictions_dicts(model, features, labels,
File "/usr/local/lib/python3.8/dist-packages/object_detection/model_lib_v2.py", line 124, in _compute_losses_and_predictions_dicts
prediction_dict = model.predict(
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 570, in predict
if self._feature_extractor.is_keras_model:
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 571, in predict
feature_maps = self._feature_extractor(preprocessed_inputs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call
return super().call(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/object_detection/meta_architectures/ssd_meta_arch.py", line 252, in call
return self._extract_features(inputs)
File "/usr/local/lib/python3.8/dist-packages/object_detection/models/ssd_efficientnet_bifpn_feature_extractor.py", line 234, in _extract_features
base_feature_maps = self._efficientnet(
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/training.py", line 558, in call
return super().call(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 512, in call
return self._run_internal_graph(inputs, training=training, mask=mask)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/functional.py", line 669, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/engine/base_layer.py", line 1145, in call
outputs = call_fn(inputs, *args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
return fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 290, in call
outputs = self.convolution_op(inputs, self.kernel)
File "/usr/local/lib/python3.8/dist-packages/keras/layers/convolutional/base_conv.py", line 262, in convolution_op
return tf.nn.convolution(
Node: 'EfficientDet-D1/model/stem_conv2d/Conv2D'
DNN library is not found.
#11 [[{{node EfficientDet-D1/model/stem_conv2d/Conv2D}}]] [Op:__inference__dummy_computation_fn_30818]
==EVALUATING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
WARNING:tensorflow:Forced number of epochs for all eval validations to be 1.
W0415 10:34:20.197691 139818454525760 model_lib_v2.py:1089] Forced number of epochs for all eval validations to be 1.
INFO:tensorflow:Maybe overwriting sample_1_of_n_eval_examples: None
I0415 10:34:20.197891 139818454525760 config_util.py:552] Maybe overwriting sample_1_of_n_eval_examples: None
INFO:tensorflow:Maybe overwriting use_bfloat16: False
I0415 10:34:20.197963 139818454525760 config_util.py:552] Maybe overwriting use_bfloat16: False
INFO:tensorflow:Maybe overwriting eval_num_epochs: 1
I0415 10:34:20.198034 139818454525760 config_util.py:552] Maybe overwriting eval_num_epochs: 1
WARNING:tensorflow:Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
W0415 10:34:20.198128 139818454525760 model_lib_v2.py:1106] Expected number of evaluation epochs is 1, but instead encountered eval_on_train_input_config.num_epochs = 0. Overwriting num_epochs to 1.
I0415 10:34:21.334374 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:34:21.334518 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:34:21.334567 139818454525760 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:34:21.338575 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:21.377283 139818454525760 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:21.377415 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:21.544433 139818454525760 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:21.544581 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:21.815359 139818454525760 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:21.815515 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:22.077441 139818454525760 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:22.077582 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:22.423800 139818454525760 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:22.423947 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:22.765852 139818454525760 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:22.766001 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:23.356628 139818454525760 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:23.356780 139818454525760 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:34:23.542802 139818454525760 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:34:23.585031 139818454525760 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
INFO:tensorflow:Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
I0415 10:34:23.855814 139818454525760 dataset_builder.py:162] Reading unweighted datasets: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
INFO:tensorflow:Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
I0415 10:34:23.877571 139818454525760 dataset_builder.py:79] Reading record datasets for input file: ['s3://sagemaker-ap-south-1-657101763531/data_1/tfrecords/validation.records']
INFO:tensorflow:Number of filenames to read: 1
I0415 10:34:23.877725 139818454525760 dataset_builder.py:80] Number of filenames to read: 1
WARNING:tensorflow:num_readers has been reduced to 1 to match input file shards.
W0415 10:34:23.877783 139818454525760 dataset_builder.py:86] num_readers has been reduced to 1 to match input file shards.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
W0415 10:34:23.885428 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:100: parallel_interleave (from tensorflow.python.data.experimental.ops.interleave_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.AUTOTUNE) instead. If sloppy execution is desired, use tf.data.Options.deterministic.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.data.Dataset.map() W0415 10:34:23.902438 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/dataset_builder.py:235: DatasetV1.map_with_legacy_function (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version. Instructions for updating: Use tf.data.Dataset.map()
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
W0415 10:34:27.488649 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a tf.sparse.SparseTensor and use tf.sparse.to_dense instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
W0415 10:34:28.549533 139818454525760 deprecation.py:364] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/util/dispatch.py:1176: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
W0415 10:34:31.064076 139818454525760 module_wrapper.py:149] From /usr/local/lib/python3.8/dist-packages/object_detection/builders/optimizer_builder.py:124: The name tf.keras.optimizers.SGD is deprecated. Please use tf.keras.optimizers.legacy.SGD instead.
INFO:tensorflow:Waiting for new checkpoint at /opt/training
I0415 10:34:31.064445 139818454525760 checkpoint_utils.py:168] Waiting for new checkpoint at /opt/training
INFO:tensorflow:Timed-out waiting for a checkpoint.
I0415 10:34:40.073957 139818454525760 checkpoint_utils.py:231] Timed-out waiting for a checkpoint.
==EXPORTING THE MODEL==
/usr/local/lib/python3.8/dist-packages/tensorflow_addons/utils/tfa_eol_msg.py:23: UserWarning:
TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP).
For more information see: tensorflow/addons#2807
warnings.warn(
I0415 10:34:45.487078 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:150] EfficientDet EfficientNet backbone version: efficientnet-b1
I0415 10:34:45.487244 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:152] EfficientDet BiFPN num filters: 88
I0415 10:34:45.487298 140492014913344 ssd_efficientnet_bifpn_feature_extractor.py:153] EfficientDet BiFPN num iterations: 4
I0415 10:34:45.491338 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:45.531586 140492014913344 efficientnet_model.py:143] round_filter input=32 output=32
I0415 10:34:45.531744 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:45.684208 140492014913344 efficientnet_model.py:143] round_filter input=16 output=16
I0415 10:34:45.684357 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:45.971665 140492014913344 efficientnet_model.py:143] round_filter input=24 output=24
I0415 10:34:45.971818 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:46.260079 140492014913344 efficientnet_model.py:143] round_filter input=40 output=40
I0415 10:34:46.260246 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:46.636266 140492014913344 efficientnet_model.py:143] round_filter input=80 output=80
I0415 10:34:46.636410 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:47.010365 140492014913344 efficientnet_model.py:143] round_filter input=112 output=112
I0415 10:34:47.010508 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:47.476083 140492014913344 efficientnet_model.py:143] round_filter input=192 output=192
I0415 10:34:47.476248 140492014913344 efficientnet_model.py:143] round_filter input=320 output=320
I0415 10:34:47.672554 140492014913344 efficientnet_model.py:143] round_filter input=1280 output=1280
I0415 10:34:47.720135 140492014913344 efficientnet_model.py:453] Building model efficientnet with params ModelConfig(width_coefficient=1.0, depth_coefficient=1.1, resolution=240, dropout_rate=0.2, blocks=(BlockConfig(input_filters=32, output_filters=16, kernel_size=3, num_repeat=1, expand_ratio=1, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=16, output_filters=24, kernel_size=3, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=24, output_filters=40, kernel_size=5, num_repeat=2, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=40, output_filters=80, kernel_size=3, num_repeat=3, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=80, output_filters=112, kernel_size=5, num_repeat=3, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=112, output_filters=192, kernel_size=5, num_repeat=4, expand_ratio=6, strides=(2, 2), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise'), BlockConfig(input_filters=192, output_filters=320, kernel_size=3, num_repeat=1, expand_ratio=6, strides=(1, 1), se_ratio=0.25, id_skip=True, fused_conv=False, conv_type='depthwise')), stem_base_filters=32, top_base_filters=1280, activation='simple_swish', batch_norm='default', bn_momentum=0.99, bn_epsilon=0.001, weight_decay=5e-06, drop_connect_rate=0.2, depth_divisor=8, min_depth=None, use_se=True, input_channels=3, num_classes=1000, model_name='efficientnet', rescale_input=False, data_format='channels_last', dtype='float32')
WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))
W0415 10:34:48.031827 140492014913344 deprecation.py:641] From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/impl/api.py:458: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with back_prop=False is deprecated and will be removed in a future version.
Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.map_fn(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.map_fn(fn, elems))
I0415 10:34:52.011857 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
I0415 10:35:01.855062 140492014913344 api.py:459] feature_map_spatial_dims: [(80, 80), (40, 40), (20, 20), (10, 10), (5, 5)]
Traceback (most recent call last):
File "exporter_main_v2.py", line 164, in
app.run(main)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "exporter_main_v2.py", line 157, in main
exporter_lib_v2.export_inference_graph(
File "/usr/local/lib/python3.8/dist-packages/object_detection/exporter_lib_v2.py", line 271, in export_inference_graph
status.assert_existing_objects_matched()
File "/usr/local/lib/python3.8/dist-packages/tensorflow/python/checkpoint/checkpoint.py", line 955, in assert_existing_objects_matched
raise AssertionError(
AssertionError: No checkpoint specified (save_path=None); nothing is being restored.
mv: cannot stat '/tmp/exported/saved_model': No such file or directory
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Reporting training FAILURE
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1"
2023-04-15 10:35:05,585 sagemaker-training-toolkit ERROR Encountered exit_code 1

2023-04-15 10:35:22 Uploading - Uploading generated training model
2023-04-15 10:35:22 Failed - Training job failed

UnexpectedStatusException Traceback (most recent call last)
/tmp/ipykernel_10180/1459479382.py in <cell line: 1>()
----> 1 estimator.fit(inputs)

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/workflow/pipeline_context.py in wrapper(*args, **kwargs)
270 return _StepArguments(retrieve_caller_name(self_instance), run_func, *args, **kwargs)
271
--> 272 return run_func(*args, **kwargs)
273
274 return wrapper

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
1161 self.jobs.append(self.latest_training_job)
1162 if wait:
-> 1163 self.latest_training_job.wait(logs=logs)
1164
1165 def _compilation_job_name(self):

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/estimator.py in wait(self, logs)
2309 # If logs are requested, call logs_for_jobs.
2310 if logs != "None":
-> 2311 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
2312 else:
2313 self.sagemaker_session.wait_for_job(self.job_name)

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
4174
4175 if wait:
-> 4176 self._check_job_status(job_name, description, "TrainingJobStatus")
4177 if dot:
4178 print()

~/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
3705 actual_status=status,
3706 )
-> 3707 raise exceptions.UnexpectedStatusException(
3708 message=message,
3709 allowed_statuses=["Completed", "Stopped"],

UnexpectedStatusException: Error for Training job tf2-object-detection-2023-04-15-10-29-55-436: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/bin/sh -c ./run_training.sh --model_dir /opt/training --num_train_steps 1000 --pipeline_config_path pipeline.config --sample_1_of_n_eval_examples 1", exit code: 1

I am not able to understand this error please help me to figure this out.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions