-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Description
Please fill out the form below.
System Information
- Framework :TensorFlow
- Framework Version: 1.12.0
- Python Version: 2
- CPU or GPU: CPU
- Python SDK Version: 1.25.0
- Are you using a custom image: no
Describe the problem
I am using sagemaker to train the tensorflow model. At the beginning, everything is ok, but It get the key error of feature after the first evaluation.
Here is my code:
def train_input_fn(training_dir, hyperparameters):
print("======================================================================================")
print(training_dir)
#return _input_fn(training_dir,batch_size=hyperparameters['batch_size'],shuffle_and_repeat=True)
return _input_fn("/opt/ml/input/data/train",batch_size=8,shuffle_and_repeat=True)
def eval_input_fn(training_dir, hyperparameters):
print("--------------------------------------------------------------------------------------")
print(training_dir)
#return _input_fn(training_dir,batch_size=hyperparameters['batch_size'])
return _input_fn("/opt/ml/input/data/eval",batch_size=32)
def serving_input_fn(hyperparameters):
input_content_ids = tf.placeholder(shape=[None,None],dtype=tf.int32)
input_content_len = tf.placeholder(shape=[None],dtype=tf.int32)
input_title_ids = tf.placeholder(shape=[None,None],dtype=tf.int32)
input_title_len = tf.placeholder(shape=[None],dtype=tf.int32)
feature = {
"words":input_content_ids,
"words_len":input_content_len,
"titles":input_title_ids,
"titles_len":input_title_len
}
return tf.estimator.export.build_raw_serving_input_receiver_fn(features=feature)()
#return tf.estimator.export.build_raw_serving_input_receiver_fn(features ={"inputs":(tensor_1,tensor_2)} )()
#return tf.estimator.export.ServingInputReceiver((tensor_1,tensor_2),)
def _input_fn(filename_dir,shuffle_size=15000,epochs=2,batch_size=20,skip_header_lines=1, shuffle_and_repeat=False):
def parse_file(row):
row_ = tf.parse_single_example(row,features={
"content_input_ids":tf.VarLenFeature(tf.int64),
"content_input_len":tf.FixedLenFeature([],tf.int64),
"tag_input_ids":tf.VarLenFeature(tf.int64),
"tag_input_len":tf.FixedLenFeature([],tf.int64),
"binary_output":tf.FixedLenFeature([],tf.int64),
})
content = tf.sparse_tensor_to_dense(row_["content_input_ids"])
content = tf.cast(content,tf.int32)
content_len = row_["content_input_len"]
content_len = tf.cast(content_len,tf.int32)
tag = tf.sparse_tensor_to_dense(row_["tag_input_ids"])
tag = tf.cast(tag,tf.int32)
tag_len = row_["tag_input_len"]
tag_len = tf.cast(tag_len,tf.int32)
label = row_["binary_output"]
label = tf.cast(label,tf.int32)
return {"content_input_ids":content,"content_input_len":content_len,"tag_input_ids":tag,"tag_input_len":tag_len},label
ext = "*.tfrecord*"
all_tf_files = [file for path,subdir,files in os.walk(filename_dir) for file in glob.glob(os.path.join(path,ext))]
dataset = tf.data.TFRecordDataset(all_tf_files)
dataset = dataset.map(parse_file)
dataset = dataset.shuffle(shuffle_size).repeat(epochs)
dataset = dataset.padded_batch(
batch_size=batch_size,
padded_shapes=(
{
"content_input_ids":[None],
"content_input_len":[],
"tag_input_ids":[None],
"tag_input_len":[],
},[]
),
padding_values=(
{
"content_input_ids":0,
"content_input_len":0,
"tag_input_ids":0,
"tag_input_len":0
},0
)
)
return dataset
def model_fn(features, labels, mode, hyperparameters):
SAGEMAKER_DATA_PATH = '/opt/ml/input/data'
inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH)
base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data'
input_ids_content = features["content_input_ids"]
input_ids_content_len = features["content_input_len"]
input_ids_tag = features["tag_input_ids"]
input_ids_tag_len = features["tag_input_len"]
label_binary_output = labels
dropout = 0.5
training = (mode == tf.estimator.ModeKeys.TRAIN)
#lstm_size = hyperparameters['lstm_size']
lstm_size = 300
#numtags = hyperparameters['num_tags']
embedding_dir = os.path.join(base_dir,'embedding','bpemb_model.npz')
def loadPretrainEmbeddingMatrixi(filename_dir):
embedding = np.load(filename_dir)["embeddings"]
return embedding
embedding = loadPretrainEmbeddingMatrixi(embedding_dir)
preEmbedding = tf.get_variable(initializer=embedding,dtype=tf.float32,trainable=False,name="embedding")
content_embedding = tf.nn.embedding_lookup(preEmbedding,input_ids_content)
tag_embedding = tf.nn.embedding_lookup(preEmbedding,input_ids_tag)
hidden_dim = 300
numtags = 4
content_fw_cell = tf.nn.rnn_cell.GRUCell(hidden_dim,name="content_fw_cell")
content_bw_cell = tf.nn.rnn_cell.GRUCell(hidden_dim,name="content_bw_cell")
content_outputs,content_outputs_states = tf.nn.bidirectional_dynamic_rnn(content_fw_cell,content_bw_cell,content_embedding,sequence_length=input_ids_content_len,dtype=tf.float32)
content_fw_output,content_bw_output = content_outputs
content_outputs = tf.concat([content_fw_output,content_bw_output],-1)
Minimal repro / logs
From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
2019-08-22 03:49:06,047 WARNING - tensorflow - From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Calling model_fn.
2019-08-22 03:49:06,075 INFO - tensorflow - Calling model_fn.
Done calling model_fn.
2019-08-22 03:49:12,073 INFO - tensorflow - Done calling model_fn.
Create CheckpointSaverHook.
2019-08-22 03:49:12,074 INFO - tensorflow - Create CheckpointSaverHook.
Graph was finalized.
2019-08-22 03:49:18,854 INFO - tensorflow - Graph was finalized.
Running local_init_op.
2019-08-22 03:49:20,313 INFO - tensorflow - Running local_init_op.
Done running local_init_op.
2019-08-22 03:49:20,334 INFO - tensorflow - Done running local_init_op.
Saving checkpoints for 0 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
2019-08-22 03:50:12,586 INFO - tensorflow - Saving checkpoints for 0 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
loss = 0.6956065032933187, step = 1
2019-08-22 03:50:27,582 INFO - tensorflow - loss = 0.6956065032933187, step = 1
Saving checkpoints for 73 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
2019-08-22 03:55:23,700 INFO - tensorflow - Saving checkpoints for 73 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
--------------------------------------------------------------------------------------
None
Calling model_fn.
2019-08-22 03:55:37,164 INFO - tensorflow - Calling model_fn.
Done calling model_fn.
2019-08-22 03:55:41,287 INFO - tensorflow - Done calling model_fn.
Starting evaluation at 2019-08-22-03:55:41
2019-08-22 03:55:41,306 INFO - tensorflow - Starting evaluation at 2019-08-22-03:55:41
Graph was finalized.
2019-08-22 03:55:41,498 INFO - tensorflow - Graph was finalized.
Restoring parameters from s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
2019-08-22 03:55:41,553 INFO - tensorflow - Restoring parameters from s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
Running local_init_op.
2019-08-22 03:55:44,259 INFO - tensorflow - Running local_init_op.
Done running local_init_op.
2019-08-22 03:55:44,280 INFO - tensorflow - Done running local_init_op.
Evaluation [10/100]
2019-08-22 03:55:51,946 INFO - tensorflow - Evaluation [10/100]
Evaluation [20/100]
2019-08-22 03:55:59,165 INFO - tensorflow - Evaluation [20/100]
Evaluation [30/100]
2019-08-22 03:56:07,046 INFO - tensorflow - Evaluation [30/100]
Evaluation [40/100]
2019-08-22 03:56:14,048 INFO - tensorflow - Evaluation [40/100]
Evaluation [50/100]
2019-08-22 03:56:21,118 INFO - tensorflow - Evaluation [50/100]
Evaluation [60/100]
2019-08-22 03:56:29,392 INFO - tensorflow - Evaluation [60/100]
Evaluation [70/100]
2019-08-22 03:56:36,941 INFO - tensorflow - Evaluation [70/100]
Evaluation [80/100]
2019-08-22 03:56:43,949 INFO - tensorflow - Evaluation [80/100]
Evaluation [90/100]
2019-08-22 03:56:51,402 INFO - tensorflow - Evaluation [90/100]
2019-08-22 03:57:13 Uploading - Uploading generated training modelEvaluation [100/100]
2019-08-22 03:56:58,415 INFO - tensorflow - Evaluation [100/100]
Finished evaluation at 2019-08-22-03:56:58
2019-08-22 03:56:58,508 INFO - tensorflow - Finished evaluation at 2019-08-22-03:56:58
Saving dict for global step 73: accuracy = 0.4940625, f1 = 0.6608076, global_step = 73, loss = 0.69346565, precision = 0.4936265, recall = 0.98100066
2019-08-22 03:56:58,508 INFO - tensorflow - Saving dict for global step 73: accuracy = 0.4940625, f1 = 0.6608076, global_step = 73, loss = 0.69346565, precision = 0.4936265, recall = 0.98100066
Saving 'checkpoint_path' summary for global step 73: s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
2019-08-22 03:57:08,532 INFO - tensorflow - Saving 'checkpoint_path' summary for global step 73: s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
Calling model_fn.
2019-08-22 03:57:11,818 INFO - tensorflow - Calling model_fn.
2019-08-22 03:57:11,871 ERROR - container_support.training - uncaught exception during training: 'content_input_ids'
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
fw.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 177, in train
train_wrapper.train()
File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 73, in train
tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
return executor.run()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 637, in run
getattr(self, task_to_run)()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 674, in run_master
self._start_distributed_training(saving_listeners=saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
saving_listeners=saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 354, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1209, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1243, in _train_model_default
saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1473, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 671, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1156, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1255, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1240, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1320, in run
run_metadata=run_metadata))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 582, in after_run
if self._save(run_context.session, global_step):
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 607, in _save
if l.after_save(session, step):
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 517, in after_save
self._evaluate(global_step_value) # updates self.eval_result
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 537, in _evaluate
self._evaluator.evaluate_and_export())
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 924, in evaluate_and_export
is_the_final_export)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 957, in _export_eval_result
is_the_final_export=is_the_final_export))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/exporter.py", line 472, in export
is_the_final_export)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/exporter.py", line 126, in export
strip_default_attrs=self._strip_default_attrs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 663, in export_savedmodel
mode=model_fn_lib.ModeKeys.PREDICT)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 789, in _export_saved_model_for_mode
strip_default_attrs=strip_default_attrs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 909, in _export_all_saved_models
mode=model_fn_lib.ModeKeys.PREDICT)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 986, in _add_meta_graph_for_mode
config=self.config)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1197, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 108, in _model_fn
return self.customer_script.model_fn(features, labels, mode, params)
File "/opt/ml/code/tf-train-aws-2.py", line 299, in model_fn
input_ids_content = features["content_input_ids"]
KeyError: 'content_input_ids'
- Exact command to reproduce:
Reactions are currently unavailable