Skip to content

The Tensorflow get the key error of model_fn feature parameter  #992

@DavidJohn197749

Description

@DavidJohn197749

Please fill out the form below.

System Information

  • Framework :TensorFlow
  • Framework Version: 1.12.0
  • Python Version: 2
  • CPU or GPU: CPU
  • Python SDK Version: 1.25.0
  • Are you using a custom image: no

Describe the problem

I am using sagemaker to train the tensorflow model. At the beginning, everything is ok, but It get the key error of feature after the first evaluation.
Here is my code:

def train_input_fn(training_dir, hyperparameters):
    print("======================================================================================")
    print(training_dir)
    #return _input_fn(training_dir,batch_size=hyperparameters['batch_size'],shuffle_and_repeat=True)
    return _input_fn("/opt/ml/input/data/train",batch_size=8,shuffle_and_repeat=True)
def eval_input_fn(training_dir, hyperparameters):
    print("--------------------------------------------------------------------------------------")
    print(training_dir)
    #return _input_fn(training_dir,batch_size=hyperparameters['batch_size'])
    return _input_fn("/opt/ml/input/data/eval",batch_size=32)

def serving_input_fn(hyperparameters):
    input_content_ids = tf.placeholder(shape=[None,None],dtype=tf.int32)
    input_content_len = tf.placeholder(shape=[None],dtype=tf.int32)

    input_title_ids = tf.placeholder(shape=[None,None],dtype=tf.int32)
    input_title_len = tf.placeholder(shape=[None],dtype=tf.int32)

    feature = {
        "words":input_content_ids,
        "words_len":input_content_len,
        "titles":input_title_ids,
        "titles_len":input_title_len
    }
    return tf.estimator.export.build_raw_serving_input_receiver_fn(features=feature)()
    #return tf.estimator.export.build_raw_serving_input_receiver_fn(features ={"inputs":(tensor_1,tensor_2)} )()
    #return tf.estimator.export.ServingInputReceiver((tensor_1,tensor_2),)

def _input_fn(filename_dir,shuffle_size=15000,epochs=2,batch_size=20,skip_header_lines=1, shuffle_and_repeat=False):
    def parse_file(row):
        row_ = tf.parse_single_example(row,features={
                "content_input_ids":tf.VarLenFeature(tf.int64),
                "content_input_len":tf.FixedLenFeature([],tf.int64),
                "tag_input_ids":tf.VarLenFeature(tf.int64),
                "tag_input_len":tf.FixedLenFeature([],tf.int64),
                "binary_output":tf.FixedLenFeature([],tf.int64),
        })
        content = tf.sparse_tensor_to_dense(row_["content_input_ids"])
        content = tf.cast(content,tf.int32)
        content_len = row_["content_input_len"]
        content_len = tf.cast(content_len,tf.int32)
        
        tag = tf.sparse_tensor_to_dense(row_["tag_input_ids"])
        tag = tf.cast(tag,tf.int32)
        tag_len = row_["tag_input_len"]
        tag_len = tf.cast(tag_len,tf.int32)
        
        label = row_["binary_output"]
        label = tf.cast(label,tf.int32)
        return {"content_input_ids":content,"content_input_len":content_len,"tag_input_ids":tag,"tag_input_len":tag_len},label
    ext = "*.tfrecord*"
    all_tf_files = [file for path,subdir,files in os.walk(filename_dir) for file in glob.glob(os.path.join(path,ext))]
    dataset = tf.data.TFRecordDataset(all_tf_files)
    dataset = dataset.map(parse_file)
    dataset = dataset.shuffle(shuffle_size).repeat(epochs)
    dataset = dataset.padded_batch(
        batch_size=batch_size,
        padded_shapes=(
        {
            "content_input_ids":[None],
            "content_input_len":[],
            "tag_input_ids":[None],
            "tag_input_len":[],
        },[]
        ),
        padding_values=(
        {
            "content_input_ids":0,
            "content_input_len":0,
            "tag_input_ids":0,
            "tag_input_len":0
        },0
        )
    )
    return dataset

def model_fn(features, labels, mode, hyperparameters):
    SAGEMAKER_DATA_PATH = '/opt/ml/input/data'
    inside_sagemaker_container = os.path.exists(SAGEMAKER_DATA_PATH)
    base_dir = SAGEMAKER_DATA_PATH if inside_sagemaker_container else 'data'
    
    input_ids_content = features["content_input_ids"]
    input_ids_content_len = features["content_input_len"]
    input_ids_tag = features["tag_input_ids"]
    input_ids_tag_len = features["tag_input_len"]
    
    label_binary_output = labels
    dropout = 0.5
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    #lstm_size = hyperparameters['lstm_size']
    lstm_size = 300
    #numtags = hyperparameters['num_tags']

    embedding_dir = os.path.join(base_dir,'embedding','bpemb_model.npz')
    def loadPretrainEmbeddingMatrixi(filename_dir):
        embedding = np.load(filename_dir)["embeddings"]
        return embedding

    embedding = loadPretrainEmbeddingMatrixi(embedding_dir)

    preEmbedding = tf.get_variable(initializer=embedding,dtype=tf.float32,trainable=False,name="embedding")
    content_embedding = tf.nn.embedding_lookup(preEmbedding,input_ids_content)
    tag_embedding = tf.nn.embedding_lookup(preEmbedding,input_ids_tag)
    
    hidden_dim = 300
    numtags = 4
    
    content_fw_cell = tf.nn.rnn_cell.GRUCell(hidden_dim,name="content_fw_cell")
    content_bw_cell = tf.nn.rnn_cell.GRUCell(hidden_dim,name="content_bw_cell")
    content_outputs,content_outputs_states = tf.nn.bidirectional_dynamic_rnn(content_fw_cell,content_bw_cell,content_embedding,sequence_length=input_ids_content_len,dtype=tf.float32)
    content_fw_output,content_bw_output = content_outputs
    content_outputs = tf.concat([content_fw_output,content_bw_output],-1)

Minimal repro / logs

From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
2019-08-22 03:49:06,047 WARNING - tensorflow - From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
Calling model_fn.
2019-08-22 03:49:06,075 INFO - tensorflow - Calling model_fn.
Done calling model_fn.
2019-08-22 03:49:12,073 INFO - tensorflow - Done calling model_fn.
Create CheckpointSaverHook.
2019-08-22 03:49:12,074 INFO - tensorflow - Create CheckpointSaverHook.

Graph was finalized.
2019-08-22 03:49:18,854 INFO - tensorflow - Graph was finalized.
Running local_init_op.
2019-08-22 03:49:20,313 INFO - tensorflow - Running local_init_op.
Done running local_init_op.
2019-08-22 03:49:20,334 INFO - tensorflow - Done running local_init_op.
Saving checkpoints for 0 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
2019-08-22 03:50:12,586 INFO - tensorflow - Saving checkpoints for 0 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
loss = 0.6956065032933187, step = 1
2019-08-22 03:50:27,582 INFO - tensorflow - loss = 0.6956065032933187, step = 1
Saving checkpoints for 73 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
2019-08-22 03:55:23,700 INFO - tensorflow - Saving checkpoints for 73 into s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt.
--------------------------------------------------------------------------------------
None
Calling model_fn.
2019-08-22 03:55:37,164 INFO - tensorflow - Calling model_fn.
Done calling model_fn.
2019-08-22 03:55:41,287 INFO - tensorflow - Done calling model_fn.
Starting evaluation at 2019-08-22-03:55:41
2019-08-22 03:55:41,306 INFO - tensorflow - Starting evaluation at 2019-08-22-03:55:41
Graph was finalized.
2019-08-22 03:55:41,498 INFO - tensorflow - Graph was finalized.
Restoring parameters from s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
2019-08-22 03:55:41,553 INFO - tensorflow - Restoring parameters from s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
Running local_init_op.
2019-08-22 03:55:44,259 INFO - tensorflow - Running local_init_op.
Done running local_init_op.
2019-08-22 03:55:44,280 INFO - tensorflow - Done running local_init_op.
Evaluation [10/100]
2019-08-22 03:55:51,946 INFO - tensorflow - Evaluation [10/100]
Evaluation [20/100]
2019-08-22 03:55:59,165 INFO - tensorflow - Evaluation [20/100]
Evaluation [30/100]
2019-08-22 03:56:07,046 INFO - tensorflow - Evaluation [30/100]
Evaluation [40/100]
2019-08-22 03:56:14,048 INFO - tensorflow - Evaluation [40/100]
Evaluation [50/100]
2019-08-22 03:56:21,118 INFO - tensorflow - Evaluation [50/100]
Evaluation [60/100]
2019-08-22 03:56:29,392 INFO - tensorflow - Evaluation [60/100]
Evaluation [70/100]
2019-08-22 03:56:36,941 INFO - tensorflow - Evaluation [70/100]
Evaluation [80/100]
2019-08-22 03:56:43,949 INFO - tensorflow - Evaluation [80/100]
Evaluation [90/100]
2019-08-22 03:56:51,402 INFO - tensorflow - Evaluation [90/100]

2019-08-22 03:57:13 Uploading - Uploading generated training modelEvaluation [100/100]
2019-08-22 03:56:58,415 INFO - tensorflow - Evaluation [100/100]
Finished evaluation at 2019-08-22-03:56:58
2019-08-22 03:56:58,508 INFO - tensorflow - Finished evaluation at 2019-08-22-03:56:58
Saving dict for global step 73: accuracy = 0.4940625, f1 = 0.6608076, global_step = 73, loss = 0.69346565, precision = 0.4936265, recall = 0.98100066
2019-08-22 03:56:58,508 INFO - tensorflow - Saving dict for global step 73: accuracy = 0.4940625, f1 = 0.6608076, global_step = 73, loss = 0.69346565, precision = 0.4936265, recall = 0.98100066
Saving 'checkpoint_path' summary for global step 73: s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
2019-08-22 03:57:08,532 INFO - tensorflow - Saving 'checkpoint_path' summary for global step 73: s3://hket04.deeplearningtest01/temp/lstm_binary_data/lstm/artifacts/model/sagemaker-tensorflow-2019-08-22-03-46-38-535/checkpoints/model.ckpt-73
Calling model_fn.
2019-08-22 03:57:11,818 INFO - tensorflow - Calling model_fn.
2019-08-22 03:57:11,871 ERROR - container_support.training - uncaught exception during training: 'content_input_ids'
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/container_support/training.py", line 36, in start
    fw.train()
  File "/usr/local/lib/python2.7/dist-packages/tf_container/train_entry_point.py", line 177, in train
    train_wrapper.train()
  File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 73, in train
    tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 471, in train_and_evaluate
    return executor.run()
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 637, in run
    getattr(self, task_to_run)()
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 674, in run_master
    self._start_distributed_training(saving_listeners=saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 788, in _start_distributed_training
    saving_listeners=saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 354, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1209, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1243, in _train_model_default
    saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1473, in _train_with_estimator_spec
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 671, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1156, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1255, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1240, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1320, in run
    run_metadata=run_metadata))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 582, in after_run
    if self._save(run_context.session, global_step):
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 607, in _save
    if l.after_save(session, step):
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 517, in after_save
    self._evaluate(global_step_value)  # updates self.eval_result
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 537, in _evaluate
    self._evaluator.evaluate_and_export())
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 924, in evaluate_and_export
    is_the_final_export)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 957, in _export_eval_result
    is_the_final_export=is_the_final_export))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/exporter.py", line 472, in export
    is_the_final_export)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/exporter.py", line 126, in export
    strip_default_attrs=self._strip_default_attrs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 663, in export_savedmodel
    mode=model_fn_lib.ModeKeys.PREDICT)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 789, in _export_saved_model_for_mode
    strip_default_attrs=strip_default_attrs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 909, in _export_all_saved_models
    mode=model_fn_lib.ModeKeys.PREDICT)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 986, in _add_meta_graph_for_mode
    config=self.config)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1197, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tf_container/trainer.py", line 108, in _model_fn
    return self.customer_script.model_fn(features, labels, mode, params)
  File "/opt/ml/code/tf-train-aws-2.py", line 299, in model_fn
    input_ids_content = features["content_input_ids"]
KeyError: 'content_input_ids'
  • Exact command to reproduce:

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions