Skip to content
This repository was archived by the owner on Jul 26, 2019. It is now read-only.
This repository was archived by the owner on Jul 26, 2019. It is now read-only.

loss does not decrease during training #17

@ChiuHsin

Description

@ChiuHsin

Hello, I tried to simplify your code for NER task. I made a model as below

 def load_model(self):
        self.encoder = create_transformer(embedding_layer_norm=True,
                                          neg_inf=-10000.0,
                                          use_attn_mask=self.config.use_attn_mask,
                                          vocab_size=self.bert_config.vocab_size,
                                          accurate_gelu=True,
                                          layer_norm_epsilon=1e-12,
                                          max_len=self.config.max_len,
                                          use_one_embedding_dropout=True,
                                          d_hid=self.bert_config.intermediate_size,
                                          embedding_dim=self.bert_config.hidden_size,
                                          num_layers=self.bert_config.num_hidden_layers,
                                          num_heads=self.bert_config.num_attention_heads,
                                          residual_dropout=self.bert_config.hidden_dropout_prob,
                                          attention_dropout=self.bert_config.attention_probs_dropout_prob)

        self.encoder = load_google_bert(self.encoder, self.bert_config.vocab_size, self.config.bert_dir_path, self.config.max_len, self.config.verbose)
        
        decoder = Dense(units=self.config.num_classes)
        logits = TimeDistributed(decoder)(
            Dropout(self.config.dropout)(self.encoder.outputs[0]))
        task_target = Input(batch_shape=(None, self.config.max_len,), dtype='int32')
        task_mask = Input(batch_shape=(None, self.config.max_len), dtype='int32')
        task_loss = Lambda(lambda x: masked_classification_loss(x[0], x[1], x[2]))([task_target, logits, task_mask])

        # sharing layers between training model and prediction model
        self.train_model = Model(inputs=self.encoder.inputs+[task_target, task_mask], outputs=task_loss)
        self.model = Model(inputs=self.encoder.inputs, outputs=logits)

    def compile(self, *args, **kwargs):
        return self.train_model.compile(*args, loss=pass_through_loss, **kwargs)

Then train the model by

model = XXXX(config)
model.compile(optimizer='adam')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
    checkpoint = ModelCheckpoint(
        os.path.join(config.dir_output, 'best-weights.h5'),
        monitor='val_loss',
        verbose=1,
        save_best_only=True,
        save_weights_only=True
    )
model.train_model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch, 
    validation_data=dev_generator,
                                    validation_steps=dev_steps, verbose=1, callbacks=[earlystop, checkpoint],
                                    shuffle=False, epochs=100)
```.

In addition, I modified the function load_google_bert, commented the line
 `weights[w_id][vocab_size + TextEncoder.EOS_OFFSET] = saved[3 + TextEncoder.BERT_UNUSED_COUNT]` 
because the variable `TextEncoder.BERT_SPECIAL_COUNT` is 4 instead of 5, 
so the created model does not have so many weigths. 

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions