diff --git a/setup.py b/setup.py index 78ab795..79188b1 100644 --- a/setup.py +++ b/setup.py @@ -3,9 +3,15 @@ with open("README.md", "r") as fh: long_description = fh.read() +with open("src/NERDA/about.py") as f: + v = f.read() + for l in v.split("\n"): + if l.startswith("__version__"): + __version__ = l.split('"')[-2] + setuptools.setup( - name="NERDA", - version="1.0.0", + name="NERDA", + version=__version__, author="Lars Kjeldgaard, Lukas Christian Nielsen", author_email="lars.kjeldgaard@eb.dk", description="A Framework for Finetuning Transformers for Named-Entity Recognition", diff --git a/src/NERDA/__init__.py b/src/NERDA/__init__.py index 2d7ded4..ef19635 100644 --- a/src/NERDA/__init__.py +++ b/src/NERDA/__init__.py @@ -1 +1,2 @@ -import NERDA \ No newline at end of file +import NERDA +from .about import __version__, __title__ diff --git a/src/NERDA/about.py b/src/NERDA/about.py new file mode 100644 index 0000000..be09982 --- /dev/null +++ b/src/NERDA/about.py @@ -0,0 +1,2 @@ +__title__ = "NERDA" +__version__ = "1.0.1" # the ONLY source of version I diff --git a/src/NERDA/models.py b/src/NERDA/models.py index 00dc3c4..ffa3c5a 100644 --- a/src/NERDA/models.py +++ b/src/NERDA/models.py @@ -163,6 +163,7 @@ def __init__(self, self.dataset_training = dataset_training self.dataset_validation = dataset_validation self.hyperparameters = hyperparameters + self.tokenizer_parameters = tokenizer_parameters self.tag_outside = tag_outside self.tag_scheme = tag_scheme tag_complete = [tag_outside] + tag_scheme @@ -214,7 +215,7 @@ def train(self) -> str: return "Model trained successfully" - def load_network_from_file(self, model_path = "model.bin") -> str: + def load_network_from_file(self, model_path = "model.bin", tokenizer_path = "./tokenizer/") -> str: """Load Pretrained NERDA Network from file Loads weights for a pretrained NERDA Network from file. @@ -230,10 +231,17 @@ def load_network_from_file(self, model_path = "model.bin") -> str: # TODO: change assert to Raise. assert os.path.exists(model_path), "File does not exist. You can download network with download_network()" self.network.load_state_dict(torch.load(model_path, map_location = torch.device(self.device))) + + if(os.path.exists(tokenizer_path)): + self.transformer_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + else: + self.transformer_tokenizer = AutoTokenizer.from_pretrained( + self.transformer, **self.tokenizer_parameters) + self.network.device = self.device return f'Weights for network loaded from {model_path}' - def save_network(self, model_path:str = "model.bin") -> None: + def save_network(self, output_dir:str = "./output_dir") -> None: """Save Weights of NERDA Network Saves weights for a fine-tuned NERDA Network to file. @@ -245,8 +253,12 @@ def save_network(self, model_path:str = "model.bin") -> None: Returns: Nothing. Saves model to file as a side-effect. """ - torch.save(self.network.state_dict(), model_path) - print(f"Network written to file {model_path}") + if(not os.path.exists(output_dir)): + os.makedirs(os.path.join(output_dir, "tokenizer")) + + torch.save(self.network.state_dict(), os.path.join(output_dir, "model.bin")) + self.transformer_tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer")) + print(f"Network written to file {output_dir}") def quantize(self): """Apply dynamic quantization to increase performance. @@ -387,18 +399,18 @@ def evaluate_performance(self, dataset: dict, 'F1-Score': [f1_micro[2]], 'Precision': [np.nan], 'Recall': [np.nan]}) - df = df.append(f1_micro) + df = pd.concat([df, f1_micro]) # compute MACRO-averaged F1-scores and add to table. f1_macro = compute_f1_scores(y_pred = tags_predicted, y_true = dataset.get('tags'), labels = self.tag_scheme, average = 'macro') - f1_macro = pd.DataFrame({'Level' : ['AVG_MICRO'], + f1_macro = pd.DataFrame({'Level' : ['AVG_MACRO'], 'F1-Score': [f1_macro[2]], 'Precision': [np.nan], 'Recall': [np.nan]}) - df = df.append(f1_macro) + df = pd.concat([df, f1_macro]) # compute and return accuracy if desired if return_accuracy: diff --git a/src/NERDA/preprocessing.py b/src/NERDA/preprocessing.py index 4dd7dae..8081c4a 100644 --- a/src/NERDA/preprocessing.py +++ b/src/NERDA/preprocessing.py @@ -99,7 +99,11 @@ def __getitem__(self, item): # compute padding length if self.pad_sequences: padding_len = self.max_len - len(input_ids) - input_ids = input_ids + ([self.pad_token_id] * padding_len) + if self.pad_token_id == None: + input_ids = input_ids + ([0] * padding_len) + else: + input_ids = input_ids + ([self.pad_token_id] * padding_len) + #input_ids = input_ids + ([self.pad_token_id] * padding_len) masks = masks + ([0] * padding_len) offsets = offsets + ([0] * padding_len) token_type_ids = token_type_ids + ([0] * padding_len) diff --git a/src/NERDA/training.py b/src/NERDA/training.py index 7c52c37..c85615a 100644 --- a/src/NERDA/training.py +++ b/src/NERDA/training.py @@ -1,7 +1,6 @@ import numpy as np from .preprocessing import create_dataloader -from sklearn import preprocessing -from transformers import AdamW, get_linear_schedule_with_warmup +from transformers import get_linear_schedule_with_warmup import random import torch from tqdm import tqdm @@ -141,7 +140,7 @@ def train_model(network, num_train_steps = int(len(dataset_training.get('sentences')) / train_batch_size * epochs) - optimizer = AdamW(optimizer_parameters, lr = learning_rate) + optimizer = torch.optim.AdamW(optimizer_parameters, lr=learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps = warmup_steps, num_training_steps = num_train_steps )