CU-8698mqu96 Transformers update (4.51.0) fix (#531)

mart-r · web-flow · commit 65f7c5e01f5e · 2025-04-09T17:12:03.000+01:00
* CU-8698mqu96: Update special tokens lengths attribute

* CU-8698mqu96: Update MetaCAT usage of BertTokenizer.from_pretrained for type safety

* CU-8698mqu96: Ignore typing where mypy is wrong + add note in code

* CU-8698mqu96: Ignore typing where mypy may be wrong + add comment

* CU-8698mqu96: Fix tokenizer wrapper import for rel cat

* CU-8698mqu96: Rename evaluation strategy keyword argument in line with changes

* CU-8698mqu96: Type-ignore method where mypy says it does not exist

* CU-8698mqu96: Fix TRF-NER output dir typing issue

* CU-8698mqu96: Update a doc string for darglint

* CU-8698mqu96: Fix typing issue for TrfNER trainer callback
diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -95,8 +95,9 @@ def get_model(self, embeddings: Optional[Tensor]) -> nn.Module:
             if not config.model.model_freeze_layers:
                 peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16,
                                          target_modules=["query", "value"], lora_dropout=0.2)
-
-                model = get_peft_model(model, peft_config)
+                # Not sure what changed between transformers 4.50.3 and 4.50.1 that made this
+                # fail for mypy. But as best as I Can tell, it still works just the same
+                model = get_peft_model(model, peft_config)  # type: ignore
                 # model.print_trainable_parameters()
 
             logger.info("BERT model used for classification")
@@ -412,7 +413,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "MetaCA
             tokenizer = TokenizerWrapperBPE.load(save_dir_path)
         elif config.general['tokenizer_name'] == 'bert-tokenizer':
             from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT
-            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model['model_variant'])
+            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model.model_variant)
 
         # Create meta_cat
         meta_cat = cls(tokenizer=tokenizer, embeddings=None, config=config)
diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
@@ -70,7 +70,8 @@ def __init__(self, cdb, config: Optional[ConfigTransformersNER] = None,
                 eval_accumulation_steps=1,
                 gradient_accumulation_steps=4, # We want to get to bs=4
                 do_eval=True,
-                evaluation_strategy='epoch', # type: ignore
+                # eval_strategy over evaluation_strategy since trf==4.46 (apperently)
+                eval_strategy='epoch', # type: ignore
                 logging_strategy='epoch', # type: ignore
                 save_strategy='epoch', # type: ignore
                 metric_for_best_model='eval_recall', # Can be changed if our preference is not recall but precision or f1
@@ -176,7 +177,7 @@ def train(self,
               ignore_extra_labels=False,
               dataset=None,
               meta_requirements=None,
-              trainer_callbacks: Optional[List[TrainerCallback]]=None) -> Tuple:
+              trainer_callbacks: Optional[List[Callable[[Trainer], TrainerCallback]]] = None) -> Tuple:
         """Train or continue training a model give a json_path containing a MedCATtrainer export. It will
         continue training if an existing model is loaded or start new training if the model is blank/new.
 
@@ -188,10 +189,13 @@ def train(self,
                 labels that did not exist in the old model.
             dataset: Defaults to None.
             meta_requirements: Defaults to None
-            trainer_callbacks (List[TrainerCallback]):
+            trainer_callbacks (List[Callable[[Trainer], TrainerCallback]]]):
                 A list of trainer callbacks for collecting metrics during the training at the client side. The
                 transformers Trainer object will be passed in when each callback is called.
 
+        Raises:
+            ValueError: If something went wrong with model save path.
+
         Returns:
             Tuple: The dataframe, examples, and the dataset
         """
@@ -254,15 +258,21 @@ def train(self,
                 tokenizer=None)
         if trainer_callbacks:
             for callback in trainer_callbacks:
-                trainer.add_callback(callback(trainer))
+                # No idea why mypy isn't picking up the method.
+                # It most certainly does exist
+                trainer.add_callback(callback(trainer))  # type: ignore
 
         trainer.train() # type: ignore
 
         # Save the training time
         self.config.general.last_train_on = datetime.now().timestamp() # type: ignore
 
         # Save everything
-        self.save(save_dir_path=os.path.join(self.training_arguments.output_dir, 'final_model'))
+        output_dir = self.training_arguments.output_dir
+        if output_dir is None:
+            # NOTE: this shouldn't really happen, but we'll do this for type safety
+            raise ValueError("Output path should not be None!")
+        self.save(save_dir_path=os.path.join(output_dir, 'final_model'))
 
         # Run an eval step and return metrics
         p = trainer.predict(encoded_dataset['test']) # type: ignore
diff --git a/medcat/tokenizers/meta_cat_tokenizers.py b/medcat/tokenizers/meta_cat_tokenizers.py
@@ -193,8 +193,13 @@ def load(cls, dir_path: str, model_variant: Optional[str] = '', **kwargs) -> "To
         try:
             tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(path, **kwargs)
         except Exception as e:
-            logging.warning("Could not load tokenizer from path due to error: {}. Loading from library for model variant: {}".format(e,model_variant))
-            tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(model_variant)
+            # So that this is a string - it should be as it's only used in MetaCAT.load method
+            # with `config.model.model_variant` which is a `str` rathern than None
+            # NOTE: The reason the type in method signature is Optional[str] is because supertype defines it as such
+            variant = str(model_variant)
+            logging.warning("Could not load tokenizer from path due to error: %s. Loading from library for model variant: %s",
+                            e, variant)
+            tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(variant)
 
         return tokenizer
 
diff --git a/medcat/utils/relation_extraction/models.py b/medcat/utils/relation_extraction/models.py
@@ -202,7 +202,8 @@ def forward(self,
         encoder_attention_mask = encoder_attention_mask.to(
             self.relcat_config.general.device)
 
-        self.bert_model = self.bert_model.to(self.relcat_config.general.device)
+        # NOTE: no idea why, but mypy doesn't understand that there's an implicit `self` argument here...
+        self.bert_model = self.bert_model.to(device=self.relcat_config.general.device)  # type: ignore
 
         model_output = self.bert_model(input_ids=input_ids, attention_mask=attention_mask,
                                        token_type_ids=token_type_ids,
diff --git a/medcat/utils/relation_extraction/tokenizer.py b/medcat/utils/relation_extraction/tokenizer.py
@@ -16,12 +16,12 @@ class TokenizerWrapperBERT(BertTokenizerFast):
     def __init__(self, hf_tokenizers=None, max_seq_length: Optional[int] = None, add_special_tokens: Optional[bool] = False):
         self.hf_tokenizers = hf_tokenizers
         self.max_seq_length = max_seq_length
-        self.add_special_tokens = add_special_tokens
+        self._add_special_tokens = add_special_tokens
 
     def __call__(self, text, truncation: Optional[bool] = True):
         if isinstance(text, str):
             result = self.hf_tokenizers.encode_plus(text, return_offsets_mapping=True, return_length=True, return_token_type_ids=True, return_attention_mask=True,
-                                                    add_special_tokens=self.add_special_tokens, max_length=self.max_seq_length, padding="longest", truncation=truncation)
+                                                    add_special_tokens=self._add_special_tokens, max_length=self.max_seq_length, padding="longest", truncation=truncation)
 
             return {'offset_mapping': result['offset_mapping'],
                     'input_ids': result['input_ids'],
@@ -32,7 +32,7 @@ def __call__(self, text, truncation: Optional[bool] = True):
                     }
         elif isinstance(text, list):
             results = self.hf_tokenizers._batch_encode_plus(text, return_offsets_mapping=True, return_length=True, return_token_type_ids=True,
-                                                            add_special_tokens=self.add_special_tokens, max_length=self.max_seq_length,truncation=truncation)
+                                                            add_special_tokens=self._add_special_tokens, max_length=self.max_seq_length,truncation=truncation)
             output = []
             for ind in range(len(results['input_ids'])):
                 output.append({
diff --git a/medcat/utils/relation_extraction/utils.py b/medcat/utils/relation_extraction/utils.py
@@ -9,7 +9,7 @@
 from pandas.core.series import Series
 from medcat.config_rel_cat import ConfigRelCAT
 
-from medcat.preprocessing.tokenizers import TokenizerWrapperBERT
+from medcat.utils.relation_extraction.tokenizer import TokenizerWrapperBERT
 from medcat.utils.relation_extraction.models import BertModel_RelationExtraction