Merge branch 'master' of https://github.com/CogStack/MedCAT into relation_extraction_llama

vladd-bit · vladd-bit · commit 64a7a6abad96 · 2025-04-16T08:53:03.000+01:00
diff --git a/medcat/meta_cat.py b/medcat/meta_cat.py
@@ -95,8 +95,9 @@ def get_model(self, embeddings: Optional[Tensor]) -> nn.Module:
             if not config.model.model_freeze_layers:
                 peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=16,
                                          target_modules=["query", "value"], lora_dropout=0.2)
-
-                model = get_peft_model(model, peft_config)
+                # Not sure what changed between transformers 4.50.3 and 4.50.1 that made this
+                # fail for mypy. But as best as I Can tell, it still works just the same
+                model = get_peft_model(model, peft_config)  # type: ignore
                 # model.print_trainable_parameters()
 
             logger.info("BERT model used for classification")
@@ -412,7 +413,7 @@ def load(cls, save_dir_path: str, config_dict: Optional[Dict] = None) -> "MetaCA
             tokenizer = TokenizerWrapperBPE.load(save_dir_path)
         elif config.general['tokenizer_name'] == 'bert-tokenizer':
             from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT
-            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model['model_variant'])
+            tokenizer = TokenizerWrapperBERT.load(save_dir_path, config.model.model_variant)
 
         # Create meta_cat
         meta_cat = cls(tokenizer=tokenizer, embeddings=None, config=config)
diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
@@ -70,7 +70,8 @@ def __init__(self, cdb, config: Optional[ConfigTransformersNER] = None,
                 eval_accumulation_steps=1,
                 gradient_accumulation_steps=4, # We want to get to bs=4
                 do_eval=True,
-                evaluation_strategy='epoch', # type: ignore
+                # eval_strategy over evaluation_strategy since trf==4.46 (apperently)
+                eval_strategy='epoch', # type: ignore
                 logging_strategy='epoch', # type: ignore
                 save_strategy='epoch', # type: ignore
                 metric_for_best_model='eval_recall', # Can be changed if our preference is not recall but precision or f1
@@ -176,7 +177,7 @@ def train(self,
               ignore_extra_labels=False,
               dataset=None,
               meta_requirements=None,
-              trainer_callbacks: Optional[List[TrainerCallback]]=None) -> Tuple:
+              trainer_callbacks: Optional[List[Callable[[Trainer], TrainerCallback]]] = None) -> Tuple:
         """Train or continue training a model give a json_path containing a MedCATtrainer export. It will
         continue training if an existing model is loaded or start new training if the model is blank/new.
 
@@ -188,10 +189,13 @@ def train(self,
                 labels that did not exist in the old model.
             dataset: Defaults to None.
             meta_requirements: Defaults to None
-            trainer_callbacks (List[TrainerCallback]):
+            trainer_callbacks (List[Callable[[Trainer], TrainerCallback]]]):
                 A list of trainer callbacks for collecting metrics during the training at the client side. The
                 transformers Trainer object will be passed in when each callback is called.
 
+        Raises:
+            ValueError: If something went wrong with model save path.
+
         Returns:
             Tuple: The dataframe, examples, and the dataset
         """
@@ -227,7 +231,9 @@ def train(self,
         if self.model.num_labels != len(self.tokenizer.label_map):
             logger.warning("The dataset contains labels we've not seen before, model is being reinitialized")
             logger.warning("Model: {} vs Dataset: {}".format(self.model.num_labels, len(self.tokenizer.label_map)))
-            self.model = AutoModelForTokenClassification.from_pretrained(self.config.general['model_name'], num_labels=len(self.tokenizer.label_map))
+            self.model = AutoModelForTokenClassification.from_pretrained(self.config.general['model_name'], 
+                                                                         num_labels=len(self.tokenizer.label_map), 
+                                                                         ignore_mismatched_sizes=True)
             self.tokenizer.cui2name = {k:self.cdb.get_name(k) for k in self.tokenizer.label_map.keys()}
 
         self.model.config.id2label = {v:k for k,v in self.tokenizer.label_map.items()}
@@ -252,15 +258,21 @@ def train(self,
                 tokenizer=None)
         if trainer_callbacks:
             for callback in trainer_callbacks:
-                trainer.add_callback(callback(trainer))
+                # No idea why mypy isn't picking up the method.
+                # It most certainly does exist
+                trainer.add_callback(callback(trainer))  # type: ignore
 
         trainer.train() # type: ignore
 
         # Save the training time
         self.config.general.last_train_on = datetime.now().timestamp() # type: ignore
 
         # Save everything
-        self.save(save_dir_path=os.path.join(self.training_arguments.output_dir, 'final_model'))
+        output_dir = self.training_arguments.output_dir
+        if output_dir is None:
+            # NOTE: this shouldn't really happen, but we'll do this for type safety
+            raise ValueError("Output path should not be None!")
+        self.save(save_dir_path=os.path.join(output_dir, 'final_model'))
 
         # Run an eval step and return metrics
         p = trainer.predict(encoded_dataset['test']) # type: ignore
diff --git a/medcat/tokenizers/meta_cat_tokenizers.py b/medcat/tokenizers/meta_cat_tokenizers.py
@@ -193,8 +193,13 @@ def load(cls, dir_path: str, model_variant: Optional[str] = '', **kwargs) -> "To
         try:
             tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(path, **kwargs)
         except Exception as e:
-            logging.warning("Could not load tokenizer from path due to error: {}. Loading from library for model variant: {}".format(e,model_variant))
-            tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(model_variant)
+            # So that this is a string - it should be as it's only used in MetaCAT.load method
+            # with `config.model.model_variant` which is a `str` rathern than None
+            # NOTE: The reason the type in method signature is Optional[str] is because supertype defines it as such
+            variant = str(model_variant)
+            logging.warning("Could not load tokenizer from path due to error: %s. Loading from library for model variant: %s",
+                            e, variant)
+            tokenizer.hf_tokenizers = BertTokenizerFast.from_pretrained(variant)
 
         return tokenizer
 
diff --git a/medcat/utils/relation_extraction/tokenizer.py b/medcat/utils/relation_extraction/tokenizer.py
@@ -16,7 +16,7 @@ class BaseTokenizerWrapper_RelationExtraction(PreTrainedTokenizerFast, ABC):
     def __init__(self, hf_tokenizers=None, max_seq_length: Optional[int] = None, add_special_tokens: Optional[bool] = False):
         self.hf_tokenizers = hf_tokenizers
         self.max_seq_length = max_seq_length
-        self.add_special_tokens = add_special_tokens
+        self._add_special_tokens = add_special_tokens
 
     def get_size(self):
         return len(self.hf_tokenizers.vocab)
@@ -30,7 +30,7 @@ def get_pad_id(self):
     def __call__(self, text, truncation: Optional[bool] = True):
         if isinstance(text, str):
             result = self.hf_tokenizers.encode_plus(text, return_offsets_mapping=True, return_length=True, return_token_type_ids=True, return_attention_mask=True,
-                                                    add_special_tokens=self.add_special_tokens, max_length=self.max_seq_length, padding="longest", truncation=truncation)
+                                                    add_special_tokens=self._add_special_tokens, max_length=self.max_seq_length, padding="longest", truncation=truncation)
 
             return {'offset_mapping': result['offset_mapping'],
                     'input_ids': result['input_ids'],
@@ -41,7 +41,7 @@ def __call__(self, text, truncation: Optional[bool] = True):
                     }
         elif isinstance(text, list):
             results = self.hf_tokenizers._batch_encode_plus(text, return_offsets_mapping=True, return_length=True, return_token_type_ids=True,
-                                                            add_special_tokens=self.add_special_tokens, max_length=self.max_seq_length,truncation=truncation)
+                                                            add_special_tokens=self._add_special_tokens, max_length=self.max_seq_length,truncation=truncation)
             output = []
             for ind in range(len(results['input_ids'])):
                 output.append({
diff --git a/tests/test_transformers_ner.py b/tests/test_transformers_ner.py
@@ -0,0 +1,242 @@
+import unittest
+import tempfile
+import json
+import os
+import shutil
+from medcat.cdb import CDB
+from medcat.ner.transformers_ner import TransformersNER
+from medcat.config_transformers_ner import ConfigTransformersNER
+
+class TestTransformersNER(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory for the test
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        # Create results dir for training outputs
+        self.results_dir = './results'
+        os.makedirs(self.results_dir, exist_ok=True)
+        
+        # Create a minimal CDB
+        self.cdb = CDB()
+        
+        # Create initial training data with 2 labels and multiple examples
+        self.initial_data = {
+            "projects": [{
+                "documents": [
+                    {
+                        "text": "Patient has diabetes and hypertension.",
+                        "annotations": [
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 14,
+                                "end": 22,
+                                "value": "diabetes"
+                            },
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 27,
+                                "end": 39,
+                                "value": "hypertension"
+                            }
+                        ]
+                    },
+                    {
+                        "text": "History of diabetes with hypertension.",
+                        "annotations": [
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 12,
+                                "end": 20,
+                                "value": "diabetes"
+                            },
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 26,
+                                "end": 38,
+                                "value": "hypertension"
+                            }
+                        ]
+                    },
+                    {
+                        "text": "Diagnosed with hypertension and diabetes.",
+                        "annotations": [
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 15,
+                                "end": 27,
+                                "value": "hypertension"
+                            },
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 32,
+                                "end": 40,
+                                "value": "diabetes"
+                            }
+                        ]
+                    }
+                ]
+            }]
+        }
+        
+        # Create new training data with an extra label
+        self.new_data = {
+            "projects": [{
+                "documents": [
+                    {
+                        "text": "Patient has diabetes, hypertension, and asthma.",
+                        "annotations": [
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 14,
+                                "end": 22,
+                                "value": "diabetes"
+                            },
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 24,
+                                "end": 36,
+                                "value": "hypertension"
+                            },
+                            {
+                                "cui": "C0004096",  # Asthma
+                                "start": 42,
+                                "end": 48,
+                                "value": "asthma"
+                            }
+                        ]
+                    },
+                    {
+                        "text": "History of asthma with diabetes and hypertension.",
+                        "annotations": [
+                            {
+                                "cui": "C0004096",  # Asthma
+                                "start": 12,
+                                "end": 18,
+                                "value": "asthma"
+                            },
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 24,
+                                "end": 32,
+                                "value": "diabetes"
+                            },
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 37,
+                                "end": 49,
+                                "value": "hypertension"
+                            }
+                        ]
+                    },
+                    {
+                        "text": "Diagnosed with asthma, diabetes, and hypertension.",
+                        "annotations": [
+                            {
+                                "cui": "C0004096",  # Asthma
+                                "start": 15,
+                                "end": 21,
+                                "value": "asthma"
+                            },
+                            {
+                                "cui": "C0011849",  # Diabetes
+                                "start": 23,
+                                "end": 31,
+                                "value": "diabetes"
+                            },
+                            {
+                                "cui": "C0020538",  # Hypertension
+                                "start": 37,
+                                "end": 49,
+                                "value": "hypertension"
+                            }
+                        ]
+                    }
+                ]
+            }]
+        }
+        
+        # Save initial training data
+        self.initial_data_path = os.path.join(self.tmp_dir.name, 'initial_data.json')
+        with open(self.initial_data_path, 'w') as f:
+            json.dump(self.initial_data, f)
+            
+        # Save new training data
+        self.new_data_path = os.path.join(self.tmp_dir.name, 'new_data.json')
+        with open(self.new_data_path, 'w') as f:
+            json.dump(self.new_data, f)
+
+    def tearDown(self):
+        # Clean up the temporary directory
+        self.tmp_dir.cleanup()
+        # Clean up results directory if it exists
+        if os.path.exists(self.results_dir):
+            shutil.rmtree(self.results_dir)
+        # Clean up logs directory if it exists
+        if os.path.exists('./logs'):
+            shutil.rmtree('./logs')
+
+    def test_ignore_extra_labels(self):
+        # Create and train initial model with tiny BERT
+        config = ConfigTransformersNER()
+        config.general['model_name'] = 'prajjwal1/bert-tiny'
+        # Set to single epoch and small test size for faster testing
+        config.general['num_train_epochs'] = 1
+        config.general['test_size'] = 0.1
+        
+        # Create training arguments with reduced epochs
+        from transformers import TrainingArguments
+        training_args = TrainingArguments(
+            output_dir=self.results_dir,  # Use the class results_dir
+            num_train_epochs=1
+        )
+        
+        ner = TransformersNER(self.cdb, config=config, training_arguments=training_args)
+        ner.train(self.initial_data_path)
+        
+        # Save the model
+        model_path = os.path.join(self.tmp_dir.name, 'model')
+        ner.save(model_path)
+        
+        # Load the saved model
+        loaded_ner = TransformersNER.load(model_path)
+        
+        # Get initial number of labels
+        initial_num_labels = len(loaded_ner.tokenizer.label_map)
+        
+        # Train with ignore_extra_labels=True
+        loaded_ner.train(self.new_data_path, ignore_extra_labels=True)
+        
+        # Verify number of labels hasn't changed
+        self.assertEqual(
+            len(loaded_ner.tokenizer.label_map),
+            initial_num_labels,
+            "Number of labels changed despite ignore_extra_labels=True"
+        )
+        
+        # Verify only original labels are present (including special tokens)
+        expected_labels = {"C0011849", "C0020538", "O", "X"}
+        self.assertEqual(
+            set(loaded_ner.tokenizer.label_map.keys()),
+            expected_labels,
+            "Label map contains unexpected labels"
+        )
+        
+        # Train with ignore_extra_labels=False
+        loaded_ner.train(self.new_data_path, ignore_extra_labels=False)
+        
+        # Verify new label was added
+        self.assertEqual(
+            len(loaded_ner.tokenizer.label_map),
+            initial_num_labels + 1,
+            "New label was not added when ignore_extra_labels=False"
+        )
+        
+        # Verify all labels are present (including special tokens)
+        expected_labels = {"C0011849", "C0020538", "C0004096", "O", "X"}
+        self.assertEqual(
+            set(loaded_ner.tokenizer.label_map.keys()),
+            expected_labels,
+            "Label map missing expected labels"
+        )
+
+if __name__ == '__main__':
+    unittest.main()