CogStack
diff --git a/‎medcat-v1/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎medcat-v1/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎medcat-v1/install_requires.txt‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/install_requires.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/medcat/cat.py‎
Lines changed: 1 addition & 1 deletion b/‎medcat-v1/medcat/cat.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎medcat-v1/medcat/config_rel_cat.py‎
Lines changed: 85 additions & 13 deletions b/‎medcat-v1/medcat/config_rel_cat.py‎
Lines changed: 85 additions & 13 deletions
@@ -55,3 +55,4 @@ tests/model_creator/output/*
 docs/auto/
 docs/_build
 
+models/
@@ -3,7 +3,7 @@
 'gensim>=4.3.0,<5.0.0'  # 5.3.0 is first to support 3.11; avoid major version bump
 'spacy>=3.6.0,<4.0.0'  # avoid major bump
 'scipy>=1.9.2,<1.14.0'  # 1.9.2 is first to support 3.11; 1.14.0 does not support 3.9
-'transformers>=4.34.0,<5.0.0'  # avoid major version bump
+'transformers>=4.48.1,<5.0.0'  # avoid major version bump
 'accelerate>=0.23.0' # required by Trainer class in de-id
 'torch>=2.4.0,<3.0.0' # 2.4.0 is first to support 3.12; avoid major 3.0.0 for now
 'tqdm>=4.27'
 
@@ -143,7 +143,7 @@ def _create_pipeline(self, config: Config):
             self.pipe.add_meta_cat(meta_cat, meta_cat.config.general.category_name)
 
         for rel_cat in self._rel_cats:
-            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.config.general["labels2idx"].keys())))
+            self.pipe.add_rel_cat(rel_cat, "_".join(list(rel_cat.component.relcat_config.general["labels2idx"].keys())))
 
         # Set max document length
         self.pipe.spacy_nlp.max_length = config.preprocessing.max_document_length
 
@@ -1,5 +1,6 @@
+import os
 import logging
-from typing import Dict, Any, List
+from typing import Any, Dict, List, Tuple, Union, cast
 from medcat.config import MixingConfig, BaseModel, Optional
 
 
@@ -21,10 +22,14 @@ class General(MixingConfig, BaseModel):
     window_size: int = 300
     """Max acceptable dinstance between entities (in characters), care when using this as it can produce sentences that are over 512 tokens (limit is given by tokenizer)"""
 
-    mct_export_max_non_rel_sample_size:int = 200
+    limit_samples_per_class: int = -1
+    """Number of samples per class, this limit is applied for train samples, so if train samples are 100 then test would be 20."""
+    addl_rels_max_sample_size:int = 200
     """Limit the number of 'Other' samples selected for training/test. This is applied per encountered medcat project, sample_size/num_projects. """
-    mct_export_create_addl_rels: bool = False
-    """When processing relations from a MedCAT export, relations labeled as 'Other' are created from all the annotations pairs available"""
+    create_addl_rels: bool = False
+    """When processing relations from a MedCAT export/docs, relations labeled as 'Other' are created from all the annotations pairs available"""
+    create_addl_rels_by_type: bool = False
+    """When creating the 'Other' relation class, actually split this class into subclasses based on concept types"""
 
     tokenizer_name: str = "bert"
     """The name of the tokenizer user.
@@ -46,21 +51,47 @@ class General(MixingConfig, BaseModel):
     """Tokenizer.
 
     NB! For these changes to take effect, the pipe would need to be recreated."""
-    annotation_schema_tag_ids: List = []
+    annotation_schema_tag_ids: List = [30522, 30523, 30524, 30525]
     """If a foreign non-MCAT trainer dataset is used, you can insert your own Rel entity token delimiters into the tokenizer, \
-    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce"""
-    labels2idx: Dict = {}
-    idx2labels: Dict = {}
+    copy those token IDs here, and also resize your tokenizer embeddings and adjust the hidden_size of the model, this will depend on the number of tokens you introduce
+    for example: 30522 - [s1], 30523 - [e1], 30524 - [s2], 30525 - [e2], 30526 - [BLANK], 30527 - [ENT1], 30528 - [ENT2], 30529 - [/ENT1], 30530 - [/ENT2]
+    Please note that the tokenizer special tokens are supposed to be in pairs of two for example [s1] and [e1], [s2] and [e2], the [BLANK] is just an example placeholder token
+    If you have more than four tokens here then you need to make sure they are present in the text, 
+    otherwise the pipeline will throw an error in the get_annotation_schema_tag() function.
+    """
+
+    tokenizer_relation_annotation_special_tokens_tags: List[str] = ["[s1]", "[e1]", "[s2]", "[e2]"]
+
+    tokenizer_other_special_tokens: Dict[str, str] = {"pad_token": "[PAD]"}
+    """
+    The special tokens used by the tokenizer. The {PAD} is for Lllama tokenizer."""
+
+    labels2idx: Dict[str, int] = {}
+    idx2labels: Dict[int, str] = {}
+
     pin_memory: bool = True
+    """If True the data loader will copy the tensors to the GPU pinned memory"""
+
     seed: int = 13
     """The seed for random number generation.
 
-    NOTE: If used along MetaCAT or additional NER, only one of the seeds will take effect
     NB! For these changes to take effect, the pipe would need to be recreated."""
     task: str = "train"
-    """The task for RelCAT.
+    """The task for RelCAT."""
 
-    NB! For these changes to take effect, the pipe would need to be recreated."""
+    language: str = "en"
+    """Used for Spacy lang setting"""
+
+    @classmethod
+    def convert_keys_to_int(cls, value):
+        if isinstance(value, dict):
+            return {int(k): v for k, v in value.items()}
+        return value
+
+    def __setattr__(self, key: str, value: Any):
+        if key == "idx2labels" and isinstance(value, dict):
+            value = self.convert_keys_to_int(value)  # Ensure conversion
+        super().__setattr__(key, value)
 
 
 class Model(MixingConfig, BaseModel):
@@ -82,12 +113,18 @@ class Model(MixingConfig, BaseModel):
     num_directions: int = 2
     """2 - bidirectional model, 1 - unidirectional"""
 
+    freeze_layers: bool = True
+    """If we update the weights during training"""
+
     padding_idx: int = -1
     emb_grad: bool = True
     """If True the embeddings will also be trained"""
     ignore_cpos: bool = False
     """If set to True center positions will be ignored when calculating representation"""
 
+    llama_use_pooled_output: bool = False
+    """If set to True, used only in Llama model, it will add the extra tensor formed from selecting the max of the last hidden layer"""
+
     class Config:
         extra = 'allow'
         validate_assignment = True
@@ -98,9 +135,24 @@ class Train(MixingConfig, BaseModel):
     nclasses: int = 2
     """Number of classes that this model will output"""
     batch_size: int = 25
+    """batch size"""
     nepochs: int = 1
+    """Epochs"""
     lr: float = 1e-4
-    adam_epsilon: float = 1e-4
+    """Learning rate"""
+    stratified_batching: bool = False
+    """Train the model with stratified batching"""
+    batching_samples_per_class: list = []
+    """Number of samples per class in each batch
+    example for batch size 64: [6,6,6,8,8,8,6,8,8]"""
+    batching_minority_limit: Union[List[int], int] = 0
+    """Maximum number of samples the minority class can have.
+    Since the minority class elements need to be repeated, this is used to facilitate that
+    example: batching_samples_per_class - [6,6,6,8,8,8,6,8,8]
+             batching_minority_limit - 6"""
+    adam_betas: Tuple[float, float] = (0.9, 0.999)
+    adam_weight_decay: float = 0
+    adam_epsilon: float = 1e-8
     test_size: float = 0.2
     gradient_acc_steps: int = 1
     multistep_milestones: List[int] = [
@@ -109,7 +161,8 @@ class Train(MixingConfig, BaseModel):
     max_grad_norm: float = 1.0
     shuffle_data: bool = True
     """Used only during training, if set the dataset will be shuffled before train/test split"""
-    class_weights: Optional[Any] = None
+    class_weights: Union[List[float], None] = None
+    enable_class_weights: bool = False
     score_average: str = "weighted"
     """What to use for averaging F1/P/R across labels"""
     auto_save_model: bool = True
@@ -129,3 +182,22 @@ class ConfigRelCAT(MixingConfig, BaseModel):
     class Config:
         extra = 'allow'
         validate_assignment = True
+
+    @classmethod
+    def load(cls, load_path: str = "./") -> "ConfigRelCAT":
+        """Load the config from a file.
+
+        Args:
+            load_path (str): Path to RelCAT config. Defaults to "./".
+
+        Returns:
+            ConfigRelCAT: The loaded config.
+        """
+        config = cls()
+        if os.path.exists(load_path):
+            if "config.json" not in load_path:
+                load_path = os.path.join(load_path, "config.json")
+            config = cast(ConfigRelCAT, super().load(load_path))
+            logging.info("Loaded config.json")
+
+        return config
Original file line number	Diff line number	Diff line change
`@@ -55,3 +55,4 @@ tests/model_creator/output/*`
`55`	`55`	`docs/auto/`
`56`	`56`	`docs/_build`
`57`	`57`
	`58`	`+models/`