CogStack · adam-sutton-1992 · Apr 24, 2026 · Apr 24, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/config.py
@@ -70,13 +70,46 @@ class EmbeddingLinking(Linking):
     """Choose a device for the linking model to be stored. If None
     then an appropriate GPU device that is available will be chosen"""
     context_window_size: int = 14
-    """Choose the window size to get context vectors."""
+    """Choose the window size to get context vectors. In a trained model 
+    if you increase the context window after training then performance will
+    degrade significantly."""
     use_ner_link_candidates: bool = True
     """Link candidates are provided by some NER steps. This will flag if 
-    you want to trust them or not."""
+    you want to trust them or not. A good guideline is if you've trained 
+    on data from the same distribution then this is probably best set to True.
+    If you have no training data from the same source distribution then it MIGHT
+    be better set to false."""
+    append_to_ner_link_candidates: bool = False
+    """If `use_ner_link_candidates` is enabled, generate additional
+    candidates and append them to existing NER candidates instead of only
+    generating for entities that have none. This will often result in a slight
+    increase in recall, and precision."""
+    use_pre_inference: bool = True
+    """Whether to use the pre-inference step to filter candidates before
+    calculating similarities. This can speed up inference by only calculating
+    similarities for candidates that are likely to be correct based direct on word 
+    matching."""
     learning_rate: float = 1e-4
     """Learning rate for training the embedding linker. Only used if 
     the embedding linker is trainable."""
     weight_decay: float = 0.01
     """Weight decay for training the embedding linker. Only used if
     the embedding linker is trainable."""
+    multiple_predictions_per_detected_entity: bool = False
+    """Whether to allow multiple predictions per detected entity. If False, only 
+    the highest scoring candidate will be returned for each entity. If True, all 
+    candidates that exceed the similarity thresholds will be returned. This can be 
+    useful if you want to return multiple CUIs for an entity, but can also lead to 
+    more false positives."""
+    pre_inference_top_k_sampling: int = 1
+    """When using pre-inference to filter candidates, how many names to then add
+    their related CUIs as potential candidates. Higher numbers will increase recall 
+    but also increase inference time, and reduce precision. This is influenced by 
+    `short_similarity_threshold`, i.e. pass the top k samples over the threshold 
+    for inference."""
+    inference_top_k_sampling: int = 1
+    """At the inference step, after calculating similarity scores, how many candidates 
+    to keep for each entity. Higher numbers will increase recall but also increase 
+    inference time, and often reduce precision. This is influenced by 
+    `long_similarity_threshold`, i.e. take the top k samples over the threshold. This 
+    will be ignored if `multiple_predictions_per_detected_entity` is set to False."""
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/embedding_linker.py
diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/trainable_embedding_linker.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/trainable_embedding_linker.py
@@ -2,6 +2,7 @@
 from medcat_embedding_linker.config import EmbeddingLinking
 from torch import Tensor
 from medcat.cdb import CDB
+from medcat.components.types import TrainableComponent
 from medcat.config.config import Config, ComponentConfig
 from medcat.components.linking.vector_context_model import PerDocumentTokenCache
 from medcat.tokenizing.tokenizers import BaseTokenizer
@@ -17,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Linker(StaticEmbeddingLinker, AbstractManualSerialisable):
+class Linker(StaticEmbeddingLinker, AbstractManualSerialisable, TrainableComponent):
     """Trainable variant of the embedding linker.
     This class inherits inference and embedding behavior from Linker and provides
     method hooks for online/offline training.
@@ -28,7 +29,10 @@ class Linker(StaticEmbeddingLinker, AbstractManualSerialisable):
     _MODEL_FOLDER_NAME = "trainable_embedding_model"
     _MODEL_STATE_FILE_NAME = "model_state.pt"
 
-    def __init__(self, cdb: CDB, config: Config) -> None:
+    def __init__(self, 
+                 cdb: CDB, 
+                 config: Config,
+                 tokenizer: BaseTokenizer) -> None:
         if not isinstance(config.components.linking, EmbeddingLinking):
             raise TypeError("Linking config must be an EmbeddingLinking instance")
         self.cnf_l: EmbeddingLinking = config.components.linking
@@ -41,6 +45,7 @@ def __init__(self, cdb: CDB, config: Config) -> None:
         super().__init__(
             cdb,
             config,
+            tokenizer,
             model_init_kwargs=model_init_kwargs,
         )
         self.training_batch: list[tuple] = []
@@ -407,7 +412,7 @@ def create_new_component(
         vocab: Vocab,
         model_load_path: Optional[str],
     ) -> "Linker":
-        return cls(cdb, cdb.config)
+        return cls(cdb, cdb.config, tokenizer)
 
     def serialise_to(self, folder_path: str) -> None:
         os.makedirs(folder_path, exist_ok=True)
@@ -424,7 +429,8 @@ def deserialise_from(
         cls, folder_path: str, **init_kwargs
     ) -> "Linker":
         cdb = init_kwargs["cdb"]
-        linker = cls(cdb, cdb.config)
+        tokenizer = init_kwargs["tokenizer"]
+        linker = cls(cdb, cdb.config, tokenizer)
 
         model_state_path = os.path.join(
             folder_path, cls._MODEL_FOLDER_NAME, cls._MODEL_STATE_FILE_NAME

diff --git a/medcat-plugins/embedding-linker/src/medcat_embedding_linker/transformer_context_model.py b/medcat-plugins/embedding-linker/src/medcat_embedding_linker/transformer_context_model.py
@@ -3,6 +3,7 @@
 from medcat.storage.serialisables import AbstractSerialisable
 from torch import Tensor, nn
 from transformers import AutoModel, AutoTokenizer
+from medcat_embedding_linker.config import EmbeddingLinking as LinkingConfig
 from tqdm import tqdm
 import json
 import logging
@@ -23,14 +24,16 @@ class ModelForEmbeddingLinking(nn.Module):
     def __init__(
         self,
         embedding_model_name: str,
+        cnf_l: LinkingConfig,
         use_projection_layer: bool = False,
-        top_n_layers_to_unfreeze: int = -1,
+        top_n_layers_to_unfreeze: int = 0,
         device: Optional[Union[str, torch.device]] = None,
     ) -> None:
         super().__init__()
         self.language_model = AutoModel.from_pretrained(embedding_model_name)
         self.base_model_name = self.language_model.name_or_path
 
+        self.cnf_l = cnf_l
         self.use_projection_layer = use_projection_layer
         self.top_n_layers_to_unfreeze = top_n_layers_to_unfreeze
 
@@ -86,6 +89,10 @@ def _freeze_all_parameters(self) -> None:
                 param.requires_grad = True
 
     def unfreeze_top_n_lm_layers(self, n: int) -> None:
+        self.cnf_l.top_n_layers_to_unfreeze = n
+        self.top_n_layers_to_unfreeze = n
+        # Re-apply from a known baseline so repeated calls are deterministic.
+        self._freeze_all_parameters()
         # train all LM layers - each layer requires more data
         if n == -1:
             for param in self.language_model.parameters():
@@ -133,6 +140,7 @@ def save_pretrained(self, save_directory: Union[str, Path]) -> None:
     def from_pretrained(
         cls,
         path_or_model_name: Union[str, Path],
+        cnf_l: LinkingConfig,
         device: Optional[Union[str, torch.device]] = None,
         **kwargs,
     ) -> "ModelForEmbeddingLinking":
@@ -147,7 +155,7 @@ def from_pretrained(
                 config = json.load(f)
 
             config.update(kwargs)
-            model = cls(**config)
+            model = cls(cnf_l=cnf_l, **config)
             state_dict = torch.load(weights_path, map_location="cpu")
             model.load_state_dict(state_dict)
             model.to(target_device)
@@ -156,6 +164,7 @@ def from_pretrained(
         # Hugging Face model id/path.
         model = cls(
             embedding_model_name=str(path_or_model_name),
+            cnf_l=cnf_l,
             device=target_device,
             **kwargs,
         )
@@ -208,8 +217,19 @@ def _resolve_model_source(path_or_model_name: Union[str, Path]) -> str:
         return str(path_or_model_name)
 
     def _get_model_init_kwargs(self) -> dict[str, Any]:
-        """Build kwargs passed to ModelForEmbeddingLinking.from_pretrained."""
-        return dict(self._model_init_kwargs)
+        """Build kwargs passed to ModelForEmbeddingLinking.from_pretrained.
+
+        Keep these in sync with runtime linker config so model swaps preserve
+        trainability settings (i.e. top-n LM layers to unfreeze).
+        """
+        kwargs = dict(self._model_init_kwargs)
+        if hasattr(self.cnf_l, "use_projection_layer"):
+            kwargs["use_projection_layer"] = self.cnf_l.use_projection_layer
+        if hasattr(self.cnf_l, "top_n_layers_to_unfreeze"):
+            kwargs["top_n_layers_to_unfreeze"] = (
+                self.cnf_l.top_n_layers_to_unfreeze
+            )
+        return kwargs
 
     def load_transformers(self, embedding_model_name: Union[str, Path]) -> None:
         """Load tokenizer/model from local path or Hugging Face model id."""
@@ -224,7 +244,9 @@ def load_transformers(self, embedding_model_name: Union[str, Path]) -> None:
             self.cnf_l.embedding_model_name = str(embedding_model_name)
             self.tokenizer = AutoTokenizer.from_pretrained(model_source)
             self.model = ModelForEmbeddingLinking.from_pretrained(
-                model_source, **model_init_kwargs
+                model_source,
+                cnf_l=self.cnf_l,
+                **model_init_kwargs,
             )
             self.model.eval()
             self.device = torch.device(

diff --git a/medcat-plugins/embedding-linker/tests/test_embedding_linker.py b/medcat-plugins/embedding-linker/tests/test_embedding_linker.py
@@ -67,7 +67,8 @@ class NonTrainableEmbeddingLinkerTests(unittest.TestCase):
     cnf = Config()
     cnf.components.linking = embedding_linker.EmbeddingLinking()
     cnf.components.linking.comp_name = embedding_linker.Linker.name
-    linker = embedding_linker.Linker(FakeCDB(cnf), cnf)
+    vtokenizer = FakeTokenizer()
+    linker = embedding_linker.Linker(FakeCDB(cnf), cnf, vtokenizer)
 
     def test_linker_is_not_trainable(self):
         self.assertNotIsInstance(self.linker, TrainableComponent)
@@ -83,7 +84,8 @@ class TrainableEmbeddingLinkerTests(unittest.TestCase):
     cnf.components.linking.comp_name = (
         trainable_embedding_linker.Linker.name
     )
-    linker = trainable_embedding_linker.Linker(FakeCDB(cnf), cnf)
+    vtokenizer = FakeTokenizer()
+    linker = trainable_embedding_linker.Linker(FakeCDB(cnf), cnf, vtokenizer)
 
     def test_linker_is_trainable(self):
         self.assertIsInstance(self.linker, TrainableComponent)

diff --git a/medcat-plugins/rawstring-tokenizer/README.md b/medcat-plugins/rawstring-tokenizer/README.md
@@ -0,0 +1,80 @@
+# MedCAT Embedding Linker
+
+A MedCAT plugin that provides an a Rawstring tokenizer, essentially splitting on whitespace characters (" ", "\n", "\t") only.
+
+## Overview
+
+This plugin replaces MedCAT's default tokenizing components with with rawstring, that are not limited by requiring SpaCy representations that perform linking.
+
+## Requirements
+
+- **MedCAT**: 2.0+ ([PyPI](https://pypi.org/project/medcat/) | [GitHub](https://github.com/CogStack/MedCAT))
+- Python 3.10+
+
+## Installation
+
+```bash
+pip install medcat-rawstring-tokenizer
+```
+
+## Quick Start
+
+### Replacing current tokenizer with a rawstring_tokenizer
+
+```python
+from medcat.cat import CAT
+from medcat_rawstring_tokenizer.tokenizer import RawstringTokenizer
+from medcat.tokenizing.tokenizers import register_tokenizer
+
+MODEL_PACK_PATH = ".."
+TARGET_FOLDER = ".."
+TARGET_PACK_NAME = ".."
+TOKENIZER_NAME = "rawstring_tokenizer"
+
+# The custom tokenizer must be registered before we rebuild the pipeline.
+register_tokenizer(TOKENIZER_NAME, RawstringTokenizer)
+
+cat = CAT.load_model_pack(MODEL_PACK_PATH)
+print("Tokenizer provider before:", cat.config.general.nlp.provider)
+
+# Switch tokenizer provider in config, then recreate pipeline to apply it.
+cat.config.general.nlp.provider = TOKENIZER_NAME
+
+cat.config.components.addons.clear()
+cat._recreate_pipe()
+
+print("Tokenizer provider after:", cat.config.general.nlp.provider)
+
+cat.save_model_pack(
+    target_folder=TARGET_FOLDER,
+    pack_name=TARGET_PACK_NAME,
+    add_hash_to_pack_name=False,
+    make_archive=False,
+)
+print("Saved model pack to:", f"{TARGET_FOLDER.rstrip('/')}/{TARGET_PACK_NAME}")
+```
+
+## How It Works
+
+### Component Registration
+
+Register the tokenizer by name before trying to add the tokenizer to the pipeline. If loading a model with a rawstring tokenizer register it beforehand.
+
+### Embedding Generation
+
+## Limitations
+
+- Can NOT be used with the default `context_based_linker` as, that uses spacy tokens and spacy embeddings for linking. Which are not used with this tokenizer.
+
+## Citation
+
+If you use this plugin, please cite MedCAT:
+
+```bibtex
+@article{medcat2021,
+    title={Medical Concept Annotation Tool (MedCAT)},
+    author={Kraljevic, Zeljko and et al.},
+    journal={arXiv preprint arXiv:2010.01165},
+    year={2021}
+}
+```