CogStack · mart-r · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/README.md b/medcat-v2/paper/data/supervised/MDACE/raw/README.md
@@ -0,0 +1,18 @@
+First we download the MDACE dataset and prepare it with MIMIC-IV as per instructions:
+https://github.com/3mcloud/MDACE
+
+Then, we need to convert the data to a format MedCAT can understand using:
+```python
+python convert_to_mct_export.py  # no need for arguments if in this folder
+```
+
+However, that still only has ICD-10 codes.
+Yet the models we're comparing to use SNOMED.
+
+So we then need to convert to SNOMED by doing:
+```python
+python map_from_icd_to_snomed.py <model_pack_path> ../icd10_convert.json ../mct_export_with_candidates.json
+```
+
+This will create a trainer export that has multiple CUIs as options for each annotation.
+That is because ICD-10 codes can map to multiple different Snomed concepts and there is no automated way to create a 1 to 1 mapping.
diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py b/medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py
@@ -0,0 +1,87 @@
+import json
+import os
+import sys
+from datetime import datetime
+from typing import Iterator
+
+from medcat.data.mctexport import (
+    MedCATTrainerExport, MedCATTrainerExportProject,
+    MedCATTrainerExportDocument, MedCATTrainerExportAnnotation)
+from medcat.data.mctexport import count_all_annotations, count_all_docs
+
+DEFAULT_INPUT_DIR = "with_text/gold"
+DEFAULT_OUTPUT_PATH = "../icd10_convert.json"
+
+
+def get_all_jsons(input_dir: str) -> Iterator[str]:
+    for fn in os.listdir(input_dir):
+        path = os.path.join(input_dir, fn)
+        if os.path.isdir(path):
+            yield from get_all_jsons(path)
+        elif path.endswith(".json"):
+            yield path
+
+
+def do_conversion(
+        input_dir: str = DEFAULT_INPUT_DIR,
+        output_file: str = DEFAULT_OUTPUT_PATH):
+    mod_time = datetime.now().isoformat()
+    all_out: MedCATTrainerExport = {
+        "projects": []
+    }
+
+    for path in get_all_jsons(input_dir):
+        if not path.endswith(".json"):
+            continue
+        with open(path) as f:
+            in_data = json.load(f)
+        documents: list[MedCATTrainerExportDocument] = []
+        proj_id = in_data["hadm_id"]
+        proj_name = f'MDACE_{proj_id}'
+        project: MedCATTrainerExportProject = {
+            "documents": documents,
+            "name": proj_name,
+            "id": proj_id,
+            "cuis": "",
+            "tuis": "",
+        }
+        all_out["projects"].append(project)
+
+        in_notes = in_data["notes"]  # guess name
+        for in_doc in in_notes:
+            doc_id = in_doc["note_id"]
+            doc_name = f'{in_doc["description"]}_{doc_id}'
+            anns: list[MedCATTrainerExportAnnotation] = []
+            documents.append(
+                {
+                    "name": doc_name,
+                    "id": doc_id,
+                    "last_modified": mod_time,
+                    "text": in_doc["text"],
+                    "annotations": anns,
+                }
+            )
+
+            for ann_num, ann in enumerate(in_doc["annotations"]):
+                anns.append(
+                    {
+                        "start": ann["begin"],
+                        "end": ann["end"],
+                        # NOTE: this is currently in ICD
+                        "cui": ann["code"],
+                        "value": ann["covered_text"],
+                        "id": f"{proj_name}_{doc_name}_{ann_num}",
+                        "meta_anns": [],
+                        "validated": True,
+                    }
+                )
+    print("GOT", len(all_out["projects"]), "projects",
+          "with", count_all_annotations(all_out), "annotations",
+          "across", count_all_docs(all_out), "documents")
+
+    with open(output_file, "w") as of:
+        json.dump(all_out, of, indent=2)
+
+
+if __name__ == "__main__":
+    do_conversion(*sys.argv[1:])
diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py b/medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py
@@ -0,0 +1,104 @@
+import sys
+import json
+from collections import defaultdict
+
+from medcat.cat import CAT
+from medcat.data.mctexport import (
+    MedCATTrainerExport, MedCATTrainerExportAnnotation,
+    count_all_annotations, count_all_docs)
+
+
+def load_export(path: str) -> MedCATTrainerExport:
+    with open(path) as f:
+        return json.load(f)
+
+
+def icd2snomed(cat: CAT) -> dict[str, list[str]]:
+    code2snomed: dict[str, list[str]] = defaultdict(list)
+    cui2icd10 = cat.cdb.addl_info["cui2icd10"]
+    for cui_info in cat.cdb.cui2info.values():
+        cui = cui_info["cui"]
+        for icd10 in cui2icd10.get(cui, []):
+            code2snomed[icd10].append(cui)
+    print("GOT", len(code2snomed), "ICD codes")
+    print("Mapped to", sum(len(v) for v in code2snomed.values()),
+          "total Snomed CUIs")
+    return code2snomed
+
+
+def pick_concept(cat: CAT,
+                 mapper: dict[str, list[str]],
+                 ann: MedCATTrainerExportAnnotation) -> str | None:
+    # NOTE: I could try and select 1 - the best
+    #       but there isn't really a good way to do that.
+    #       Instead, we'll use all as candidates
+    return mapper.get(ann["cui"])
+
+
+def convert_export(
+        cat: CAT, export: MedCATTrainerExport
+        ) -> MedCATTrainerExport:
+    mapper = icd2snomed(cat)
+    return {
+        "projects": [
+            {
+                "id": proj["id"],
+                "name": proj["name"],
+                "cuis": proj["cuis"],
+                "tuis": proj["tuis"],
+                "documents": docs
+            }
+            for proj in export["projects"]
+            if (docs := [
+                {
+                    "id": doc["id"],
+                    "name": doc["name"],
+                    "last_modified": doc["last_modified"],
+                    "text": doc["text"],
+                    "annotations": anns
+                } for doc in proj["documents"]
+                if (anns := [
+                    {
+                        "id": ann["id"],
+                        "start": ann["start"],
+                        "end": ann["end"],
+                        "value": ann["value"],
+                        "cui": mapped_cui,
+                        "meta_anns": ann["meta_anns"],
+                        "validated": ann["validated"]
+                    } for ann in doc["annotations"]
+                    if (mapped_cui := pick_concept(cat, mapper, ann))
+                    ])
+            ])
+        ]
+    }
+
+
+def main(model_pack_path: str,
+         icd10_export_path: str,
+         final_export_path: str):
+    print("Loading model pack", model_pack_path)
+    cat = CAT.load_model_pack(model_pack_path)
+    print("Loading export")
+    export = load_export(icd10_export_path)
+    print("Initial import has", count_all_docs(export), "docs",
+          "and", count_all_annotations(export), "anns within",
+          len(export["projects"]), "projects")
+    print("Converting...")
+    converted = convert_export(cat, export)
+    print("CONVERTED export HAS", count_all_docs(converted), "docs",
+          "and", count_all_annotations(converted), "anns within",
+          len(converted["projects"]), "projects")
+    from medcat.data.mctexport import iter_anns
+    lens = []
+    for _, _, ann in iter_anns(converted):
+        lens.append(len(ann["cui"]) if isinstance(ann["cui"], list) else 1)
+    print("Total", len(lens), "annotations with", sum(lens) / len(lens),
+          "values on average")
+    print("Saving to", final_export_path)
+    with open(final_export_path, 'w') as f:
+        json.dump(converted, f)
+
+
+if __name__ == "__main__":
+    main(*sys.argv[1:])
diff --git a/medcat-v2/paper/data/supervised/cometa/raw/README.md b/medcat-v2/paper/data/supervised/cometa/raw/README.md
@@ -0,0 +1,7 @@
+First, we need to download the dataset:
+https://metatext.io/datasets/cometa
+
+Then we need to convert to a format MedCAT understands:
+```python
+python conversion/converter.py chv.csv ../mct_export.json
+```
diff --git a/medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py b/medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py
@@ -0,0 +1,115 @@
+from sys import argv
+import json
+import os.path
+from datetime import datetime
+
+from tqdm import tqdm
+import pandas as pd
+
+from medcat.data.mctexport import (
+    MedCATTrainerExport, MedCATTrainerExportProject,
+    MedCATTrainerExportAnnotation)
+from medcat.data.mctexport import count_all_docs, count_all_annotations
+
+
+COLS = ['Term', 'General SNOMED Label', 'General SNOMED ID',
+        'Specific SNOMED Label', 'Specific SNOMED ID', 'Example',
+        'Example Link', 'Origin_Sheet']
+COL4VALUE = "Term"
+COL4CUI = "Specific SNOMED ID"
+COL4TEXT = "Example"
+COL4LINK = "Example Link"
+
+# November 2020
+LAST_MODIFIED = datetime(year=2020, month=11, day=1).isoformat()
+
+
+def find_annotations(value: str, text: str, cui: str
+                     ) -> list[MedCATTrainerExportAnnotation]:
+    value = value.lower()
+    orig_text = text
+    text = text.lower()
+    if value not in text:
+        raise ValueError(f"{repr(value)} not in text ({repr(text)})")
+    cur_start = 0
+    anns: list[MedCATTrainerExportAnnotation] = []
+    while (cur_index := text.find(value, cur_start)) >= 0:
+        start = cur_index
+        end = cur_index + len(value)
+        anns.append(
+            {
+                "cui": str(cui),
+                "value": orig_text[start: end],
+                "start": start,
+                "end": end,
+            }
+        )
+        cur_start = end
+        if len(anns) > 100:
+            raise KeyError(
+                f"Too many annotations!, {start}, {end}, for {value}. "
+                f"cur start at {cur_start}")
+    return anns
+
+
+def do_conversion(df: pd.DataFrame, proj_base_id: str, proj_base_name: str
+                  ) -> MedCATTrainerExport:
+    projects: list[MedCATTrainerExportProject] = []
+    for line_num, (index, line) in enumerate(tqdm(df.iterrows(),
+                                                  total=len(df.index))):
+        text = line[COL4TEXT]
+        cui = line[COL4CUI]
+        try:
+            anns = find_annotations(
+                line[COL4VALUE], text, cui)
+        except ValueError as e:
+            print("LINE", line_num, "at index", index,
+                  "Failed to load(VE):", str(e))
+            continue
+        except AttributeError as e:
+            print("LINE", line_num, "at index", index,
+                  "Failed to load(AE):", str(e))
+            continue
+        proj_id = proj_base_id + str(index)
+        proj_name = proj_base_name + "@" + str(index)
+        # NOTE: each document is a project so that I can use per-project
+        #       filters and thus only focus on the CUI in question and not
+        #       the other terms in the text
+        projects.append({
+            "documents": [
+                {
+                    "text": text,
+                    "annotations": anns,
+                    "id": str(index),
+                    "name": f"LINK: {line[COL4LINK]}; ID: {index}",
+                    "last_modified": LAST_MODIFIED
+                }
+            ],
+            "id": proj_id,
+            "name": proj_name,
+            "cuis": f'{cui}',
+            "tuis": '',
+        })
+    return {"projects": projects}
+
+
+def main(file_path: str,
+         export_path: str,
+         # TODO: options
+         ):
+    df = pd.read_csv(file_path, sep='\t', index_col=0, header=0).sort_index()
+    proj_name = export_path.split(os.path.sep + "cometa" + os.path.sep, 1)[-1]
+    proj_id = ".".join(proj_name.split(os.path.sep)[-2:]).replace(".csv", "")
+    print("Giving 'project' a name of", repr(proj_name))
+    print("And setting ID to", proj_id)
+    mct_export = do_conversion(df, proj_id, proj_name)
+    print("Got", len(mct_export["projects"]), "projects with a total of",
+          count_all_docs(mct_export), "documents and a total of",
+          count_all_annotations(mct_export), "annotations")
+    print("Saving to", repr(export_path))
+    with open(export_path, 'w') as f:
+        json.dump(mct_export, f)
+
+
+if __name__ == "__main__":
+    main(*argv[1:])
diff --git a/medcat-v2/paper/data/supervised/distemist/raw/README.md b/medcat-v2/paper/data/supervised/distemist/raw/README.md
@@ -0,0 +1,11 @@
+First we need to download and extract the distemist dataset:
+https://temu.bsc.es/distemist/distemist-linking/
+
+Subsequently, we convert to MedCAT supported format:
+```python
+python convert_to_mct_export.py distemist_zenodo/multilingual_resources/training_text_files/en distemist_zenodo/multilingual_resources/en ../mct_export.json
+```
+
+NOTE:
+The underlying dataset (at least in some cases) links to multiple concepts per annotation.
+And because of that the output also allows a subset of concepts.