From fc0e9c725663fb97dd6f4f11e460af6176453005 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 11:09:57 +0000 Subject: [PATCH 001/111] Add initial script to get individiual results --- medcat-v2/paper/scripts/get_load_speed.py | 115 ++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 medcat-v2/paper/scripts/get_load_speed.py diff --git a/medcat-v2/paper/scripts/get_load_speed.py b/medcat-v2/paper/scripts/get_load_speed.py new file mode 100644 index 000000000..ccde8f2db --- /dev/null +++ b/medcat-v2/paper/scripts/get_load_speed.py @@ -0,0 +1,115 @@ + +import time +import cProfile +import pstats +import argparse +import logging +import io +OVERALL_START_TIME = time.perf_counter() +from medcat.cat import CAT # noqa + + +logger = logging.getLogger(__name__) + + +def main(): + parser = argparse.ArgumentParser( + "get_load_speed.py" + ) + parser.add_argument("model_pack_path", + help="model_pack_path", + type=str) + parser.add_argument("--verbose", "-v", + help="Whether to run in verbose mode", + action="store_true") + parser.add_argument("--do-profiling", "-p", + help="Whether to run profiling on top of just timing", + action="store_true") + parser.add_argument("--num-in-profile", "--np", + help="The number of lines in the profile.", + type=int, default=20) + parser.add_argument("--startup", "-s", + help="Whether to use the startup as the start time. " + "This is useful when trying to include import times " + "as well - i.e real user experience", + action="store_true") + parser.add_argument("--warmup", "-w", + help="The number of warmup rounds", + type=int, default=1) + args = parser.parse_args() + took_time = perform_work( + args.model_pack_path, + warmup=args.warmup, + startup=args.startup, + verbose=args.verbose, + profiling=args.do_profiling, + lines_in_profile=args.num_in_profile + ) + print(took_time) + return took_time + + +def perform_work(model_pack_path: str, + warmup: int, + startup: bool, + verbose: bool, + profiling: bool, + lines_in_profile: int, + ) -> float: + sh = logging.StreamHandler() + logger.addHandler(sh) + if verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.CRITICAL) + # NOTE: to make sure all the imports are done and so on + if warmup > 0 and startup: + raise ValueError("Timing warmed up from startup doesn't make sense") + logger.debug("Starting with wramp of %d repetations", warmup) + for cur_warmup in range(warmup): + logger.debug("Warmup number %d ...", cur_warmup) + load_once(model_pack_path, False, 0) + logger.info("Warmup done! Now loading!") + # NOTE: if doing startup, then counting from before + if startup: + logger.info("Using overall start time (before import)") + start_time = time.perf_counter() if not startup else OVERALL_START_TIME + load_once(model_pack_path, profiling, lines_in_profile) + took_time = time.perf_counter() - start_time + logger.info("Took a total of %ss", took_time) + # NOTE: print for any time output + # NOTE: no units for easy automation + return took_time + + +def _get_stats_str(profile: cProfile.Profile, lines_in_profile: int, + stat_type: str) -> str: + string_io = io.StringIO() + stats = pstats.Stats(profile, stream=string_io) + stats.sort_stats(stat_type).print_stats(lines_in_profile) + return string_io.getvalue() + + +def load_once(model_path: str, do_profiling: bool, + lines_in_profile: int): + if do_profiling: + profile = cProfile.Profile() + + profile.enable() + + CAT.load_model_pack(model_path) + + if do_profiling: + profile.disable() + + # NOTE: for logging + tot_stats = _get_stats_str(profile, lines_in_profile, "tottime") + logger.info("TOTtime for top %d", lines_in_profile) + logger.info(tot_stats) + cum_stats = _get_stats_str(profile, lines_in_profile, "cumtime") + logger.info("CUMtime for top %d", lines_in_profile) + logger.info(cum_stats) + + +if __name__ == "__main__": + took_time = main() From f138663680bdb9116765f6bef7bd530e348597cb Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 11:10:14 +0000 Subject: [PATCH 002/111] Add script to get overall results (startup, warm, cold) --- medcat-v2/paper/scripts/get_load_speed_all.py | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 medcat-v2/paper/scripts/get_load_speed_all.py diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py new file mode 100644 index 000000000..72dd6187e --- /dev/null +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -0,0 +1,97 @@ +from pydantic import BaseModel, ConfigDict +# import runpy +import subprocess +import os +import sys +import argparse +from pprint import pprint +from enum import Enum, auto + +import get_load_speed + + +class RunConfig(BaseModel): + repeats: int = 20 + # how many times to perform for warmup + warmup_count: int = 1 + + +class RunResults(BaseModel): + all_times: list[float] + mean: float + min: float + max: float + + model_config = ConfigDict(frozen=True) + + @classmethod + def from_times(cls, times: list[float]) -> "RunResults": + return cls( + all_times=times, + mean=sum(times) / len(times), + min=min(times), + max=max(times), + ) + + +class OverallResults(BaseModel): + startup: RunResults + cold: RunResults + warm: RunResults + + +class RunType(Enum): + STARTUP = auto() + COLD = auto() + WARM = auto() + + +def _single_experiment(model_path: str, + cnf: RunConfig, + run_type: RunType, + ) -> RunResults: + target_script = os.path.join( + os.path.dirname(__file__), get_load_speed.__name__ + ".py") + sys_argv = [sys.executable, target_script, model_path,] + if run_type is RunType.STARTUP: + sys_argv.extend(["-w", "0", "-s"]) + elif run_type is RunType.COLD: + sys_argv.extend(["-w", "0"]) + elif run_type is RunType.WARM: + sys_argv.extend(["-w", str(cnf.warmup_count)]) + all_took: list[float] = [] + for _ in range(cnf.repeats): + run_out = subprocess.run(sys_argv, capture_output=True) + all_took.append(float(run_out.stdout)) + return RunResults.from_times(all_took) + + +def do_experiment( + model_path: str, + cnf: RunConfig = RunConfig(), + ) -> OverallResults: + return OverallResults( + startup=_single_experiment( + model_path, cnf, RunType.STARTUP), + cold=_single_experiment( + model_path, cnf, RunType.COLD), + warm=_single_experiment( + model_path, cnf, RunType.WARM) + ) + + +def main(): + parser = argparse.ArgumentParser( + "get_load_speed_all" + ) + parser.add_argument("model_pack_path", + help="Model pack path", + type=str) + args = parser.parse_args() + results = do_experiment(args.model_pack_path, RunConfig(repeats=3)) + print("Overall:") + pprint(results.model_dump()) + + +if __name__ == "__main__": + main() From fe04f49c12142c25a6b22b2067763a37f4e73723 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 11:10:40 +0000 Subject: [PATCH 003/111] Fix default args --- medcat-v2/paper/scripts/get_load_speed_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py index 72dd6187e..50c822a65 100644 --- a/medcat-v2/paper/scripts/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -88,7 +88,7 @@ def main(): help="Model pack path", type=str) args = parser.parse_args() - results = do_experiment(args.model_pack_path, RunConfig(repeats=3)) + results = do_experiment(args.model_pack_path, RunConfig()) print("Overall:") pprint(results.model_dump()) From 7ce5e8d4c14d6e41320aad6db896282986841a08 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 13:37:30 +0000 Subject: [PATCH 004/111] Add master script for getting load speed for multiple models --- .../scripts/get_load_speed_for_multiple.sh | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 medcat-v2/paper/scripts/get_load_speed_for_multiple.sh diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh new file mode 100644 index 000000000..86db92b07 --- /dev/null +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# --- Input Validation --- +if (( $# == 0 )); then + echo "Usage: $0 ..." + exit 0 +fi + +if (( $# % 2 != 0 )); then + echo "Error: Arguments must be provided in pairs (name and path)." >&2 + exit 1 +fi + +echo "Starting pairwise argument processing..." +echo "-----------------------------------------" + +# The 'while' loop continues as long as there are arguments left ($# is non-zero) +while (( "$#" )); do + MODEL_NAME="$1" + MODEL_PATH="$2" + + echo "Model: '$MODEL_NAME'" + + python scripts/get_load_speed_all.py $MODEL_PATH + + echo "---" + + # Shift discards the first N arguments. + # We discard the two arguments we just processed ($1 and $2) + shift 2 +done + +echo "-----------------------------------------" +echo "Processing complete." \ No newline at end of file From 06a513f4525933e1377d5bc5902d6df46bf61ea9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 13:37:55 +0000 Subject: [PATCH 005/111] Add v1 and v2 (and my localy setup) specific scripts for getting load speeds --- .../paper/scripts/get_load_speed_for_multiple_v1.sh | 9 +++++++++ .../paper/scripts/get_load_speed_for_multiple_v2.sh | 9 +++++++++ 2 files changed, 18 insertions(+) create mode 100644 medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh create mode 100644 medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh new file mode 100644 index 000000000..38949ac2f --- /dev/null +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh @@ -0,0 +1,9 @@ +echo "Regular NER / 2023 model" +ner1="2023 NER (no MetaCAT)" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +ner2="2023 NER (w MetaCAT)" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" +deid="n2c2 DeID" +deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/deid_medcat_n2c2_modelpack.zip" + +bash scripts/get_load_speed_for_multiple.sh "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh new file mode 100644 index 000000000..7d62c8f7f --- /dev/null +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh @@ -0,0 +1,9 @@ +echo "Regular NER / 2023 model" +ner1="2023 NER (no MetaCAT)" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +ner2="2023 NER (w MetaCAT)" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +deid="n2c2 DeID" +deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" + +bash scripts/get_load_speed_for_multiple.sh "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" From a519fcc143945141444326714338b8c8553bc94f Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 13:53:03 +0000 Subject: [PATCH 006/111] Avoid unknown run types --- medcat-v2/paper/scripts/get_load_speed_all.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py index 50c822a65..ffb3edbe1 100644 --- a/medcat-v2/paper/scripts/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -59,6 +59,8 @@ def _single_experiment(model_path: str, sys_argv.extend(["-w", "0"]) elif run_type is RunType.WARM: sys_argv.extend(["-w", str(cnf.warmup_count)]) + else: + raise ValueError("Unknown run type") all_took: list[float] = [] for _ in range(cnf.repeats): run_out = subprocess.run(sys_argv, capture_output=True) From a0df0e63787324a16b88d844a237137f7020d50b Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:22:40 +0000 Subject: [PATCH 007/111] Add option to specify number of repeats when doing all load experiments for a single model --- medcat-v2/paper/scripts/get_load_speed_all.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py index ffb3edbe1..6d85859b2 100644 --- a/medcat-v2/paper/scripts/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -89,8 +89,13 @@ def main(): parser.add_argument("model_pack_path", help="Model pack path", type=str) + parser.add_argument("--repeats", + help="Number of repeats to use", + type=int, default=20) args = parser.parse_args() - results = do_experiment(args.model_pack_path, RunConfig()) + results = do_experiment( + args.model_pack_path, + RunConfig(repeats=args.repeats,)) print("Overall:") pprint(results.model_dump()) From 98a49fd195a93dfbd38d2ab7d0db28323896cf4c Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:23:55 +0000 Subject: [PATCH 008/111] Move to a timeit based approach --- medcat-v2/paper/scripts/get_load_speed.py | 28 +++++++++++++++-------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/medcat-v2/paper/scripts/get_load_speed.py b/medcat-v2/paper/scripts/get_load_speed.py index ccde8f2db..0282fb1e9 100644 --- a/medcat-v2/paper/scripts/get_load_speed.py +++ b/medcat-v2/paper/scripts/get_load_speed.py @@ -5,6 +5,7 @@ import argparse import logging import io +import timeit OVERALL_START_TIME = time.perf_counter() from medcat.cat import CAT # noqa @@ -65,17 +66,24 @@ def perform_work(model_pack_path: str, # NOTE: to make sure all the imports are done and so on if warmup > 0 and startup: raise ValueError("Timing warmed up from startup doesn't make sense") - logger.debug("Starting with wramp of %d repetations", warmup) - for cur_warmup in range(warmup): - logger.debug("Warmup number %d ...", cur_warmup) - load_once(model_pack_path, False, 0) - logger.info("Warmup done! Now loading!") - # NOTE: if doing startup, then counting from before + start_time = time.perf_counter() + timed = timeit.repeat( + f""" +load_once("{model_pack_path}", False, 0) + """, + setup=f""" +from get_load_speed import load_once, logger +for cur_warmup in range({warmup}, {profiling}, {lines_in_profile}): + logger.debug("Warmup number %d ...", cur_warmup) + load_once("{model_pack_path}", False, 0) + """, + repeat=1, number=1 + ) + took_time = timed[0] if startup: - logger.info("Using overall start time (before import)") - start_time = time.perf_counter() if not startup else OVERALL_START_TIME - load_once(model_pack_path, profiling, lines_in_profile) - took_time = time.perf_counter() - start_time + logger.info("Adding startup time of %fs to account for imports", + start_time - OVERALL_START_TIME) + took_time += start_time - OVERALL_START_TIME logger.info("Took a total of %ss", took_time) # NOTE: print for any time output # NOTE: no units for easy automation From e294df42dce9815fd89b7c7fea7a3de3688dfeb0 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:24:45 +0000 Subject: [PATCH 009/111] Add output folder --- medcat-v2/paper/out/.keep | 0 medcat-v2/paper/out/load_speed/.keep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 medcat-v2/paper/out/.keep create mode 100644 medcat-v2/paper/out/load_speed/.keep diff --git a/medcat-v2/paper/out/.keep b/medcat-v2/paper/out/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/medcat-v2/paper/out/load_speed/.keep b/medcat-v2/paper/out/load_speed/.keep new file mode 100644 index 000000000..e69de29bb From 074e7a178bd3de4e8790c96bcb5ec3e905b26ca5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:27:29 +0000 Subject: [PATCH 010/111] Add automatic json output --- medcat-v2/paper/scripts/get_load_speed_all.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py index 6d85859b2..0f8165930 100644 --- a/medcat-v2/paper/scripts/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -6,6 +6,7 @@ import argparse from pprint import pprint from enum import Enum, auto +import json import get_load_speed @@ -92,12 +93,21 @@ def main(): parser.add_argument("--repeats", help="Number of repeats to use", type=int, default=20) + parser.add_argument("--save-json", "-j", + help="The json path to save the results to", + type=float, default=None) args = parser.parse_args() results = do_experiment( args.model_pack_path, RunConfig(repeats=args.repeats,)) - print("Overall:") - pprint(results.model_dump()) + dumped = results.model_dump() + if args.save_json: + print("Saving to", args.save_json) + with open(args.save_json, 'w') as f: + json.dump(dumped, f) + else: + print("Overall:") + pprint(dumped) if __name__ == "__main__": From a80b7804f66a66db1cb280e0bd47b557d334fb27 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:45:40 +0000 Subject: [PATCH 011/111] Fix type of save json argument --- medcat-v2/paper/scripts/get_load_speed_all.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/get_load_speed_all.py index 0f8165930..63947ed56 100644 --- a/medcat-v2/paper/scripts/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/get_load_speed_all.py @@ -95,7 +95,7 @@ def main(): type=int, default=20) parser.add_argument("--save-json", "-j", help="The json path to save the results to", - type=float, default=None) + type=str, default=None) args = parser.parse_args() results = do_experiment( args.model_pack_path, From 2b05b702ac680f0e4f75e00b49dc8bbe5da3f5d3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:50:37 +0000 Subject: [PATCH 012/111] Always save results to a file when doing in bulk --- medcat-v2/paper/scripts/get_load_speed_for_multiple.sh | 10 ++++++++-- .../paper/scripts/get_load_speed_for_multiple_v1.sh | 4 +++- .../paper/scripts/get_load_speed_for_multiple_v2.sh | 4 +++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh index 86db92b07..6dc57975f 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh @@ -1,8 +1,11 @@ #!/bin/bash +SAVE_PREFIX=$1 +shift 1 + # --- Input Validation --- if (( $# == 0 )); then - echo "Usage: $0 ..." + echo "Usage: $0 ..." exit 0 fi @@ -20,8 +23,11 @@ while (( "$#" )); do MODEL_PATH="$2" echo "Model: '$MODEL_NAME'" + + SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" + echo "Will save to" $SAVE_PATH - python scripts/get_load_speed_all.py $MODEL_PATH + python scripts/get_load_speed_all.py $MODEL_PATH --save-json $SAVE_PATH echo "---" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh index 38949ac2f..a0199f09c 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh @@ -6,4 +6,6 @@ ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20 deid="n2c2 DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/deid_medcat_n2c2_modelpack.zip" -bash scripts/get_load_speed_for_multiple.sh "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" +out_prefix="out/load_speed/v1" + +bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh index 7d62c8f7f..22bea4a59 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh @@ -6,4 +6,6 @@ ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/med deid="n2c2 DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" -bash scripts/get_load_speed_for_multiple.sh "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" +out_prefix="out/load_speed/v2" + +bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" From 39a47b5ff5a76c51d6e7e270a255740d4e5a6dbc Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:52:12 +0000 Subject: [PATCH 013/111] Allow overwriting output prefix if/when required --- medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh | 5 +++++ medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh index a0199f09c..b9b87e8f1 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh @@ -7,5 +7,10 @@ deid="n2c2 DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/deid_medcat_n2c2_modelpack.zip" out_prefix="out/load_speed/v1" +if [ -z "$1" ] + then + out_prefix=$1 + echo "Overwriting out prefix with: "$1 +fi bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh index 22bea4a59..fed8f4a85 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh @@ -7,5 +7,11 @@ deid="n2c2 DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" out_prefix="out/load_speed/v2" +if [ -z "$1" ] + then + out_prefix=$1 + echo "Overwriting out prefix with: "$1 +fi + bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" From 3ac3d0d34bcdf6725dbd4c49b49f426bfd741eec Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 15:55:55 +0000 Subject: [PATCH 014/111] Separated speed scripts from (future) performance ones --- medcat-v2/paper/scripts/{ => speed}/get_load_speed.py | 0 medcat-v2/paper/scripts/{ => speed}/get_load_speed_all.py | 0 .../paper/scripts/{ => speed}/get_load_speed_for_multiple.sh | 2 +- .../paper/scripts/{ => speed}/get_load_speed_for_multiple_v1.sh | 2 +- .../paper/scripts/{ => speed}/get_load_speed_for_multiple_v2.sh | 2 +- 5 files changed, 3 insertions(+), 3 deletions(-) rename medcat-v2/paper/scripts/{ => speed}/get_load_speed.py (100%) rename medcat-v2/paper/scripts/{ => speed}/get_load_speed_all.py (100%) rename medcat-v2/paper/scripts/{ => speed}/get_load_speed_for_multiple.sh (91%) rename medcat-v2/paper/scripts/{ => speed}/get_load_speed_for_multiple_v1.sh (79%) rename medcat-v2/paper/scripts/{ => speed}/get_load_speed_for_multiple_v2.sh (80%) diff --git a/medcat-v2/paper/scripts/get_load_speed.py b/medcat-v2/paper/scripts/speed/get_load_speed.py similarity index 100% rename from medcat-v2/paper/scripts/get_load_speed.py rename to medcat-v2/paper/scripts/speed/get_load_speed.py diff --git a/medcat-v2/paper/scripts/get_load_speed_all.py b/medcat-v2/paper/scripts/speed/get_load_speed_all.py similarity index 100% rename from medcat-v2/paper/scripts/get_load_speed_all.py rename to medcat-v2/paper/scripts/speed/get_load_speed_all.py diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh similarity index 91% rename from medcat-v2/paper/scripts/get_load_speed_for_multiple.sh rename to medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh index 6dc57975f..d0f13ee81 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh @@ -27,7 +27,7 @@ while (( "$#" )); do SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" echo "Will save to" $SAVE_PATH - python scripts/get_load_speed_all.py $MODEL_PATH --save-json $SAVE_PATH + python scripts/speed/get_load_speed_all.py $MODEL_PATH --save-json $SAVE_PATH echo "---" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh similarity index 79% rename from medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh rename to medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh index b9b87e8f1..39d7f4df6 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh @@ -13,4 +13,4 @@ if [ -z "$1" ] echo "Overwriting out prefix with: "$1 fi -bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" +bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" diff --git a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh similarity index 80% rename from medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh rename to medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh index fed8f4a85..90734e643 100644 --- a/medcat-v2/paper/scripts/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh @@ -14,4 +14,4 @@ if [ -z "$1" ] fi -bash scripts/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" +bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" "$ner2" "$ner_model_path_w_mc" "$deid" "$deid_model_path" From 2652fd57d398e34a9eb0152f292ffce630915b8f Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 16:44:51 +0000 Subject: [PATCH 015/111] Move a bunch of code to a separate module --- medcat-v2/paper/scripts/speed/common.py | 77 +++++++++++++++++ .../paper/scripts/speed/get_load_speed.py | 82 +------------------ 2 files changed, 81 insertions(+), 78 deletions(-) create mode 100644 medcat-v2/paper/scripts/speed/common.py diff --git a/medcat-v2/paper/scripts/speed/common.py b/medcat-v2/paper/scripts/speed/common.py new file mode 100644 index 000000000..bb710b436 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/common.py @@ -0,0 +1,77 @@ +import logging +import timeit +from contextlib import contextmanager +import cProfile +import pstats +import io + + +logger = logging.getLogger(__name__) + + +def _get_stats_str(profile: cProfile.Profile, lines_in_profile: int, + stat_type: str) -> str: + string_io = io.StringIO() + stats = pstats.Stats(profile, stream=string_io) + stats.sort_stats(stat_type).print_stats(lines_in_profile) + return string_io.getvalue() + + +@contextmanager +def show_profile(do_profiling: bool, lines_in_profile: int): + if do_profiling: + profile = cProfile.Profile() + + profile.enable() + + yield + + if do_profiling: + profile.disable() + + # NOTE: for logging + tot_stats = _get_stats_str(profile, lines_in_profile, "tottime") + logger.info("TOTtime for top %d", lines_in_profile) + logger.info(tot_stats) + cum_stats = _get_stats_str(profile, lines_in_profile, "cumtime") + logger.info("CUMtime for top %d", lines_in_profile) + logger.info(cum_stats) + + +def perform_work(setup: list[str], + worker: list[str], + warmup: int, + startup: bool, + verbose: bool, + profiling: bool, + lines_in_profile: int, + ) -> float: + sh = logging.StreamHandler() + logger.addHandler(sh) + if verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.CRITICAL) + # NOTE: to make sure all the imports are done and so on + if warmup > 0 and startup: + raise ValueError("Timing warmed up from startup doesn't make sense") + # do warmup if needed + for cur_warmup in range(warmup): + logger.info("Doing warmup step %d", cur_warmup) + exec("\n".join(setup + worker)) + if startup: + logger.warning("For startup, will include warmup in timed work") + worker = setup + worker + setup = [] + with show_profile(do_profiling=profiling, + lines_in_profile=lines_in_profile): + timed = timeit.repeat( + "\n".join(worker), + setup="\n".join(setup), + repeat=1, number=1 + ) + took_time = timed[0] + logger.info("Took a total of %ss", took_time) + # NOTE: print for any time output + # NOTE: no units for easy automation + return took_time diff --git a/medcat-v2/paper/scripts/speed/get_load_speed.py b/medcat-v2/paper/scripts/speed/get_load_speed.py index 0282fb1e9..6f382d2df 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed.py +++ b/medcat-v2/paper/scripts/speed/get_load_speed.py @@ -1,13 +1,7 @@ - -import time -import cProfile -import pstats import argparse import logging -import io -import timeit -OVERALL_START_TIME = time.perf_counter() -from medcat.cat import CAT # noqa + +from common import perform_work logger = logging.getLogger(__name__) @@ -39,7 +33,8 @@ def main(): type=int, default=1) args = parser.parse_args() took_time = perform_work( - args.model_pack_path, + setup=["from medcat.cat import CAT",], + worker=[f"""CAT.load_model_pack("{args.model_pack_path}")"""], warmup=args.warmup, startup=args.startup, verbose=args.verbose, @@ -50,74 +45,5 @@ def main(): return took_time -def perform_work(model_pack_path: str, - warmup: int, - startup: bool, - verbose: bool, - profiling: bool, - lines_in_profile: int, - ) -> float: - sh = logging.StreamHandler() - logger.addHandler(sh) - if verbose: - logger.setLevel(logging.DEBUG) - else: - logger.setLevel(logging.CRITICAL) - # NOTE: to make sure all the imports are done and so on - if warmup > 0 and startup: - raise ValueError("Timing warmed up from startup doesn't make sense") - start_time = time.perf_counter() - timed = timeit.repeat( - f""" -load_once("{model_pack_path}", False, 0) - """, - setup=f""" -from get_load_speed import load_once, logger -for cur_warmup in range({warmup}, {profiling}, {lines_in_profile}): - logger.debug("Warmup number %d ...", cur_warmup) - load_once("{model_pack_path}", False, 0) - """, - repeat=1, number=1 - ) - took_time = timed[0] - if startup: - logger.info("Adding startup time of %fs to account for imports", - start_time - OVERALL_START_TIME) - took_time += start_time - OVERALL_START_TIME - logger.info("Took a total of %ss", took_time) - # NOTE: print for any time output - # NOTE: no units for easy automation - return took_time - - -def _get_stats_str(profile: cProfile.Profile, lines_in_profile: int, - stat_type: str) -> str: - string_io = io.StringIO() - stats = pstats.Stats(profile, stream=string_io) - stats.sort_stats(stat_type).print_stats(lines_in_profile) - return string_io.getvalue() - - -def load_once(model_path: str, do_profiling: bool, - lines_in_profile: int): - if do_profiling: - profile = cProfile.Profile() - - profile.enable() - - CAT.load_model_pack(model_path) - - if do_profiling: - profile.disable() - - # NOTE: for logging - tot_stats = _get_stats_str(profile, lines_in_profile, "tottime") - logger.info("TOTtime for top %d", lines_in_profile) - logger.info(tot_stats) - cum_stats = _get_stats_str(profile, lines_in_profile, "cumtime") - logger.info("CUMtime for top %d", lines_in_profile) - logger.info(cum_stats) - - if __name__ == "__main__": took_time = main() From d70054d46d9e3d9edd4502f71d5622071ca5a3b5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 16:45:32 +0000 Subject: [PATCH 016/111] Allow for a more general error handling when running subprocesses --- medcat-v2/paper/scripts/speed/get_load_speed_all.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_all.py b/medcat-v2/paper/scripts/speed/get_load_speed_all.py index 63947ed56..ec1540def 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/speed/get_load_speed_all.py @@ -65,7 +65,17 @@ def _single_experiment(model_path: str, all_took: list[float] = [] for _ in range(cnf.repeats): run_out = subprocess.run(sys_argv, capture_output=True) - all_took.append(float(run_out.stdout)) + try: + took_time = float(run_out.stdout) + except ValueError as err: + raise ValueError( + f"Unable to get run time from for {run_type}:\n" + f"'{run_out.stdout.decode()}'\n" + f"\nError output was:\n" + f"{run_out.stderr.decode()}\n" + f"\nWas running the command:\n {' '.join(sys_argv)}" + ) from err + all_took.append(took_time) return RunResults.from_times(all_took) From 7f2ff90afe79654dfecc5df3b454d2cac78008ed Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 16:54:28 +0000 Subject: [PATCH 017/111] Add a few overarching scripts to run all the speed scripts at once --- .../paper/scripts/speed/run_all_speed_scripts.sh | 15 +++++++++++++++ .../speed/run_all_speed_scripts_for_version.sh | 12 ++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/run_all_speed_scripts.sh create mode 100644 medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh diff --git a/medcat-v2/paper/scripts/speed/run_all_speed_scripts.sh b/medcat-v2/paper/scripts/speed/run_all_speed_scripts.sh new file mode 100644 index 000000000..6211360bc --- /dev/null +++ b/medcat-v2/paper/scripts/speed/run_all_speed_scripts.sh @@ -0,0 +1,15 @@ +echo "*****************" +echo "running v1 stuff" +echo "*****************" + +source .venv_v1/bin/activate + +bash scripts/speed/run_all_speed_scripts_for_version.sh v1 + +echo "*****************" +echo "running v2 stuff" +echo "*****************" + +source ../.venv312/bin/activate + +bash scripts/speed/run_all_speed_scripts_for_version.sh v2 diff --git a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh new file mode 100644 index 000000000..b167db2a8 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh @@ -0,0 +1,12 @@ +ver=$1 +echo "Running for version: $ver" +python --version +python -m pip show medcat | grep "Version" + + +for fn in `ls scripts/speed/*_v1.sh`; +do + echo "Running script:" + echo $fn + # bash $fn +done From 45f03a6896f0f6da691e86851d6dca182365d821 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 17:22:07 +0000 Subject: [PATCH 018/111] Centralise combining of experiments --- .../paper/scripts/speed/common4subproc.py | 85 +++++++++++++++ .../paper/scripts/speed/get_load_speed_all.py | 102 +++--------------- 2 files changed, 97 insertions(+), 90 deletions(-) create mode 100644 medcat-v2/paper/scripts/speed/common4subproc.py diff --git a/medcat-v2/paper/scripts/speed/common4subproc.py b/medcat-v2/paper/scripts/speed/common4subproc.py new file mode 100644 index 000000000..ad7062cb8 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/common4subproc.py @@ -0,0 +1,85 @@ +import sys +from enum import Enum, auto +from pydantic import BaseModel, ConfigDict +import subprocess + + +class RunConfig(BaseModel): + repeats: int = 20 + # how many times to perform for warmup + warmup_count: int = 1 + + +class RunResults(BaseModel): + all_times: list[float] + mean: float + min: float + max: float + + model_config = ConfigDict(frozen=True) + + @classmethod + def from_times(cls, times: list[float]) -> "RunResults": + return cls( + all_times=times, + mean=sum(times) / len(times), + min=min(times), + max=max(times), + ) + + +class OverallResults(BaseModel): + startup: RunResults + cold: RunResults + warm: RunResults + + +class RunType(Enum): + STARTUP = auto() + COLD = auto() + WARM = auto() + + +def _single_experiment(target_script: str, + target_args: list[str], + cnf: RunConfig, + run_type: RunType, + run_type_map: dict[RunType, list[str]], + ) -> RunResults: + sys_argv = [sys.executable, target_script,] + target_args + if run_type in run_type_map: + sys_argv += run_type_map[run_type] + all_took: list[float] = [] + for _ in range(cnf.repeats): + run_out = subprocess.run(sys_argv, capture_output=True) + try: + took_time = float(run_out.stdout) + except ValueError as err: + raise ValueError( + f"Unable to get run time from for {run_type}:\n" + f"'{run_out.stdout.decode()}'\n" + f"\nError output was:\n" + f"{run_out.stderr.decode()}\n" + f"\nWas running the command:\n {' '.join(sys_argv)}" + ) from err + all_took.append(took_time) + return RunResults.from_times(all_took) + + +def do_experiment( + target_script: str, + target_args: list[str], + run_type_map: dict[RunType, list[str]], + cnf: RunConfig = RunConfig(), + ) -> OverallResults: + return OverallResults( + startup=_single_experiment( + target_script, target_args, cnf, RunType.STARTUP, + run_type_map=run_type_map), + cold=_single_experiment( + target_script, target_args, cnf, RunType.COLD, + run_type_map=run_type_map), + warm=_single_experiment( + target_script, target_args, cnf, RunType.WARM, + run_type_map=run_type_map) + ) diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_all.py b/medcat-v2/paper/scripts/speed/get_load_speed_all.py index ec1540def..f7de7a4f8 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/speed/get_load_speed_all.py @@ -1,96 +1,10 @@ -from pydantic import BaseModel, ConfigDict -# import runpy -import subprocess -import os -import sys import argparse from pprint import pprint -from enum import Enum, auto import json +import os import get_load_speed - - -class RunConfig(BaseModel): - repeats: int = 20 - # how many times to perform for warmup - warmup_count: int = 1 - - -class RunResults(BaseModel): - all_times: list[float] - mean: float - min: float - max: float - - model_config = ConfigDict(frozen=True) - - @classmethod - def from_times(cls, times: list[float]) -> "RunResults": - return cls( - all_times=times, - mean=sum(times) / len(times), - min=min(times), - max=max(times), - ) - - -class OverallResults(BaseModel): - startup: RunResults - cold: RunResults - warm: RunResults - - -class RunType(Enum): - STARTUP = auto() - COLD = auto() - WARM = auto() - - -def _single_experiment(model_path: str, - cnf: RunConfig, - run_type: RunType, - ) -> RunResults: - target_script = os.path.join( - os.path.dirname(__file__), get_load_speed.__name__ + ".py") - sys_argv = [sys.executable, target_script, model_path,] - if run_type is RunType.STARTUP: - sys_argv.extend(["-w", "0", "-s"]) - elif run_type is RunType.COLD: - sys_argv.extend(["-w", "0"]) - elif run_type is RunType.WARM: - sys_argv.extend(["-w", str(cnf.warmup_count)]) - else: - raise ValueError("Unknown run type") - all_took: list[float] = [] - for _ in range(cnf.repeats): - run_out = subprocess.run(sys_argv, capture_output=True) - try: - took_time = float(run_out.stdout) - except ValueError as err: - raise ValueError( - f"Unable to get run time from for {run_type}:\n" - f"'{run_out.stdout.decode()}'\n" - f"\nError output was:\n" - f"{run_out.stderr.decode()}\n" - f"\nWas running the command:\n {' '.join(sys_argv)}" - ) from err - all_took.append(took_time) - return RunResults.from_times(all_took) - - -def do_experiment( - model_path: str, - cnf: RunConfig = RunConfig(), - ) -> OverallResults: - return OverallResults( - startup=_single_experiment( - model_path, cnf, RunType.STARTUP), - cold=_single_experiment( - model_path, cnf, RunType.COLD), - warm=_single_experiment( - model_path, cnf, RunType.WARM) - ) +from common4subproc import do_experiment, RunType, RunConfig def main(): @@ -107,9 +21,17 @@ def main(): help="The json path to save the results to", type=str, default=None) args = parser.parse_args() + target_script = os.path.join( + os.path.dirname(__file__), get_load_speed.__name__ + ".py") results = do_experiment( - args.model_pack_path, - RunConfig(repeats=args.repeats,)) + target_script, + [args.model_pack_path,], + run_type_map={ + RunType.STARTUP: ["-w", "0", "-s"], + RunType.COLD: ["-w", "0"], + RunType.WARM: [], + }, + cnf=RunConfig(repeats=args.repeats,)) dumped = results.model_dump() if args.save_json: print("Saving to", args.save_json) From 9d2ee4631e2936bb866a00f6231ce9afd410e7ed Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 17:31:42 +0000 Subject: [PATCH 019/111] Only produce results for run types that are required --- .../paper/scripts/speed/common4subproc.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/common4subproc.py b/medcat-v2/paper/scripts/speed/common4subproc.py index ad7062cb8..fe4031d99 100644 --- a/medcat-v2/paper/scripts/speed/common4subproc.py +++ b/medcat-v2/paper/scripts/speed/common4subproc.py @@ -28,12 +28,6 @@ def from_times(cls, times: list[float]) -> "RunResults": ) -class OverallResults(BaseModel): - startup: RunResults - cold: RunResults - warm: RunResults - - class RunType(Enum): STARTUP = auto() COLD = auto() @@ -71,15 +65,10 @@ def do_experiment( target_args: list[str], run_type_map: dict[RunType, list[str]], cnf: RunConfig = RunConfig(), - ) -> OverallResults: - return OverallResults( - startup=_single_experiment( - target_script, target_args, cnf, RunType.STARTUP, - run_type_map=run_type_map), - cold=_single_experiment( - target_script, target_args, cnf, RunType.COLD, - run_type_map=run_type_map), - warm=_single_experiment( - target_script, target_args, cnf, RunType.WARM, + ) -> dict[RunType, RunResults]: + return { + run_type: _single_experiment( + target_script, target_args, cnf, run_type, run_type_map=run_type_map) - ) + for run_type in run_type_map + } From 6c0164fa6392905f46faeb421150d4b5b487a866 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 17:32:25 +0000 Subject: [PATCH 020/111] Add uncommitted changes from last commit --- medcat-v2/paper/scripts/speed/get_load_speed_all.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_all.py b/medcat-v2/paper/scripts/speed/get_load_speed_all.py index f7de7a4f8..4abb5a17f 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/speed/get_load_speed_all.py @@ -32,7 +32,8 @@ def main(): RunType.WARM: [], }, cnf=RunConfig(repeats=args.repeats,)) - dumped = results.model_dump() + dumped = {run_type: model.model_dump() + for run_type, model in results.items()} if args.save_json: print("Saving to", args.save_json) with open(args.save_json, 'w') as f: From 2390d2bd613c7bd455b3648290a7f8b9a23257d8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 17:59:18 +0000 Subject: [PATCH 021/111] Add modules to get inference speed --- .../scripts/speed/get_inference_speed.py | 62 +++++++++++++++++++ .../scripts/speed/get_inference_speed_all.py | 49 +++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_inference_speed.py create mode 100644 medcat-v2/paper/scripts/speed/get_inference_speed_all.py diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed.py b/medcat-v2/paper/scripts/speed/get_inference_speed.py new file mode 100644 index 000000000..ca492b27d --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_inference_speed.py @@ -0,0 +1,62 @@ +from common import perform_work +from pydantic import BaseModel +import argparse + + +class InferenceSpeedConfig(BaseModel): + model_pack_path: str + inference_file_path: str + + +def get_speed(): + + pass + + +def main(): + parser = argparse.ArgumentParser( + "get_inference_speed.py" + ) + parser.add_argument("model_pack_path", + help="The path to the model pack", + type=str) + parser.add_argument("csv_path", + help="Path to the csv with (at least a) 'text' field", + type=str) + parser.add_argument("--verbose", "-v", + help="Whether to run in verbose mode", + action="store_true") + parser.add_argument("--do-profiling", "-p", + help="Whether to run profiling on top of just timing", + action="store_true") + parser.add_argument("--num-in-profile", "--np", + help="The number of lines in the profile.", + type=int, default=20) + parser.add_argument("--startup", "-s", + help="Whether to use the startup as the start time. " + "This is useful when trying to include import times " + "as well - i.e real user experience", + action="store_true") + parser.add_argument("--warmup", "-w", + help="The number of warmup rounds", + type=int, default=1) + args = parser.parse_args() + took_time = perform_work( + setup=["from medcat.cat import CAT", + "import pandas as pd", + f"cat = CAT.load_model_pack('{args.model_pack_path}')", + f"df = pd.read_csv('{args.csv_path}')"], + worker=["for text in df.text:", + " cat.get_entities(text)"], + warmup=args.warmup, + startup=args.startup, + verbose=args.verbose, + profiling=args.do_profiling, + lines_in_profile=args.num_in_profile + ) + print(took_time) + return took_time + + +if __name__ == "__main__": + main() diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_all.py b/medcat-v2/paper/scripts/speed/get_inference_speed_all.py new file mode 100644 index 000000000..25dbf0c4d --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_all.py @@ -0,0 +1,49 @@ +import argparse +from pprint import pprint +import json +import os + +import get_inference_speed +from common4subproc import do_experiment, RunType, RunConfig + + +def main(): + parser = argparse.ArgumentParser( + "get_inference_speed_all" + ) + parser.add_argument("model_pack_path", + help="Model pack path", + type=str) + parser.add_argument("csv_path", + help="Path to the csv with (at least a) 'text' field", + type=str) + parser.add_argument("--repeats", + help="Number of repeats to use", + type=int, default=20) + parser.add_argument("--save-json", "-j", + help="The json path to save the results to", + type=str, default=None) + args = parser.parse_args() + target_script = os.path.join( + os.path.dirname(__file__), get_inference_speed.__name__ + ".py") + results = do_experiment( + target_script, + [args.model_pack_path, args.csv_path], + run_type_map={ + RunType.COLD: ["-w", "0"], + RunType.WARM: ["-w", "1"], + }, + cnf=RunConfig(repeats=args.repeats,)) + dumped = {run_type: model.model_dump() + for run_type, model in results.items()} + if args.save_json: + print("Saving to", args.save_json) + with open(args.save_json, 'w') as f: + json.dump(dumped, f) + else: + print("Overall:") + pprint(dumped) + + +if __name__ == "__main__": + main() From fc7b0657547cedc4b1cc38089f58a049ca071ecb Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 22:50:18 +0000 Subject: [PATCH 022/111] Fix serialisation issue --- medcat-v2/paper/scripts/speed/get_inference_speed_all.py | 2 +- medcat-v2/paper/scripts/speed/get_load_speed_all.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_all.py b/medcat-v2/paper/scripts/speed/get_inference_speed_all.py index 25dbf0c4d..bc78a99de 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_all.py +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_all.py @@ -34,7 +34,7 @@ def main(): RunType.WARM: ["-w", "1"], }, cnf=RunConfig(repeats=args.repeats,)) - dumped = {run_type: model.model_dump() + dumped = {run_type.name: model.model_dump() for run_type, model in results.items()} if args.save_json: print("Saving to", args.save_json) diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_all.py b/medcat-v2/paper/scripts/speed/get_load_speed_all.py index 4abb5a17f..f32656a0f 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_all.py +++ b/medcat-v2/paper/scripts/speed/get_load_speed_all.py @@ -32,7 +32,7 @@ def main(): RunType.WARM: [], }, cnf=RunConfig(repeats=args.repeats,)) - dumped = {run_type: model.model_dump() + dumped = {run_type.name: model.model_dump() for run_type, model in results.items()} if args.save_json: print("Saving to", args.save_json) From 21ecc3d6e51a939b4a17be1eb35cbd21eb9af21f Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 23:25:17 +0000 Subject: [PATCH 023/111] Add overall inference speed getter --- .../speed/get_inference_speed_for_multiple.sh | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh new file mode 100644 index 000000000..10215987f --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +SAVE_PREFIX=$1 +shift 1 + +# --- Input Validation --- +if (( $# == 0 )); then + echo "Usage: $0 ..." + exit 0 +fi + +if (( $# % 3 != 0 )); then + echo "Error: Arguments must be provided in triples (name, model path, and CSV path)." >&2 + exit 1 +fi + +echo "Starting pairwise argument processing..." +echo "-----------------------------------------" + +# The 'while' loop continues as long as there are arguments left ($# is non-zero) +while (( "$#" )); do + MODEL_NAME="$1" + MODEL_PATH="$2" + CSV_PATH="$3" + + echo "Model: '$MODEL_NAME' with CSV '$CSV_PATH'" + + SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" + echo "Will save to" $SAVE_PATH + + python scripts/speed/get_inference_speed_all.py $MODEL_PATH $CSV_PATH --save-json $SAVE_PATH + + echo "---" + + # Shift discards the first N arguments. + # We discard the two arguments we just processed ($1 and $2) + shift 3 +done + +echo "-----------------------------------------" +echo "Processing complete." \ No newline at end of file From 48cf5f3d948a08500e2346f21b355c8dcaff74f3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 23:26:06 +0000 Subject: [PATCH 024/111] Add setup-specific scripts for inference speed --- .../speed/get_inference_speed_for_multiple_v1.sh | 15 +++++++++++++++ .../speed/get_inference_speed_for_multiple_v2.sh | 15 +++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh create mode 100644 medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh new file mode 100644 index 000000000..97c86fb70 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -0,0 +1,15 @@ +echo "Regular NER / 2023 model" +ner1="2023 NER (no MetaCAT)" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +ner2="2023 NER (w MetaCAT)" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" +csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" + +out_prefix="out/inference_speed/v1" +if [ -z "$1" ] + then + out_prefix=$1 + echo "Overwriting out prefix with: "$1 +fi + +bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh new file mode 100644 index 000000000..73ddeadc6 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -0,0 +1,15 @@ +echo "Regular NER / 2023 model" +ner1="2023 NER (no MetaCAT)" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +ner2="2023 NER (w MetaCAT)" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" + +out_prefix="out/inference_speed/v2" +if [ -z "$1" ] + then + out_prefix=$1 + echo "Overwriting out prefix with: "$1 +fi + +bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path From a9bd75f385f0249efc8b54536dc342b28e25aa22 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 7 Nov 2025 23:58:54 +0000 Subject: [PATCH 025/111] Allow scripts to actually run --- .../paper/scripts/speed/run_all_speed_scripts_for_version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh index b167db2a8..4eb6c6bde 100644 --- a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh +++ b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh @@ -8,5 +8,5 @@ for fn in `ls scripts/speed/*_v1.sh`; do echo "Running script:" echo $fn - # bash $fn + bash $fn done From b5283a2054f28b1ab9d156cb3c39cc12eec22c41 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 00:02:51 +0000 Subject: [PATCH 026/111] Fix a small issue (running load speed instead of inference speed) --- .../paper/scripts/speed/get_inference_speed_for_multiple_v1.sh | 2 +- .../paper/scripts/speed/get_inference_speed_for_multiple_v2.sh | 2 +- medcat-v2/paper/scripts/speed/run_speed_v1.sh | 0 medcat-v2/paper/scripts/speed/run_speed_v2.sh | 0 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 medcat-v2/paper/scripts/speed/run_speed_v1.sh create mode 100644 medcat-v2/paper/scripts/speed/run_speed_v2.sh diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index 97c86fb70..f5210c8f4 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -12,4 +12,4 @@ if [ -z "$1" ] echo "Overwriting out prefix with: "$1 fi -bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path +bash scripts/speed/get_inference_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index 73ddeadc6..806086171 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -12,4 +12,4 @@ if [ -z "$1" ] echo "Overwriting out prefix with: "$1 fi -bash scripts/speed/get_load_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path +bash scripts/speed/get_inference_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path diff --git a/medcat-v2/paper/scripts/speed/run_speed_v1.sh b/medcat-v2/paper/scripts/speed/run_speed_v1.sh new file mode 100644 index 000000000..e69de29bb diff --git a/medcat-v2/paper/scripts/speed/run_speed_v2.sh b/medcat-v2/paper/scripts/speed/run_speed_v2.sh new file mode 100644 index 000000000..e69de29bb From 340f4fe9d211979ea51ea445edc218c2967f3f00 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 00:04:32 +0000 Subject: [PATCH 027/111] Fix some argument issues in bash scripts --- .../paper/scripts/speed/get_inference_speed_for_multiple_v1.sh | 2 +- .../paper/scripts/speed/get_inference_speed_for_multiple_v2.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index f5210c8f4..2a1c67f2c 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -12,4 +12,4 @@ if [ -z "$1" ] echo "Overwriting out prefix with: "$1 fi -bash scripts/speed/get_inference_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path +bash scripts/speed/get_inference_speed_for_multiple.sh "$out_prefix" "$ner1" "$ner_model_path_no_mc" "$csv_path" "$ner2" "$ner_model_path_w_mc" "$csv_path" diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index 806086171..7869f4683 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -12,4 +12,4 @@ if [ -z "$1" ] echo "Overwriting out prefix with: "$1 fi -bash scripts/speed/get_inference_speed_for_multiple.sh $out_prefix "$ner1" "$ner_model_path_no_mc" $csv_path "$ner2" "$ner_model_path_w_mc" $csv_path +bash scripts/speed/get_inference_speed_for_multiple.sh "$out_prefix" "$ner1" "$ner_model_path_no_mc" "$csv_path" "$ner2" "$ner_model_path_w_mc" "$csv_path" From b96a8799aa03926aa1f110a677f00d5efb7bf0be Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 00:06:24 +0000 Subject: [PATCH 028/111] Make names file-name safe --- .../scripts/speed/get_inference_speed_for_multiple_v1.sh | 4 ++-- .../scripts/speed/get_inference_speed_for_multiple_v2.sh | 4 ++-- .../paper/scripts/speed/get_load_speed_for_multiple_v1.sh | 6 +++--- .../paper/scripts/speed/get_load_speed_for_multiple_v2.sh | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index 2a1c67f2c..48ad4e027 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -1,7 +1,7 @@ echo "Regular NER / 2023 model" -ner1="2023 NER (no MetaCAT)" +ner1="2023_NER_no MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" -ner2="2023 NER (w MetaCAT)" +ner2="2023_NER_w_MetaCAT" ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index 7869f4683..ed9e06125 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -1,7 +1,7 @@ echo "Regular NER / 2023 model" -ner1="2023 NER (no MetaCAT)" +ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -ner2="2023 NER (w MetaCAT)" +ner2="2023_NER_w_MetaCAT" ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh index 39d7f4df6..cbfcbdfd4 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh @@ -1,9 +1,9 @@ echo "Regular NER / 2023 model" -ner1="2023 NER (no MetaCAT)" +ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" -ner2="2023 NER (w MetaCAT)" +ner2="2023_NER_w_MetaCAT" ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" -deid="n2c2 DeID" +deid="n2c2_DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/deid_medcat_n2c2_modelpack.zip" out_prefix="out/load_speed/v1" diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh index 90734e643..8e1906b78 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh @@ -1,9 +1,9 @@ echo "Regular NER / 2023 model" -ner1="2023 NER (no MetaCAT)" +ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -ner2="2023 NER (w MetaCAT)" +ner2="2023_NER_w_MetaCAT" ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" -deid="n2c2 DeID" +deid="n2c2_DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" out_prefix="out/load_speed/v2" From 546a7d8105fd353f3b657e6307be58ad1044fdff Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 00:11:14 +0000 Subject: [PATCH 029/111] Change divider type between scripts --- .../paper/scripts/speed/run_all_speed_scripts_for_version.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh index 4eb6c6bde..dd23876c4 100644 --- a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh +++ b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh @@ -6,6 +6,7 @@ python -m pip show medcat | grep "Version" for fn in `ls scripts/speed/*_v1.sh`; do + echo "__________________________" echo "Running script:" echo $fn bash $fn From 25a1fbc407a92164fa85a53b0ac30d3fadb7f936 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 00:14:30 +0000 Subject: [PATCH 030/111] Fix a bash script logic issue --- .../scripts/speed/get_inference_speed_for_multiple_v1.sh | 4 ++-- .../scripts/speed/get_inference_speed_for_multiple_v2.sh | 2 +- .../paper/scripts/speed/get_load_speed_for_multiple_v1.sh | 2 +- .../paper/scripts/speed/get_load_speed_for_multiple_v2.sh | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index 48ad4e027..3529d5cb1 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -6,10 +6,10 @@ ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20 csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" out_prefix="out/inference_speed/v1" -if [ -z "$1" ] +if [[ ! -z "$1" ]] then out_prefix=$1 - echo "Overwriting out prefix with: "$1 + echo "Overwriting out prefix with: '"$1"'" fi bash scripts/speed/get_inference_speed_for_multiple.sh "$out_prefix" "$ner1" "$ner_model_path_no_mc" "$csv_path" "$ner2" "$ner_model_path_w_mc" "$csv_path" diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index ed9e06125..14dd08386 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -6,7 +6,7 @@ ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/med csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" out_prefix="out/inference_speed/v2" -if [ -z "$1" ] +if [[ ! -z "$1" ]] then out_prefix=$1 echo "Overwriting out prefix with: "$1 diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh index cbfcbdfd4..760f06412 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh @@ -7,7 +7,7 @@ deid="n2c2_DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/deid_medcat_n2c2_modelpack.zip" out_prefix="out/load_speed/v1" -if [ -z "$1" ] +if [[ ! -z "$1" ]] then out_prefix=$1 echo "Overwriting out prefix with: "$1 diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh index 8e1906b78..aed2e8e8e 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh @@ -7,7 +7,7 @@ deid="n2c2_DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" out_prefix="out/load_speed/v2" -if [ -z "$1" ] +if [[ ! -z "$1" ]] then out_prefix=$1 echo "Overwriting out prefix with: "$1 From 9f6e087ae8d92c60bb74909c07bfe171bfa07165 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sat, 8 Nov 2025 22:22:10 +0000 Subject: [PATCH 031/111] Read output from last line --- medcat-v2/paper/scripts/speed/common4subproc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/common4subproc.py b/medcat-v2/paper/scripts/speed/common4subproc.py index fe4031d99..0edacb80e 100644 --- a/medcat-v2/paper/scripts/speed/common4subproc.py +++ b/medcat-v2/paper/scripts/speed/common4subproc.py @@ -47,7 +47,7 @@ def _single_experiment(target_script: str, for _ in range(cnf.repeats): run_out = subprocess.run(sys_argv, capture_output=True) try: - took_time = float(run_out.stdout) + took_time = float(run_out.stdout.split(b"\n")[-1]) except ValueError as err: raise ValueError( f"Unable to get run time from for {run_type}:\n" From 7cf0c40efe4bc67fd2f3dce5acfc9b16d27874b2 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 9 Nov 2025 12:53:36 +0000 Subject: [PATCH 032/111] Fix issue with errorenoushly newlines --- medcat-v2/paper/scripts/speed/common4subproc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/common4subproc.py b/medcat-v2/paper/scripts/speed/common4subproc.py index 0edacb80e..5a919650a 100644 --- a/medcat-v2/paper/scripts/speed/common4subproc.py +++ b/medcat-v2/paper/scripts/speed/common4subproc.py @@ -47,7 +47,7 @@ def _single_experiment(target_script: str, for _ in range(cnf.repeats): run_out = subprocess.run(sys_argv, capture_output=True) try: - took_time = float(run_out.stdout.split(b"\n")[-1]) + took_time = float(run_out.stdout.strip().split(b"\n")[-1]) except ValueError as err: raise ValueError( f"Unable to get run time from for {run_type}:\n" From 3511a050788f24cedc8e6972b9e627d355531b50 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Nov 2025 09:16:11 +0000 Subject: [PATCH 033/111] Improve output for getting time from stdout --- medcat-v2/paper/scripts/speed/common4subproc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/common4subproc.py b/medcat-v2/paper/scripts/speed/common4subproc.py index 5a919650a..9621d22ca 100644 --- a/medcat-v2/paper/scripts/speed/common4subproc.py +++ b/medcat-v2/paper/scripts/speed/common4subproc.py @@ -46,12 +46,14 @@ def _single_experiment(target_script: str, all_took: list[float] = [] for _ in range(cnf.repeats): run_out = subprocess.run(sys_argv, capture_output=True) + raw_time_str = run_out.stdout.strip().split(b"\n")[-1] try: - took_time = float(run_out.stdout.strip().split(b"\n")[-1]) + took_time = float(raw_time_str) except ValueError as err: raise ValueError( - f"Unable to get run time from for {run_type}:\n" - f"'{run_out.stdout.decode()}'\n" + f"Unable to get run time for {run_type} from:\n" + f"'{raw_time_str}'\n" + f"Total output:\n{run_out.stdout.decode()}\n" f"\nError output was:\n" f"{run_out.stderr.decode()}\n" f"\nWas running the command:\n {' '.join(sys_argv)}" From ab9076973ce40918692eb9e1942798f274654baf Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Nov 2025 09:39:59 +0000 Subject: [PATCH 034/111] Add some more output when doing inference speed --- .../paper/scripts/speed/get_inference_speed_for_multiple.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh index 10215987f..577a5a894 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh @@ -14,7 +14,7 @@ if (( $# % 3 != 0 )); then exit 1 fi -echo "Starting pairwise argument processing..." +echo "Starting triplet argument processing..." echo "-----------------------------------------" # The 'while' loop continues as long as there are arguments left ($# is non-zero) @@ -28,7 +28,9 @@ while (( "$#" )); do SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" echo "Will save to" $SAVE_PATH - python scripts/speed/get_inference_speed_all.py $MODEL_PATH $CSV_PATH --save-json $SAVE_PATH + FULL_TARGET="scripts/speed/get_inference_speed_all.py $MODEL_PATH $CSV_PATH --save-json $SAVE_PATH" + echo "Running: python $FULL_TARGET" + python $FULL_TARGET echo "---" From 2abeec8b8a4f58eebb4e63c14b621143d29feb1f Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Nov 2025 11:00:15 +0000 Subject: [PATCH 035/111] Fix some comment --- .../paper/scripts/speed/get_inference_speed_for_multiple.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh index 577a5a894..9ccc0437a 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh @@ -35,7 +35,7 @@ while (( "$#" )); do echo "---" # Shift discards the first N arguments. - # We discard the two arguments we just processed ($1 and $2) + # We discard the thre arguments we just processed ($1, $2, and $3) shift 3 done From 7676ced98177c562b92e12a5d2314a980f83546c Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Nov 2025 11:29:26 +0000 Subject: [PATCH 036/111] Some whitespace changes --- .../paper/scripts/speed/get_inference_speed_for_multiple.sh | 2 +- medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh index 9ccc0437a..3c6cbaafa 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple.sh @@ -27,7 +27,7 @@ while (( "$#" )); do SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" echo "Will save to" $SAVE_PATH - + FULL_TARGET="scripts/speed/get_inference_speed_all.py $MODEL_PATH $CSV_PATH --save-json $SAVE_PATH" echo "Running: python $FULL_TARGET" python $FULL_TARGET diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh index d0f13ee81..9748e12ee 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple.sh @@ -26,7 +26,7 @@ while (( "$#" )); do SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" echo "Will save to" $SAVE_PATH - + python scripts/speed/get_load_speed_all.py $MODEL_PATH --save-json $SAVE_PATH echo "---" From 4aeb85952fa6910f40898907ddbac38c3c0b34f6 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 10 Nov 2025 16:28:27 +0000 Subject: [PATCH 037/111] Fix typo --- .../paper/scripts/speed/get_inference_speed_for_multiple_v1.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index 3529d5cb1..096a690f3 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -1,5 +1,5 @@ echo "Regular NER / 2023 model" -ner1="2023_NER_no MetaCAT" +ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" ner2="2023_NER_w_MetaCAT" ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" From 636c4ccda68903b9aa7f9133377301b2cb27d134 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 11 Nov 2025 09:31:09 +0000 Subject: [PATCH 038/111] Remove unneeded output --- .../paper/scripts/speed/get_inference_speed_for_multiple_v1.sh | 1 - .../paper/scripts/speed/get_inference_speed_for_multiple_v2.sh | 1 - medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh | 1 - medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh | 1 - 4 files changed, 4 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh index 096a690f3..a2b1553b7 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v1.sh @@ -1,4 +1,3 @@ -echo "Regular NER / 2023 model" ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" ner2="2023_NER_w_MetaCAT" diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index 14dd08386..28d80a053 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -1,4 +1,3 @@ -echo "Regular NER / 2023 model" ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" ner2="2023_NER_w_MetaCAT" diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh index 760f06412..21ca257bc 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v1.sh @@ -1,4 +1,3 @@ -echo "Regular NER / 2023 model" ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" ner2="2023_NER_w_MetaCAT" diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh index aed2e8e8e..a1dfa8c6b 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh @@ -1,4 +1,3 @@ -echo "Regular NER / 2023 model" ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" ner2="2023_NER_w_MetaCAT" From f218e7f0c28eb4618f2660d1a3c0de8aec96c0a4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 11 Nov 2025 09:33:11 +0000 Subject: [PATCH 039/111] Fix script running for specific version --- .../paper/scripts/speed/run_all_speed_scripts_for_version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh index dd23876c4..6630db8a4 100644 --- a/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh +++ b/medcat-v2/paper/scripts/speed/run_all_speed_scripts_for_version.sh @@ -4,7 +4,7 @@ python --version python -m pip show medcat | grep "Version" -for fn in `ls scripts/speed/*_v1.sh`; +for fn in `ls scripts/speed/*_$ver.sh`; do echo "__________________________" echo "Running script:" From 7341c420b4b7bf0a9be83f4e71ca688976a383a7 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 13:51:38 +0000 Subject: [PATCH 040/111] Remove unused empty method --- medcat-v2/paper/scripts/speed/get_inference_speed.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed.py b/medcat-v2/paper/scripts/speed/get_inference_speed.py index ca492b27d..f27fd6f11 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed.py +++ b/medcat-v2/paper/scripts/speed/get_inference_speed.py @@ -8,11 +8,6 @@ class InferenceSpeedConfig(BaseModel): inference_file_path: str -def get_speed(): - - pass - - def main(): parser = argparse.ArgumentParser( "get_inference_speed.py" From fecf5ab17747dcce8738fc3b6ae8efd389b282df Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 13:58:03 +0000 Subject: [PATCH 041/111] Add script to get unsupervised training speed as well --- .../scripts/speed/get_unsup_train_speed.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_unsup_train_speed.py diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py new file mode 100644 index 000000000..dd13da2a2 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py @@ -0,0 +1,64 @@ +from common import perform_work +from pydantic import BaseModel +import argparse +import importlib.metadata + + +mct_ver = importlib.metadata.distribution("medcat").version + + +class InferenceSpeedConfig(BaseModel): + model_pack_path: str + inference_file_path: str + + +def main(): + parser = argparse.ArgumentParser( + "get_inference_speed.py" + ) + parser.add_argument("model_pack_path", + help="The path to the model pack", + type=str) + parser.add_argument("csv_path", + help="Path to the csv with (at least a) 'text' field", + type=str) + parser.add_argument("--verbose", "-v", + help="Whether to run in verbose mode", + action="store_true") + parser.add_argument("--do-profiling", "-p", + help="Whether to run profiling on top of just timing", + action="store_true") + parser.add_argument("--num-in-profile", "--np", + help="The number of lines in the profile.", + type=int, default=20) + parser.add_argument("--startup", "-s", + help="Whether to use the startup as the start time. " + "This is useful when trying to include import times " + "as well - i.e real user experience", + action="store_true") + parser.add_argument("--warmup", "-w", + help="The number of warmup rounds", + type=int, default=1) + args = parser.parse_args() + if mct_ver.startswith("1."): + work_string = "cat.train(df.text)" + elif mct_ver.startswith("2."): + work_string = "cat.trainer.train_unsupervised(df.text)" + took_time = perform_work( + setup=["from medcat.cat import CAT", + "import pandas as pd", + f"cat = CAT.load_model_pack('{args.model_pack_path}')", + f"df = pd.read_csv('{args.csv_path}')"], + worker=[work_string], + warmup=args.warmup, + startup=args.startup, + verbose=args.verbose, + profiling=args.do_profiling, + lines_in_profile=args.num_in_profile + ) + print(took_time) + return took_time + + +if __name__ == "__main__": + main() From 42e888ab5449fe17cc95f349c8fa1e9a60a0b636 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 14:00:12 +0000 Subject: [PATCH 042/111] Add script to summarise output --- .../paper/scripts/speed/summarise_speeds.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/summarise_speeds.py diff --git a/medcat-v2/paper/scripts/speed/summarise_speeds.py b/medcat-v2/paper/scripts/speed/summarise_speeds.py new file mode 100644 index 000000000..43c5606d8 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/summarise_speeds.py @@ -0,0 +1,52 @@ +import json +import sys +import os +import pandas as pd +import re + + +VERSION_MODEL_PATTERN = re.compile(r"(v\d)_(.*).json") +FOLDER_NAME_PATTERN = re.compile(r"(.*)_speed") + + +def extract_test_version_and_model(path: str) -> tuple[str, str, str]: + dirname = os.path.basename(os.path.dirname(path)) + fnmatch = FOLDER_NAME_PATTERN.match(dirname) + if not fnmatch: + raise ValueError(f"Folder name unrecognsied: {dirname}") + basename = os.path.basename(path) + match = VERSION_MODEL_PATTERN.match(basename) + if not match: + raise ValueError(f"Basename did not match: {basename}") + return fnmatch.group(1), match.group(1), match.group(2) + + +def gather_data(json_paths: list[str], + header=[ + "Check Type", "Version", "Model", "Warm status", + "Mean time", "# of repeats"] + ) -> pd.DataFrame: + dfs: list[pd.DataFrame] = [] + for path in json_paths: + speed_type, version, model = extract_test_version_and_model(path) + with open(path) as f: + cur_data = json.load(f) + print("KEYS", cur_data.keys()) + col1 = list(cur_data.keys()) + mean = [cur_data[cc]['mean'] for cc in col1] + experiments = [len(cur_data[cc]['all_times']) for cc in col1] + vals = [speed_type, version, model, col1, mean, experiments] + dfs.append(pd.DataFrame({col: val for col, val in zip(header, vals)})) + df = pd.concat(dfs) + df.sort_values(by=["Check Type", "Model", "Warm status"], inplace=True) + df.reset_index(inplace=True) + return df + + +def main(*file_paths: str): + df = gather_data(file_paths) + print(df.to_string()) + + +if __name__ == "__main__": + main(*sys.argv[1:]) From b5e611cb81530ee929867549626044617851ff5e Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 14:00:34 +0000 Subject: [PATCH 043/111] Add script to combine all unsuperivsed training output for a particula model --- .../speed/get_unsup_train_speed_all.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_unsup_train_speed_all.py diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_all.py b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_all.py new file mode 100644 index 000000000..124b22f6b --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_all.py @@ -0,0 +1,49 @@ +import argparse +from pprint import pprint +import json +import os + +import get_unsup_train_speed +from common4subproc import do_experiment, RunType, RunConfig + + +def main(): + parser = argparse.ArgumentParser( + "get_unsup_train_speed_all" + ) + parser.add_argument("model_pack_path", + help="Model pack path", + type=str) + parser.add_argument("csv_path", + help="Path to the csv with (at least a) 'text' field", + type=str) + parser.add_argument("--repeats", + help="Number of repeats to use", + type=int, default=20) + parser.add_argument("--save-json", "-j", + help="The json path to save the results to", + type=str, default=None) + args = parser.parse_args() + target_script = os.path.join( + os.path.dirname(__file__), get_unsup_train_speed.__name__ + ".py") + results = do_experiment( + target_script, + [args.model_pack_path, args.csv_path], + run_type_map={ + RunType.COLD: ["-w", "0"], + RunType.WARM: ["-w", "1"], + }, + cnf=RunConfig(repeats=args.repeats,)) + dumped = {run_type.name: model.model_dump() + for run_type, model in results.items()} + if args.save_json: + print("Saving to", args.save_json) + with open(args.save_json, 'w') as f: + json.dump(dumped, f) + else: + print("Overall:") + pprint(dumped) + + +if __name__ == "__main__": + main() From f49dab3d5de393b3795169c2db684164cccd1e3b Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 14:18:48 +0000 Subject: [PATCH 044/111] Add scripts to get unsupervised speed overall --- .../get_unsup_train_speed_for_multiple.sh | 43 +++++++++++++++++++ .../get_unsup_train_speed_for_multiple_v1.sh | 14 ++++++ .../get_unsup_train_speed_for_multiple_v2.sh | 14 ++++++ 3 files changed, 71 insertions(+) create mode 100644 medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple.sh create mode 100644 medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v1.sh create mode 100644 medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple.sh b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple.sh new file mode 100644 index 000000000..10f895b7e --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +SAVE_PREFIX=$1 +shift 1 + +# --- Input Validation --- +if (( $# == 0 )); then + echo "Usage: $0 ..." + exit 0 +fi + +if (( $# % 3 != 0 )); then + echo "Error: Arguments must be provided in triples (name, model path, and CSV path)." >&2 + exit 1 +fi + +echo "Starting triplet argument processing..." +echo "-----------------------------------------" + +# The 'while' loop continues as long as there are arguments left ($# is non-zero) +while (( "$#" )); do + MODEL_NAME="$1" + MODEL_PATH="$2" + CSV_PATH="$3" + + echo "Model: '$MODEL_NAME' with CSV '$CSV_PATH'" + + SAVE_PATH=$SAVE_PREFIX"_"$MODEL_NAME".json" + echo "Will save to" $SAVE_PATH + + FULL_TARGET="scripts/speed/get_unsup_train_speed_all.py $MODEL_PATH $CSV_PATH --save-json $SAVE_PATH" + echo "Running: python $FULL_TARGET" + python $FULL_TARGET + + echo "---" + + # Shift discards the first N arguments. + # We discard the thre arguments we just processed ($1, $2, and $3) + shift 3 +done + +echo "-----------------------------------------" +echo "Processing complete." \ No newline at end of file diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v1.sh b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v1.sh new file mode 100644 index 000000000..8dd0a480f --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v1.sh @@ -0,0 +1,14 @@ +ner1="2023_NER_no_MetaCAT" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +ner2="2023_NER_w_MetaCAT" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_494c3717f637bb89.zip" +csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" + +out_prefix="out/unsup_train_speed/v1" +if [[ ! -z "$1" ]] + then + out_prefix=$1 + echo "Overwriting out prefix with: '"$1"'" +fi + +bash scripts/speed/get_unsup_train_speed_for_multiple.sh "$out_prefix" "$ner1" "$ner_model_path_no_mc" "$csv_path" "$ner2" "$ner_model_path_w_mc" "$csv_path" diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh new file mode 100644 index 000000000..30db39dc2 --- /dev/null +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh @@ -0,0 +1,14 @@ +ner1="2023_NER_no_MetaCAT" +ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +ner2="2023_NER_w_MetaCAT" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" + +out_prefix="out/unsup_train_speed/v2" +if [[ ! -z "$1" ]] + then + out_prefix=$1 + echo "Overwriting out prefix with: '"$1"'" +fi + +bash scripts/speed/get_unsup_train_speed_for_multiple.sh "$out_prefix" "$ner1" "$ner_model_path_no_mc" "$csv_path" "$ner2" "$ner_model_path_w_mc" "$csv_path" From c3d8672ebb4c32c193564cf44c36e8edd909515c Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 14:37:47 +0000 Subject: [PATCH 045/111] Add folder for inference and unsupervised training output --- medcat-v2/paper/out/inference_speed/.keep | 0 medcat-v2/paper/out/unsup_train_speed/.keep | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 medcat-v2/paper/out/inference_speed/.keep create mode 100644 medcat-v2/paper/out/unsup_train_speed/.keep diff --git a/medcat-v2/paper/out/inference_speed/.keep b/medcat-v2/paper/out/inference_speed/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/medcat-v2/paper/out/unsup_train_speed/.keep b/medcat-v2/paper/out/unsup_train_speed/.keep new file mode 100644 index 000000000..e69de29bb From e70a9db4fdbec5197267fad257bf7d9d7f7699f1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 12 Nov 2025 16:27:16 +0000 Subject: [PATCH 046/111] Removed empty / old files --- medcat-v2/paper/scripts/speed/run_speed_v1.sh | 0 medcat-v2/paper/scripts/speed/run_speed_v2.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 medcat-v2/paper/scripts/speed/run_speed_v1.sh delete mode 100644 medcat-v2/paper/scripts/speed/run_speed_v2.sh diff --git a/medcat-v2/paper/scripts/speed/run_speed_v1.sh b/medcat-v2/paper/scripts/speed/run_speed_v1.sh deleted file mode 100644 index e69de29bb..000000000 diff --git a/medcat-v2/paper/scripts/speed/run_speed_v2.sh b/medcat-v2/paper/scripts/speed/run_speed_v2.sh deleted file mode 100644 index e69de29bb..000000000 From de88018ff433a9dc5cf77f63d237e46108c28e2a Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:06:33 +0000 Subject: [PATCH 047/111] Improve / fix profiling --- medcat-v2/paper/scripts/speed/common.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/common.py b/medcat-v2/paper/scripts/speed/common.py index bb710b436..4c24eeb9f 100644 --- a/medcat-v2/paper/scripts/speed/common.py +++ b/medcat-v2/paper/scripts/speed/common.py @@ -4,6 +4,7 @@ import cProfile import pstats import io +import time logger = logging.getLogger(__name__) @@ -24,7 +25,7 @@ def show_profile(do_profiling: bool, lines_in_profile: int): profile.enable() - yield + yield [] if do_profiling: profile.disable() @@ -63,14 +64,22 @@ def perform_work(setup: list[str], logger.warning("For startup, will include warmup in timed work") worker = setup + worker setup = [] - with show_profile(do_profiling=profiling, - lines_in_profile=lines_in_profile): - timed = timeit.repeat( + if profiling: + # NOTE: do it manually so I can profile only the worker part + exec("\n".join(setup)) + start_time = time.perf_counter() + with show_profile( + do_profiling=True, + lines_in_profile=lines_in_profile): + exec("\n".join(worker)) + times = [time.perf_counter() - start_time] + else: + times = timeit.repeat( "\n".join(worker), setup="\n".join(setup), repeat=1, number=1 ) - took_time = timed[0] + took_time = times[0] logger.info("Took a total of %ss", took_time) # NOTE: print for any time output # NOTE: no units for easy automation From 33705622ae9a13ee037bd18c7f2ba81d50079298 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:40:30 +0000 Subject: [PATCH 048/111] Move version specified to common module --- medcat-v2/paper/scripts/speed/common.py | 4 ++++ medcat-v2/paper/scripts/speed/get_unsup_train_speed.py | 6 +----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/common.py b/medcat-v2/paper/scripts/speed/common.py index 4c24eeb9f..01eff6794 100644 --- a/medcat-v2/paper/scripts/speed/common.py +++ b/medcat-v2/paper/scripts/speed/common.py @@ -5,6 +5,10 @@ import pstats import io import time +import importlib.metadata + + +mct_ver = importlib.metadata.distribution("medcat").version logger = logging.getLogger(__name__) diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py index dd13da2a2..cafb131e4 100644 --- a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py @@ -1,10 +1,6 @@ -from common import perform_work +from common import perform_work, mct_ver from pydantic import BaseModel import argparse -import importlib.metadata - - -mct_ver = importlib.metadata.distribution("medcat").version class InferenceSpeedConfig(BaseModel): From 9f626c717cd6ae3e7b13d3e99ff3e15b536fc978 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:41:18 +0000 Subject: [PATCH 049/111] Reset subanmes after model load if v2 --- medcat-v2/paper/scripts/speed/get_inference_speed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed.py b/medcat-v2/paper/scripts/speed/get_inference_speed.py index f27fd6f11..1a0840e6c 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed.py +++ b/medcat-v2/paper/scripts/speed/get_inference_speed.py @@ -1,4 +1,4 @@ -from common import perform_work +from common import perform_work, mct_ver from pydantic import BaseModel import argparse @@ -40,6 +40,7 @@ def main(): setup=["from medcat.cat import CAT", "import pandas as pd", f"cat = CAT.load_model_pack('{args.model_pack_path}')", + "cat.cdb.has_subname('abc')" if mct_ver.startwith("2") else "", f"df = pd.read_csv('{args.csv_path}')"], worker=["for text in df.text:", " cat.get_entities(text)"], From 58ed8933b0a1bff6c5d12ce35db7f48356200547 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:41:49 +0000 Subject: [PATCH 050/111] Fix typo --- medcat-v2/paper/scripts/speed/get_inference_speed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed.py b/medcat-v2/paper/scripts/speed/get_inference_speed.py index 1a0840e6c..e3983994d 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed.py +++ b/medcat-v2/paper/scripts/speed/get_inference_speed.py @@ -40,7 +40,7 @@ def main(): setup=["from medcat.cat import CAT", "import pandas as pd", f"cat = CAT.load_model_pack('{args.model_pack_path}')", - "cat.cdb.has_subname('abc')" if mct_ver.startwith("2") else "", + "cat.cdb.has_subname('abc')" if mct_ver.startswith("2") else "", f"df = pd.read_csv('{args.csv_path}')"], worker=["for text in df.text:", " cat.get_entities(text)"], From 17c85a80ce79776f3385d3f6f0a28a6135068c77 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:44:08 +0000 Subject: [PATCH 051/111] Add subname reset when doing unsupervised training speed --- medcat-v2/paper/scripts/speed/get_unsup_train_speed.py | 1 + 1 file changed, 1 insertion(+) diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py index cafb131e4..48606856a 100644 --- a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py @@ -44,6 +44,7 @@ def main(): setup=["from medcat.cat import CAT", "import pandas as pd", f"cat = CAT.load_model_pack('{args.model_pack_path}')", + "cat.cdb.has_subname('abc')" if mct_ver.startswith("2") else "", f"df = pd.read_csv('{args.csv_path}')"], worker=[work_string], warmup=args.warmup, From 5b1c82d689382419543c7170113e8a62f6e2266c Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 13 Nov 2025 11:45:16 +0000 Subject: [PATCH 052/111] Add some minor comments --- medcat-v2/paper/scripts/speed/get_inference_speed.py | 2 ++ medcat-v2/paper/scripts/speed/get_unsup_train_speed.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed.py b/medcat-v2/paper/scripts/speed/get_inference_speed.py index e3983994d..f1fcf1a66 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed.py +++ b/medcat-v2/paper/scripts/speed/get_inference_speed.py @@ -40,6 +40,8 @@ def main(): setup=["from medcat.cat import CAT", "import pandas as pd", f"cat = CAT.load_model_pack('{args.model_pack_path}')", + # NOTE: this reset subnames - it is only required for models saved + # in v2 pre-beta releases "cat.cdb.has_subname('abc')" if mct_ver.startswith("2") else "", f"df = pd.read_csv('{args.csv_path}')"], worker=["for text in df.text:", diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py index 48606856a..2daa2f24e 100644 --- a/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed.py @@ -44,6 +44,8 @@ def main(): setup=["from medcat.cat import CAT", "import pandas as pd", f"cat = CAT.load_model_pack('{args.model_pack_path}')", + # NOTE: this reset subnames - it is only required for models saved + # in v2 pre-beta releases "cat.cdb.has_subname('abc')" if mct_ver.startswith("2") else "", f"df = pd.read_csv('{args.csv_path}')"], worker=[work_string], From b2eded21737e1ee5145ab73ace528fc9ec542a9f Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 18 Nov 2025 15:04:33 +0000 Subject: [PATCH 053/111] Add initial regression performance script --- .../scripts/performance/regression_perf.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/regression_perf.py diff --git a/medcat-v2/paper/scripts/performance/regression_perf.py b/medcat-v2/paper/scripts/performance/regression_perf.py new file mode 100644 index 000000000..e246a0c5b --- /dev/null +++ b/medcat-v2/paper/scripts/performance/regression_perf.py @@ -0,0 +1,115 @@ +from sys import argv +from pathlib import Path +import os +import logging +import re +from pydantic import BaseModel + +from medcat.utils.regression.regression_checker import ( + main as regr_main, logger as regr_l) + + +DEFAULT_REGRESSION_SUITE = os.path.join( + *"../tests/resources/default_regression_tests.yml".split("/")) + + +CASES_PATTERN = re.compile( + r"The number of total (successful|failing) \(sub\) cases\s*: (\d+) \( ?(\d+\.\d+)%\)" +) + + +class RegressionOverallResults(BaseModel): + total_cases: int + successful_cases: int + failed_cases: int + + def is_valid(self, + success_percent: float, + fail_percent: float, + tolerance: float = 0.005) -> bool: + got_good = self.successful_cases / self.total_cases + got_bad = self.failed_cases / self.total_cases + return ( + abs(got_good - success_percent) < tolerance and + abs(got_bad - fail_percent) < tolerance) + + def final_comma_sep_out(self) -> str: + return ",".join([str(self.successful_cases/self.total_cases), + str(self.failed_cases/self.total_cases), + str(self.total_cases)]) + + @classmethod + def from_records(cls, records: list[tuple[str, int, float]] + ) -> 'RegressionOverallResults': + if len(records) != 2: + raise ValueError(f"Unbalanced records: {records}") + good, bad = records + if "successful" != good[0] and "successful" in bad[0]: + # NOTE: swapping order - shouldn't be needed though + good, bad = bad, good + good_cases, good_perc = good[1:] + bad_cases, bad_perc = bad[1:] + inst = cls(total_cases=good_cases + bad_cases, + successful_cases=good_cases, + failed_cases=bad_cases) + if not inst.is_valid(good_perc / 100, bad_perc / 100): + raise ValueError(f"Unbalanced totals:\nRecords:\n{records}" + f"\nvs\nOutcome:\n{inst}\n" + f"Expected: {good_perc}% S, {bad_perc}% F\n" + f"Got: {inst.successful_cases / inst.total_cases} S, " + f"{inst.failed_cases / inst.total_cases} F") + return inst + + +class CapturingHandler(logging.Handler): + """ + A custom logging handler that captures formatted messages + in a list instead of outputting them. + """ + def __init__(self, *args, + pattern: re.Pattern = CASES_PATTERN, + **kwargs): + super().__init__(*args, **kwargs) + self.pattern = pattern + self.records: list[tuple[str, int, float]] = [] + + def emit(self, record: logging.LogRecord): + """ + Format the record and append the resulting string to the records list. + """ + # Ensure the record is formatted before storing it + msg = self.format(record) + for line in msg.split("\n"): + match = self.pattern.match(line) + if match: + self.records.append( + (match.group(1), int(match.group(2)), float(match.group(3)))) + + def get_captured_records(self) -> list[str]: + """ + Returns the list of captured formatted log messages. + """ + return self.records + + def get_results(self) -> RegressionOverallResults: + return RegressionOverallResults.from_records(self.records) + + def clear(self): + """ + Clears the list of captured records. + """ + self.records.clear() + + +def main(model_pack_path: str, + regression_suite_path: str = DEFAULT_REGRESSION_SUITE): + regr_l.setLevel(logging.INFO) + handler = CapturingHandler() + regr_l.addHandler(handler) + regr_main(Path(model_pack_path), Path(regression_suite_path)) + results = handler.get_results() + print(results.final_comma_sep_out()) + + +if __name__ == "__main__": + main(*argv[1:]) From 9ab219093dac2c3c5d5de4e5e1292bba61bce217 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 18 Nov 2025 15:05:20 +0000 Subject: [PATCH 054/111] some linting / whitespace fixes --- .../scripts/performance/regression_perf.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/regression_perf.py b/medcat-v2/paper/scripts/performance/regression_perf.py index e246a0c5b..ecc7baac3 100644 --- a/medcat-v2/paper/scripts/performance/regression_perf.py +++ b/medcat-v2/paper/scripts/performance/regression_perf.py @@ -14,7 +14,8 @@ CASES_PATTERN = re.compile( - r"The number of total (successful|failing) \(sub\) cases\s*: (\d+) \( ?(\d+\.\d+)%\)" + r"The number of total (successful|failing) \(sub\) cases\s*: (\d+) " + r"\( ?(\d+\.\d+)%\)" ) @@ -53,11 +54,12 @@ def from_records(cls, records: list[tuple[str, int, float]] successful_cases=good_cases, failed_cases=bad_cases) if not inst.is_valid(good_perc / 100, bad_perc / 100): - raise ValueError(f"Unbalanced totals:\nRecords:\n{records}" - f"\nvs\nOutcome:\n{inst}\n" - f"Expected: {good_perc}% S, {bad_perc}% F\n" - f"Got: {inst.successful_cases / inst.total_cases} S, " - f"{inst.failed_cases / inst.total_cases} F") + raise ValueError( + f"Unbalanced totals:\nRecords:\n{records}" + f"\nvs\nOutcome:\n{inst}\n" + f"Expected: {good_perc}% S, {bad_perc}% F\n" + f"Got: {inst.successful_cases / inst.total_cases} S, " + f"{inst.failed_cases / inst.total_cases} F") return inst @@ -83,7 +85,8 @@ def emit(self, record: logging.LogRecord): match = self.pattern.match(line) if match: self.records.append( - (match.group(1), int(match.group(2)), float(match.group(3)))) + (match.group(1), int(match.group(2)), + float(match.group(3)))) def get_captured_records(self) -> list[str]: """ From f87dcc4da63768e4156d79577f6b1c797cd56aca Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 18 Nov 2025 15:05:47 +0000 Subject: [PATCH 055/111] some further linting / whitespace fixes --- medcat-v2/paper/scripts/performance/regression_perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/performance/regression_perf.py b/medcat-v2/paper/scripts/performance/regression_perf.py index ecc7baac3..d03f06849 100644 --- a/medcat-v2/paper/scripts/performance/regression_perf.py +++ b/medcat-v2/paper/scripts/performance/regression_perf.py @@ -65,7 +65,7 @@ def from_records(cls, records: list[tuple[str, int, float]] class CapturingHandler(logging.Handler): """ - A custom logging handler that captures formatted messages + A custom logging handler that captures formatted messages in a list instead of outputting them. """ def __init__(self, *args, From d14af99a2c07afec53a7592bd5401fef84b34099 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 18 Nov 2025 15:10:44 +0000 Subject: [PATCH 056/111] Add out/performance folder --- medcat-v2/paper/out/performance/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 medcat-v2/paper/out/performance/.keep diff --git a/medcat-v2/paper/out/performance/.keep b/medcat-v2/paper/out/performance/.keep new file mode 100644 index 000000000..e69de29bb From f2aa642e1084d11d9000ae064d57d9bfb40d5601 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 18 Nov 2025 15:19:39 +0000 Subject: [PATCH 057/111] Add script to get all of regression --- .../scripts/performance/get_regression_all.sh | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/get_regression_all.sh diff --git a/medcat-v2/paper/scripts/performance/get_regression_all.sh b/medcat-v2/paper/scripts/performance/get_regression_all.sh new file mode 100644 index 000000000..c8dbd3ed7 --- /dev/null +++ b/medcat-v2/paper/scripts/performance/get_regression_all.sh @@ -0,0 +1,23 @@ + + +script_path="scripts/performance/regression_perf.py" +v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +v1_out_file="out/performance/v1_regression.csv" +v2_out_file="out/performance/v2_regression.csv" + +echo "*****************" +echo "running v1 stuff" +echo "*****************" + +source .venv_v1/bin/activate + +python $script_path $v1_model_pack | head -n 1 >> $v1_out_file + +echo "*****************" +echo "running v2 stuff" +echo "*****************" + +source ../.venv312/bin/activate + +python $script_path $v2_model_pack | head -n 1 >> $v2_out_file From 8905e3f429b500b1361ce7a169b8b367e4f6b270 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 19 Nov 2025 09:41:11 +0000 Subject: [PATCH 058/111] Add conversion script for MDACE --- .../MDACE/raw/convert_to_mct_export.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py b/medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py new file mode 100644 index 000000000..50ff8d32d --- /dev/null +++ b/medcat-v2/paper/data/supervised/MDACE/raw/convert_to_mct_export.py @@ -0,0 +1,87 @@ +import json +import os +import sys +from datetime import datetime +from typing import Iterator + +from medcat.data.mctexport import ( + MedCATTrainerExport, MedCATTrainerExportProject, + MedCATTrainerExportDocument, MedCATTrainerExportAnnotation) +from medcat.data.mctexport import count_all_annotations, count_all_docs + +DEFAULT_INPUT_DIR = "with_text/gold" +DEFAULT_OUTPUT_PATH = "../icd10_convert.json" + + +def get_all_jsons(input_dir: str) -> Iterator[str]: + for fn in os.listdir(input_dir): + path = os.path.join(input_dir, fn) + if os.path.isdir(path): + yield from get_all_jsons(path) + elif path.endswith(".json"): + yield path + + +def do_conversion( + input_dir: str = DEFAULT_INPUT_DIR, + output_file: str = DEFAULT_OUTPUT_PATH): + mod_time = datetime.now().isoformat() + all_out: MedCATTrainerExport = { + "projects": [] + } + + for path in get_all_jsons(input_dir): + if not path.endswith(".json"): + continue + with open(path) as f: + in_data = json.load(f) + documents: list[MedCATTrainerExportDocument] = [] + proj_id = in_data["hadm_id"] + proj_name = f'MDACE_{proj_id}' + project: MedCATTrainerExportProject = { + "documents": documents, + "name": proj_name, + "id": proj_id, + "cuis": "", + "tuis": "", + } + all_out["projects"].append(project) + + in_notes = in_data["notes"] # guess name + for in_doc in in_notes: + doc_id = in_doc["note_id"] + doc_name = f'{in_doc["description"]}_{doc_id}' + anns: list[MedCATTrainerExportAnnotation] = [] + documents.append( + { + "name": doc_name, + "id": doc_id, + "last_modified": mod_time, + "text": in_doc["text"], + "annotations": anns, + } + ) + + for ann_num, ann in enumerate(in_doc["annotations"]): + anns.append( + { + "start": ann["begin"], + "end": ann["end"], + # NOTE: this is currently in ICD + "cui": ann["code"], + "value": ann["covered_text"], + "id": f"{proj_name}_{doc_name}_{ann_num}", + "meta_anns": [], + "validated": True, + } + ) + print("GOT", len(all_out["projects"]), "projects", + "with", count_all_annotations(all_out), "annotations", + "across", count_all_docs(all_out), "documents") + + with open(output_file, "w") as of: + json.dump(all_out, of, indent=2) + + +if __name__ == "__main__": + do_conversion(*sys.argv[1:]) From c5449eca5249b1106382e34ba3d52399c09cd641 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 19 Nov 2025 11:47:20 +0000 Subject: [PATCH 059/111] Add mapping from ICD to Snomed --- .../MDACE/raw/map_from_icd_to_snomed.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py b/medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py new file mode 100644 index 000000000..0702238ba --- /dev/null +++ b/medcat-v2/paper/data/supervised/MDACE/raw/map_from_icd_to_snomed.py @@ -0,0 +1,104 @@ +import sys +import json +from collections import defaultdict + +from medcat.cat import CAT +from medcat.data.mctexport import ( + MedCATTrainerExport, MedCATTrainerExportAnnotation, + count_all_annotations, count_all_docs) + + +def load_export(path: str) -> MedCATTrainerExport: + with open(path) as f: + return json.load(f) + + +def icd2snomed(cat: CAT) -> dict[str, list[str]]: + code2snomed: dict[str, list[str]] = defaultdict(list) + cui2icd10 = cat.cdb.addl_info["cui2icd10"] + for cui_info in cat.cdb.cui2info.values(): + cui = cui_info["cui"] + for icd10 in cui2icd10.get(cui, []): + code2snomed[icd10].append(cui) + print("GOT", len(code2snomed), "ICD codes") + print("Mapped to", sum(len(v) for v in code2snomed.values()), + "total Snomed CUIs") + return code2snomed + + +def pick_concept(cat: CAT, + mapper: dict[str, list[str]], + ann: MedCATTrainerExportAnnotation) -> str | None: + # NOTE: I could try and select 1 - the best + # but there isn't really a good way to do that. + # Instead, we'll use all as candidates + return mapper.get(ann["cui"]) + + +def convert_export( + cat: CAT, export: MedCATTrainerExport + ) -> MedCATTrainerExport: + mapper = icd2snomed(cat) + return { + "projects": [ + { + "id": proj["id"], + "name": proj["name"], + "cuis": proj["cuis"], + "tuis": proj["tuis"], + "documents": docs + } + for proj in export["projects"] + if (docs := [ + { + "id": doc["id"], + "name": doc["name"], + "last_modified": doc["last_modified"], + "text": doc["text"], + "annotations": anns + } for doc in proj["documents"] + if (anns := [ + { + "id": ann["id"], + "start": ann["start"], + "end": ann["end"], + "value": ann["value"], + "cui": mapped_cui, + "meta_anns": ann["meta_anns"], + "validated": ann["validated"] + } for ann in doc["annotations"] + if (mapped_cui := pick_concept(cat, mapper, ann)) + ]) + ]) + ] + } + + +def main(model_pack_path: str, + icd10_export_path: str, + final_export_path: str): + print("Loading model pack", model_pack_path) + cat = CAT.load_model_pack(model_pack_path) + print("Loading export") + export = load_export(icd10_export_path) + print("Initial import has", count_all_docs(export), "docs", + "and", count_all_annotations(export), "anns within", + len(export["projects"]), "projects") + print("Converting...") + converted = convert_export(cat, export) + print("CONVERTED export HAS", count_all_docs(converted), "docs", + "and", count_all_annotations(converted), "anns within", + len(converted["projects"]), "projects") + from medcat.data.mctexport import iter_anns + lens = [] + for _, _, ann in iter_anns(converted): + lens.append(len(ann["cui"]) if isinstance(ann["cui"], list) else 1) + print("Total", len(lens), "annotations with", sum(lens) / len(lens), + "values on average") + print("Saving to", final_export_path) + with open(final_export_path, 'w') as f: + json.dump(converted, f) + + +if __name__ == "__main__": + main(*sys.argv[1:]) From 32053853db3db8581e4e8555bf378ff5c3d9abdd Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 19 Nov 2025 14:59:26 +0000 Subject: [PATCH 060/111] Add conversion for distemist dataset --- .../distemist/raw/convert_to_mct_export.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/distemist/raw/convert_to_mct_export.py diff --git a/medcat-v2/paper/data/supervised/distemist/raw/convert_to_mct_export.py b/medcat-v2/paper/data/supervised/distemist/raw/convert_to_mct_export.py new file mode 100644 index 000000000..04ebfef32 --- /dev/null +++ b/medcat-v2/paper/data/supervised/distemist/raw/convert_to_mct_export.py @@ -0,0 +1,130 @@ +import sys +import os +from typing import Iterator +from datetime import datetime +from functools import lru_cache +import json + +import pandas as pd + +from medcat.data.mctexport import ( + MedCATTrainerExport, MedCATTrainerExportDocument) +from medcat.data.mctexport import count_all_annotations, count_all_docs + + +DEFAULT_TEXT_FOLDER = ( + "distemist_zenodo/multilingual_resources/training_text_files/en") +DEFAULT_ANN_FOLDER = ( + "distemist_zenodo/multilingual_resources/en") +DEFAULT_MOD_DATE = datetime.now().isoformat() +DEFAULT_DTYPE = { + "filename": str, + "mart": str, + "label": str, + "offset1": int, + "offset2": int, + "span": str, + "code": str, +} + + +def find_text_file(folder: str, base_name: str) -> str: + path = os.path.join(folder, base_name + ".txt") + if not os.path.exists(path): + raise ValueError(f"No such file/folder: {path}") + return path + + +def find_text(folder: str, base_name: str) -> str: + file_path = find_text_file(folder, base_name) + with open(file_path) as f: + return f.read() + + +@lru_cache +def get_doc(folder: str, base_name: str) -> MedCATTrainerExportDocument: + text = find_text(folder, base_name) + return { + "id": hash(base_name), + "name": base_name, + "last_modified": DEFAULT_MOD_DATE, + "text": text, + "annotations": [] + } + + +def get_docs( + annotation_folder: str, + text_folder: str, + ) -> Iterator[MedCATTrainerExportDocument]: + for file_name in os.listdir(annotation_folder): + print("Looking at annotation file", file_name) + if not file_name.endswith(".tsv"): + # print(" - IGNORE") + continue + file_path = os.path.join(annotation_folder, file_name) + df = pd.read_csv(file_path, sep="\t", dtype=DEFAULT_DTYPE, + na_values={"code": ""}) + print(" - Read Data", df.index.shape, '\n - And', df.columns) + for row_nr, row in df.iterrows(): + # print(" - Row nr", row_nr) + file_base_name = row.filename + # print("ROW", row) + # print("CODE", type(row.code), ":", row.code) + if row.code != row.code: + print("ROW", row) + print("CODE", type(row.code), ":", row.code) + print("Unsuitable! ignoring") + continue + cuis = row.code.split("+") + start, end = row.offset1, row.offset2 + value = row.span + doc = get_doc(text_folder, file_base_name) + doc["annotations"].append({ + "id": row_nr, + "cui": cuis, + "start": start, + "end": end, + "value": value, + "meta_anns": [], + "validated": True, + }) + yield doc + + +def build_export(text_folder: str, annotation_folder: str + ) -> MedCATTrainerExport: + docs: list[MedCATTrainerExportDocument] = [] + out = { + "projects": [ + { + "id": hash("distemist"), + "name": "distemist", + "cuis": "", + "tuis": "", + "documents": docs + } + ] + } + for cur_doc in get_docs(annotation_folder, text_folder): + if cur_doc not in docs: + # if multuple annotaitons in the same doc/text, + # we don't want multiple instances + docs.append(cur_doc) + return out + + +def main(text_folder: str, annotation_folder: str, + target_file: str): + export = build_export(text_folder, annotation_folder) + print("Built export w", len(export["projects"]), "projects", + count_all_docs(export), "docs and", count_all_annotations(export), + "annotations") + print("Saving to", target_file) + with open(target_file, 'w') as f: + json.dump(export, f) + print("Done!") + + +if __name__ == "__main__": + main(*sys.argv[1:]) From 2e1282e214cd3b48962f997b7bf3af45a0f22cea Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 10:06:24 +0000 Subject: [PATCH 061/111] Add a new stats methodology for multi-optioned datasets --- .../paper/scripts/performance/my_stats.py | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/my_stats.py diff --git a/medcat-v2/paper/scripts/performance/my_stats.py b/medcat-v2/paper/scripts/performance/my_stats.py new file mode 100644 index 000000000..e281b4ca9 --- /dev/null +++ b/medcat-v2/paper/scripts/performance/my_stats.py @@ -0,0 +1,222 @@ +# from typing import Callable + +from medcat.data.mctexport import MedCATTrainerExportDocument +from medcat.tokenizing.tokens import MutableEntity +# from medcat.cat import CAT +from medcat.cdb.concepts import CUIInfo +from medcat.config.config import LinkingFilters + + +class StatsCalculator: + """Calculates precision/recall statistics for entity linking.""" + + def __init__(self, filters: LinkingFilters, cui2info: dict[str, CUIInfo]): + self.filters = filters + self.cui2info = cui2info + self._reset() + + def _reset(self): + self.tp = self.fp = self.fn = 0 + self.cui_tp: dict[str, int] = {} + self.cui_fp: dict[str, int] = {} + self.cui_fn: dict[str, int] = {} + self.examples: dict[str, dict[str, list]] = { + 'tp': {}, 'fp': {}, 'fn': {}} + + def process_document( + self, + doc: MedCATTrainerExportDocument, + predictions: list[MutableEntity] + ) -> None: + """ + Process a single document's annotations and predictions. + + Args: + doc: Gold-standard annotated document + predictions: Model's predicted entities + """ + gold_anns = self._extract_gold_annotations(doc) + pred_anns = self._extract_predictions(predictions) + + # Track which predictions have been matched + matched_preds: set[int] = set() + + # Phase 1: Match gold annotations to predictions (find TPs and FNs) + for gold in gold_anns: + match_idx = self._find_matching_prediction( + gold, pred_anns, matched_preds) + + if match_idx is not None: + # True Positive + matched_preds.add(match_idx) + pred = pred_anns[match_idx] + self._record_tp(gold, pred) + else: + # False Negative + self._record_fn(gold) + + # Phase 2: Remaining predictions are False Positives + for idx, pred in enumerate(pred_anns): + if idx not in matched_preds: + self._record_fp(pred) + + def _extract_gold_annotations( + self, + doc: MedCATTrainerExportDocument + ) -> list[dict]: + """Extract validated gold annotations, supporting multi-CUI options.""" + gold_anns = [] + + for ann in doc['annotations']: + if not ann.get('validated', True): + continue + if ann.get('killed', False) or ann.get('deleted', False): + continue + + # Support both single CUI and multiple acceptable CUIs + cuis = ann.get('acceptable_cuis', [ann['cui']]) + if not isinstance(cuis, list): + cuis = [cuis] + + # Filter to valid CUIs + valid_cuis = [ + cui for cui in cuis + if self.filters.check_filters(cui)] + + if valid_cuis: + gold_anns.append({ + 'start': ann['start'], + 'end': ann['end'], + 'cuis': valid_cuis, # List of acceptable CUIs + 'primary_cui': valid_cuis[0], # For counting + 'text': ann['value'], + 'raw': ann + }) + + return gold_anns + + def _extract_predictions( + self, + predictions: list[MutableEntity] + ) -> list[dict]: + """Extract relevant info from predicted entities.""" + return [{ + 'start': ent.base.start_char_index, + 'end': ent.base.end_char_index, + 'cui': ent.cui, + 'text': ent.base.text, + 'confidence': float(ent.context_similarity), + 'raw': ent + } for ent in predictions if self.filters.check_filters(ent.cui)] + + def _find_matching_prediction( + self, + gold: dict, + predictions: list[dict], + matched_preds: set[int] + ) -> int | None: + """ + Find a prediction that matches this gold annotation. + + Matching criteria: + - Same start position (can be relaxed for fuzzy matching) + - Predicted CUI is in gold's acceptable CUIs + - Not already matched + """ + for idx, pred in enumerate(predictions): + if idx in matched_preds: + continue + + # Exact span match + if pred['start'] == gold['start']: + # Check if predicted CUI is acceptable + if pred['cui'] in gold['cuis']: + return idx + + return None + + def _record_tp(self, gold: dict, pred: dict) -> None: + """Record a true positive.""" + cui = pred['cui'] + self.tp += 1 + self.cui_tp[cui] = self.cui_tp.get(cui, 0) + 1 + + if cui not in self.examples['tp']: + self.examples['tp'][cui] = [] + self.examples['tp'][cui].append({ + 'gold_text': gold['text'], + 'pred_text': pred['text'], + 'cui': cui, + 'start': pred['start'], + 'confidence': pred['confidence'] + }) + + def _record_fn(self, gold: dict) -> None: + """Record a false negative.""" + cui = gold['primary_cui'] + self.fn += 1 + self.cui_fn[cui] = self.cui_fn.get(cui, 0) + 1 + + if cui not in self.examples['fn']: + self.examples['fn'][cui] = [] + self.examples['fn'][cui].append({ + 'text': gold['text'], + 'acceptable_cuis': gold['cuis'], + 'start': gold['start'] + }) + + def _record_fp(self, pred: dict) -> None: + """Record a false positive.""" + cui = pred['cui'] + self.fp += 1 + self.cui_fp[cui] = self.cui_fp.get(cui, 0) + 1 + + if cui not in self.examples['fp']: + self.examples['fp'][cui] = [] + self.examples['fp'][cui].append({ + 'text': pred['text'], + 'cui': cui, + 'start': pred['start'], + 'confidence': pred['confidence'] + }) + + def compute_metrics(self) -> dict: + """Compute overall and per-CUI metrics.""" + print("Overall REPORT w", self.tp, self.fp, self.fn, "\n") + metrics = { + 'overall': self._compute_prf(self.tp, self.fp, self.fn), + 'per_cui': {} + } + + all_cuis = ( + set(self.cui_tp.keys()) | set(self.cui_fp.keys()) | + set(self.cui_fn.keys())) + + for cui in all_cuis: + tp = self.cui_tp.get(cui, 0) + fp = self.cui_fp.get(cui, 0) + fn = self.cui_fn.get(cui, 0) + + metrics['per_cui'][cui] = { + 'name': self._get_cui_name(cui), + **self._compute_prf(tp, fp, fn), + 'tp': tp, 'fp': fp, 'fn': fn + } + + return metrics + + @staticmethod + def _compute_prf(tp: int, fp: int, fn: int) -> dict: + """Compute precision, recall, F1.""" + prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0 + rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0 + f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0 + + return {'precision': prec, 'recall': rec, 'f1': f1} + + def _get_cui_name(self, cui: str) -> str: + """Get preferred name for CUI.""" + info = self.cui2info.get(cui) + if info: + return info.get('preferred_name') or list(info['names'])[0] + return cui From 00a196796ea293471b061e82eb9ca0260a2f00ae Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 10:54:08 +0000 Subject: [PATCH 062/111] Minor updates to new stats method --- medcat-v2/paper/scripts/performance/my_stats.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/my_stats.py b/medcat-v2/paper/scripts/performance/my_stats.py index e281b4ca9..6d5b90eff 100644 --- a/medcat-v2/paper/scripts/performance/my_stats.py +++ b/medcat-v2/paper/scripts/performance/my_stats.py @@ -74,7 +74,7 @@ def _extract_gold_annotations( continue # Support both single CUI and multiple acceptable CUIs - cuis = ann.get('acceptable_cuis', [ann['cui']]) + cuis = ann.get('acceptable_cuis', ann['cui']) if not isinstance(cuis, list): cuis = [cuis] @@ -182,7 +182,6 @@ def _record_fp(self, pred: dict) -> None: def compute_metrics(self) -> dict: """Compute overall and per-CUI metrics.""" - print("Overall REPORT w", self.tp, self.fp, self.fn, "\n") metrics = { 'overall': self._compute_prf(self.tp, self.fp, self.fn), 'per_cui': {} From 7fa25a505e13fe5bb43ef9b3b1cc5f3ac02fc3be Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 11:36:17 +0000 Subject: [PATCH 063/111] Update stats to allow projct processing with project filters --- .../paper/scripts/performance/my_stats.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/my_stats.py b/medcat-v2/paper/scripts/performance/my_stats.py index 6d5b90eff..e72acb26d 100644 --- a/medcat-v2/paper/scripts/performance/my_stats.py +++ b/medcat-v2/paper/scripts/performance/my_stats.py @@ -1,10 +1,13 @@ -# from typing import Callable +from typing import Callable + +from tqdm import tqdm from medcat.data.mctexport import MedCATTrainerExportDocument +from medcat.data.mctexport import MedCATTrainerExportProject from medcat.tokenizing.tokens import MutableEntity -# from medcat.cat import CAT from medcat.cdb.concepts import CUIInfo from medcat.config.config import LinkingFilters +from medcat.utils.filters import project_filters class StatsCalculator: @@ -58,7 +61,22 @@ def process_document( # Phase 2: Remaining predictions are False Positives for idx, pred in enumerate(pred_anns): if idx not in matched_preds: - self._record_fp(pred) + if self.filters.check_filters(pred["cui"]): + self._record_fp(pred) + + def process_project(self, project: MedCATTrainerExportProject, + entity_getter: Callable[[str], list[MutableEntity]], + use_project_filters: bool = True, + extra_cui_filter: set[str] | None = None, + show_progress: bool = True, + ) -> None: + with project_filters(self.filters, + project, + extra_cui_filter, + use_project_filters): + for doc in tqdm(project["documents"], disable=not show_progress, + desc="Documents"): + self.process_document(doc, entity_getter(doc["text"])) def _extract_gold_annotations( self, From f56b9446a82738d79d2524e5abbd0d26ea3df3d9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:21:31 +0000 Subject: [PATCH 064/111] Add v1 implementation for missing stuff (hopefully) --- .../paper/scripts/performance/common_pref.py | 3 + .../paper/scripts/performance/my_stats.py | 19 ++- .../paper/scripts/performance/v1_helper.py | 114 ++++++++++++++++++ 3 files changed, 130 insertions(+), 6 deletions(-) create mode 100644 medcat-v2/paper/scripts/performance/common_pref.py create mode 100644 medcat-v2/paper/scripts/performance/v1_helper.py diff --git a/medcat-v2/paper/scripts/performance/common_pref.py b/medcat-v2/paper/scripts/performance/common_pref.py new file mode 100644 index 000000000..8deed344d --- /dev/null +++ b/medcat-v2/paper/scripts/performance/common_pref.py @@ -0,0 +1,3 @@ +import medcat + +IS_V2 = medcat.__version__.startswith("2.") diff --git a/medcat-v2/paper/scripts/performance/my_stats.py b/medcat-v2/paper/scripts/performance/my_stats.py index e72acb26d..65b1d561a 100644 --- a/medcat-v2/paper/scripts/performance/my_stats.py +++ b/medcat-v2/paper/scripts/performance/my_stats.py @@ -2,12 +2,19 @@ from tqdm import tqdm -from medcat.data.mctexport import MedCATTrainerExportDocument -from medcat.data.mctexport import MedCATTrainerExportProject -from medcat.tokenizing.tokens import MutableEntity -from medcat.cdb.concepts import CUIInfo -from medcat.config.config import LinkingFilters -from medcat.utils.filters import project_filters +from common_pref import IS_V2 + +if IS_V2: + from medcat.data.mctexport import MedCATTrainerExportDocument + from medcat.data.mctexport import MedCATTrainerExportProject + from medcat.utils.filters import project_filters + from medcat.tokenizing.tokens import MutableEntity + from medcat.cdb.concepts import CUIInfo +else: + from medcat.statsdata.mctexport import MedCATTrainerExportDocument + from medcat.statsdata.mctexport import MedCATTrainerExportProject + from v1_helper import CUIInfo, project_filters, MutableEntity +from medcat.config import LinkingFilters class StatsCalculator: diff --git a/medcat-v2/paper/scripts/performance/v1_helper.py b/medcat-v2/paper/scripts/performance/v1_helper.py new file mode 100644 index 000000000..09422093f --- /dev/null +++ b/medcat-v2/paper/scripts/performance/v1_helper.py @@ -0,0 +1,114 @@ +from typing import TypedDict, Any +from contextlib import contextmanager, nullcontext + +from pydantic import BaseModel +from spacy.tokens import Span + +from medcat.cdb import CDB +from medcat.config import LinkingFilters + +from medcat.statsdata.mctexport import MedCATTrainerExportProject + + +class CUIInfo(TypedDict): + preferred_name: str | None + + +class _FakeDict: + + def __call__(self, cdb: CDB): + self.cdb = cdb + + def get(self, cui: str, def_val: Any | None = None) -> CUIInfo | None: + if cui not in self.cdb.cui2preferred_name: + return def_val + return {"preferred_name": self.cdb.cui2preferred_name[cui]} + + def __getitem__(self, cui: str) -> CUIInfo: + if cui not in self.cdb.cui2preferred_name: + raise KeyError(cui) + return {"preferred_name": self.cdb.cui2preferred_name[cui]} + + def __contains__(self, cui: str) -> bool: + return cui in self.cdb.cui2preferred_name + + +def from_cdb(cdb: CDB) -> dict[str, 'CUIInfo']: + return _FakeDict(cdb) + + +class BaseMutableEntity(BaseModel): + start_char_index: int + end_char_index: int + text: str + + +class MutableEntity(BaseModel): + base: BaseMutableEntity + cui: str + confidence: float + + @classmethod + def from_spacy(cls, span: Span) -> 'MutableEntity': + base = BaseMutableEntity(start_char_index=span.start_char, + end_char_index=span.end_char, + text=span.text) + return cls(base=base, + cui=span._.cui, + confidence=span._.context_similarity) + + @classmethod + def from_spacy_list(cls, spans: list[Span]) -> list['MutableEntity']: + return [cls.from_spacy(span) for span in spans] + + +@contextmanager +def temp_changed_config(config: BaseModel, target: str, value: Any): + """Context manager to change the config temporarily (within). + + Args: + config (BaseModel): The config in question. + target (str): The attribute name to change. + value (Any): The temporary value to use. + + Raises: + IllegalConfigPathException: If no previous value is available. + """ + try: + prev_value = getattr(config, target) + except AttributeError as e: + raise IllegalConfigPathException(target) from e + setattr(config, target, value) + try: + yield + finally: + setattr(config, target, prev_value) + + +class IllegalConfigPathException(ValueError): + + def __init__(self, target_path: str): + super().__init__( + f"Config has no target path: {target_path}") + + +def project_filters(filters: LinkingFilters, + project: MedCATTrainerExportProject, + extra_cui_filter: set[str] | None, + use_project_filters: bool): + """Context manager with per project filters based on a trainer export. + + Args: + filters (LinkingFilters): The current config. + project (MedCATTrainerExportProject): The trainer export. + extra_cui_filter (Optional[set[str]]): Extra cui filters. + use_project_filters (bool): Whether to use project filters. + """ + if extra_cui_filter is not None and not use_project_filters: + return temp_changed_config(filters, 'cuis', extra_cui_filter) + if use_project_filters: + cuis = project.get('cuis', None) + if cuis is None or not cuis: + return nullcontext() + return temp_changed_config(filters, 'cuis', set(cuis.split(","))) + return temp_changed_config(filters, 'cuis', set()) From 6fbc6a6155c20cd3fb51784764028022b09ab976 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:28:17 +0000 Subject: [PATCH 065/111] Fix minor import path issues --- medcat-v2/paper/scripts/performance/my_stats.py | 4 ++-- medcat-v2/paper/scripts/performance/v1_helper.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/my_stats.py b/medcat-v2/paper/scripts/performance/my_stats.py index 65b1d561a..83f8694fe 100644 --- a/medcat-v2/paper/scripts/performance/my_stats.py +++ b/medcat-v2/paper/scripts/performance/my_stats.py @@ -11,8 +11,8 @@ from medcat.tokenizing.tokens import MutableEntity from medcat.cdb.concepts import CUIInfo else: - from medcat.statsdata.mctexport import MedCATTrainerExportDocument - from medcat.statsdata.mctexport import MedCATTrainerExportProject + from medcat.stats.mctexport import MedCATTrainerExportDocument + from medcat.stats.mctexport import MedCATTrainerExportProject from v1_helper import CUIInfo, project_filters, MutableEntity from medcat.config import LinkingFilters diff --git a/medcat-v2/paper/scripts/performance/v1_helper.py b/medcat-v2/paper/scripts/performance/v1_helper.py index 09422093f..f003a5387 100644 --- a/medcat-v2/paper/scripts/performance/v1_helper.py +++ b/medcat-v2/paper/scripts/performance/v1_helper.py @@ -7,7 +7,7 @@ from medcat.cdb import CDB from medcat.config import LinkingFilters -from medcat.statsdata.mctexport import MedCATTrainerExportProject +from medcat.stats.mctexport import MedCATTrainerExportProject class CUIInfo(TypedDict): From e19a7a7385e9df77883599d9d4b95bdcf48ff287 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:30:07 +0000 Subject: [PATCH 066/111] Fix problematic dunder call method --- medcat-v2/paper/scripts/performance/v1_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/performance/v1_helper.py b/medcat-v2/paper/scripts/performance/v1_helper.py index f003a5387..27b7a89fe 100644 --- a/medcat-v2/paper/scripts/performance/v1_helper.py +++ b/medcat-v2/paper/scripts/performance/v1_helper.py @@ -16,7 +16,7 @@ class CUIInfo(TypedDict): class _FakeDict: - def __call__(self, cdb: CDB): + def __init__(self, cdb: CDB): self.cdb = cdb def get(self, cui: str, def_val: Any | None = None) -> CUIInfo | None: From f22e0d395ea19a6bf1ff613bdccaf32789b5253b Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:32:20 +0000 Subject: [PATCH 067/111] Fix typo in name --- medcat-v2/paper/scripts/performance/v1_helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/v1_helper.py b/medcat-v2/paper/scripts/performance/v1_helper.py index 27b7a89fe..f6c830f9b 100644 --- a/medcat-v2/paper/scripts/performance/v1_helper.py +++ b/medcat-v2/paper/scripts/performance/v1_helper.py @@ -46,7 +46,7 @@ class BaseMutableEntity(BaseModel): class MutableEntity(BaseModel): base: BaseMutableEntity cui: str - confidence: float + context_similarity: float @classmethod def from_spacy(cls, span: Span) -> 'MutableEntity': @@ -55,7 +55,7 @@ def from_spacy(cls, span: Span) -> 'MutableEntity': text=span.text) return cls(base=base, cui=span._.cui, - confidence=span._.context_similarity) + context_similarity=span._.context_similarity) @classmethod def from_spacy_list(cls, spans: list[Span]) -> list['MutableEntity']: From 54e7a91ef6ef5ddb0bd6f820fae6d7c7a58d5c50 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:49:22 +0000 Subject: [PATCH 068/111] Add performance script for model and dataset(s) --- .../get_performance_for_model_and_ds.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py diff --git a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py new file mode 100644 index 000000000..607300b8f --- /dev/null +++ b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py @@ -0,0 +1,164 @@ +import json +import sys +# import io +import os +# from contextlib import contextmanager, redirect_stdout +import re +# from copy import deepcopy +# from collections import Counter + +import pandas as pd +from tqdm import tqdm + +from common_pref import IS_V2 + +from medcat.cat import CAT +# from medcat.stats.stats import StatsBuilder, get_stats +if IS_V2: + from medcat.data.mctexport import MedCATTrainerExport, iter_anns +else: + from medcat.stats.mctexport import MedCATTrainerExport, iter_anns + from v1_helper import MutableEntity, from_cdb +# from medcat.data.mctexport import count_all_docs +# from medcat.utils.filters import project_filters + +from my_stats import StatsCalculator + + +def get_overall_prec_rec_f1(cat: CAT, export: MedCATTrainerExport + ) -> tuple[float, float, float]: + if IS_V2: + calculator = StatsCalculator( + cat.config.components.linking.filters, + cat.cdb.cui2info) + else: + calculator = StatsCalculator( + cat.config.linking.filters, + from_cdb(cat.cdb)) + for proj in tqdm(export["projects"], desc="Projects"): + if IS_V2: + calculator.process_project( + proj, lambda text: cat(text).linked_ents, + show_progress=False) + else: + calculator.process_project( + proj, lambda text: MutableEntity.from_spacy_list( + cat(text).ents), + show_progress=False) + overall = calculator.compute_metrics()["overall"] + return overall["precision"], overall["recall"], overall["f1"] + + +PREC_REC_F1_PATTERN = re.compile( + r"Epoch: \d, Prec: (\d\.\d+), Rec: (\d\.\d+), F1: (\d\.\d+)") + + +# @contextmanager +# def capture_overall_perf(): +# out_perf = [] +# string_io = io.StringIO() +# with redirect_stdout(string_io): +# yield out_perf +# lines = [ +# line for line in +# string_io.getvalue().split("\n") +# if line.startswith("Epoch: ") +# ] +# f_report_lines = [ +# line for line in +# string_io.getvalue().split("\n") +# if line.startswith("FINALISE REPORT w") +# ] +# print("\n".join(f_report_lines)) +# if len(lines) != 1: +# raise ValueError( +# "Found too many (or too few) matching lines:" +# f"\n{'\n'.join(lines)}") +# match = PREC_REC_F1_PATTERN.match(lines[0]) +# if not match: +# raise ValueError(f"Did not match pattern:\n{lines[0]}") +# out_perf.append(( +# float(match.group(1)), float(match.group(2)), float(match.group(3)))) + + +def load_data(path: str, setup_filters: bool = True) -> MedCATTrainerExport: + with open(path) as f: + data = json.load(f) + # fix str -> int in some weird exports + for _, _, ann in iter_anns(data): + ann["start"] = int(ann["start"]) + ann["end"] = int(ann["end"]) + # # count how many extras we did created + # fixer: dict[str, int] = Counter() + # for _, doc in iter_docs(data): + # do_rearrange = False + # for ann in list(doc["annotations"]): + # if isinstance(ann["cui"], list): + # do_rearrange = True + # doc["annotations"].remove(ann) + # for cui in ann["cui"]: + # cp_ann = deepcopy(ann) + # cp_ann["cui"] = cui + # doc["annotations"].append(ann) + # fixer[cui] += 1 + # if do_rearrange: + # doc["annotations"].sort(key=lambda ann: ann["start"]) + for proj in data["projects"]: + all_cuis: set[str] = set() + for doc in proj["documents"]: + for ann in doc["annotations"]: + cuis = ann["cui"] + if not isinstance(cuis, list): + cuis = [cuis, ] + all_cuis.update(cuis) + prev_cuis = proj["cuis"] + if prev_cuis: + all_cuis.update(proj["cuis"].split(",")) + all_cuis_str = ",".join(all_cuis) + proj["cuis"] = all_cuis_str + return data + + +# def get_stats(cat: CAT, data: MedCATTrainerExport, +# fixer: dict[str, int]): +# builder = StatsBuilder.from_cat(cat, +# use_project_filters=True, +# use_overlaps=True) +# for pind, project in tqdm(enumerate(data['projects']), +# desc="Stats project", +# total=len(data['projects']), +# leave=False): +# with project_filters(cat.config.components.linking.filters, +# project, +# builder.extra_cui_filter, +# builder.use_project_filters): +# builder.process_project(project) +# # TODO: how do I use the fixer? +# # this is the part that prints out the stats +# builder.finalise_report(0, do_print=True) + + +def main(model_pack_path: str, + *export_paths: str): + cat = CAT.load_model_pack(model_pack_path) + out_data: list[tuple[str, float, float, float, float]] = [] + for export_path in export_paths: + print("Exploring", export_path) + data = load_data(export_path) + # with capture_overall_perf() as captured: + # get_stats(cat, data) + # out_data.extend([os.path.basename(export_path)] + captured) + # print("GOT", captured) + # print("NEW VERSION") + new_metrics = get_overall_prec_rec_f1(cat, data) + out_data.extend([os.path.basename(export_path)] + list(new_metrics)) + print(new_metrics) + df = pd.DataFrame( + out_data, + columns=["filename", "prec", "rec", "F1"] + ) + print(df.to_string()) + + +if __name__ == "__main__": + main(sys.argv[1], *sys.argv[2:]) From bba1171ecea8e2340f237ae6c3a999a5636333a4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 12:49:53 +0000 Subject: [PATCH 069/111] Remove commented code --- .../get_performance_for_model_and_ds.py | 74 ------------------- 1 file changed, 74 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py index 607300b8f..542e98732 100644 --- a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py +++ b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py @@ -1,11 +1,7 @@ import json import sys -# import io import os -# from contextlib import contextmanager, redirect_stdout import re -# from copy import deepcopy -# from collections import Counter import pandas as pd from tqdm import tqdm @@ -13,14 +9,11 @@ from common_pref import IS_V2 from medcat.cat import CAT -# from medcat.stats.stats import StatsBuilder, get_stats if IS_V2: from medcat.data.mctexport import MedCATTrainerExport, iter_anns else: from medcat.stats.mctexport import MedCATTrainerExport, iter_anns from v1_helper import MutableEntity, from_cdb -# from medcat.data.mctexport import count_all_docs -# from medcat.utils.filters import project_filters from my_stats import StatsCalculator @@ -53,34 +46,6 @@ def get_overall_prec_rec_f1(cat: CAT, export: MedCATTrainerExport r"Epoch: \d, Prec: (\d\.\d+), Rec: (\d\.\d+), F1: (\d\.\d+)") -# @contextmanager -# def capture_overall_perf(): -# out_perf = [] -# string_io = io.StringIO() -# with redirect_stdout(string_io): -# yield out_perf -# lines = [ -# line for line in -# string_io.getvalue().split("\n") -# if line.startswith("Epoch: ") -# ] -# f_report_lines = [ -# line for line in -# string_io.getvalue().split("\n") -# if line.startswith("FINALISE REPORT w") -# ] -# print("\n".join(f_report_lines)) -# if len(lines) != 1: -# raise ValueError( -# "Found too many (or too few) matching lines:" -# f"\n{'\n'.join(lines)}") -# match = PREC_REC_F1_PATTERN.match(lines[0]) -# if not match: -# raise ValueError(f"Did not match pattern:\n{lines[0]}") -# out_perf.append(( -# float(match.group(1)), float(match.group(2)), float(match.group(3)))) - - def load_data(path: str, setup_filters: bool = True) -> MedCATTrainerExport: with open(path) as f: data = json.load(f) @@ -88,21 +53,6 @@ def load_data(path: str, setup_filters: bool = True) -> MedCATTrainerExport: for _, _, ann in iter_anns(data): ann["start"] = int(ann["start"]) ann["end"] = int(ann["end"]) - # # count how many extras we did created - # fixer: dict[str, int] = Counter() - # for _, doc in iter_docs(data): - # do_rearrange = False - # for ann in list(doc["annotations"]): - # if isinstance(ann["cui"], list): - # do_rearrange = True - # doc["annotations"].remove(ann) - # for cui in ann["cui"]: - # cp_ann = deepcopy(ann) - # cp_ann["cui"] = cui - # doc["annotations"].append(ann) - # fixer[cui] += 1 - # if do_rearrange: - # doc["annotations"].sort(key=lambda ann: ann["start"]) for proj in data["projects"]: all_cuis: set[str] = set() for doc in proj["documents"]: @@ -119,25 +69,6 @@ def load_data(path: str, setup_filters: bool = True) -> MedCATTrainerExport: return data -# def get_stats(cat: CAT, data: MedCATTrainerExport, -# fixer: dict[str, int]): -# builder = StatsBuilder.from_cat(cat, -# use_project_filters=True, -# use_overlaps=True) -# for pind, project in tqdm(enumerate(data['projects']), -# desc="Stats project", -# total=len(data['projects']), -# leave=False): -# with project_filters(cat.config.components.linking.filters, -# project, -# builder.extra_cui_filter, -# builder.use_project_filters): -# builder.process_project(project) -# # TODO: how do I use the fixer? -# # this is the part that prints out the stats -# builder.finalise_report(0, do_print=True) - - def main(model_pack_path: str, *export_paths: str): cat = CAT.load_model_pack(model_pack_path) @@ -145,11 +76,6 @@ def main(model_pack_path: str, for export_path in export_paths: print("Exploring", export_path) data = load_data(export_path) - # with capture_overall_perf() as captured: - # get_stats(cat, data) - # out_data.extend([os.path.basename(export_path)] + captured) - # print("GOT", captured) - # print("NEW VERSION") new_metrics = get_overall_prec_rec_f1(cat, data) out_data.extend([os.path.basename(export_path)] + list(new_metrics)) print(new_metrics) From 993c692fa020595b8e75a76774f4a70cfd4fe562 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 13:13:10 +0000 Subject: [PATCH 070/111] Allow filtering before disamb (optionally) --- .../performance/get_performance_for_model_and_ds.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py index 542e98732..b29972f23 100644 --- a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py +++ b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py @@ -18,16 +18,21 @@ from my_stats import StatsCalculator -def get_overall_prec_rec_f1(cat: CAT, export: MedCATTrainerExport +def get_overall_prec_rec_f1(cat: CAT, export: MedCATTrainerExport, + filter_before_disamb: bool = False ) -> tuple[float, float, float]: if IS_V2: calculator = StatsCalculator( cat.config.components.linking.filters, cat.cdb.cui2info) + if filter_before_disamb: + cat.config.components.linking.filter_before_disamb = True else: calculator = StatsCalculator( cat.config.linking.filters, from_cdb(cat.cdb)) + if filter_before_disamb: + cat.config.linking.filter_before_disamb = True for proj in tqdm(export["projects"], desc="Projects"): if IS_V2: calculator.process_project( From d22a85f18b9db6685f921270e4f85ddde7850ed6 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 13:18:26 +0000 Subject: [PATCH 071/111] Add README for MDACE dataset --- .../paper/data/supervised/MDACE/raw/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/MDACE/raw/README.md diff --git a/medcat-v2/paper/data/supervised/MDACE/raw/README.md b/medcat-v2/paper/data/supervised/MDACE/raw/README.md new file mode 100644 index 000000000..1753785fe --- /dev/null +++ b/medcat-v2/paper/data/supervised/MDACE/raw/README.md @@ -0,0 +1,18 @@ +First we download the MDACE dataset and prepare it with MIMIC-IV as per instructions: +https://github.com/3mcloud/MDACE + +Then, we need to convert the data to a format MedCAT can understand using: +```python +python convert_to_mct_export.py # no need for arguments if in this folder +``` + +However, that still only has ICD-10 codes. +Yet the models we're comparing to use SNOMED. + +So we then need to convert to SNOMED by doing: +```python +python map_from_icd_to_snomed.py ../icd10_convert.json ../mct_export_with_candidates.json +``` + +This will create a trainer export that has multiple CUIs as options for each annotation. +That is because ICD-10 codes can map to multiple different Snomed concepts and there is no automated way to create a 1 to 1 mapping. From 2553fbeea664754df8c31b8293339098361117b5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 13:22:50 +0000 Subject: [PATCH 072/111] Add README for distemist dataset --- .../paper/data/supervised/distemist/raw/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/distemist/raw/README.md diff --git a/medcat-v2/paper/data/supervised/distemist/raw/README.md b/medcat-v2/paper/data/supervised/distemist/raw/README.md new file mode 100644 index 000000000..0001b4559 --- /dev/null +++ b/medcat-v2/paper/data/supervised/distemist/raw/README.md @@ -0,0 +1,11 @@ +First we need to download and extract the distemist dataset: +https://temu.bsc.es/distemist/distemist-linking/ + +Subsequently, we convert to MedCAT supported format: +```python +python convert_to_mct_export.py distemist_zenodo/multilingual_resources/training_text_files/en distemist_zenodo/multilingual_resources/en ../mct_export.json +``` + +NOTE: +The underlying dataset (at least in some cases) links to multiple concepts per annotation. +And because of that the output also allows a subset of concepts. From aad0e064a60e5b05cfb4f25bbd2118c21596c43a Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 13:23:36 +0000 Subject: [PATCH 073/111] Add conversion script - from linking challenge to trainer export --- .../supervised/linking_challenge/raw/convert_to_mct_export.py | 1 + 1 file changed, 1 insertion(+) create mode 120000 medcat-v2/paper/data/supervised/linking_challenge/raw/convert_to_mct_export.py diff --git a/medcat-v2/paper/data/supervised/linking_challenge/raw/convert_to_mct_export.py b/medcat-v2/paper/data/supervised/linking_challenge/raw/convert_to_mct_export.py new file mode 120000 index 000000000..fd957f236 --- /dev/null +++ b/medcat-v2/paper/data/supervised/linking_challenge/raw/convert_to_mct_export.py @@ -0,0 +1 @@ +/Users/martratas/Documents/CogStack/.MedCAT.nosync/MedCATv2_new/.temp/validation_datasets/linking_challenge_2023/convert_to_mct_export.py \ No newline at end of file From 817feba3e30c9df21bc86850ab63062ee4168ede Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 13:34:16 +0000 Subject: [PATCH 074/111] Add README for linking challenge data prep --- .../paper/data/supervised/linking_challenge/raw/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/linking_challenge/raw/README.md diff --git a/medcat-v2/paper/data/supervised/linking_challenge/raw/README.md b/medcat-v2/paper/data/supervised/linking_challenge/raw/README.md new file mode 100644 index 000000000..cf470d6ae --- /dev/null +++ b/medcat-v2/paper/data/supervised/linking_challenge/raw/README.md @@ -0,0 +1,7 @@ +First, we need to download the 2023 Snomed linking challenge dataset: +https://www.drivendata.org/competitions/258/competition-snomed-ct/ + +Then, ocnvert to MedCAT supported format: +```python +python convert_to_mct_export.py mimic-iv_notes_training_set.csv train_annotations.csv ../mct_export.json +``` \ No newline at end of file From dad18e28398ea88c02f63edcefe1d0ef5047aa61 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 14:40:45 +0000 Subject: [PATCH 075/111] Add README for COMETA dataset --- medcat-v2/paper/data/supervised/cometa/raw/README.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/cometa/raw/README.md diff --git a/medcat-v2/paper/data/supervised/cometa/raw/README.md b/medcat-v2/paper/data/supervised/cometa/raw/README.md new file mode 100644 index 000000000..f2b9200c6 --- /dev/null +++ b/medcat-v2/paper/data/supervised/cometa/raw/README.md @@ -0,0 +1,7 @@ +First, we need to download the dataset: +https://metatext.io/datasets/cometa + +Then we need to convert to a format MedCAT understands: +```python +python conversion/converter.py chv.csv ../mct_export.json +``` \ No newline at end of file From 56d8bbc8406c2b02b82aae7fbe9f4a7cfe04e6ad Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 14:42:19 +0000 Subject: [PATCH 076/111] Add cometa dataset conversion script --- .../cometa/raw/conversion/converter.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py diff --git a/medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py b/medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py new file mode 100644 index 000000000..52f3d72a7 --- /dev/null +++ b/medcat-v2/paper/data/supervised/cometa/raw/conversion/converter.py @@ -0,0 +1,115 @@ +from sys import argv +import json +import os.path +from datetime import datetime + +from tqdm import tqdm +import pandas as pd + +from medcat.data.mctexport import ( + MedCATTrainerExport, MedCATTrainerExportProject, + MedCATTrainerExportAnnotation) +from medcat.data.mctexport import count_all_docs, count_all_annotations + + +COLS = ['Term', 'General SNOMED Label', 'General SNOMED ID', + 'Specific SNOMED Label', 'Specific SNOMED ID', 'Example', + 'Example Link', 'Origin_Sheet'] +COL4VALUE = "Term" +COL4CUI = "Specific SNOMED ID" +COL4TEXT = "Example" +COL4LINK = "Example Link" + +# November 2020 +LAST_MODIFIED = datetime(year=2020, month=11, day=1).isoformat() + + +def find_annotations(value: str, text: str, cui: str + ) -> list[MedCATTrainerExportAnnotation]: + value = value.lower() + orig_text = text + text = text.lower() + if value not in text: + raise ValueError(f"{repr(value)} not in text ({repr(text)})") + cur_start = 0 + anns: list[MedCATTrainerExportAnnotation] = [] + while (cur_index := text.find(value, cur_start)) >= 0: + start = cur_index + end = cur_index + len(value) + anns.append( + { + "cui": str(cui), + "value": orig_text[start: end], + "start": start, + "end": end, + } + ) + cur_start = end + if len(anns) > 100: + raise KeyError( + f"Too many annotations!, {start}, {end}, for {value}. " + f"cur start at {cur_start}") + return anns + + +def do_conversion(df: pd.DataFrame, proj_base_id: str, proj_base_name: str + ) -> MedCATTrainerExport: + projects: list[MedCATTrainerExportProject] = [] + for line_num, (index, line) in enumerate(tqdm(df.iterrows(), + total=len(df.index))): + text = line[COL4TEXT] + cui = line[COL4CUI] + try: + anns = find_annotations( + line[COL4VALUE], text, cui) + except ValueError as e: + print("LINE", line_num, "at index", index, + "Failed to load(VE):", str(e)) + continue + except AttributeError as e: + print("LINE", line_num, "at index", index, + "Failed to load(AE):", str(e)) + continue + proj_id = proj_base_id + str(index) + proj_name = proj_base_name + "@" + str(index) + # NOTE: each document is a project so that I can use per-project + # filters and thus only focus on the CUI in question and not + # the other terms in the text + projects.append({ + "documents": [ + { + "text": text, + "annotations": anns, + "id": str(index), + "name": f"LINK: {line[COL4LINK]}; ID: {index}", + "last_modified": LAST_MODIFIED + } + ], + "id": proj_id, + "name": proj_name, + "cuis": f'{cui}', + "tuis": '', + }) + return {"projects": projects} + + +def main(file_path: str, + export_path: str, + # TODO: options + ): + df = pd.read_csv(file_path, sep='\t', index_col=0, header=0).sort_index() + proj_name = export_path.split(os.path.sep + "cometa" + os.path.sep, 1)[-1] + proj_id = ".".join(proj_name.split(os.path.sep)[-2:]).replace(".csv", "") + print("Giving 'project' a name of", repr(proj_name)) + print("And setting ID to", proj_id) + mct_export = do_conversion(df, proj_id, proj_name) + print("Got", len(mct_export["projects"]), "projects with a total of", + count_all_docs(mct_export), "documents and a total of", + count_all_annotations(mct_export), "annotations") + print("Saving to", repr(export_path)) + with open(export_path, 'w') as f: + json.dump(mct_export, f) + + +if __name__ == "__main__": + main(*argv[1:]) From adfe353c01f08045f9602a4d686af9314bf124c0 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 14:45:49 +0000 Subject: [PATCH 077/111] Add medmentions conversion scripts --- .../medmentions/raw/src/conversion_mapper.py | 73 +++++ .../raw/src/medmen_umls2snomed_converter.py | 258 ++++++++++++++++++ .../raw/src/medmentions_converter.py | 87 ++++++ 3 files changed, 418 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/medmentions/raw/src/conversion_mapper.py create mode 100644 medcat-v2/paper/data/supervised/medmentions/raw/src/medmen_umls2snomed_converter.py create mode 100644 medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py diff --git a/medcat-v2/paper/data/supervised/medmentions/raw/src/conversion_mapper.py b/medcat-v2/paper/data/supervised/medmentions/raw/src/conversion_mapper.py new file mode 100644 index 000000000..4ec76ed09 --- /dev/null +++ b/medcat-v2/paper/data/supervised/medmentions/raw/src/conversion_mapper.py @@ -0,0 +1,73 @@ +import os +import json +import pandas as pd +from medcat.model_creation.preprocess_umls import _DEFAULT_COLUMNS + + +def get_umls_df(umls_path: str) -> pd.DataFrame: + mrconso = os.path.join(umls_path, "MRCONSO.RRF") + df = pd.read_csv(mrconso, names=_DEFAULT_COLUMNS, sep="|", index_col=False) + print("INIT", len(df.index)) + df = df[df["LAT"] == "ENG"] + print("After LANG", len(df.index)) + df = df[df["SAB"].str.contains("SNOMEDCT")] + print("After SNOMED", len(df.index)) + df = df[df["SCUI"].notna()] + print("After non-none Snomed CUIs", len(df.index)) + return df + + +def load_cuis(needed_path: str) -> list[str]: + with open(needed_path) as f: + return [cui for line in f.readlines() if line for cui in line.split(",")] + + +def get_mappings(df: pd.DataFrame, umls_cuis: list[str], + status_order: list[str] = ['P', 'p', 'S', 's']) -> dict[str, str]: + print("GM") + custom_order = pd.CategoricalDtype(status_order, ordered=True) + out_dict = {} + for nr, cui in enumerate(umls_cuis): + print(nr, cui) + per_cui = df[df['CUI'] == cui] + per_cui['TS'] = per_cui['TS'].astype(custom_order) + per_cui = per_cui.sort_values('TS') + # print("PCUI", per_cui) + cui_and_status = per_cui[['CUI', 'TS']] + print("CUI and status", cui_and_status) + ordered_cuis = [row['CUI'] for _, row in cui_and_status.iterrows()] + # ordered_cuis = sorted([ + # (row['CUI'], row['TS']) for _, row in + # cui_and_status.iterrows() + # ], key=lambda cs: status_order.index(cs[1])) + # # remove duplicates + # ordered_cuis = [cui for nr, cui in enumerate(ordered_cuis) if cui not in ordered_cuis[:nr]] + print(cui, "Ordered CUIs", len(ordered_cuis)) + # scuis = per_cui['SCUI'].unique().tolist() + # if nr >= 25: + # raise + if len(ordered_cuis) == 0: + continue + if len(ordered_cuis) > 1: + print(f"{cui}:", len(ordered_cuis) if len(ordered_cuis) > 10 else ordered_cuis) + print("CONTEXT:") + for nr, row in per_cui.iterrows(): + print(row) + out_dict[cui] = ordered_cuis[0] + return out_dict + + +def main(*args: str): + umls_path, needed_path, json_path = args + umls_df = get_umls_df(umls_path) + needed_umls_cuis = load_cuis(needed_path) + print("Getting mappings") + map_dict = get_mappings(umls_df, needed_umls_cuis) + print("SAVING to", json_path) + with open(json_path, 'w') as f: + json.dump(map_dict, f) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) diff --git a/medcat-v2/paper/data/supervised/medmentions/raw/src/medmen_umls2snomed_converter.py b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmen_umls2snomed_converter.py new file mode 100644 index 000000000..c28c517d4 --- /dev/null +++ b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmen_umls2snomed_converter.py @@ -0,0 +1,258 @@ + +import json +import os +from copy import deepcopy +from functools import lru_cache +from typing import Callable + +import pandas as pd + +from medcat.data.mctexport import MedCATTrainerExport, iter_anns, iter_docs +from medcat.data.mctexport import MedCATTrainerExportAnnotation +from medcat.model_creation.preprocess_umls import _DEFAULT_COLUMNS +from medcat.cdb import CDB + + +UMLS_TYPE_TO_SNOMED_TYPE = { + "T033": ("67667581", "finding"), + "T059": ("28321150", "procedure"), + "T060": ("28321150", "procedure"), + # "T061": ("28321150", "procedure"), + "T090": ("16939031", "occupation"), + "T091": ("16939031", "occupation"), + "T071": ("2680757", "observable entity"),# and T077/conceptual entity? + # "" : ("40357424", "foundation metadata concept"), + # "" : ("29422548", "core metadata concept"), + "T072": ("32816260", "physical object"), + # "" : ("7882689", "qualifier value"), + "T167": ("91187746", "substance"), + # "" : ("72706784", "nan"), + "T001": ("81102976", "organism"), + "T017": ("37552161", "body structure"), # I think? + "T047": ("9090192", "disorder"), + # "" : ("33782986", "morphologic abnormality"), + # "" : ("66527446", "cell structure"), + "T061": ("47503797", "regime/therapy"), + # "" : ("91776366", "product"), + # "" : ("37785117", "medicinal product"), + "T025": ("99220404", "cell"), + # "" : ("31601201", "person"), + # "" : ("20410104", "ethnic group"), + # "" : ("75168589", "environment"), + "T051": ("33797723", "event"), + # "" : ("46922199", "religion/philosophy"), + "T201": ("43039974", "attribute"), + # "" : ("3061879", "situation"), + # "" : ("9593000", "medicinal product form"), + # "" : ("82417248", "navigational concept"), + # "" : ("43857361", "physical force"), + "T200": ("27603525", "clinical drug"), + # "" : ("13371933", "social concept"), + # "" : ("30703196", "tumor staging"), + # "" : ("337250", "specimen"), + # "" : ("8067332", "basic dose form"), + # "" : ("21114934", "dose form"), + # "" : ("55540447", "linkage concept"), + # "" : ("31685163", "staging scale"), + # "" : ("90170645", "record artifact"), + # "" : ("17030977", "assessment scale"), + # "" : ("25624495", "SNOMED RT+CTV3"), + # "" : ("18854038", "geographic location"), + # "" : ("78096516", "environment / location"), + # "" : ("92873870", "special concept"), + # "" : ("70426313", "namespace concept"), + # "" : ("14654508", "racial group"), + # "" : ("28695783", "link assertion"), + # "" : ("46506674", "disposition"), + # "" : ("39041339", "unit of presentation"), + # "" : ("51885115", "OWL metadata concept"), + # "" : ("49144999", "state of matter"), + # "" : ("66203715", "transformation"), + # "" : ("51120815", "intended site"), + # "" : ("64755083", "release characteristic"), + # "" : ("45958968", "administration method"), + # "" : ("87776218", "role"), + # "" : ("43744943", "supplier"), + # "" : ("95475658", "product name"), + # "" : ("40584095", "metadata"), + # "" : ("3242456", "life style"), +} + +SNOMED_TYPE_ID2NAME = { + '67667581': 'finding', '28321150': 'procedure', '16939031': 'occupation', + '2680757': 'observable entity', '40357424': 'foundation metadata concept', + '29422548': 'core metadata concept', '32816260': 'physical object', + '7882689': 'qualifier value', '91187746': 'substance', '72706784': 'nan', + '81102976': 'organism', '37552161': 'body structure', '9090192': 'disorder', + '33782986': 'morphologic abnormality', '66527446': 'cell structure', + '47503797': 'regime/therapy', '91776366': 'product', '37785117': 'medicinal product', + '99220404': 'cell', '31601201': 'person', '20410104': 'ethnic group', + '75168589': 'environment', '33797723': 'event', '46922199': 'religion/philosophy', + '43039974': 'attribute', '3061879': 'situation', '9593000': 'medicinal product form', + '82417248': 'navigational concept', '43857361': 'physical force', + '27603525': 'clinical drug', '13371933': 'social concept', '30703196': 'tumor staging', + '337250': 'specimen', '8067332': 'basic dose form', '21114934': 'dose form', + '55540447': 'linkage concept', '31685163': 'staging scale', '90170645': 'record artifact', + '17030977': 'assessment scale', '25624495': 'SNOMED RT+CTV3', + '18854038': 'geographic location', '78096516': 'environment / location', + '92873870': 'special concept', '70426313': 'namespace concept', '14654508': 'racial group', + '28695783': 'link assertion', '46506674': 'disposition', '39041339': 'unit of presentation', + '51885115': 'OWL metadata concept', '49144999': 'state of matter', '66203715': 'transformation', + '51120815': 'intended site', '64755083': 'release characteristic', + '45958968': 'administration method', '87776218': 'role', '43744943': 'supplier', + '95475658': 'product name', '40584095': 'metadata', '3242456': 'life style' +} + + + +def load_export(path: str) -> MedCATTrainerExport: + with open(path) as f: + return json.load(f) + + +def load_umls(umls_path: str) -> pd.DataFrame: + mrconso = os.path.join(umls_path, "MRCONSO.RRF") + df = pd.read_csv(mrconso, names=_DEFAULT_COLUMNS, sep="|", index_col=False) + print("INIT", len(df.index)) + df = df[df["LAT"] == "ENG"] + print("After LANG", len(df.index)) + df = df[df["SAB"].str.contains("SNOMEDCT")] + print("After SNOMED", len(df.index)) + df = df[df["SCUI"].notna()] + print("After removing None-CUIs", len(df.index)) + # remove column I don't care about + df = df.drop(["LAT", # language - already selected + "LUI", # unique identifier for term + "SUI", # unique identifier for string + "AUI", # Unique identifier for atom - variable length field, 8 or 9 characters + # source stuff - will get CUI from CODE + "SAUI", # Source asserted atom identifier [optional] + "SCUI", # Source asserted concept identifier [optional] + "SDUI", # Source asserted descriptor identifier [optional] + ], axis='columns') + return df + + +class TPG: + + def __init__(self, pt2ch: dict) -> None: + self.pt2ch = pt2ch + + @lru_cache + def get_root_parent(self, cui: str) -> str | None: + for pt, children in self.pt2ch.items(): + if cui in children: + rp = self.get_root_parent(pt) + if rp is None: + return cui + # if not a child of anything, must be root + return None + + +def pick_snomed_cui(ann: MedCATTrainerExportAnnotation, + umls_df: pd.DataFrame, + get_cui_name: Callable[[str], str], + tpg: TPG) -> str | None: + umls_cui = ann['cui'] + snomed_candidates = umls_df[umls_df['CUI'] == umls_cui] + num_of_candidates = len(snomed_candidates.index) + if num_of_candidates == 0: + return None + elif num_of_candidates == 1: + return snomed_candidates['CODE'].to_list()[0] + preferred = snomed_candidates[snomed_candidates["TS"] == "P"] + num_of_candidates = len(preferred) + if num_of_candidates == 1 or len(preferred['CODE'].unique()) == 1: + return preferred['CODE'].to_list()[0] + return None + if num_of_candidates == 0: + # check all if no preferred + print("No preferred candidates...") + preferred = snomed_candidates + cuis_and_names = [(row['CODE'], get_cui_name(row['CODE'])) + for _, row in preferred.iterrows() + if get_cui_name(row['CODE']) != row['CODE'] is not None] + cuis_with_name = set(cui for cui, _ in cuis_and_names) + if len(cuis_with_name) == 1: + return list(cuis_with_name)[0] + # find one exact match (if present) + names_of_cuis = set((name, cui) for cui, name in cuis_and_names + if name.lower() == ann['value'].lower()) + if len(names_of_cuis) == 1: + return list(names_of_cuis)[0][1] + cuitid2 = [ + (cui, name, tid, + UMLS_TYPE_TO_SNOMED_TYPE[tid], + # SNOMED_TYPE_ID2NAME[UMLS_TYPE_TO_SNOMED_TYPE[tid]] + ) + for cui, name in cuis_and_names + for tid in ann['type_ids'].split(",") + if tid in UMLS_TYPE_TO_SNOMED_TYPE] + print("CUI typeIDs") + print(cuitid2) + num_of_candidates = len(preferred) + print("Picking from", num_of_candidates, 'for', umls_cui, 'from') + print(preferred) + print("Context:", ann) + cand_cuis = [row['CODE'] for _, row in preferred.iterrows()] + for cui in cand_cuis: + root = tpg.get_root_parent(cui) # the type id + root_name = get_cui_name(root) if root else None + print("CUI2name", cui, f"({get_cui_name(cui)})", + "->", root, "->", root_name) + # import time + # time.sleep(0.2) + + +def convert(export: MedCATTrainerExport, + umls_df: pd.DataFrame, + cui2name: Callable[[str], str], + pt2ch: dict) -> MedCATTrainerExport: + export = deepcopy(export) + total_initial_anns = len(list(iter_anns(export))) + tpg = TPG(pt2ch) + for _, doc, ann in iter_anns(export): + snomed_cui = pick_snomed_cui(ann, umls_df, cui2name, tpg) + if snomed_cui: + ann['cui'] = snomed_cui + else: + ann['cui'] = None + total_kept = 0 + total_removed = 0 + for _, doc in iter_docs(export): + to_remove = [] + for nr, ann in enumerate(doc['annotations']): + if ann['cui'] is None: + to_remove.append(nr) + total_removed += len(to_remove) + total_kept += len(doc['annotations']) - len(to_remove) + # print("Removing", to_remove, "annotations") + for nr in to_remove[::-1]: + # start from end to avoid changing order while iterating + doc["annotations"].pop(nr) + print("Total removed", total_removed) + print("Total kept", total_kept) + print("TOTAL TOTAL", total_removed + total_kept, 'vs', total_initial_anns) + return export + + +def main(export_path: str, + cdb_path: str, + umls_path: str, + target_path: str) -> None: + print("Loading original") + export = load_export(export_path) + print("Getting CDB") + cdb = CDB.load(cdb_path) + pt2ch = cdb.addl_info['pt2ch'] + print("Loading UMLS") + umls_df = load_umls(umls_path) + print("Converting...") + converted = convert(export, umls_df, cdb.get_name, pt2ch) + with open(target_path, 'w') as f: + json.dump(converted, f) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) diff --git a/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py new file mode 100644 index 000000000..fb2b8798f --- /dev/null +++ b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py @@ -0,0 +1,87 @@ +import os +import json +from typing import Iterator + +from medcat.data.mctexport import MedCATTrainerExportDocument, MedCATTrainerExport +from medcat.data.mctexport import MedCATTrainerExportAnnotation +from datetime import datetime + + +def _unwrap_ann_line(line: str) -> MedCATTrainerExportAnnotation: + _, start, end, value, type_ids, cui = line.split("\t") + return { + "cui": cui, + "start": start, + "end": end, + "value": value, + "type_ids": type_ids,# EXTRA + } + + +def unwrap_anns(ann_lines: list[str]) -> list[MedCATTrainerExportAnnotation]: + return [ + _unwrap_ann_line(line) for line in ann_lines + ] + + +def load_medmentions(file_name: str) -> Iterator[tuple[str, str, str, dict]]: + with open(file_name) as f: + all_text = f.read() + for nr, part in enumerate(all_text.split("\n\n")): + if not part: + continue + # print("PART", nr, ":", type(part), len(part)) + title_line, a_line, *ann_lines = part.split("\n") + doc_id, title = title_line.split("|t|", 1) + _doc_id, abstract = a_line.split("|a|", 1) + assert doc_id == _doc_id + yield doc_id, title, abstract, unwrap_anns(ann_lines) + + +def get_export(file_name: str, snomed_pt2ch: dict) -> MedCATTrainerExport: + mct_export: MedCATTrainerExport = { + "projects": [ + { + "cuis": "", + "documents": [], + "id": file_name, + "name": file_name, + "tuis": "", + } + ] + } + cur_docs: list[MedCATTrainerExportDocument] = [] + for doc_id, doc_title, ann_text, annotations in load_medmentions(file_name): + doc: MedCATTrainerExportDocument = { + "text": doc_title + " " + ann_text, + "annotations": annotations, + "id": doc_id, + "last_modified": datetime.now().isoformat() + } + cur_docs.append(doc) + mct_export['projects'][0]['documents'].extend(cur_docs) + return mct_export + + +def save_export(mct_export: dict, file_name: str) -> None: + if os.path.exists(file_name): + raise ValueError(f"File exists: {file_name}") + with open(file_name, 'w') as f: + json.dump(mct_export, f) + + +def load_json(fn: str) -> dict: + with open(fn) as f: + return json.load(f) + + +def main(*args: str): + in_file, out_file, snomed_pt2ch_file = args + snomed_pt2ch = load_json(snomed_pt2ch_file) + mct_export = get_export(in_file, snomed_pt2ch) + save_export(mct_export, out_file) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) From 9a9283f660cf68eede06f0cf6b6d2f0a5a946028 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 20 Nov 2025 14:54:12 +0000 Subject: [PATCH 078/111] Remove some unneeded code --- .../medmentions/raw/src/medmentions_converter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py index fb2b8798f..69e14429a 100644 --- a/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py +++ b/medcat-v2/paper/data/supervised/medmentions/raw/src/medmentions_converter.py @@ -38,7 +38,7 @@ def load_medmentions(file_name: str) -> Iterator[tuple[str, str, str, dict]]: yield doc_id, title, abstract, unwrap_anns(ann_lines) -def get_export(file_name: str, snomed_pt2ch: dict) -> MedCATTrainerExport: +def get_export(file_name: str) -> MedCATTrainerExport: mct_export: MedCATTrainerExport = { "projects": [ { @@ -76,9 +76,8 @@ def load_json(fn: str) -> dict: def main(*args: str): - in_file, out_file, snomed_pt2ch_file = args - snomed_pt2ch = load_json(snomed_pt2ch_file) - mct_export = get_export(in_file, snomed_pt2ch) + in_file, out_file = args + mct_export = get_export(in_file) save_export(mct_export, out_file) From 596178646babf77a0c8dd3ddf97706c5119086f0 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 21 Nov 2025 09:26:36 +0000 Subject: [PATCH 079/111] Add MedMentions dataset README --- .../data/supervised/medmentions/raw/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 medcat-v2/paper/data/supervised/medmentions/raw/README.md diff --git a/medcat-v2/paper/data/supervised/medmentions/raw/README.md b/medcat-v2/paper/data/supervised/medmentions/raw/README.md new file mode 100644 index 000000000..8ed76468c --- /dev/null +++ b/medcat-v2/paper/data/supervised/medmentions/raw/README.md @@ -0,0 +1,15 @@ +First we need to download the MedMentions dataset: +https://github.com/chanzuckerberg/MedMentions + +Then, we need to convert it to a format MedCAT understands: +```python +python src/medmentions_converter.py corpus_pubtator.txt medmentions_umls.json +``` + +However, this still has UMLS codes instead of Snomed ones. +For that we also need UMLS (`MRCONSO.RRF`) to do the mappingp. + +To do the conversion into Snomed we do: +```python +python src/medmen_umls2snomed_converter.py medmentions_umls.json ../medmentions_snomed_stricter.json +``` \ No newline at end of file From a878241b883efa1307a5c07d793022daf896e18c Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 21 Nov 2025 10:00:58 +0000 Subject: [PATCH 080/111] Keep unsupervised data folder --- medcat-v2/paper/data/unsupervised/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 medcat-v2/paper/data/unsupervised/.keep diff --git a/medcat-v2/paper/data/unsupervised/.keep b/medcat-v2/paper/data/unsupervised/.keep new file mode 100644 index 000000000..e69de29bb From 5f62fd8b22e151dec2a497b54c0e81ccdfa8ce26 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 21 Nov 2025 11:21:37 +0000 Subject: [PATCH 081/111] Add script to get all performance --- .../performance/get_performance_all.sh | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/get_performance_all.sh diff --git a/medcat-v2/paper/scripts/performance/get_performance_all.sh b/medcat-v2/paper/scripts/performance/get_performance_all.sh new file mode 100644 index 000000000..df99010dc --- /dev/null +++ b/medcat-v2/paper/scripts/performance/get_performance_all.sh @@ -0,0 +1,23 @@ +a + +script_path="scripts/performance/get_performance_for_model_and_ds.py" +# v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +# v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/medcat-snomed-model-creation/.creation_cache/out_snomed_2025/final_model_Snomed2025-07-11_de7cbec4a786e418.zip" +v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/2025_11_19_issue228_add_meta_cat_to_other/models/v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip" + +echo "*****************" +echo "running v1 stuff" +echo "*****************" + +source .venv_v1/bin/activate + +python $script_path $v1_model_pack data/supervised/*/*.json + +echo "*****************" +echo "running v2 stuff" +echo "*****************" + +source ../.venv312/bin/activate + +python $script_path $v2_model_pack data/supervised/*/*.json From bbb062528d35bd9d554430bd5a260859096b3c28 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 21 Nov 2025 12:21:33 +0000 Subject: [PATCH 082/111] Fix performance script --- .../scripts/performance/get_performance_for_model_and_ds.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py index b29972f23..a83d6e0e9 100644 --- a/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py +++ b/medcat-v2/paper/scripts/performance/get_performance_for_model_and_ds.py @@ -82,7 +82,8 @@ def main(model_pack_path: str, print("Exploring", export_path) data = load_data(export_path) new_metrics = get_overall_prec_rec_f1(cat, data) - out_data.extend([os.path.basename(export_path)] + list(new_metrics)) + out_data.append([os.path.basename( + os.path.dirname(export_path))] + list(new_metrics)) print(new_metrics) df = pd.DataFrame( out_data, From 98e64e10ac8f9e0b6ed9aea6feccf4a5bd18aeb3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 13:18:23 +0000 Subject: [PATCH 083/111] CU-869b9h7y6: Add faster linker that only links to primary names --- .../linking/only_primary_name_linker.py | 70 +++++++++++++++++++ medcat-v2/medcat/components/types.py | 4 ++ 2 files changed, 74 insertions(+) create mode 100644 medcat-v2/medcat/components/linking/only_primary_name_linker.py diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py new file mode 100644 index 000000000..97bfb8c9b --- /dev/null +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -0,0 +1,70 @@ +from typing import Iterator, Optional, Union +import logging + +from medcat.tokenizing.tokens import MutableDocument, MutableEntity +from medcat.components.linking.context_based_linker import Linker +from medcat.components.linking.vector_context_model import ( + PerDocumentTokenCache) +from medcat.utils.defaults import StatusTypes +from medcat.cdb import CDB +from medcat.vocab import Vocab +from medcat.config import Config + + +logger = logging.getLogger(__name__) + + +class OnlyPrimaryNamesLinker(Linker): + name = 'primary_name_only_linker' + + def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None: + super().__init__(cdb, vocab, config) + print("==INIT== primary_name_only_linker") + # don't need / use the context model + del self.context_model + + def _process_entity_inference( + self, doc: MutableDocument, + entity: MutableEntity, + per_doc_valid_token_cache: PerDocumentTokenCache + ) -> Iterator[MutableEntity]: + cuis = entity.link_candidates + if not cuis: + return + # Check does it have a detected name + name = entity.detected_name + if name is None: + logger.info("No name detected for entity %s", entity) + return + primary_cuis = [cui for cui, status in + self.cdb.name2info[name]["per_cui_status"].items() + if status in StatusTypes.PRIMARY_STATUS] + if not primary_cuis: + logger.info("No pimary CUIs for name %s", name) + return + if len(primary_cuis) > 1: + logger.info( + "Ambiguous pimary CUIs for name %s: %s", name, primary_cuis) + return + cui = primary_cuis[0] + entity.cui = cui + entity.context_similarity = 1.0 + yield entity + + def train(self, cui: str, + entity: MutableEntity, + doc: MutableDocument, + negative: bool = False, + names: Union[list[str], dict] = [], + per_doc_valid_token_cache: Optional[PerDocumentTokenCache] = None + ) -> None: + raise NoTrainingException("Training is not supported for this linker") + + def _train_on_doc(self, doc: MutableDocument, + ner_ents: list[MutableEntity] + ) -> Iterator[MutableEntity]: + raise NoTrainingException("Training is not supported for this linker") + + +class NoTrainingException(ValueError): + pass diff --git a/medcat-v2/medcat/components/types.py b/medcat-v2/medcat/components/types.py index a3aa549eb..a95b82ee3 100644 --- a/medcat-v2/medcat/components/types.py +++ b/medcat-v2/medcat/components/types.py @@ -213,6 +213,10 @@ def train(self, cui: str, "medcat2_embedding_linker": ( "medcat.components.linking.embedding_linker", "Linker.create_new_component"), + # primary name only + "primary_name_only_linker": ( + "medcat.components.linking.only_primary_name_linker", + "OnlyPrimaryNamesLinker.create_new_component"), } From d72b4f940c62f889238ad02b6fcd71a2f4e0c1b3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 14:05:47 +0000 Subject: [PATCH 084/111] CU-869b9h7y6: Remove debug output --- medcat-v2/medcat/components/linking/only_primary_name_linker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py index 97bfb8c9b..d472eabf6 100644 --- a/medcat-v2/medcat/components/linking/only_primary_name_linker.py +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -19,7 +19,6 @@ class OnlyPrimaryNamesLinker(Linker): def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None: super().__init__(cdb, vocab, config) - print("==INIT== primary_name_only_linker") # don't need / use the context model del self.context_model From 0839a24ee077d16cdd1de85a029bbe27e4ed8a77 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 14:25:38 +0000 Subject: [PATCH 085/111] CU-869b9h7y6: Add proper filtering as well as usage of single-possible CUI options --- .../linking/only_primary_name_linker.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py index d472eabf6..c9cca440d 100644 --- a/medcat-v2/medcat/components/linking/only_primary_name_linker.py +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -35,9 +35,28 @@ def _process_entity_inference( if name is None: logger.info("No name detected for entity %s", entity) return - primary_cuis = [cui for cui, status in - self.cdb.name2info[name]["per_cui_status"].items() - if status in StatusTypes.PRIMARY_STATUS] + cnf_l = self.config.components.linking + if cnf_l.filter_before_disamb: + cuis = [cui for cui in cuis if cnf_l.filters.check_filters(cui)] + if not cuis: + logger.debug("No CUIs that fit filter for %s", entity) + return + if len(cuis) == 1: + if cnf_l.filters.check_filters(cuis[0]): + logger.info("Choosing only possible CUI %s for %s", + cuis[0], entity) + entity.cui = cuis[0] + entity.context_similarity = 1.0 + yield entity + else: + logger.info( + "A single CUI (%s) was mapped to for %s but not in filter", + cuis[0], entity) + return + primary_cuis = [cui for cui in cuis + if (self.cdb.name2info[name]['per_cui_status'][cui] + in StatusTypes.PRIMARY_STATUS and + cnf_l.filters.check_filters(cui))] if not primary_cuis: logger.info("No pimary CUIs for name %s", name) return From 48396af99ff09f910eda82cbd7a416bc9bc24602 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 16:15:29 +0000 Subject: [PATCH 086/111] CU-869b9h7y6: Add a simple test for the new linker --- .../linking/test_primary_name_only_linker.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 medcat-v2/tests/components/linking/test_primary_name_only_linker.py diff --git a/medcat-v2/tests/components/linking/test_primary_name_only_linker.py b/medcat-v2/tests/components/linking/test_primary_name_only_linker.py new file mode 100644 index 000000000..f2036eb7e --- /dev/null +++ b/medcat-v2/tests/components/linking/test_primary_name_only_linker.py @@ -0,0 +1,33 @@ +import os + +from medcat.cdb import CDB +from medcat.cat import CAT +from medcat.vocab import Vocab +from medcat.components.linking.only_primary_name_linker import ( + OnlyPrimaryNamesLinker) + +import unittest + +from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH + + +EXAMPLE_CDB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "cdb") +EXAMPLE_VOCAB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "vocab") + + +class PrimaryNamesLinkerTests(unittest.TestCase): + TEXT = ( + "Man was diagnosed with severe kidney failure and acute diabetes " + "and presented with a light fever") + + @classmethod + def setUpClass(cls): + vocab = Vocab.load(EXAMPLE_VOCAB_PATH) + cdb = CDB.load(EXAMPLE_CDB_PATH) + cdb.config.components.linking.comp_name = OnlyPrimaryNamesLinker.name + cls.cat = CAT(cdb, vocab) + + def test_gets_entities(self): + ents = self.cat.get_entities(self.TEXT) + self.assertTrue(ents) + self.assertTrue(len(ents["entities"])) From 6d7910b27d60434ceb1706480c6c0d6163a6d3a3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 13:41:05 +0000 Subject: [PATCH 087/111] Add a few scripts to show possible variance in performance and throughput --- .../get_variance_with_linker_and_tokenizer.py | 94 +++++++++++++++++++ ..._variance_with_linker_and_tokenizer_all.sh | 27 ++++++ 2 files changed, 121 insertions(+) create mode 100644 medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py create mode 100644 medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py new file mode 100644 index 000000000..350407853 --- /dev/null +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -0,0 +1,94 @@ +import json +import time + +from cProfile import Profile +from pstats import Stats + +from medcat.cat import CAT +from medcat.stats import get_stats + +EXAMPLE_DATASET = "paper/data/supervised/cometa/mct_export.json" +EXAMPLE_MODEL_PATH = ".temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +USE_NEW_LINKER = False +USE_REGEX_TOKENIZER = True +DO_PROFILING = True + + +def setup_cui_filter(data: dict) -> None: + per_proj_cuis: list[int] = [] + for proj in data["projects"]: + all_cuis = { + ann["cui"] + for doc in proj["documents"] + for ann in doc["annotations"] + } + cur_cuis = proj["cuis"] + all_cuis.update(cur_cuis.split(",")) + proj["cuis"] = ",".join(all_cuis) + per_proj_cuis.append(len(all_cuis)) + print("Total projects", len(per_proj_cuis), + "\n Min CUIs", min(per_proj_cuis), + "\n Mean CUIs", sum(per_proj_cuis) / len(per_proj_cuis), + "\n Max CUIs", max(per_proj_cuis)) + + +def main( + new_linker_raw: bool | str = USE_NEW_LINKER, + regex_tokenizer_raw: bool | str = USE_REGEX_TOKENIZER, + model_path: str = EXAMPLE_MODEL_PATH, + data_path: str = EXAMPLE_DATASET): + if isinstance(new_linker_raw, str): + new_linker = new_linker_raw.lower() in ("new", "yes", "true") + else: + new_linker = new_linker_raw + if isinstance(regex_tokenizer_raw, str): + regex_tokenizer = regex_tokenizer_raw.lower() in ( + "regex", "yes", "true") + else: + regex_tokenizer = regex_tokenizer_raw + print(f"Setup:\n Linker:{'new' if new_linker else 'old'}" + f"\n Tokenizer:{'regex' if regex_tokenizer else 'spacy'}") + print("Loading model", model_path, "...") + cat = CAT.load_model_pack(model_path) + # NOTE: prep subnames + cat.cdb.has_subname("") + if new_linker: + print("USING NEW LINKER") + cat.config.components.linking.comp_name = "primary_name_only_linker" + cat._recreate_pipe() + else: + print("Using DEFAULT linker...") + if regex_tokenizer: + print("USING REGEX BASED TOKENIZER") + cat.config.general.nlp.provider = "regex" + cat._recreate_pipe() + else: + print("Using regular (spacy) tokenizer") + print("Loading data", data_path) + with open(data_path) as f: + data = json.load(f) + print("setting up CUI filter") + setup_cui_filter(data) + print("Running metrics...") + start = time.perf_counter() + if DO_PROFILING: + print("PROFILING") + profile = Profile() + profile.enable() + get_stats(cat, data, use_project_filters=True) + if DO_PROFILING: + profile.disable() + end = time.perf_counter() + print("Took", end - start) + if DO_PROFILING: + print("Profile stats (CUMtime)") + stats = Stats(profile) + print(stats.sort_stats("cumtime").print_stats(50)) + print("Profile stats (TOTtime)") + stats = Stats(profile) + print(stats.sort_stats("tottime").print_stats(50)) + + +if __name__ == "__main__": + from sys import argv + main(*argv[1:]) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh new file mode 100644 index 000000000..f23b112ef --- /dev/null +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -0,0 +1,27 @@ +SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" +MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" + +echo "==COMETA==" +DATASET="data/supervised/cometa/mct_export.json" + +echo "NORMAL" +python $SCRIPT old spacy $MODEL_PATH $DATASET +echo "With faster linker" +python $SCRIPT new spacy $MODEL_PATH $DATASET +echo "With regex tokenizer" +python $SCRIPT old regex $MODEL_PATH $DATASET +echo "With regex tokenizer AND faster linker" +python $SCRIPT new regex $MODEL_PATH $DATASET + +# other dataset +echo "==Linking Challenge==" +DATASET="data/supervised/linking_challenge/mct_export.json" + +echo "NORMAL" +python $SCRIPT old spacy $MODEL_PATH $DATASET +echo "With faster linker" +python $SCRIPT new spacy $MODEL_PATH $DATASET +echo "With regex tokenizer" +python $SCRIPT old regex $MODEL_PATH $DATASET +echo "With regex tokenizer AND faster linker" +python $SCRIPT new regex $MODEL_PATH $DATASET From 146662e7df4cbeea93dd8ef6ff43f13e34937936 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 16:22:10 +0000 Subject: [PATCH 088/111] Update script to include embedding linker in it --- .../get_variance_with_linker_and_tokenizer.py | 66 +++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py index 350407853..d2e2a4fc2 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -1,17 +1,63 @@ import json import time +from enum import Enum from cProfile import Profile from pstats import Stats from medcat.cat import CAT +from medcat.components.types import CoreComponentType from medcat.stats import get_stats EXAMPLE_DATASET = "paper/data/supervised/cometa/mct_export.json" EXAMPLE_MODEL_PATH = ".temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -USE_NEW_LINKER = False +USE_LINKER = "DEFAULT" USE_REGEX_TOKENIZER = True -DO_PROFILING = True +DO_PROFILING = False + + +class LinkerType(Enum): + DEFAULT = 0 + VECTOR_CONTEXT = 0 + FASTER = 1 + EMBEDDING = 2 + + @classmethod + def get_type(cls, linker: str) -> 'LinkerType': + if linker.upper() in cls: + return cls[linker.upper()] + elif linker.lower() in ("new", "faster"): + return cls.FASTER + elif "embed" in linker.lower(): + return cls.EMBEDDING + elif linker.lower() in ("normal", "def", "regular", "reg", "old"): + return cls.DEFAULT + raise ValueError(f"Unknown linker type: '{linker}'") + + def set_linker(self, cat: CAT): + cmp_cnf = cat.config.components + if self is LinkerType.DEFAULT: + # make change just in case this is a re-run / subsequent change + cmp_cnf.linking.comp_name = "default" + if self is LinkerType.FASTER: + cmp_cnf.linking.comp_name = "primary_name_only_linker" + elif self is LinkerType.EMBEDDING: + from medcat.config.config import EmbeddingLinking + from medcat.components.linking.embedding_linker import ( + Linker as ELinker) + cmp_cnf.linking = EmbeddingLinking() + # NOTE: should fix on the lib side + cmp_cnf.linking.comp_name = "medcat2_embedding_linker" + # need to recreate and create embeddings + cat._recreate_pipe() + linker: ELinker = cat.pipe.get_component(CoreComponentType.linking) + print("Creating embeddings...") + linker.create_embeddings() + # NOTE: returning without another pipe recreation + return + else: + raise ValueError("Not defined for linker:") + cat._recreate_pipe() def setup_cui_filter(data: dict) -> None: @@ -33,29 +79,25 @@ def setup_cui_filter(data: dict) -> None: def main( - new_linker_raw: bool | str = USE_NEW_LINKER, + linker_type_str: str = USE_LINKER, regex_tokenizer_raw: bool | str = USE_REGEX_TOKENIZER, model_path: str = EXAMPLE_MODEL_PATH, data_path: str = EXAMPLE_DATASET): - if isinstance(new_linker_raw, str): - new_linker = new_linker_raw.lower() in ("new", "yes", "true") - else: - new_linker = new_linker_raw + linker_type = LinkerType.get_type(linker_type_str) if isinstance(regex_tokenizer_raw, str): regex_tokenizer = regex_tokenizer_raw.lower() in ( "regex", "yes", "true") else: regex_tokenizer = regex_tokenizer_raw - print(f"Setup:\n Linker:{'new' if new_linker else 'old'}" + print(f"Setup:\n Linker:{linker_type.name}" f"\n Tokenizer:{'regex' if regex_tokenizer else 'spacy'}") print("Loading model", model_path, "...") cat = CAT.load_model_pack(model_path) # NOTE: prep subnames cat.cdb.has_subname("") - if new_linker: - print("USING NEW LINKER") - cat.config.components.linking.comp_name = "primary_name_only_linker" - cat._recreate_pipe() + if linker_type != LinkerType.DEFAULT: + print("USING non-default LINKER", linker_type) + linker_type.set_linker(cat) else: print("Using DEFAULT linker...") if regex_tokenizer: From f3a545ddc421b791e8ccff34778c70cf36ab91f4 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 16:23:49 +0000 Subject: [PATCH 089/111] Add embedding linker stuff to script --- .../get_variance_with_linker_and_tokenizer_all.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index f23b112ef..a61a8a5a4 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -13,6 +13,13 @@ python $SCRIPT old regex $MODEL_PATH $DATASET echo "With regex tokenizer AND faster linker" python $SCRIPT new regex $MODEL_PATH $DATASET +# with embedding linker + +echo "With spacy tokenizer + embed lnker" +python $SCRIPT embed spacy $MODEL_PATH $DATASET +echo "With regex tokenizer + embed linker" +python $SCRIPT embed regex $MODEL_PATH $DATASET + # other dataset echo "==Linking Challenge==" DATASET="data/supervised/linking_challenge/mct_export.json" @@ -25,3 +32,10 @@ echo "With regex tokenizer" python $SCRIPT old regex $MODEL_PATH $DATASET echo "With regex tokenizer AND faster linker" python $SCRIPT new regex $MODEL_PATH $DATASET + +# with embedding linker + +echo "With spacy tokenizer + embed lnker" +python $SCRIPT embed spacy $MODEL_PATH $DATASET +echo "With regex tokenizer + embed linker" +python $SCRIPT embed regex $MODEL_PATH $DATASET From bb5951c2a7e18cb566485d6562bfaff1a8357edd Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 16:58:56 +0000 Subject: [PATCH 090/111] Start moving towards a better format for variance getting (get 1 output in CSV) --- .../get_variance_with_linker_and_tokenizer.py | 56 ++++++++++++++++++- ..._variance_with_linker_and_tokenizer_all.sh | 26 +++++---- 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py index d2e2a4fc2..58cf2a20f 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -1,6 +1,9 @@ import json import time from enum import Enum +import io +from contextlib import redirect_stdout, redirect_stderr, contextmanager +import re from cProfile import Profile from pstats import Stats @@ -78,7 +81,55 @@ def setup_cui_filter(data: dict) -> None: "\n Max CUIs", max(per_proj_cuis)) +@contextmanager +def capture_output(): + f = io.StringIO() + out_list = [] + with redirect_stdout(f): + with redirect_stderr(f): + yield out_list + lines = f.getvalue().split("\n") + linker: str | None = None + tokenizer: str | None = None + prec: str | None = None + rec: str | None = None + f1: str | None = None + for line in lines: + if m := re.match(r"\s+Linker:\s*(.*)", line): + linker = m.group(1) + elif m := re.match(r"\s+Tokenizer:\s*(.*)", line): + tokenizer = m.group(1) + elif m := re.search( + r"Epoch:\s*0,.*Prec:\s*([\d.]+),\s*" + r"Rec:\s*([\d.]+),\s*"r"F1:\s*([\d.]+)", line): + prec, rec, f1 = m.groups() + if None not in (linker, tokenizer, prec, rec, f1): + # break early if all found + break + if None in (linker, tokenizer, prec, rec, f1): + raise ValueError( + "Unable to find linker, tokenizer, precision, recall, or f1. Got " + f"{linker}, {tokenizer}, {prec}, {rec}, {f1}") + out_list.extend([linker, tokenizer, prec, rec, f1]) + + def main( + linker_type_str: str = USE_LINKER, + regex_tokenizer_raw: bool | str = USE_REGEX_TOKENIZER, + model_path: str = EXAMPLE_MODEL_PATH, + data_path: str = EXAMPLE_DATASET, + one_line_only: bool = False): + if one_line_only: + with capture_output() as captured: + _main(linker_type_str, regex_tokenizer_raw, + model_path, data_path) + print(",".join(captured)) + else: + _main(linker_type_str, regex_tokenizer_raw, + model_path, data_path) + + +def _main( linker_type_str: str = USE_LINKER, regex_tokenizer_raw: bool | str = USE_REGEX_TOKENIZER, model_path: str = EXAMPLE_MODEL_PATH, @@ -133,4 +184,7 @@ def main( if __name__ == "__main__": from sys import argv - main(*argv[1:]) + one_line_only = "--one-line" in argv + if one_line_only: + argv.remove("--one-line") + main(*argv[1:], one_line_only=one_line_only) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index a61a8a5a4..5d1816cbd 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -1,41 +1,43 @@ SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +MODEL_PATH="../tests/resources/mct2_model_pack.zip" echo "==COMETA==" DATASET="data/supervised/cometa/mct_export.json" +EXTRA="--one-line" echo "NORMAL" -python $SCRIPT old spacy $MODEL_PATH $DATASET +python $SCRIPT old spacy $MODEL_PATH $DATASET $EXTRA echo "With faster linker" -python $SCRIPT new spacy $MODEL_PATH $DATASET +python $SCRIPT new spacy $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer" -python $SCRIPT old regex $MODEL_PATH $DATASET +python $SCRIPT old regex $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer AND faster linker" -python $SCRIPT new regex $MODEL_PATH $DATASET +python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker echo "With spacy tokenizer + embed lnker" -python $SCRIPT embed spacy $MODEL_PATH $DATASET +python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer + embed linker" -python $SCRIPT embed regex $MODEL_PATH $DATASET +python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA # other dataset echo "==Linking Challenge==" DATASET="data/supervised/linking_challenge/mct_export.json" echo "NORMAL" -python $SCRIPT old spacy $MODEL_PATH $DATASET +python $SCRIPT old spacy $MODEL_PATH $DATASET $EXTRA echo "With faster linker" -python $SCRIPT new spacy $MODEL_PATH $DATASET +python $SCRIPT new spacy $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer" -python $SCRIPT old regex $MODEL_PATH $DATASET +python $SCRIPT old regex $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer AND faster linker" -python $SCRIPT new regex $MODEL_PATH $DATASET +python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker echo "With spacy tokenizer + embed lnker" -python $SCRIPT embed spacy $MODEL_PATH $DATASET +python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA echo "With regex tokenizer + embed linker" -python $SCRIPT embed regex $MODEL_PATH $DATASET +python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA From 1f92f83f440b7be9b36faf163c2f3a8174ee5913 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 16:59:49 +0000 Subject: [PATCH 091/111] Remove some echo /debug output --- ..._variance_with_linker_and_tokenizer_all.sh | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index 5d1816cbd..26dc840e0 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -2,42 +2,42 @@ SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" MODEL_PATH="../tests/resources/mct2_model_pack.zip" -echo "==COMETA==" +#"==COMETA==" DATASET="data/supervised/cometa/mct_export.json" EXTRA="--one-line" -echo "NORMAL" +#"NORMAL" python $SCRIPT old spacy $MODEL_PATH $DATASET $EXTRA -echo "With faster linker" +#"With faster linker" python $SCRIPT new spacy $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer" +#"With regex tokenizer" python $SCRIPT old regex $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer AND faster linker" +# "With regex tokenizer AND faster linker" python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker -echo "With spacy tokenizer + embed lnker" +# "With spacy tokenizer + embed lnker" python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer + embed linker" +# "With regex tokenizer + embed linker" python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA # other dataset -echo "==Linking Challenge==" +# "==Linking Challenge==" DATASET="data/supervised/linking_challenge/mct_export.json" -echo "NORMAL" +# "NORMAL" python $SCRIPT old spacy $MODEL_PATH $DATASET $EXTRA -echo "With faster linker" +# "With faster linker" python $SCRIPT new spacy $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer" +# "With regex tokenizer" python $SCRIPT old regex $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer AND faster linker" +# "With regex tokenizer AND faster linker" python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker -echo "With spacy tokenizer + embed lnker" +# "With spacy tokenizer + embed lnker" python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA -echo "With regex tokenizer + embed linker" +# "With regex tokenizer + embed linker" python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA From 0b4ecabafe26c68eaa294e429335b7c22918a547 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 17:12:43 +0000 Subject: [PATCH 092/111] Add dataset name to output --- .../variance/get_variance_with_linker_and_tokenizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py index 58cf2a20f..25bd7077c 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -4,6 +4,7 @@ import io from contextlib import redirect_stdout, redirect_stderr, contextmanager import re +import os from cProfile import Profile from pstats import Stats @@ -123,7 +124,10 @@ def main( with capture_output() as captured: _main(linker_type_str, regex_tokenizer_raw, model_path, data_path) - print(",".join(captured)) + # start with data path + data_folder_name = os.path.basename( + os.path.dirname(data_path)) + print(",".join([data_folder_name] + captured)) else: _main(linker_type_str, regex_tokenizer_raw, model_path, data_path) From 2c8feaf7e5ef9000180c9ea6843223966d13c2d0 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 17:12:55 +0000 Subject: [PATCH 093/111] Add header to output --- .../variance/get_variance_with_linker_and_tokenizer_all.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index 26dc840e0..61f630c8c 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -1,6 +1,8 @@ SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -MODEL_PATH="../tests/resources/mct2_model_pack.zip" + +# HEADER +echo "Dataset,linker,tokenizer,prec,rec,f1" #"==COMETA==" DATASET="data/supervised/cometa/mct_export.json" From 59aaeef4f71dfa4829a46ca3af079b221b8e6216 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 27 Nov 2025 17:21:26 +0000 Subject: [PATCH 094/111] Add run time to output --- .../get_variance_with_linker_and_tokenizer.py | 15 ++++++++++----- .../get_variance_with_linker_and_tokenizer_all.sh | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py index 25bd7077c..292a44147 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -95,6 +95,7 @@ def capture_output(): prec: str | None = None rec: str | None = None f1: str | None = None + time_taken: str | None = None for line in lines: if m := re.match(r"\s+Linker:\s*(.*)", line): linker = m.group(1) @@ -104,14 +105,18 @@ def capture_output(): r"Epoch:\s*0,.*Prec:\s*([\d.]+),\s*" r"Rec:\s*([\d.]+),\s*"r"F1:\s*([\d.]+)", line): prec, rec, f1 = m.groups() - if None not in (linker, tokenizer, prec, rec, f1): + elif m := re.search( + r"Took ([\d.]+)", line): + time_taken = m.group(1) + if None not in (linker, tokenizer, prec, rec, f1, time_taken): # break early if all found break - if None in (linker, tokenizer, prec, rec, f1): + if None in (linker, tokenizer, prec, rec, f1, time_taken): raise ValueError( - "Unable to find linker, tokenizer, precision, recall, or f1. Got " - f"{linker}, {tokenizer}, {prec}, {rec}, {f1}") - out_list.extend([linker, tokenizer, prec, rec, f1]) + "Unable to find linker, tokenizer, precision, recall, f1, " + "or time taken. Got " + f"{linker}, {tokenizer}, {prec}, {rec}, {f1} {time_taken}") + out_list.extend([linker, tokenizer, prec, rec, f1, time_taken]) def main( diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index 61f630c8c..96cdede00 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -2,7 +2,7 @@ SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" # HEADER -echo "Dataset,linker,tokenizer,prec,rec,f1" +echo "Dataset,linker,tokenizer,prec,rec,f1,runtime" #"==COMETA==" DATASET="data/supervised/cometa/mct_export.json" From b03755609097cb65216813f68ffd4475e94ad71b Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 30 Nov 2025 09:06:52 +0000 Subject: [PATCH 095/111] Add 1 time embedding linker conversion script --- .../variance/convert_to_embed_linker.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 medcat-v2/paper/scripts/variance/convert_to_embed_linker.py diff --git a/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py new file mode 100644 index 000000000..f88873d48 --- /dev/null +++ b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py @@ -0,0 +1,34 @@ +import os + +from medcat.cat import CAT +from medcat.components.types import CoreComponentType +from medcat.config.config import EmbeddingLinking +from medcat.components.linking.embedding_linker import ( + Linker as ELinker) + + +def convert(cat: CAT): + cmp_cnf = cat.config.components + cmp_cnf.linking = EmbeddingLinking() + # NOTE: should fix on the lib side + cmp_cnf.linking.comp_name = "medcat2_embedding_linker" + # need to recreate and create embeddings + cat._recreate_pipe() + linker: ELinker = cat.pipe.get_component(CoreComponentType.linking) + print("Creating embeddings...") + linker.create_embeddings() + # NOTE: returning without another pipe recreation + + +def main(model_pack_path: str, save_path: str): + print("Loading", model_pack_path) + cat = CAT.load_model_pack(model_pack_path) + convert(cat) + print("Saving to", save_path) + saved = cat.save_model_pack(os.path.dirname(save_path), pack_name=os.path.basename(save_path)) + print("Saved to", saved) + + +if __name__ == "__main__": + from sys import argv + main(*argv[1:]) From a5202a27032642a0335bf5c430962b2ae89eac70 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 30 Nov 2025 09:07:03 +0000 Subject: [PATCH 096/111] Some whitespace changes --- medcat-v2/paper/scripts/variance/convert_to_embed_linker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py index f88873d48..ad8ab256d 100644 --- a/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py +++ b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py @@ -25,7 +25,8 @@ def main(model_pack_path: str, save_path: str): cat = CAT.load_model_pack(model_pack_path) convert(cat) print("Saving to", save_path) - saved = cat.save_model_pack(os.path.dirname(save_path), pack_name=os.path.basename(save_path)) + saved = cat.save_model_pack(os.path.dirname(save_path), + pack_name=os.path.basename(save_path)) print("Saved to", saved) From ff4c0abd739be590091497937a24141d43386284 Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 30 Nov 2025 09:07:41 +0000 Subject: [PATCH 097/111] Make last line of conversion be the model path --- medcat-v2/paper/scripts/variance/convert_to_embed_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py index ad8ab256d..b852914d7 100644 --- a/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py +++ b/medcat-v2/paper/scripts/variance/convert_to_embed_linker.py @@ -27,7 +27,7 @@ def main(model_pack_path: str, save_path: str): print("Saving to", save_path) saved = cat.save_model_pack(os.path.dirname(save_path), pack_name=os.path.basename(save_path)) - print("Saved to", saved) + print(f"Saved to\n{saved}") if __name__ == "__main__": From 1d63dae35d3ea35359d9394a372dc063d1c8e0bf Mon Sep 17 00:00:00 2001 From: mart-r Date: Sun, 30 Nov 2025 09:09:59 +0000 Subject: [PATCH 098/111] Convert embedding model once --- .../get_variance_with_linker_and_tokenizer_all.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index 96cdede00..09d4d26dc 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -18,11 +18,13 @@ python $SCRIPT old regex $MODEL_PATH $DATASET $EXTRA python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker +# convert embedding model once +EMBED_MODEL_PATH=`python scripts/variance/convert_to_embed_linker.py $MODEL_PATH data/embed_model_converted | tail -n 1` # "With spacy tokenizer + embed lnker" -python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA +python $SCRIPT embed spacy $EMBED_MODEL_PATH $DATASET $EXTRA # "With regex tokenizer + embed linker" -python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA +python $SCRIPT embed regex $EMBED_MODEL_PATH $DATASET $EXTRA # other dataset # "==Linking Challenge==" @@ -40,6 +42,6 @@ python $SCRIPT new regex $MODEL_PATH $DATASET $EXTRA # with embedding linker # "With spacy tokenizer + embed lnker" -python $SCRIPT embed spacy $MODEL_PATH $DATASET $EXTRA +python $SCRIPT embed spacy $EMBED_MODEL_PATH $DATASET $EXTRA # "With regex tokenizer + embed linker" -python $SCRIPT embed regex $MODEL_PATH $DATASET $EXTRA +python $SCRIPT embed regex $EMBED_MODEL_PATH $DATASET $EXTRA From 040af180527538269396180893c5dc16c4d48e2c Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Dec 2025 09:28:19 +0000 Subject: [PATCH 099/111] Try to redo filtering for embedding linker --- .../components/linking/embedding_linker.py | 186 ++++++++++++++---- 1 file changed, 145 insertions(+), 41 deletions(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index c13a48fb6..afd475ca6 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -281,6 +281,142 @@ def _get_context_vectors( texts.append(text) return self._embed(texts, self.device) + def _initialize_cui_name_mapping(self) -> None: + """Call this once during initialization to pre-compute CUI->name.""" + self._cui_to_name_mask = {} + + for cui, cui_idx in self._cui_to_idx.items(): + mask = torch.tensor( + [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs], + dtype=torch.bool, + device=self.device + ) + self._cui_to_name_mask[cui] = mask + + # Cache _has_cuis_all as well + self._has_cuis_all_cached = torch.tensor( + [bool(self.cdb.name2info[name]["per_cui_status"]) + for name in self._name_keys], + device=self.device, + dtype=torch.bool, + ) + + def _get_include_filters_1cui( + self, cui: str, use_precomputed: bool, n: int) -> Tensor: + if use_precomputed and cui in self._cui_to_name_mask: + allowed_mask = self._cui_to_name_mask[cui].clone() + elif cui in self._cui_to_idx: + cui_idx = self._cui_to_idx[cui] + allowed_mask = torch.tensor( + [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs], + dtype=torch.bool, + device=self.device + ) + else: + allowed_mask = torch.zeros( + n, dtype=torch.bool, device=self.device) + return allowed_mask + + def _get_include_filters_multi_cui( + self, include_set: set[str], + use_precomputed: bool, n: int) -> Tensor: + # Multiple CUIs: combine pre-computed masks with OR + if use_precomputed: + valid_cuis = [cui for cui in include_set + if cui in self._cui_to_name_mask] + if valid_cuis: + allowed_mask = self._cui_to_name_mask[ + valid_cuis[0]].clone() + for cui in valid_cuis[1:]: + allowed_mask |= self._cui_to_name_mask[cui] + else: + allowed_mask = torch.zeros( + n, dtype=torch.bool, device=self.device) + else: + include_cui_idxs = { + self._cui_to_idx[cui] for cui in include_set + if cui in self._cui_to_idx + } + if include_cui_idxs: + allowed_mask = torch.tensor( + [any(cui in include_cui_idxs + for cui in name_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs], + dtype=torch.bool, + device=self.device + ) + else: + allowed_mask = torch.zeros( + n, dtype=torch.bool, device=self.device) + return allowed_mask + + def _get_include_filters(self, include_set: set[str], + use_precomputed: bool, n: int) -> Tensor: + # Fast path for single CUI (very common case) + if len(include_set) == 1: + cui = next(iter(include_set)) + return self._get_include_filters_1cui(cui, use_precomputed, n) + else: + return self._get_include_filters_multi_cui( + include_set, use_precomputed, n) + + def _get_exclude_filters_1cui(self, allowed_mask: Tensor, cui: str, + use_precomputed: bool, n: int) -> Tensor: + if use_precomputed and cui in self._cui_to_name_mask: + allowed_mask &= ~self._cui_to_name_mask[cui] + elif cui in self._cui_to_idx: + cui_idx = self._cui_to_idx[cui] + exclude_mask = torch.tensor( + [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs], + dtype=torch.bool, + device=self.device + ) + allowed_mask &= ~exclude_mask + return allowed_mask + + def _get_exclude_filters_multi_cui( + self, allowed_mask: Tensor, exclude_set: set[str], + use_precomputed: bool, n: int) -> Tensor: + # Multiple CUIs to exclude + if use_precomputed: + for cui in exclude_set: + if cui in self._cui_to_name_mask: + allowed_mask &= ~self._cui_to_name_mask[cui] + else: + exclude_cui_idxs = { + self._cui_to_idx[cui] for cui in exclude_set + if cui in self._cui_to_idx + } + if exclude_cui_idxs: + exclude_mask = torch.tensor( + [any(ci in exclude_cui_idxs + for ci in name_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs], + dtype=torch.bool, + device=self.device + ) + allowed_mask &= ~exclude_mask + return allowed_mask + + def _get_exclude_filters(self, exclude_set: set[str], + use_precomputed: bool, n: int) -> Tensor: + # Start with all allowed + allowed_mask = torch.ones(n, dtype=torch.bool, device=self.device) + + if not exclude_set: + return allowed_mask + # Fast path for single CUI exclusion + if len(exclude_set) == 1: + cui = next(iter(exclude_set)) + return self._get_exclude_filters_1cui( + allowed_mask, cui, use_precomputed, n) + else: + return self._get_exclude_filters_multi_cui( + allowed_mask, exclude_set, use_precomputed, n) + def _set_filters(self) -> None: include_set = self.cnf_l.filters.cuis exclude_set = self.cnf_l.filters.cuis_exclude @@ -295,54 +431,22 @@ def _set_filters(self) -> None: return n = len(self._name_keys) - allowed_mask = torch.empty(n, dtype=torch.bool, device=self.device) + + # Use pre-computed mappings if available, + # otherwise fall back to on-the-fly computation + use_precomputed = hasattr(self, '_cui_to_name_mask') if include_set: - # if in include set, ignore exclude set. - allowed_mask[:] = False - include_cui_idxs = { - self._cui_to_idx[cui] for cui in include_set if cui in self._cui_to_idx - } - include_idxs = [ - name_idx - for name_idx, name_cui_idxs in enumerate(self._name_to_cui_idxs) - if any(cui in include_cui_idxs for cui in name_cui_idxs) - ] - allowed_mask[ - torch.tensor(include_idxs, dtype=torch.long, device=self.device) - ] = True + allowed_mask = self._get_include_filters( + include_set, use_precomputed, n) else: - # only look at exclude if there's no include set - allowed_mask[:] = True - if exclude_set: - exclude_cui_idxs = { - self._cui_to_idx[cui] - for cui in exclude_set - if cui in self._cui_to_idx - } - exclude_idxs = [ - i - for i, name_cui_idxs in enumerate(self._name_to_cui_idxs) - if any(ci in exclude_cui_idxs for ci in name_cui_idxs) - ] - allowed_mask[ - torch.tensor(exclude_idxs, dtype=torch.long, device=self.device) - ] = False + allowed_mask = self._get_exclude_filters( + exclude_set, use_precomputed, n) - # checking if a name has at least 1 cui related to it. - _has_cuis_all = torch.tensor( - [ - bool(self.cdb.name2info[name]["per_cui_status"]) - for name in self._name_keys - ], - device=self.device, - dtype=torch.bool, - ) - self._valid_names = _has_cuis_all & allowed_mask + self._valid_names = self._has_cuis_all_cached & allowed_mask self._last_include_set = set(include_set) if include_set is not None else None self._last_exclude_set = set(exclude_set) if exclude_set is not None else None - def _disambiguate_by_cui( self, cui_candidates: list[str], scores: Tensor ) -> tuple[str, float]: From 540caaa3069ce0c81a618d8b74a4782ee3b02386 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Dec 2025 10:20:45 +0000 Subject: [PATCH 100/111] Try to redo filtering for embedding linker (attempt no 2) --- .../components/linking/embedding_linker.py | 187 +++++++++--------- 1 file changed, 93 insertions(+), 94 deletions(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index afd475ca6..fa5ff42df 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -85,8 +85,9 @@ def __init__(self, cdb: CDB, config: Config) -> None: ] for name in self._name_keys ] + self._initialize_filter_structures() - def create_embeddings(self, + def create_embeddings(self, embedding_model_name: Optional[str] = None, max_length: Optional[int] = None, ): @@ -302,107 +303,109 @@ def _initialize_cui_name_mapping(self) -> None: dtype=torch.bool, ) - def _get_include_filters_1cui( - self, cui: str, use_precomputed: bool, n: int) -> Tensor: - if use_precomputed and cui in self._cui_to_name_mask: - allowed_mask = self._cui_to_name_mask[cui].clone() - elif cui in self._cui_to_idx: - cui_idx = self._cui_to_idx[cui] - allowed_mask = torch.tensor( - [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs], + def _initialize_filter_structures(self) -> None: + """Call once during initialization for efficient lookup.""" + # Convert _name_to_cui_idxs to sets for O(1) lookup instead of lists + if not hasattr(self, '_name_to_cui_idxs_sets'): + self._name_to_cui_idxs_sets = [ + set(cui_idxs) for cui_idxs in self._name_to_cui_idxs + ] + + # Cache _has_cuis_all + if not hasattr(self, '_has_cuis_all_cached'): + self._has_cuis_all_cached = torch.tensor( + [bool(self.cdb.name2info[name]["per_cui_status"]) + for name in self._name_keys], + device=self.device, dtype=torch.bool, - device=self.device ) + + def _get_include_filters_1cui( + self, cui: str, n: int) -> Tensor: + if cui not in self._cui_to_idx: + return torch.zeros(n, dtype=torch.bool, device=self.device) + + cui_idx = self._cui_to_idx[cui] + # Use sets for O(1) membership test instead of lists + if hasattr(self, '_name_to_cui_idxs_sets'): + mask_list = [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs_sets] else: - allowed_mask = torch.zeros( - n, dtype=torch.bool, device=self.device) - return allowed_mask + mask_list = [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs] + + return torch.tensor(mask_list, dtype=torch.bool, device=self.device) def _get_include_filters_multi_cui( - self, include_set: set[str], - use_precomputed: bool, n: int) -> Tensor: - # Multiple CUIs: combine pre-computed masks with OR - if use_precomputed: - valid_cuis = [cui for cui in include_set - if cui in self._cui_to_name_mask] - if valid_cuis: - allowed_mask = self._cui_to_name_mask[ - valid_cuis[0]].clone() - for cui in valid_cuis[1:]: - allowed_mask |= self._cui_to_name_mask[cui] - else: - allowed_mask = torch.zeros( - n, dtype=torch.bool, device=self.device) + self, include_set: set[str], n: int) -> Tensor: + include_cui_idxs = { + self._cui_to_idx[cui] for cui in include_set + if cui in self._cui_to_idx + } + if not include_cui_idxs: + return torch.zeros(n, dtype=torch.bool, device=self.device) + + # Use sets for faster membership testing + if hasattr(self, '_name_to_cui_idxs_sets'): + # With sets, intersection is much faster + mask_list = [bool(name_cui_idxs & include_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs_sets] else: - include_cui_idxs = { - self._cui_to_idx[cui] for cui in include_set - if cui in self._cui_to_idx - } - if include_cui_idxs: - allowed_mask = torch.tensor( - [any(cui in include_cui_idxs - for cui in name_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs], - dtype=torch.bool, - device=self.device - ) - else: - allowed_mask = torch.zeros( - n, dtype=torch.bool, device=self.device) - return allowed_mask + mask_list = [any(cui in include_cui_idxs for cui in name_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs] + + return torch.tensor(mask_list, dtype=torch.bool, device=self.device) - def _get_include_filters(self, include_set: set[str], - use_precomputed: bool, n: int) -> Tensor: + def _get_include_filters( + self, include_set: set[str], n: int) -> Tensor: # Fast path for single CUI (very common case) if len(include_set) == 1: cui = next(iter(include_set)) - return self._get_include_filters_1cui(cui, use_precomputed, n) + return self._get_include_filters_1cui(cui, n) else: return self._get_include_filters_multi_cui( - include_set, use_precomputed, n) - - def _get_exclude_filters_1cui(self, allowed_mask: Tensor, cui: str, - use_precomputed: bool, n: int) -> Tensor: - if use_precomputed and cui in self._cui_to_name_mask: - allowed_mask &= ~self._cui_to_name_mask[cui] - elif cui in self._cui_to_idx: - cui_idx = self._cui_to_idx[cui] - exclude_mask = torch.tensor( - [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs], - dtype=torch.bool, - device=self.device - ) - allowed_mask &= ~exclude_mask - return allowed_mask + include_set, n) + + def _get_exclude_filters_1cui( + self, allowed_mask: Tensor, cui: str) -> Tensor: + if cui not in self._cui_to_idx: + return allowed_mask + + cui_idx = self._cui_to_idx[cui] + if hasattr(self, '_name_to_cui_idxs_sets'): + exclude_list = [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs_sets] + else: + exclude_list = [cui_idx in name_cui_idxs + for name_cui_idxs in self._name_to_cui_idxs] + + exclude_mask = torch.tensor( + exclude_list, dtype=torch.bool, device=self.device) + return allowed_mask & ~exclude_mask def _get_exclude_filters_multi_cui( - self, allowed_mask: Tensor, exclude_set: set[str], - use_precomputed: bool, n: int) -> Tensor: - # Multiple CUIs to exclude - if use_precomputed: - for cui in exclude_set: - if cui in self._cui_to_name_mask: - allowed_mask &= ~self._cui_to_name_mask[cui] + self, allowed_mask: Tensor, exclude_set: set[str]) -> Tensor: + exclude_cui_idxs = { + self._cui_to_idx[cui] for cui in exclude_set + if cui in self._cui_to_idx + } + if not exclude_cui_idxs: + return allowed_mask + + # Use sets for faster membership testing + if hasattr(self, '_name_to_cui_idxs_sets'): + exclude_list = [bool(name_cui_idxs & exclude_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs_sets] else: - exclude_cui_idxs = { - self._cui_to_idx[cui] for cui in exclude_set - if cui in self._cui_to_idx - } - if exclude_cui_idxs: - exclude_mask = torch.tensor( - [any(ci in exclude_cui_idxs - for ci in name_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs], - dtype=torch.bool, - device=self.device - ) - allowed_mask &= ~exclude_mask - return allowed_mask + exclude_list = [any(ci in exclude_cui_idxs for ci in name_cui_idxs) + for name_cui_idxs in self._name_to_cui_idxs] + + exclude_mask = torch.tensor( + exclude_list, dtype=torch.bool, device=self.device) + return allowed_mask & ~exclude_mask - def _get_exclude_filters(self, exclude_set: set[str], - use_precomputed: bool, n: int) -> Tensor: + def _get_exclude_filters( + self, exclude_set: set[str], n: int) -> Tensor: # Start with all allowed allowed_mask = torch.ones(n, dtype=torch.bool, device=self.device) @@ -412,10 +415,10 @@ def _get_exclude_filters(self, exclude_set: set[str], if len(exclude_set) == 1: cui = next(iter(exclude_set)) return self._get_exclude_filters_1cui( - allowed_mask, cui, use_precomputed, n) + allowed_mask, cui) else: return self._get_exclude_filters_multi_cui( - allowed_mask, exclude_set, use_precomputed, n) + allowed_mask, exclude_set) def _set_filters(self) -> None: include_set = self.cnf_l.filters.cuis @@ -432,16 +435,12 @@ def _set_filters(self) -> None: n = len(self._name_keys) - # Use pre-computed mappings if available, - # otherwise fall back to on-the-fly computation - use_precomputed = hasattr(self, '_cui_to_name_mask') - if include_set: allowed_mask = self._get_include_filters( - include_set, use_precomputed, n) + include_set, n) else: allowed_mask = self._get_exclude_filters( - exclude_set, use_precomputed, n) + exclude_set, n) self._valid_names = self._has_cuis_all_cached & allowed_mask self._last_include_set = set(include_set) if include_set is not None else None From d336bfeda079d245caacb4d23768e4c78f88c9ce Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Dec 2025 13:10:38 +0000 Subject: [PATCH 101/111] Try to redo filtering for embedding linker (attempt no 3) --- .../components/linking/embedding_linker.py | 135 +++++++++++------- 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py index fa5ff42df..9ceab3953 100644 --- a/medcat-v2/medcat/components/linking/embedding_linker.py +++ b/medcat-v2/medcat/components/linking/embedding_linker.py @@ -11,6 +11,7 @@ from collections import defaultdict import logging import math +import numpy as np from medcat.utils.import_utils import ensure_optional_extras_installed import medcat @@ -304,12 +305,22 @@ def _initialize_cui_name_mapping(self) -> None: ) def _initialize_filter_structures(self) -> None: - """Call once during initialization for efficient lookup.""" - # Convert _name_to_cui_idxs to sets for O(1) lookup instead of lists - if not hasattr(self, '_name_to_cui_idxs_sets'): - self._name_to_cui_idxs_sets = [ - set(cui_idxs) for cui_idxs in self._name_to_cui_idxs - ] + """Call once during initialization to create efficient lookup structures.""" + # Build an inverted index: cui_idx -> list of name indices that contain it + # This is the KEY optimization - we flip the lookup direction + if not hasattr(self, '_cui_idx_to_name_idxs'): + cui2name_indices: defaultdict[ + int, list[int]] = defaultdict(list) + + for name_idx, cui_idxs in enumerate(self._name_to_cui_idxs): + for cui_idx in cui_idxs: + cui2name_indices[cui_idx].append(name_idx) + + # Convert lists to numpy arrays for faster indexing + self._cui_idx_to_name_idxs = { + cui_idx: np.array(name_idxs, dtype=np.int32) + for cui_idx, name_idxs in cui2name_indices.items() + } # Cache _has_cuis_all if not hasattr(self, '_has_cuis_all_cached'): @@ -321,44 +332,57 @@ def _initialize_filter_structures(self) -> None: ) def _get_include_filters_1cui( - self, cui: str, n: int) -> Tensor: + self, cui: str, n: int) -> torch.Tensor: + """Optimized single CUI include filter using inverted index.""" if cui not in self._cui_to_idx: return torch.zeros(n, dtype=torch.bool, device=self.device) cui_idx = self._cui_to_idx[cui] - # Use sets for O(1) membership test instead of lists - if hasattr(self, '_name_to_cui_idxs_sets'): - mask_list = [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs_sets] - else: - mask_list = [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs] - return torch.tensor(mask_list, dtype=torch.bool, device=self.device) + # Use inverted index: get all name indices that contain this CUI + if cui_idx in self._cui_idx_to_name_idxs: + name_indices = self._cui_idx_to_name_idxs[cui_idx] + + # Create mask by setting specific indices to True + allowed_mask = torch.zeros(n, dtype=torch.bool, device=self.device) + allowed_mask[torch.from_numpy(name_indices).to(self.device)] = True + return allowed_mask + else: + return torch.zeros(n, dtype=torch.bool, device=self.device) def _get_include_filters_multi_cui( - self, include_set: set[str], n: int) -> Tensor: - include_cui_idxs = { + self, include_set: Set[str], n: int) -> torch.Tensor: + """Optimized multi-CUI include filter using inverted index.""" + include_cui_idxs = [ self._cui_to_idx[cui] for cui in include_set if cui in self._cui_to_idx - } + ] + if not include_cui_idxs: return torch.zeros(n, dtype=torch.bool, device=self.device) - # Use sets for faster membership testing - if hasattr(self, '_name_to_cui_idxs_sets'): - # With sets, intersection is much faster - mask_list = [bool(name_cui_idxs & include_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs_sets] - else: - mask_list = [any(cui in include_cui_idxs for cui in name_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs] + # Collect all name indices from inverted index + all_name_indices_list: list[np.ndarray] = [] + for cui_idx in include_cui_idxs: + if cui_idx in self._cui_idx_to_name_idxs: + all_name_indices_list.append( + self._cui_idx_to_name_idxs[cui_idx]) - return torch.tensor(mask_list, dtype=torch.bool, device=self.device) + if not all_name_indices_list: + return torch.zeros(n, dtype=torch.bool, device=self.device) + + # Concatenate and get unique indices + all_name_indices = np.unique( + np.concatenate(all_name_indices_list)) + + # Create mask + allowed_mask = torch.zeros(n, dtype=torch.bool, device=self.device) + allowed_mask[torch.from_numpy(all_name_indices).to(self.device)] = True + return allowed_mask def _get_include_filters( - self, include_set: set[str], n: int) -> Tensor: - # Fast path for single CUI (very common case) + self, include_set: Set[str], n: int) -> torch.Tensor: + """Route to appropriate include filter method.""" if len(include_set) == 1: cui = next(iter(include_set)) return self._get_include_filters_1cui(cui, n) @@ -367,51 +391,54 @@ def _get_include_filters( include_set, n) def _get_exclude_filters_1cui( - self, allowed_mask: Tensor, cui: str) -> Tensor: + self, allowed_mask: torch.Tensor, cui: str) -> torch.Tensor: + """Optimized single CUI exclude filter using inverted index.""" if cui not in self._cui_to_idx: return allowed_mask cui_idx = self._cui_to_idx[cui] - if hasattr(self, '_name_to_cui_idxs_sets'): - exclude_list = [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs_sets] - else: - exclude_list = [cui_idx in name_cui_idxs - for name_cui_idxs in self._name_to_cui_idxs] - exclude_mask = torch.tensor( - exclude_list, dtype=torch.bool, device=self.device) - return allowed_mask & ~exclude_mask + if cui_idx in self._cui_idx_to_name_idxs: + name_indices = self._cui_idx_to_name_idxs[cui_idx] + # Set specific indices to False + allowed_mask[ + torch.from_numpy(name_indices).to(self.device)] = False + + return allowed_mask def _get_exclude_filters_multi_cui( - self, allowed_mask: Tensor, exclude_set: set[str]) -> Tensor: - exclude_cui_idxs = { + self, allowed_mask: torch.Tensor, exclude_set: Set[str], + ) -> torch.Tensor: + """Optimized multi-CUI exclude filter using inverted index.""" + exclude_cui_idxs = [ self._cui_to_idx[cui] for cui in exclude_set if cui in self._cui_to_idx - } + ] + if not exclude_cui_idxs: return allowed_mask - # Use sets for faster membership testing - if hasattr(self, '_name_to_cui_idxs_sets'): - exclude_list = [bool(name_cui_idxs & exclude_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs_sets] - else: - exclude_list = [any(ci in exclude_cui_idxs for ci in name_cui_idxs) - for name_cui_idxs in self._name_to_cui_idxs] + # Collect all name indices to exclude + all_name_indices: list[np.ndarray] = [] + for cui_idx in exclude_cui_idxs: + if cui_idx in self._cui_idx_to_name_idxs: + all_name_indices.append(self._cui_idx_to_name_idxs[cui_idx]) - exclude_mask = torch.tensor( - exclude_list, dtype=torch.bool, device=self.device) - return allowed_mask & ~exclude_mask + if all_name_indices: + all_name_indices = np.unique(np.concatenate(all_name_indices)) + allowed_mask[torch.from_numpy(all_name_indices).to(self.device)] = False + + return allowed_mask def _get_exclude_filters( - self, exclude_set: set[str], n: int) -> Tensor: + self, exclude_set: Set[str], n: int) -> torch.Tensor: + """Route to appropriate exclude filter method.""" # Start with all allowed allowed_mask = torch.ones(n, dtype=torch.bool, device=self.device) if not exclude_set: return allowed_mask - # Fast path for single CUI exclusion + if len(exclude_set) == 1: cui = next(iter(exclude_set)) return self._get_exclude_filters_1cui( From c8d80ad9c7602dd08201a88ef8c2ee649624405e Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Feb 2026 11:20:44 +0000 Subject: [PATCH 102/111] Add a throughput script --- .../performance/get_entity_throughput.py | 49 +++++++++++++++++++ .../performance/get_entity_throughput_all.sh | 22 +++++++++ 2 files changed, 71 insertions(+) create mode 100644 medcat-v2/paper/scripts/performance/get_entity_throughput.py create mode 100644 medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh diff --git a/medcat-v2/paper/scripts/performance/get_entity_throughput.py b/medcat-v2/paper/scripts/performance/get_entity_throughput.py new file mode 100644 index 000000000..34b6c1df3 --- /dev/null +++ b/medcat-v2/paper/scripts/performance/get_entity_throughput.py @@ -0,0 +1,49 @@ +from typing import Iterator +import time + +import pandas as pd + +from medcat.cat import CAT + + +def get_num_ents_and_time(cat: CAT, text: str) -> tuple[int, float]: + start_time = time.perf_counter() + ents = cat.get_entities(text)["entities"] + spent_time = time.perf_counter() - start_time + return len(ents), spent_time + + +def load_data_stream(data_path: str) -> Iterator[str]: + if data_path.endswith(".csv"): + df = pd.read_csv(data_path) + return df['text'].to_list() + else: + raise ValueError(f"Unknown data file type: {data_path}") + + +def get_average_throughput(model_path: str, data_path: str, + suppress_output: bool = False): + if not suppress_output: + print("Loading model", model_path) + cat = CAT.load_model_pack(model_path) + if not suppress_output: + print("Loading data") + data = load_data_stream(data_path) + if not suppress_output: + print("Running inference") + throughput = [get_num_ents_and_time(cat, text) for text in data] + total_time = sum([time for _, time in throughput]) + total_ents = sum([ents for ents, _ in throughput]) + mean_throughput = total_ents / total_time + return mean_throughput + + +def main(model_path: str, data_path: str, suppress_output: str = ''): + throughput = get_average_throughput( + model_path, data_path, bool(suppress_output)) + print(throughput) + + +if __name__ == "__main__": + import sys + main(*sys.argv[1:]) diff --git a/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh b/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh new file mode 100644 index 000000000..74120613c --- /dev/null +++ b/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh @@ -0,0 +1,22 @@ +script_path="scripts/performance/get_entity_throughput.py" +data_path="/Users/martratas/Documents/CogStack/MedCAT/MedCATv2_new/.temp/2025_03_24_training_speed/unsup_data/mimic_iv_discharge_head20.csv" +v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +# v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/medcat-snomed-model-creation/.creation_cache/out_snomed_2025/final_model_Snomed2025-07-11_de7cbec4a786e418.zip" +# v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/2025_11_19_issue228_add_meta_cat_to_other/models/v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip" + +echo "*****************" +echo "running v1 stuff" +echo "*****************" + +source .venv_v1/bin/activate + +python $script_path $v1_model_pack $data_path suppress + +echo "*****************" +echo "running v2 stuff" +echo "*****************" + +source ../.venv312/bin/activate + +python $script_path $v2_model_pack $data_path suppress From 56c6cfb80321196cfa0b5b7b49fc843f059ec1b9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Feb 2026 15:18:12 +0000 Subject: [PATCH 103/111] Add throughput to variance calculations --- .../get_variance_with_linker_and_tokenizer.py | 21 ++++++++++++------- ..._variance_with_linker_and_tokenizer_all.sh | 2 +- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py index 292a44147..131e7b57a 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer.py @@ -96,6 +96,7 @@ def capture_output(): rec: str | None = None f1: str | None = None time_taken: str | None = None + ent_throughput: str | None = None for line in lines: if m := re.match(r"\s+Linker:\s*(.*)", line): linker = m.group(1) @@ -108,15 +109,18 @@ def capture_output(): elif m := re.search( r"Took ([\d.]+)", line): time_taken = m.group(1) - if None not in (linker, tokenizer, prec, rec, f1, time_taken): + elif m:= re.search( + r"Throughput rate (\d+\.\d+)", line): + ent_throughput = m.group(1) + if None not in (linker, tokenizer, prec, rec, f1, time_taken, ent_throughput): # break early if all found break - if None in (linker, tokenizer, prec, rec, f1, time_taken): + if None in (linker, tokenizer, prec, rec, f1, time_taken, ent_throughput): raise ValueError( - "Unable to find linker, tokenizer, precision, recall, f1, " + "Unable to find linker, tokenizer, precision, recall, f1, ent_throughput" "or time taken. Got " - f"{linker}, {tokenizer}, {prec}, {rec}, {f1} {time_taken}") - out_list.extend([linker, tokenizer, prec, rec, f1, time_taken]) + f"{linker}, {tokenizer}, {prec}, {rec}, {f1}, {time_taken}, {ent_throughput}") + out_list.extend([linker, tokenizer, prec, rec, f1, time_taken, ent_throughput]) def main( @@ -177,11 +181,14 @@ def _main( print("PROFILING") profile = Profile() profile.enable() - get_stats(cat, data, use_project_filters=True) + fps, _, tps, *_ = get_stats(cat, data, use_project_filters=True) if DO_PROFILING: profile.disable() end = time.perf_counter() - print("Took", end - start) + time_taken = end - start + print("Took", time_taken) + ents_found = sum(fps.values()) + sum(tps.values()) + print("Throughput rate", ents_found / time_taken) if DO_PROFILING: print("Profile stats (CUMtime)") stats = Stats(profile) diff --git a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh index 09d4d26dc..d5a106bb6 100644 --- a/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh +++ b/medcat-v2/paper/scripts/variance/get_variance_with_linker_and_tokenizer_all.sh @@ -2,7 +2,7 @@ SCRIPT="scripts/variance/get_variance_with_linker_and_tokenizer.py" MODEL_PATH="../.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" # HEADER -echo "Dataset,linker,tokenizer,prec,rec,f1,runtime" +echo "Dataset,linker,tokenizer,prec,rec,f1,runtime,throughput" #"==COMETA==" DATASET="data/supervised/cometa/mct_export.json" From 93daa30d67a0793bcaf46f3cc1cbef4832843cd5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Wed, 11 Feb 2026 15:18:22 +0000 Subject: [PATCH 104/111] Revert "Add a throughput script" This reverts commit c8d80ad9c7602dd08201a88ef8c2ee649624405e. --- .../performance/get_entity_throughput.py | 49 ------------------- .../performance/get_entity_throughput_all.sh | 22 --------- 2 files changed, 71 deletions(-) delete mode 100644 medcat-v2/paper/scripts/performance/get_entity_throughput.py delete mode 100644 medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh diff --git a/medcat-v2/paper/scripts/performance/get_entity_throughput.py b/medcat-v2/paper/scripts/performance/get_entity_throughput.py deleted file mode 100644 index 34b6c1df3..000000000 --- a/medcat-v2/paper/scripts/performance/get_entity_throughput.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import Iterator -import time - -import pandas as pd - -from medcat.cat import CAT - - -def get_num_ents_and_time(cat: CAT, text: str) -> tuple[int, float]: - start_time = time.perf_counter() - ents = cat.get_entities(text)["entities"] - spent_time = time.perf_counter() - start_time - return len(ents), spent_time - - -def load_data_stream(data_path: str) -> Iterator[str]: - if data_path.endswith(".csv"): - df = pd.read_csv(data_path) - return df['text'].to_list() - else: - raise ValueError(f"Unknown data file type: {data_path}") - - -def get_average_throughput(model_path: str, data_path: str, - suppress_output: bool = False): - if not suppress_output: - print("Loading model", model_path) - cat = CAT.load_model_pack(model_path) - if not suppress_output: - print("Loading data") - data = load_data_stream(data_path) - if not suppress_output: - print("Running inference") - throughput = [get_num_ents_and_time(cat, text) for text in data] - total_time = sum([time for _, time in throughput]) - total_ents = sum([ents for ents, _ in throughput]) - mean_throughput = total_ents / total_time - return mean_throughput - - -def main(model_path: str, data_path: str, suppress_output: str = ''): - throughput = get_average_throughput( - model_path, data_path, bool(suppress_output)) - print(throughput) - - -if __name__ == "__main__": - import sys - main(*sys.argv[1:]) diff --git a/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh b/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh deleted file mode 100644 index 74120613c..000000000 --- a/medcat-v2/paper/scripts/performance/get_entity_throughput_all.sh +++ /dev/null @@ -1,22 +0,0 @@ -script_path="scripts/performance/get_entity_throughput.py" -data_path="/Users/martratas/Documents/CogStack/MedCAT/MedCATv2_new/.temp/2025_03_24_training_speed/unsup_data/mimic_iv_discharge_head20.csv" -v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" -v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -# v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/medcat-snomed-model-creation/.creation_cache/out_snomed_2025/final_model_Snomed2025-07-11_de7cbec4a786e418.zip" -# v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/2025_11_19_issue228_add_meta_cat_to_other/models/v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip" - -echo "*****************" -echo "running v1 stuff" -echo "*****************" - -source .venv_v1/bin/activate - -python $script_path $v1_model_pack $data_path suppress - -echo "*****************" -echo "running v2 stuff" -echo "*****************" - -source ../.venv312/bin/activate - -python $script_path $v2_model_pack $data_path suppress From 7663949cc9fa33d37959469fa372c4f3537f36a0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 10 Apr 2026 10:09:09 +0100 Subject: [PATCH 105/111] Update model paths --- .../paper/scripts/speed/get_inference_speed_for_multiple_v2.sh | 2 +- medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh | 2 +- .../scripts/speed/get_unsup_train_speed_for_multiple_v2.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh index 28d80a053..62b21bee9 100644 --- a/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_inference_speed_for_multiple_v2.sh @@ -1,7 +1,7 @@ ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" ner2="2023_NER_w_MetaCAT" -ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/20230227__kch_gstt_trained_model_f76d2121b77c3e9a.zip" csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" out_prefix="out/inference_speed/v2" diff --git a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh index a1dfa8c6b..e1e22a857 100644 --- a/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_load_speed_for_multiple_v2.sh @@ -1,7 +1,7 @@ ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" ner2="2023_NER_w_MetaCAT" -ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/20230227__kch_gstt_trained_model_f76d2121b77c3e9a.zip" deid="n2c2_DeID" deid_model_path="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_deid_model_af31d2a9c5ccbe4d.zip.zip" diff --git a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh index 30db39dc2..0eb6e083b 100644 --- a/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh +++ b/medcat-v2/paper/scripts/speed/get_unsup_train_speed_for_multiple_v2.sh @@ -1,7 +1,7 @@ ner1="2023_NER_no_MetaCAT" ner_model_path_no_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" ner2="2023_NER_w_MetaCAT" -ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_7ff751a4bb71630d.zip" +ner_model_path_w_mc="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/20230227__kch_gstt_trained_model_f76d2121b77c3e9a.zip" csv_path="data/unsupervised/mimic_iv_discharge_head20.csv" out_prefix="out/unsup_train_speed/v2" From fcfd7254641f78643cc348abed9255088c19b423 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 10 Apr 2026 10:09:59 +0100 Subject: [PATCH 106/111] Run performance against 2023 models again --- .../paper/scripts/performance/get_performance_all.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/medcat-v2/paper/scripts/performance/get_performance_all.sh b/medcat-v2/paper/scripts/performance/get_performance_all.sh index df99010dc..7ac6d4408 100644 --- a/medcat-v2/paper/scripts/performance/get_performance_all.sh +++ b/medcat-v2/paper/scripts/performance/get_performance_all.sh @@ -1,10 +1,10 @@ a script_path="scripts/performance/get_performance_for_model_and_ds.py" -# v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" -# v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" -v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/medcat-snomed-model-creation/.creation_cache/out_snomed_2025/final_model_Snomed2025-07-11_de7cbec4a786e418.zip" -v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/2025_11_19_issue228_add_meta_cat_to_other/models/v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip" +v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/MedCAT/models/20230227__kch_gstt_trained_model_no_mc_d84c313f24311484.zip" +v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/CONVERT_2023_model_no_mc_234dda1597f635e3.zip" +# v1_model_pack="/Users/martratas/Documents/CogStack/MedCAT/medcat-snomed-model-creation/.creation_cache/out_snomed_2025/final_model_Snomed2025-07-11_de7cbec4a786e418.zip" +# v2_model_pack="/Users/martratas/Documents/CogStack/MedCAT/monorepo-nlp/medcat-v2/.temp/2025_11_19_issue228_add_meta_cat_to_other/models/v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip" echo "*****************" echo "running v1 stuff" From ae74fcec8b3b4313aa356e0ca9dd7035ac710a8a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 10 Apr 2026 10:11:22 +0100 Subject: [PATCH 107/111] Add a script to run everything at once --- medcat-v2/paper/scripts/run_all_at_once.sh | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 medcat-v2/paper/scripts/run_all_at_once.sh diff --git a/medcat-v2/paper/scripts/run_all_at_once.sh b/medcat-v2/paper/scripts/run_all_at_once.sh new file mode 100644 index 000000000..63f6c3269 --- /dev/null +++ b/medcat-v2/paper/scripts/run_all_at_once.sh @@ -0,0 +1,23 @@ +echo "*****************" +echo "Performance" +echo "*****************" + +bash scripts/performance/get_performance_all.sh + +echo "*****************" +echo "Regression" +echo "*****************" + +bash scripts/performance/get_regression_all.sh + +echo "*****************" +echo "Speed" +echo "*****************" + +bash scripts/speed/run_all_speed_scripts.sh + +echo "*****************" +echo "Variance" +echo "*****************" + +bash scripts/variance/get_variance_with_linker_and_tokenizer_all.sh From 1db6f6a5349c8d887c0b5ce21a1a48ed9cc80d1a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 14:57:57 +0100 Subject: [PATCH 108/111] CU-869cw9zmj: Use faster way to calculate unit vector --- medcat-v2/medcat/utils/matutils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/utils/matutils.py b/medcat-v2/medcat/utils/matutils.py index 1fcce1629..ac05ee2d4 100644 --- a/medcat-v2/medcat/utils/matutils.py +++ b/medcat-v2/medcat/utils/matutils.py @@ -15,7 +15,8 @@ def unitvec(vec: np.ndarray) -> np.ndarray: Returns: np.ndarray: The new unit vector. """ - return vec / np.linalg.norm(vec) + vec /= np.sqrt(vec @ vec) + return vec @overload From 887a180ac910a508f9467aa60b2b6048ca2f22ec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 15:14:03 +0100 Subject: [PATCH 109/111] CU-869cw9zmj: Speed up context vector obtaining --- .../linking/vector_context_model.py | 63 +++++++++++++------ 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/medcat-v2/medcat/components/linking/vector_context_model.py b/medcat-v2/medcat/components/linking/vector_context_model.py index 9815c171e..acf9f18f3 100644 --- a/medcat-v2/medcat/components/linking/vector_context_model.py +++ b/medcat-v2/medcat/components/linking/vector_context_model.py @@ -91,9 +91,10 @@ def get_context_tokens(self, entity: MutableEntity, doc: MutableDocument, return tokens_left, tokens_center, tokens_right - def _tokens2vecs(self, tokens: Sequence[Union[MutableToken, str]] - ) -> Iterable[np.ndarray]: - for step, tkn in enumerate(tokens): + def _tokens2vecs(self, tokens: Sequence[Union[MutableToken, str]], + step_start: int = 0 + ) -> Iterable[np.ndarray]: + for step, tkn in enumerate(tokens, start=step_start): lower = tkn.lower() if isinstance(tkn, str) else tkn.base.lower if lower not in self.vocab: continue @@ -137,26 +138,52 @@ def get_context_vectors(self, entity: MutableEntity, """ vectors: dict[str, np.ndarray] = {} - context_vector_sizes = self.config.context_vector_sizes - for context_type, window_size in context_vector_sizes.items(): - tokens_left, tokens_center, tokens_right = self.get_context_tokens( - entity, doc, window_size, per_doc_valid_token_cache) + # Sort ascending so each iteration is a superset of the previous + sorted_contexts = sorted( + self.config.context_vector_sizes.items(), key=lambda x: x[1]) - values: list[np.ndarray] = [] - # Add left - values.extend(self._tokens2vecs(tokens_left)) + prev_left: list[MutableToken] = [] + prev_right: list[MutableToken] = [] + # Accumulated weighted vecs from previous (smaller) windows, + # excluding center (center is the same for all window sizes) + prev_left_vecs: list[np.ndarray] = [] + prev_right_vecs: list[np.ndarray] = [] + center_vecs: Optional[list[np.ndarray]] = None # same for all windows - if not self.config.context_ignore_center_tokens: - # Add center - values.extend( - self._preprocess_center_tokens(cui, tokens_center)) + for context_type, window_size in sorted_contexts: + tokens_left, tokens_center, tokens_right = self.get_context_tokens( + entity, doc, window_size, per_doc_valid_token_cache) - # Add right - values.extend(self._tokens2vecs(tokens_right)) + # New outer tokens only — the inner ones were already processed + new_left = tokens_left[:len(tokens_left) - len(prev_left)] + new_right = tokens_right[len(prev_right):] + + # step_start for new left tokens: they are further from centre + # so their step index is + # len(tokens_left) - len(new_left) ... len(tokens_left)-1 + # i.e. the new tokens are the outermost, highest-step ones + new_left_vecs = list(self._tokens2vecs( + new_left, step_start=len(prev_left))) + new_right_vecs = list(self._tokens2vecs( + new_right, step_start=len(prev_right))) + + prev_left_vecs = new_left_vecs + prev_left_vecs + prev_right_vecs = prev_right_vecs + new_right_vecs + prev_left = tokens_left + prev_right = tokens_right + + # Center is identical for all window sizes, only compute once + if center_vecs is None: + if not self.config.context_ignore_center_tokens: + center_vecs = list( + self._preprocess_center_tokens(cui, tokens_center)) + else: + center_vecs = [] + values = prev_left_vecs + center_vecs + prev_right_vecs if values: - value = np.average(values, axis=0) - vectors[context_type] = value + vectors[context_type] = np.average(values, axis=0) + return vectors def similarity(self, cui: str, entity: MutableEntity, doc: MutableDocument, From 797229257bc62aace76c43ea7a12221c67b66d08 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 3 Jun 2026 18:43:39 +0100 Subject: [PATCH 110/111] Add variance plotting script --- .../paper/scripts/variance/plot_variance.py | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 medcat-v2/paper/scripts/variance/plot_variance.py diff --git a/medcat-v2/paper/scripts/variance/plot_variance.py b/medcat-v2/paper/scripts/variance/plot_variance.py new file mode 100644 index 000000000..7d8e2cbc4 --- /dev/null +++ b/medcat-v2/paper/scripts/variance/plot_variance.py @@ -0,0 +1,146 @@ +import matplotlib.pyplot as plt +from matplotlib.lines import Line2D +# import numpy as np +import pandas as pd +import seaborn as sns + + +data_lc = { + 'Config': [ + 'Spacy / Vector Context Linker', 'Spacy / Faster Linker', 'Spacy / Embedding Linker', + 'Regex / Vector Context Linker', 'Regex / Faster Linker', 'Regex / Embedding Linker',], + 'Runtime': [68.16, 51.64, 321.37, 30.54, 6.21, 348.79], + 'F1': [0.6072, 0.5804, 0.5932, 0.5693, 0.5681, 0.5852] +} +data_cometa = { + 'Config': [ + 'Spacy / Vector Context Linker', 'Spacy / Faster Linker', 'Spacy / Embedding Linker', + 'Regex / Vector Context Linker', 'Regex / Faster Linker', 'Regex / Embedding Linker',], + 'Runtime': [75.40, 48.05, 511.19, 117.55, 82.61, 248.02], + 'F1': [0.4112, 0.3871, 0.4215, 0.3722, 0.3664, 0.3847] +} + + +def draw(is_lc: bool): +# 1. Set style for high-visibility poster presentation + sns.set_theme(style="whitegrid") + plt.rcParams.update({ + 'font.size': 14, + 'axes.labelsize': 16, + 'axes.titlesize': 18, + 'xtick.labelsize': 14, + 'ytick.labelsize': 14, + 'legend.fontsize': 10, + 'figure.titlesize': 20 + }) + if is_lc: + data = data_lc + else: + data = data_cometa + df = pd.DataFrame(data) + + # Sort by runtime to properly compute the trade-off frontier + df = df.sort_values(by='Runtime').reset_index(drop=True) + + # Identify different types + to_marker_map = { + "Spacy": "*", + "Regex": 'o', + } + df['marker'] = '' + for part, marker in to_marker_map.items(): + df.loc[df['Config'].str.contains(part), 'marker'] = marker + to_colour_map = { + "Vector Context Linker": "blue", + "Faster Linker": "red", + "Embedding Linker": "green", + } + df['colour'] = '' + for part, colour in to_colour_map.items(): + df.loc[df['Config'].str.contains(part), 'colour'] = colour + + fig, ax = plt.subplots(figsize=(7, 4)) + + for marker_type in set(to_marker_map.values()): + for cur_colour in set(to_colour_map.values()): + cur_df = df[(df['colour'] == cur_colour) & (df['marker'] == marker_type)] + ax.scatter(cur_df['Runtime'], cur_df['F1'], + marker=marker_type, + color=cur_colour, s=120, alpha=0.6, + edgecolor='k', zorder=3) + + for i, row in df.iterrows(): + # Subtle offsets to prevent text overlaying directly on top of the dots + xytext = (12, -5) + if is_lc: + if "Regex" in row['Config'] and "Faster Linker" in row['Config']: + xytext = (12, -15) + if "Embedding" in row['Config']: + xytext = (-150, -5) + if "Regex" in row['Config']: + xytext = (-140, -15) + + ax.annotate( + row['Config'], + xy=(row['Runtime'], row['F1']), + xytext=xytext, + textcoords='offset points', + fontsize=12, + color='black', + ) + + # 6. Formatting Axis and Labels + ax.set_xlabel('Runtime (seconds)', labelpad=10, weight='bold') + ax.set_ylabel('$F_1$ Score', labelpad=10, weight='bold') + ax.set_title( + f'Speed vs performance for {"Linking Challenge" if is_lc else "COMETA"}', + pad=15, weight='bold') + + # Adjust limits appropriately to give padding for text labels + ax.set_ylim(df['F1'].min() - 0.005, df['F1'].max() + 0.005) + ax.set_xlim(0, df['Runtime'].max() + 50) + + # Build custom handles for the Marker legend + marker_handles = [ + Line2D([0], [0], marker=m, color='gray', linestyle='None', markersize=10, label=label) + for label, m in to_marker_map.items() + ] + + # Build custom handles for the Color legend + # (We use a generic square 's' or circle 'o' marker just to showcase the color) + color_handles = [ + Line2D([0], [0], marker='s', color='None', markerfacecolor=c, markeredgecolor=c, markersize=10, label=label) + for label, c in to_colour_map.items() + ] + + # Create and add the legends to the axis + # First legend (Markers) - placed normally + if is_lc: + leg1 = ax.legend(handles=marker_handles, title="Shape Meaning", loc='upper right', frameon=True) + else: + leg1 = ax.legend(handles=marker_handles, title="Shape Meaning", loc='center right', frameon=True) + + # Second legend (Colors) - added manually so it doesn't overwrite the first one + leg2 = ax.legend(handles=color_handles, title="Color Meaning", loc='lower right', frameon=True) + ax.add_artist(leg1) # CRITICAL: This prevents leg2 from deleting leg1 + + + plt.tight_layout() + if is_lc: + plt.savefig('tradeoff_lc.png', dpi=300, transparent=True) + else: + plt.savefig('tradeoff_cometa.png', dpi=300, transparent=True) + + +if __name__ == "__main__": + from sys import argv + if len(argv) < 2: + print("Assuming linking challenge") + is_lc = True + else: + is_lc = ( + "lc" in argv[1].lower() or + "linking" in argv[1].lower() or + "challenge" in argv[1].lower()) + print("Doing dataset", "Linking Challenge" if is_lc else "COMETA") + draw(is_lc) From d9a74fb1a07202959849128b82dccea721fc180a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 4 Jun 2026 10:35:41 +0100 Subject: [PATCH 111/111] Revert changes to matutils --- medcat-v2/medcat/utils/matutils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/medcat-v2/medcat/utils/matutils.py b/medcat-v2/medcat/utils/matutils.py index ac05ee2d4..1fcce1629 100644 --- a/medcat-v2/medcat/utils/matutils.py +++ b/medcat-v2/medcat/utils/matutils.py @@ -15,8 +15,7 @@ def unitvec(vec: np.ndarray) -> np.ndarray: Returns: np.ndarray: The new unit vector. """ - vec /= np.sqrt(vec @ vec) - return vec + return vec / np.linalg.norm(vec) @overload