From 73110d641e1ff2efcc5269f5289fd0b20f5fa735 Mon Sep 17 00:00:00 2001 From: bamader Date: Mon, 23 Mar 2026 16:14:13 -0400 Subject: [PATCH 01/10] Add Reranker service to lambda --- Dockerfile.ttc | 17 +++++++--- .../src/index_lambda/lambda_function.py | 2 +- .../src/text_to_code/models/__init__.py | 6 ++-- .../src/text_to_code/models/registry.py | 9 ++++-- .../src/text_to_code/services/embedder.py | 12 ++++--- .../src/text_to_code/services/reranker.py | 28 ++++++++++++++++ .../text-to-code/tests/unit/test_reranker.py | 32 +++++++++++++++++++ 7 files changed, 93 insertions(+), 13 deletions(-) create mode 100644 packages/text-to-code/src/text_to_code/services/reranker.py create mode 100644 packages/text-to-code/tests/unit/test_reranker.py diff --git a/Dockerfile.ttc b/Dockerfile.ttc index f04e8268..f23608ef 100644 --- a/Dockerfile.ttc +++ b/Dockerfile.ttc @@ -19,14 +19,23 @@ RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code" COPY ./packages/text-to-code-lambda ${LAMBDA_TASK_ROOT}/text-to-code-lambda RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code-lambda" -# Download model at build time (baked into image) +# Download retriever at build time (baked into image) RUN python -c "\ -from huggingface_hub import snapshot_download; \ -snapshot_download( \ + from huggingface_hub import snapshot_download; \ + snapshot_download( \ repo_id='intfloat/e5-large-v2', \ local_dir='/opt/model', \ ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ -)" + )" + +# Download reranker at build time (baked into image) +RUN python -c "\ + from huggingface_hub import snapshot_download; \ + snapshot_download( \ + repo_id='cross-encoder/stsb-roberta-large', \ + local_dir='/opt/model', \ + ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ + )" ENV MODEL_PATH="/opt/model" diff --git a/packages/index-lambda/src/index_lambda/lambda_function.py b/packages/index-lambda/src/index_lambda/lambda_function.py index c272620c..2adb419c 100644 --- a/packages/index-lambda/src/index_lambda/lambda_function.py +++ b/packages/index-lambda/src/index_lambda/lambda_function.py @@ -21,7 +21,7 @@ def handler(event: dict, context: dict) -> dict: "name": "hnsw", "space_type": "cosinesimil", "engine": "faiss", - "parameters": {"ef_construction": 128, "m": 16}, + "parameters": {"ef_construction": 400, "m": 64}, }, }, "loinc_type": {"type": "keyword"}, diff --git a/packages/text-to-code/src/text_to_code/models/__init__.py b/packages/text-to-code/src/text_to_code/models/__init__.py index 7f31bdd3..d510ec70 100644 --- a/packages/text-to-code/src/text_to_code/models/__init__.py +++ b/packages/text-to-code/src/text_to_code/models/__init__.py @@ -6,7 +6,8 @@ from .query import DataFieldTypeMapping from .query import VectorSearchParams from .registry import EICR_REGISTRY -from .registry import default_model +from .registry import TTC_RERANKER +from .registry import TTC_RETRIEVER from .schematron import _SCHEMATRON_ENUM_TO_FIELD from .schematron import LabTestNameOrderedSchematronErrors from .schematron import LabTestNameResultedSchematronErrors @@ -15,6 +16,8 @@ __all__ = [ "EICR_REGISTRY", + "TTC_RERANKER", + "TTC_RETRIEVER", "_SCHEMATRON_ENUM_TO_FIELD", "BaseLabField", "Candidate", @@ -27,5 +30,4 @@ "SchematronConfig", "SchematronErrors", "VectorSearchParams", - "default_model", ] diff --git a/packages/text-to-code/src/text_to_code/models/registry.py b/packages/text-to-code/src/text_to_code/models/registry.py index 180c20a7..8930bc4f 100644 --- a/packages/text-to-code/src/text_to_code/models/registry.py +++ b/packages/text-to-code/src/text_to_code/models/registry.py @@ -9,5 +9,10 @@ DataField.LAB_TEST_NAME_ORDERED: LabTestNameOrdered, } -# Default model name for SentenceTransformer, representing the model TTC used most extensively -default_model: str = "intfloat/e5-large-v2" +# Text-to-Code Retrieval model, used for searching approximate neighborhoods +# to find semantically similar candidates +TTC_RETRIEVER: str = "intfloat/e5-large-v2" + +# Text-to-Code Reranker model, used for re-scoring and re-sorting the hits +# found by the approximate neighbor search +TTC_RERANKER: str = "cross-encoder/stsb-roberta-large" diff --git a/packages/text-to-code/src/text_to_code/services/embedder.py b/packages/text-to-code/src/text_to_code/services/embedder.py index cf82636e..9bfa26e7 100644 --- a/packages/text-to-code/src/text_to_code/services/embedder.py +++ b/packages/text-to-code/src/text_to_code/services/embedder.py @@ -1,18 +1,22 @@ from sentence_transformers import SentenceTransformer from torch import Tensor -from text_to_code.models.registry import default_model +from text_to_code.models.registry import TTC_RETRIEVER -_MODEL = SentenceTransformer(default_model) +_RETRIEVER = SentenceTransformer(TTC_RETRIEVER) class Embedder: """Transforms nonstandard text.""" def embed(self, text: str) -> Tensor: - """Take a text string and embeds it as a vector using a model as defined in config.py. + """Encode a text string into a vector representation. + + The dimensionality and + values of the vector form are determined by the application's default + Retriever Model. :param text: Text string to embed. :returns: Tensor representation of input text. """ - return _MODEL.encode(text) + return _RETRIEVER.encode(text) diff --git a/packages/text-to-code/src/text_to_code/services/reranker.py b/packages/text-to-code/src/text_to_code/services/reranker.py new file mode 100644 index 00000000..73ba46a2 --- /dev/null +++ b/packages/text-to-code/src/text_to_code/services/reranker.py @@ -0,0 +1,28 @@ +from sentence_transformers import CrossEncoder + +from text_to_code.models.registry import TTC_RERANKER + +_RERANKER = CrossEncoder(TTC_RERANKER) + + +class Reranker: + """Scores and sorts OpenSearch results.""" + + def rerank(self, nonstandard_in: str, hits: list[str]) -> list[dict]: + """Do reranking. + + Given a list of text strings returned from OpenSearch, score and sort + the search hits using the Text-to-Code system's default Reranker model. + The model will generate a cross-encoding score value measuring each + search result's information similarity to the original nonstandard input. + + :param nonstandard_in: The original narrative free-text input to TTC. + :param hits: The list of OpenSearch results, in text string form. + :returns: A list of dictionaries representing the newly cross-encoder + scored search results, sorted in descending order of score. + """ + ranks = _RERANKER.rank(nonstandard_in, hits) + sorted_ranks = [{"code_string": hits[r["corpus_id"]], "score": r["score"]} for r in ranks] + # Want the scores in descending order, default `sorted` method is ascending + sorted_ranks = sorted(sorted_ranks, key=lambda x: x["score"], reverse=True) + return sorted_ranks diff --git a/packages/text-to-code/tests/unit/test_reranker.py b/packages/text-to-code/tests/unit/test_reranker.py new file mode 100644 index 00000000..9c1a7813 --- /dev/null +++ b/packages/text-to-code/tests/unit/test_reranker.py @@ -0,0 +1,32 @@ +import pytest +from text_to_code.services.reranker import Reranker + + +class TestReranker: + @pytest.fixture(scope="class") + def reranker(self) -> Reranker: + return Reranker() + + def test_reranker_empty_hits(self, reranker: Reranker) -> None: + ranks = reranker.rerank([]) + assert len(ranks) == 0 + + def test_reranker_single_search_result(self, reranker: Reranker) -> None: + ranks = reranker.rerank( + "Influenza virus A and B and SARS-CoV-2 (COVID-19)", + ["Influenza virus A and B and SARS-CoV-2 (COVID-19)"], + ) + assert ranks == [ + {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 1.0} + ] + + def test_reranker_multiple_hits(self, reranker: Reranker) -> None: + nonstandard_in = "albumin/creatinine ratio (acr)" + search_hits = [ + "Albumin/Creatinine [Ratio] in Urine", + "Albumin/Creatinine (U) [Mass ratio]", + "Albumin/Creatinine [Ratio] in 24 hour Urine", + "Albumin/Creatinine (U) [Molar ratio]", + ] + ranks = reranker.rerank(nonstandard_in, search_hits) + assert ranks == [] From 851df446949e2d94a0e968b6341f2c65d4e099d1 Mon Sep 17 00:00:00 2001 From: bamader Date: Mon, 23 Mar 2026 16:30:54 -0400 Subject: [PATCH 02/10] Add some rounding to test scores --- .../src/text_to_code/services/embedder.py | 5 ++--- .../src/text_to_code/services/reranker.py | 2 +- .../text-to-code/tests/unit/test_reranker.py | 22 ++++++++++++++++--- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/packages/text-to-code/src/text_to_code/services/embedder.py b/packages/text-to-code/src/text_to_code/services/embedder.py index 9bfa26e7..0a06420a 100644 --- a/packages/text-to-code/src/text_to_code/services/embedder.py +++ b/packages/text-to-code/src/text_to_code/services/embedder.py @@ -12,9 +12,8 @@ class Embedder: def embed(self, text: str) -> Tensor: """Encode a text string into a vector representation. - The dimensionality and - values of the vector form are determined by the application's default - Retriever Model. + The dimensionality and values of the vector form are determined + by the application's default Retriever Model. :param text: Text string to embed. :returns: Tensor representation of input text. diff --git a/packages/text-to-code/src/text_to_code/services/reranker.py b/packages/text-to-code/src/text_to_code/services/reranker.py index 73ba46a2..ec2018c3 100644 --- a/packages/text-to-code/src/text_to_code/services/reranker.py +++ b/packages/text-to-code/src/text_to_code/services/reranker.py @@ -9,7 +9,7 @@ class Reranker: """Scores and sorts OpenSearch results.""" def rerank(self, nonstandard_in: str, hits: list[str]) -> list[dict]: - """Do reranking. + """Re-sorts hits by cross-encoder score values. Given a list of text strings returned from OpenSearch, score and sort the search hits using the Text-to-Code system's default Reranker model. diff --git a/packages/text-to-code/tests/unit/test_reranker.py b/packages/text-to-code/tests/unit/test_reranker.py index 9c1a7813..124b96c5 100644 --- a/packages/text-to-code/tests/unit/test_reranker.py +++ b/packages/text-to-code/tests/unit/test_reranker.py @@ -8,7 +8,7 @@ def reranker(self) -> Reranker: return Reranker() def test_reranker_empty_hits(self, reranker: Reranker) -> None: - ranks = reranker.rerank([]) + ranks = reranker.rerank("Influenza virus A and B and SARS-CoV-2 (COVID-19)", []) assert len(ranks) == 0 def test_reranker_single_search_result(self, reranker: Reranker) -> None: @@ -16,8 +16,9 @@ def test_reranker_single_search_result(self, reranker: Reranker) -> None: "Influenza virus A and B and SARS-CoV-2 (COVID-19)", ["Influenza virus A and B and SARS-CoV-2 (COVID-19)"], ) + ranks = [{"code_string": r["code_string"], "score": round(r["score"], 3)} for r in ranks] assert ranks == [ - {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 1.0} + {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 0.97} ] def test_reranker_multiple_hits(self, reranker: Reranker) -> None: @@ -29,4 +30,19 @@ def test_reranker_multiple_hits(self, reranker: Reranker) -> None: "Albumin/Creatinine (U) [Molar ratio]", ] ranks = reranker.rerank(nonstandard_in, search_hits) - assert ranks == [] + ranks = [{"code_string": r["code_string"], "score": round(r["score"], 3)} for r in ranks] + assert ranks == [ + { + "code_string": "Albumin/Creatinine (U) [Mass ratio]", + "score": 0.75, + }, + { + "code_string": "Albumin/Creatinine (U) [Molar ratio]", + "score": 0.73, + }, + { + "code_string": "Albumin/Creatinine [Ratio] in 24 hour Urine", + "score": 0.70, + }, + {"code_string": "Albumin/Creatinine [Ratio] in Urine", "score": 0.67}, + ] From 8b6e84ae12647c8d08feef8c70d64dc8a52600df Mon Sep 17 00:00:00 2001 From: bamader Date: Mon, 23 Mar 2026 16:36:33 -0400 Subject: [PATCH 03/10] Oops rounded wrong --- .../text-to-code/tests/unit/test_reranker.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/packages/text-to-code/tests/unit/test_reranker.py b/packages/text-to-code/tests/unit/test_reranker.py index 124b96c5..d156b343 100644 --- a/packages/text-to-code/tests/unit/test_reranker.py +++ b/packages/text-to-code/tests/unit/test_reranker.py @@ -16,9 +16,11 @@ def test_reranker_single_search_result(self, reranker: Reranker) -> None: "Influenza virus A and B and SARS-CoV-2 (COVID-19)", ["Influenza virus A and B and SARS-CoV-2 (COVID-19)"], ) - ranks = [{"code_string": r["code_string"], "score": round(r["score"], 3)} for r in ranks] + ranks = [ + {"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks + ] assert ranks == [ - {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 0.97} + {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 0.973} ] def test_reranker_multiple_hits(self, reranker: Reranker) -> None: @@ -30,11 +32,13 @@ def test_reranker_multiple_hits(self, reranker: Reranker) -> None: "Albumin/Creatinine (U) [Molar ratio]", ] ranks = reranker.rerank(nonstandard_in, search_hits) - ranks = [{"code_string": r["code_string"], "score": round(r["score"], 3)} for r in ranks] + ranks = [ + {"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks + ] assert ranks == [ { "code_string": "Albumin/Creatinine (U) [Mass ratio]", - "score": 0.75, + "score": 0.755, }, { "code_string": "Albumin/Creatinine (U) [Molar ratio]", @@ -42,7 +46,7 @@ def test_reranker_multiple_hits(self, reranker: Reranker) -> None: }, { "code_string": "Albumin/Creatinine [Ratio] in 24 hour Urine", - "score": 0.70, + "score": 0.701, }, - {"code_string": "Albumin/Creatinine [Ratio] in Urine", "score": 0.67}, + {"code_string": "Albumin/Creatinine [Ratio] in Urine", "score": 0.672}, ] From aebd88de99f9f9b6ee2ff886e4a0096c0cd6d0f8 Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 14:44:24 -0400 Subject: [PATCH 04/10] Add reranker into pipeline --- .../src/text_to_code_lambda/lambda_function.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py index 439f7117..e1d28243 100644 --- a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py +++ b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py @@ -15,6 +15,7 @@ from text_to_code.services import eicr_processor from text_to_code.services import embedder from text_to_code.services import evaluator +from text_to_code.services import reranker from text_to_code.services import schematron_processor from text_to_code.services.query import QueryBuilder @@ -264,8 +265,16 @@ def _process_schematron_errors( query=query, index=OPENSEARCH_INDEX, opensearch_client=opensearch_client ) + # The OpenSearch results object has a couple levels of nesting, + # but all we care about for reranking is extracting the actual + # text strings of the ANN LOINC codes + results_list = opensearch_retrieved_scores.hits.hits + retrieved_loinc_names = [hit.source.description for hit in results_list] + ranked_results = reranker.rerank(selected_candidate.value, retrieved_loinc_names) + metadata_error = error.model_dump() metadata_error["opensearch_retrieved_scores"] = opensearch_retrieved_scores + metadata_error["reranker_processed_results"] = ranked_results ttc_metadata_output["schematron_errors"][data_field].append(metadata_error) From fc74ab2c25b830c5e3d345e07c8206ca3f2b7914 Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 15:49:21 -0400 Subject: [PATCH 05/10] Instantiate Reranker --- .../src/text_to_code_lambda/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py index e1d28243..9b2620a6 100644 --- a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py +++ b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py @@ -270,7 +270,7 @@ def _process_schematron_errors( # text strings of the ANN LOINC codes results_list = opensearch_retrieved_scores.hits.hits retrieved_loinc_names = [hit.source.description for hit in results_list] - ranked_results = reranker.rerank(selected_candidate.value, retrieved_loinc_names) + ranked_results = reranker.Reranker().rerank(selected_candidate.value, retrieved_loinc_names) metadata_error = error.model_dump() metadata_error["opensearch_retrieved_scores"] = opensearch_retrieved_scores From f5d5d7ee9e83cd05b2529b8f195713e870935349 Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 16:00:15 -0400 Subject: [PATCH 06/10] Reranker prediction --- .../text-to-code-lambda/tests/test_lambda_function.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/packages/text-to-code-lambda/tests/test_lambda_function.py b/packages/text-to-code-lambda/tests/test_lambda_function.py index e000a252..4c4048e4 100644 --- a/packages/text-to-code-lambda/tests/test_lambda_function.py +++ b/packages/text-to-code-lambda/tests/test_lambda_function.py @@ -71,6 +71,15 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc "opensearch_retrieved_scores" in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0] ) + assert ( + "reranker_processed_results" + in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0] + ) + predicted_candidate = ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0][ + "reranker_processed_results" + ][0] + assert predicted_candidate["code_string"] == "" + assert round(predicted_candidate["score"], 3) == 0.0 def test_handler_with_no_records(self, example_sqs_event, mock_opensearch): """Test handler with no records.""" From 3d1797e3daaee1b56f8df80b585a2a3563db1301 Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 16:03:12 -0400 Subject: [PATCH 07/10] Code string update --- packages/text-to-code-lambda/tests/test_lambda_function.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/text-to-code-lambda/tests/test_lambda_function.py b/packages/text-to-code-lambda/tests/test_lambda_function.py index 4c4048e4..05a0099c 100644 --- a/packages/text-to-code-lambda/tests/test_lambda_function.py +++ b/packages/text-to-code-lambda/tests/test_lambda_function.py @@ -78,7 +78,10 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc predicted_candidate = ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0][ "reranker_processed_results" ][0] - assert predicted_candidate["code_string"] == "" + assert ( + predicted_candidate["code_string"] + == "(Artemisia vulgaris+Chenopodium album+Plantago lanceolata+Solidago virgaurea+Urtica dioica) Ab.IgE:PrThr:Pt:Ser:Ord:Multidisk" + ) assert round(predicted_candidate["score"], 3) == 0.0 def test_handler_with_no_records(self, example_sqs_event, mock_opensearch): From b601deea77f763b57cd83f8cd56b28c764b2b69f Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 16:08:35 -0400 Subject: [PATCH 08/10] Float cast --- packages/text-to-code-lambda/tests/test_lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/text-to-code-lambda/tests/test_lambda_function.py b/packages/text-to-code-lambda/tests/test_lambda_function.py index 05a0099c..34d600cc 100644 --- a/packages/text-to-code-lambda/tests/test_lambda_function.py +++ b/packages/text-to-code-lambda/tests/test_lambda_function.py @@ -82,7 +82,7 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc predicted_candidate["code_string"] == "(Artemisia vulgaris+Chenopodium album+Plantago lanceolata+Solidago virgaurea+Urtica dioica) Ab.IgE:PrThr:Pt:Ser:Ord:Multidisk" ) - assert round(predicted_candidate["score"], 3) == 0.0 + assert round(float(predicted_candidate["score"]), 3) == 0.0 def test_handler_with_no_records(self, example_sqs_event, mock_opensearch): """Test handler with no records.""" From 8be3aaf2a7ab06a374e2d62c51717fc75fe84cbd Mon Sep 17 00:00:00 2001 From: bamader Date: Tue, 24 Mar 2026 16:17:39 -0400 Subject: [PATCH 09/10] Update reranker score --- packages/text-to-code-lambda/tests/test_lambda_function.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/text-to-code-lambda/tests/test_lambda_function.py b/packages/text-to-code-lambda/tests/test_lambda_function.py index 34d600cc..dfc479ee 100644 --- a/packages/text-to-code-lambda/tests/test_lambda_function.py +++ b/packages/text-to-code-lambda/tests/test_lambda_function.py @@ -5,6 +5,7 @@ EXPECTED_RESULTED_ERRORS = 2 EXPECTED_ORDERED_ERRORS = 2 +EXPECTED_RERANKER_SCORE = 0.01 class TestHandler: @@ -82,7 +83,7 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc predicted_candidate["code_string"] == "(Artemisia vulgaris+Chenopodium album+Plantago lanceolata+Solidago virgaurea+Urtica dioica) Ab.IgE:PrThr:Pt:Ser:Ord:Multidisk" ) - assert round(float(predicted_candidate["score"]), 3) == 0.0 + assert round(float(predicted_candidate["score"]), 3) == EXPECTED_RERANKER_SCORE def test_handler_with_no_records(self, example_sqs_event, mock_opensearch): """Test handler with no records.""" From e9dd82e83b980158f63741a482cbf9321ffa1da2 Mon Sep 17 00:00:00 2001 From: bamader Date: Wed, 25 Mar 2026 12:01:16 -0400 Subject: [PATCH 10/10] PR Feedback --- Dockerfile.ttc | 7 ++++--- .../src/text_to_code_lambda/lambda_function.py | 9 +++++++-- .../text-to-code/src/text_to_code/services/reranker.py | 2 -- terraform/README.md | 5 ++++- terraform/main.tf | 3 ++- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/Dockerfile.ttc b/Dockerfile.ttc index f23608ef..ea177011 100644 --- a/Dockerfile.ttc +++ b/Dockerfile.ttc @@ -24,7 +24,7 @@ RUN python -c "\ from huggingface_hub import snapshot_download; \ snapshot_download( \ repo_id='intfloat/e5-large-v2', \ - local_dir='/opt/model', \ + local_dir='/opt/retriever_model', \ ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ )" @@ -33,10 +33,11 @@ RUN python -c "\ from huggingface_hub import snapshot_download; \ snapshot_download( \ repo_id='cross-encoder/stsb-roberta-large', \ - local_dir='/opt/model', \ + local_dir='/opt/reranker_model', \ ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ )" -ENV MODEL_PATH="/opt/model" +ENV RETRIEVER_MODEL_PATH="/opt/retriever_model" +ENV RERANKER_MODEL_PATH="/opt/reranker_model" CMD ["text_to_code_lambda.lambda_function.handler"] diff --git a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py index 9b2620a6..7f7842bd 100644 --- a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py +++ b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py @@ -33,6 +33,11 @@ OPENSEARCH_ENDPOINT_URL = os.getenv("OPENSEARCH_ENDPOINT_URL") OPENSEARCH_INDEX = os.getenv("OPENSEARCH_INDEX", "ttc-index") +# Instantiate wrapper objects for the sentence-transformers models +# to re-use across invocations +RETRIEVER = embedder.Embedder() +RERANKER = reranker.Reranker() + # Cache clients and auth to reuse across Lambda invocations _cached_auth = None _cached_opensearch_client = None @@ -250,7 +255,7 @@ def _process_schematron_errors( if selected_candidate is None: continue - vector_embedding = embedder.Embedder().embed(selected_candidate.value) + vector_embedding = RETRIEVER.embed(selected_candidate.value) vector_parameters = query_models.VectorSearchParams( vector=vector_embedding.tolist(), data_field=data_field @@ -270,7 +275,7 @@ def _process_schematron_errors( # text strings of the ANN LOINC codes results_list = opensearch_retrieved_scores.hits.hits retrieved_loinc_names = [hit.source.description for hit in results_list] - ranked_results = reranker.Reranker().rerank(selected_candidate.value, retrieved_loinc_names) + ranked_results = RERANKER.rerank(selected_candidate.value, retrieved_loinc_names) metadata_error = error.model_dump() metadata_error["opensearch_retrieved_scores"] = opensearch_retrieved_scores diff --git a/packages/text-to-code/src/text_to_code/services/reranker.py b/packages/text-to-code/src/text_to_code/services/reranker.py index ec2018c3..c95d9934 100644 --- a/packages/text-to-code/src/text_to_code/services/reranker.py +++ b/packages/text-to-code/src/text_to_code/services/reranker.py @@ -23,6 +23,4 @@ def rerank(self, nonstandard_in: str, hits: list[str]) -> list[dict]: """ ranks = _RERANKER.rank(nonstandard_in, hits) sorted_ranks = [{"code_string": hits[r["corpus_id"]], "score": r["score"]} for r in ranks] - # Want the scores in descending order, default `sorted` method is ascending - sorted_ranks = sorted(sorted_ranks, key=lambda x: x["score"], reverse=True) return sorted_ranks diff --git a/terraform/README.md b/terraform/README.md index cb42f0b4..42a5ad6c 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -65,17 +65,19 @@ The index it creates has LOINC-specific field mappings including `description_ve Deployed as a **container image** from ECR (`package_type = "Image"`). The Docker image (`Dockerfile.ttc` at repo root) installs the full `text-to-code-lambda` package along with its workspace dependencies (`shared-models`, `lambda-handler`, `text-to-code`). At runtime, the Lambda runs the real `text_to_code_lambda.lambda_function.handler`, which: + 1. Loads the SentenceTransformer model from `/opt/model` during initialization (cold start) 2. Parses eICR XML documents from S3 to extract text candidates 3. Evaluates and selects the best candidate for each data field 4. Generates embeddings and executes KNN queries against OpenSearch 5. Returns standardized code mappings (LOINC/SNOMED) -Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`. +Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `RETRIEVER_MODEL_PATH`, `RERANKER_MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`. ### OpenSearch Ingestion Pipeline (`main.tf`) An **AWS OpenSearch Ingestion Service (OSIS)** pipeline (`aws_osis_pipeline.ttc_ingestion_pipeline`) that: + - Polls `s3://dibbs-text-to-code/ingestion/` monthly for new NDJSON files - Parses each line as a document and bulk-writes it into the `ttc-index` OpenSearch index - Runs within the VPC using the same private subnets as Lambda @@ -100,6 +102,7 @@ Terraform manages dependency ordering automatically, but conceptually the sequen ## State Backend Terraform state is stored remotely in **AWS S3** with DynamoDB locking: + - Bucket: `dibbs-ttc-terraform-state` - Key: `terraform.tfstate` - Region: `us-east-2` diff --git a/terraform/main.tf b/terraform/main.tf index 3a5a35e3..4e35b7eb 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -261,7 +261,8 @@ resource "aws_lambda_function" "lambda" { OPENSEARCH_INDEX = var.index_name REGION = var.region BUCKET_NAME = var.s3_bucket - MODEL_PATH = "/opt/model" + RETRIEVER_MODEL_PATH = "/opt/retriever_model" + RERANKER_MODEL_PATH = "/opt/reranker_model" EICR_INPUT_PREFIX = var.eicr_input_prefix SCHEMATRON_ERROR_PREFIX = var.schematron_error_prefix TTC_INPUT_PREFIX = var.ttc_input_prefix