diff --git a/Dockerfile.ttc b/Dockerfile.ttc index f04e8268..ea177011 100644 --- a/Dockerfile.ttc +++ b/Dockerfile.ttc @@ -19,15 +19,25 @@ RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code" COPY ./packages/text-to-code-lambda ${LAMBDA_TASK_ROOT}/text-to-code-lambda RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code-lambda" -# Download model at build time (baked into image) +# Download retriever at build time (baked into image) RUN python -c "\ -from huggingface_hub import snapshot_download; \ -snapshot_download( \ + from huggingface_hub import snapshot_download; \ + snapshot_download( \ repo_id='intfloat/e5-large-v2', \ - local_dir='/opt/model', \ + local_dir='/opt/retriever_model', \ ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ -)" + )" -ENV MODEL_PATH="/opt/model" +# Download reranker at build time (baked into image) +RUN python -c "\ + from huggingface_hub import snapshot_download; \ + snapshot_download( \ + repo_id='cross-encoder/stsb-roberta-large', \ + local_dir='/opt/reranker_model', \ + ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \ + )" + +ENV RETRIEVER_MODEL_PATH="/opt/retriever_model" +ENV RERANKER_MODEL_PATH="/opt/reranker_model" CMD ["text_to_code_lambda.lambda_function.handler"] diff --git a/packages/index-lambda/src/index_lambda/lambda_function.py b/packages/index-lambda/src/index_lambda/lambda_function.py index c272620c..2adb419c 100644 --- a/packages/index-lambda/src/index_lambda/lambda_function.py +++ b/packages/index-lambda/src/index_lambda/lambda_function.py @@ -21,7 +21,7 @@ def handler(event: dict, context: dict) -> dict: "name": "hnsw", "space_type": "cosinesimil", "engine": "faiss", - "parameters": {"ef_construction": 128, "m": 16}, + "parameters": {"ef_construction": 400, "m": 64}, }, }, "loinc_type": {"type": "keyword"}, diff --git a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py index 439f7117..7f7842bd 100644 --- a/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py +++ b/packages/text-to-code-lambda/src/text_to_code_lambda/lambda_function.py @@ -15,6 +15,7 @@ from text_to_code.services import eicr_processor from text_to_code.services import embedder from text_to_code.services import evaluator +from text_to_code.services import reranker from text_to_code.services import schematron_processor from text_to_code.services.query import QueryBuilder @@ -32,6 +33,11 @@ OPENSEARCH_ENDPOINT_URL = os.getenv("OPENSEARCH_ENDPOINT_URL") OPENSEARCH_INDEX = os.getenv("OPENSEARCH_INDEX", "ttc-index") +# Instantiate wrapper objects for the sentence-transformers models +# to re-use across invocations +RETRIEVER = embedder.Embedder() +RERANKER = reranker.Reranker() + # Cache clients and auth to reuse across Lambda invocations _cached_auth = None _cached_opensearch_client = None @@ -249,7 +255,7 @@ def _process_schematron_errors( if selected_candidate is None: continue - vector_embedding = embedder.Embedder().embed(selected_candidate.value) + vector_embedding = RETRIEVER.embed(selected_candidate.value) vector_parameters = query_models.VectorSearchParams( vector=vector_embedding.tolist(), data_field=data_field @@ -264,8 +270,16 @@ def _process_schematron_errors( query=query, index=OPENSEARCH_INDEX, opensearch_client=opensearch_client ) + # The OpenSearch results object has a couple levels of nesting, + # but all we care about for reranking is extracting the actual + # text strings of the ANN LOINC codes + results_list = opensearch_retrieved_scores.hits.hits + retrieved_loinc_names = [hit.source.description for hit in results_list] + ranked_results = RERANKER.rerank(selected_candidate.value, retrieved_loinc_names) + metadata_error = error.model_dump() metadata_error["opensearch_retrieved_scores"] = opensearch_retrieved_scores + metadata_error["reranker_processed_results"] = ranked_results ttc_metadata_output["schematron_errors"][data_field].append(metadata_error) diff --git a/packages/text-to-code-lambda/tests/test_lambda_function.py b/packages/text-to-code-lambda/tests/test_lambda_function.py index e000a252..dfc479ee 100644 --- a/packages/text-to-code-lambda/tests/test_lambda_function.py +++ b/packages/text-to-code-lambda/tests/test_lambda_function.py @@ -5,6 +5,7 @@ EXPECTED_RESULTED_ERRORS = 2 EXPECTED_ORDERED_ERRORS = 2 +EXPECTED_RERANKER_SCORE = 0.01 class TestHandler: @@ -71,6 +72,18 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc "opensearch_retrieved_scores" in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0] ) + assert ( + "reranker_processed_results" + in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0] + ) + predicted_candidate = ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0][ + "reranker_processed_results" + ][0] + assert ( + predicted_candidate["code_string"] + == "(Artemisia vulgaris+Chenopodium album+Plantago lanceolata+Solidago virgaurea+Urtica dioica) Ab.IgE:PrThr:Pt:Ser:Ord:Multidisk" + ) + assert round(float(predicted_candidate["score"]), 3) == EXPECTED_RERANKER_SCORE def test_handler_with_no_records(self, example_sqs_event, mock_opensearch): """Test handler with no records.""" diff --git a/packages/text-to-code/src/text_to_code/models/__init__.py b/packages/text-to-code/src/text_to_code/models/__init__.py index 7f31bdd3..d510ec70 100644 --- a/packages/text-to-code/src/text_to_code/models/__init__.py +++ b/packages/text-to-code/src/text_to_code/models/__init__.py @@ -6,7 +6,8 @@ from .query import DataFieldTypeMapping from .query import VectorSearchParams from .registry import EICR_REGISTRY -from .registry import default_model +from .registry import TTC_RERANKER +from .registry import TTC_RETRIEVER from .schematron import _SCHEMATRON_ENUM_TO_FIELD from .schematron import LabTestNameOrderedSchematronErrors from .schematron import LabTestNameResultedSchematronErrors @@ -15,6 +16,8 @@ __all__ = [ "EICR_REGISTRY", + "TTC_RERANKER", + "TTC_RETRIEVER", "_SCHEMATRON_ENUM_TO_FIELD", "BaseLabField", "Candidate", @@ -27,5 +30,4 @@ "SchematronConfig", "SchematronErrors", "VectorSearchParams", - "default_model", ] diff --git a/packages/text-to-code/src/text_to_code/models/registry.py b/packages/text-to-code/src/text_to_code/models/registry.py index 180c20a7..8930bc4f 100644 --- a/packages/text-to-code/src/text_to_code/models/registry.py +++ b/packages/text-to-code/src/text_to_code/models/registry.py @@ -9,5 +9,10 @@ DataField.LAB_TEST_NAME_ORDERED: LabTestNameOrdered, } -# Default model name for SentenceTransformer, representing the model TTC used most extensively -default_model: str = "intfloat/e5-large-v2" +# Text-to-Code Retrieval model, used for searching approximate neighborhoods +# to find semantically similar candidates +TTC_RETRIEVER: str = "intfloat/e5-large-v2" + +# Text-to-Code Reranker model, used for re-scoring and re-sorting the hits +# found by the approximate neighbor search +TTC_RERANKER: str = "cross-encoder/stsb-roberta-large" diff --git a/packages/text-to-code/src/text_to_code/services/embedder.py b/packages/text-to-code/src/text_to_code/services/embedder.py index cf82636e..0a06420a 100644 --- a/packages/text-to-code/src/text_to_code/services/embedder.py +++ b/packages/text-to-code/src/text_to_code/services/embedder.py @@ -1,18 +1,21 @@ from sentence_transformers import SentenceTransformer from torch import Tensor -from text_to_code.models.registry import default_model +from text_to_code.models.registry import TTC_RETRIEVER -_MODEL = SentenceTransformer(default_model) +_RETRIEVER = SentenceTransformer(TTC_RETRIEVER) class Embedder: """Transforms nonstandard text.""" def embed(self, text: str) -> Tensor: - """Take a text string and embeds it as a vector using a model as defined in config.py. + """Encode a text string into a vector representation. + + The dimensionality and values of the vector form are determined + by the application's default Retriever Model. :param text: Text string to embed. :returns: Tensor representation of input text. """ - return _MODEL.encode(text) + return _RETRIEVER.encode(text) diff --git a/packages/text-to-code/src/text_to_code/services/reranker.py b/packages/text-to-code/src/text_to_code/services/reranker.py new file mode 100644 index 00000000..c95d9934 --- /dev/null +++ b/packages/text-to-code/src/text_to_code/services/reranker.py @@ -0,0 +1,26 @@ +from sentence_transformers import CrossEncoder + +from text_to_code.models.registry import TTC_RERANKER + +_RERANKER = CrossEncoder(TTC_RERANKER) + + +class Reranker: + """Scores and sorts OpenSearch results.""" + + def rerank(self, nonstandard_in: str, hits: list[str]) -> list[dict]: + """Re-sorts hits by cross-encoder score values. + + Given a list of text strings returned from OpenSearch, score and sort + the search hits using the Text-to-Code system's default Reranker model. + The model will generate a cross-encoding score value measuring each + search result's information similarity to the original nonstandard input. + + :param nonstandard_in: The original narrative free-text input to TTC. + :param hits: The list of OpenSearch results, in text string form. + :returns: A list of dictionaries representing the newly cross-encoder + scored search results, sorted in descending order of score. + """ + ranks = _RERANKER.rank(nonstandard_in, hits) + sorted_ranks = [{"code_string": hits[r["corpus_id"]], "score": r["score"]} for r in ranks] + return sorted_ranks diff --git a/packages/text-to-code/tests/unit/test_reranker.py b/packages/text-to-code/tests/unit/test_reranker.py new file mode 100644 index 00000000..d156b343 --- /dev/null +++ b/packages/text-to-code/tests/unit/test_reranker.py @@ -0,0 +1,52 @@ +import pytest +from text_to_code.services.reranker import Reranker + + +class TestReranker: + @pytest.fixture(scope="class") + def reranker(self) -> Reranker: + return Reranker() + + def test_reranker_empty_hits(self, reranker: Reranker) -> None: + ranks = reranker.rerank("Influenza virus A and B and SARS-CoV-2 (COVID-19)", []) + assert len(ranks) == 0 + + def test_reranker_single_search_result(self, reranker: Reranker) -> None: + ranks = reranker.rerank( + "Influenza virus A and B and SARS-CoV-2 (COVID-19)", + ["Influenza virus A and B and SARS-CoV-2 (COVID-19)"], + ) + ranks = [ + {"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks + ] + assert ranks == [ + {"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 0.973} + ] + + def test_reranker_multiple_hits(self, reranker: Reranker) -> None: + nonstandard_in = "albumin/creatinine ratio (acr)" + search_hits = [ + "Albumin/Creatinine [Ratio] in Urine", + "Albumin/Creatinine (U) [Mass ratio]", + "Albumin/Creatinine [Ratio] in 24 hour Urine", + "Albumin/Creatinine (U) [Molar ratio]", + ] + ranks = reranker.rerank(nonstandard_in, search_hits) + ranks = [ + {"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks + ] + assert ranks == [ + { + "code_string": "Albumin/Creatinine (U) [Mass ratio]", + "score": 0.755, + }, + { + "code_string": "Albumin/Creatinine (U) [Molar ratio]", + "score": 0.73, + }, + { + "code_string": "Albumin/Creatinine [Ratio] in 24 hour Urine", + "score": 0.701, + }, + {"code_string": "Albumin/Creatinine [Ratio] in Urine", "score": 0.672}, + ] diff --git a/terraform/README.md b/terraform/README.md index cb42f0b4..42a5ad6c 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -65,17 +65,19 @@ The index it creates has LOINC-specific field mappings including `description_ve Deployed as a **container image** from ECR (`package_type = "Image"`). The Docker image (`Dockerfile.ttc` at repo root) installs the full `text-to-code-lambda` package along with its workspace dependencies (`shared-models`, `lambda-handler`, `text-to-code`). At runtime, the Lambda runs the real `text_to_code_lambda.lambda_function.handler`, which: + 1. Loads the SentenceTransformer model from `/opt/model` during initialization (cold start) 2. Parses eICR XML documents from S3 to extract text candidates 3. Evaluates and selects the best candidate for each data field 4. Generates embeddings and executes KNN queries against OpenSearch 5. Returns standardized code mappings (LOINC/SNOMED) -Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`. +Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `RETRIEVER_MODEL_PATH`, `RERANKER_MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`. ### OpenSearch Ingestion Pipeline (`main.tf`) An **AWS OpenSearch Ingestion Service (OSIS)** pipeline (`aws_osis_pipeline.ttc_ingestion_pipeline`) that: + - Polls `s3://dibbs-text-to-code/ingestion/` monthly for new NDJSON files - Parses each line as a document and bulk-writes it into the `ttc-index` OpenSearch index - Runs within the VPC using the same private subnets as Lambda @@ -100,6 +102,7 @@ Terraform manages dependency ordering automatically, but conceptually the sequen ## State Backend Terraform state is stored remotely in **AWS S3** with DynamoDB locking: + - Bucket: `dibbs-ttc-terraform-state` - Key: `terraform.tfstate` - Region: `us-east-2` diff --git a/terraform/main.tf b/terraform/main.tf index 3a5a35e3..4e35b7eb 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -261,7 +261,8 @@ resource "aws_lambda_function" "lambda" { OPENSEARCH_INDEX = var.index_name REGION = var.region BUCKET_NAME = var.s3_bucket - MODEL_PATH = "/opt/model" + RETRIEVER_MODEL_PATH = "/opt/retriever_model" + RERANKER_MODEL_PATH = "/opt/reranker_model" EICR_INPUT_PREFIX = var.eicr_input_prefix SCHEMATRON_ERROR_PREFIX = var.schematron_error_prefix TTC_INPUT_PREFIX = var.ttc_input_prefix