Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions Dockerfile.ttc
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,25 @@ RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code"
COPY ./packages/text-to-code-lambda ${LAMBDA_TASK_ROOT}/text-to-code-lambda
RUN pip install --no-cache-dir "${LAMBDA_TASK_ROOT}/text-to-code-lambda"

# Download model at build time (baked into image)
# Download retriever at build time (baked into image)
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download( \
from huggingface_hub import snapshot_download; \
snapshot_download( \
repo_id='intfloat/e5-large-v2', \
local_dir='/opt/model', \
local_dir='/opt/retriever_model', \
ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \
)"
)"

ENV MODEL_PATH="/opt/model"
# Download reranker at build time (baked into image)
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download( \
repo_id='cross-encoder/stsb-roberta-large', \
local_dir='/opt/reranker_model', \
ignore_patterns=['*.git*', '*.md', 'onnx/*', 'openvino/*', 'pytorch_model.bin'] \
)"

ENV RETRIEVER_MODEL_PATH="/opt/retriever_model"
ENV RERANKER_MODEL_PATH="/opt/reranker_model"

CMD ["text_to_code_lambda.lambda_function.handler"]
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def handler(event: dict, context: dict) -> dict:
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "faiss",
"parameters": {"ef_construction": 128, "m": 16},
"parameters": {"ef_construction": 400, "m": 64},
},
},
"loinc_type": {"type": "keyword"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from text_to_code.services import eicr_processor
from text_to_code.services import embedder
from text_to_code.services import evaluator
from text_to_code.services import reranker
from text_to_code.services import schematron_processor
from text_to_code.services.query import QueryBuilder

Expand All @@ -32,6 +33,11 @@
OPENSEARCH_ENDPOINT_URL = os.getenv("OPENSEARCH_ENDPOINT_URL")
OPENSEARCH_INDEX = os.getenv("OPENSEARCH_INDEX", "ttc-index")

# Instantiate wrapper objects for the sentence-transformers models
# to re-use across invocations
RETRIEVER = embedder.Embedder()
RERANKER = reranker.Reranker()

# Cache clients and auth to reuse across Lambda invocations
_cached_auth = None
_cached_opensearch_client = None
Expand Down Expand Up @@ -249,7 +255,7 @@ def _process_schematron_errors(
if selected_candidate is None:
continue

vector_embedding = embedder.Embedder().embed(selected_candidate.value)
vector_embedding = RETRIEVER.embed(selected_candidate.value)

vector_parameters = query_models.VectorSearchParams(
vector=vector_embedding.tolist(), data_field=data_field
Expand All @@ -264,8 +270,16 @@ def _process_schematron_errors(
query=query, index=OPENSEARCH_INDEX, opensearch_client=opensearch_client
)

# The OpenSearch results object has a couple levels of nesting,
# but all we care about for reranking is extracting the actual
# text strings of the ANN LOINC codes
results_list = opensearch_retrieved_scores.hits.hits
retrieved_loinc_names = [hit.source.description for hit in results_list]
ranked_results = RERANKER.rerank(selected_candidate.value, retrieved_loinc_names)

metadata_error = error.model_dump()
metadata_error["opensearch_retrieved_scores"] = opensearch_retrieved_scores
metadata_error["reranker_processed_results"] = ranked_results
ttc_metadata_output["schematron_errors"][data_field].append(metadata_error)


Expand Down
13 changes: 13 additions & 0 deletions packages/text-to-code-lambda/tests/test_lambda_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

EXPECTED_RESULTED_ERRORS = 2
EXPECTED_ORDERED_ERRORS = 2
EXPECTED_RERANKER_SCORE = 0.01


class TestHandler:
Expand Down Expand Up @@ -71,6 +72,18 @@ def test_handler_success(self, example_sqs_event, mock_aws_setup, mock_opensearc
"opensearch_retrieved_scores"
in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0]
)
assert (
"reranker_processed_results"
in ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0]
)
predicted_candidate = ttc_metadata_output["schematron_errors"]["Lab Test Name Resulted"][0][
"reranker_processed_results"
][0]
assert (
predicted_candidate["code_string"]
== "(Artemisia vulgaris+Chenopodium album+Plantago lanceolata+Solidago virgaurea+Urtica dioica) Ab.IgE:PrThr:Pt:Ser:Ord:Multidisk"
)
assert round(float(predicted_candidate["score"]), 3) == EXPECTED_RERANKER_SCORE

def test_handler_with_no_records(self, example_sqs_event, mock_opensearch):
"""Test handler with no records."""
Expand Down
6 changes: 4 additions & 2 deletions packages/text-to-code/src/text_to_code/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from .query import DataFieldTypeMapping
from .query import VectorSearchParams
from .registry import EICR_REGISTRY
from .registry import default_model
from .registry import TTC_RERANKER
from .registry import TTC_RETRIEVER
from .schematron import _SCHEMATRON_ENUM_TO_FIELD
from .schematron import LabTestNameOrderedSchematronErrors
from .schematron import LabTestNameResultedSchematronErrors
Expand All @@ -15,6 +16,8 @@

__all__ = [
"EICR_REGISTRY",
"TTC_RERANKER",
"TTC_RETRIEVER",
"_SCHEMATRON_ENUM_TO_FIELD",
"BaseLabField",
"Candidate",
Expand All @@ -27,5 +30,4 @@
"SchematronConfig",
"SchematronErrors",
"VectorSearchParams",
"default_model",
]
9 changes: 7 additions & 2 deletions packages/text-to-code/src/text_to_code/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,10 @@
DataField.LAB_TEST_NAME_ORDERED: LabTestNameOrdered,
}

# Default model name for SentenceTransformer, representing the model TTC used most extensively
default_model: str = "intfloat/e5-large-v2"
# Text-to-Code Retrieval model, used for searching approximate neighborhoods
# to find semantically similar candidates
TTC_RETRIEVER: str = "intfloat/e5-large-v2"

# Text-to-Code Reranker model, used for re-scoring and re-sorting the hits
# found by the approximate neighbor search
TTC_RERANKER: str = "cross-encoder/stsb-roberta-large"
11 changes: 7 additions & 4 deletions packages/text-to-code/src/text_to_code/services/embedder.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
from sentence_transformers import SentenceTransformer
from torch import Tensor

from text_to_code.models.registry import default_model
from text_to_code.models.registry import TTC_RETRIEVER

_MODEL = SentenceTransformer(default_model)
_RETRIEVER = SentenceTransformer(TTC_RETRIEVER)


class Embedder:
"""Transforms nonstandard text."""

def embed(self, text: str) -> Tensor:
"""Take a text string and embeds it as a vector using a model as defined in config.py.
"""Encode a text string into a vector representation.

The dimensionality and values of the vector form are determined
by the application's default Retriever Model.

:param text: Text string to embed.
:returns: Tensor representation of input text.
"""
return _MODEL.encode(text)
return _RETRIEVER.encode(text)
26 changes: 26 additions & 0 deletions packages/text-to-code/src/text_to_code/services/reranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from sentence_transformers import CrossEncoder

from text_to_code.models.registry import TTC_RERANKER

_RERANKER = CrossEncoder(TTC_RERANKER)


class Reranker:
"""Scores and sorts OpenSearch results."""

def rerank(self, nonstandard_in: str, hits: list[str]) -> list[dict]:
"""Re-sorts hits by cross-encoder score values.

Given a list of text strings returned from OpenSearch, score and sort
the search hits using the Text-to-Code system's default Reranker model.
The model will generate a cross-encoding score value measuring each
search result's information similarity to the original nonstandard input.

:param nonstandard_in: The original narrative free-text input to TTC.
:param hits: The list of OpenSearch results, in text string form.
:returns: A list of dictionaries representing the newly cross-encoder
scored search results, sorted in descending order of score.
"""
ranks = _RERANKER.rank(nonstandard_in, hits)
sorted_ranks = [{"code_string": hits[r["corpus_id"]], "score": r["score"]} for r in ranks]
return sorted_ranks
52 changes: 52 additions & 0 deletions packages/text-to-code/tests/unit/test_reranker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest
from text_to_code.services.reranker import Reranker


class TestReranker:
@pytest.fixture(scope="class")
def reranker(self) -> Reranker:
return Reranker()

def test_reranker_empty_hits(self, reranker: Reranker) -> None:
ranks = reranker.rerank("Influenza virus A and B and SARS-CoV-2 (COVID-19)", [])
assert len(ranks) == 0

def test_reranker_single_search_result(self, reranker: Reranker) -> None:
ranks = reranker.rerank(
"Influenza virus A and B and SARS-CoV-2 (COVID-19)",
["Influenza virus A and B and SARS-CoV-2 (COVID-19)"],
)
ranks = [
{"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks
]
assert ranks == [
{"code_string": "Influenza virus A and B and SARS-CoV-2 (COVID-19)", "score": 0.973}
]

def test_reranker_multiple_hits(self, reranker: Reranker) -> None:
nonstandard_in = "albumin/creatinine ratio (acr)"
search_hits = [
"Albumin/Creatinine [Ratio] in Urine",
"Albumin/Creatinine (U) [Mass ratio]",
"Albumin/Creatinine [Ratio] in 24 hour Urine",
"Albumin/Creatinine (U) [Molar ratio]",
]
ranks = reranker.rerank(nonstandard_in, search_hits)
ranks = [
{"code_string": r["code_string"], "score": round(float(r["score"]), 3)} for r in ranks
]
assert ranks == [
{
"code_string": "Albumin/Creatinine (U) [Mass ratio]",
"score": 0.755,
},
{
"code_string": "Albumin/Creatinine (U) [Molar ratio]",
"score": 0.73,
},
{
"code_string": "Albumin/Creatinine [Ratio] in 24 hour Urine",
"score": 0.701,
},
{"code_string": "Albumin/Creatinine [Ratio] in Urine", "score": 0.672},
]
5 changes: 4 additions & 1 deletion terraform/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,19 @@ The index it creates has LOINC-specific field mappings including `description_ve
Deployed as a **container image** from ECR (`package_type = "Image"`). The Docker image (`Dockerfile.ttc` at repo root) installs the full `text-to-code-lambda` package along with its workspace dependencies (`shared-models`, `lambda-handler`, `text-to-code`).

At runtime, the Lambda runs the real `text_to_code_lambda.lambda_function.handler`, which:

1. Loads the SentenceTransformer model from `/opt/model` during initialization (cold start)
2. Parses eICR XML documents from S3 to extract text candidates
3. Evaluates and selects the best candidate for each data field
4. Generates embeddings and executes KNN queries against OpenSearch
5. Returns standardized code mappings (LOINC/SNOMED)

Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`.
Environment variables injected at deploy time: `OPENSEARCH_ENDPOINT_URL`, `OPENSEARCH_INDEX`, `REGION`, `BUCKET_NAME`, `RETRIEVER_MODEL_PATH`, `RERANKER_MODEL_PATH`, `EICR_INPUT_PREFIX`, `SCHEMATRON_ERROR_PREFIX`, `TTC_INPUT_PREFIX`, `TTC_OUTPUT_PREFIX`, `TTC_METADATA_PREFIX`.

### OpenSearch Ingestion Pipeline (`main.tf`)

An **AWS OpenSearch Ingestion Service (OSIS)** pipeline (`aws_osis_pipeline.ttc_ingestion_pipeline`) that:

- Polls `s3://dibbs-text-to-code/ingestion/` monthly for new NDJSON files
- Parses each line as a document and bulk-writes it into the `ttc-index` OpenSearch index
- Runs within the VPC using the same private subnets as Lambda
Expand All @@ -100,6 +102,7 @@ Terraform manages dependency ordering automatically, but conceptually the sequen
## State Backend

Terraform state is stored remotely in **AWS S3** with DynamoDB locking:

- Bucket: `dibbs-ttc-terraform-state`
- Key: `terraform.tfstate`
- Region: `us-east-2`
Expand Down
3 changes: 2 additions & 1 deletion terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ resource "aws_lambda_function" "lambda" {
OPENSEARCH_INDEX = var.index_name
REGION = var.region
BUCKET_NAME = var.s3_bucket
MODEL_PATH = "/opt/model"
RETRIEVER_MODEL_PATH = "/opt/retriever_model"
RERANKER_MODEL_PATH = "/opt/reranker_model"
EICR_INPUT_PREFIX = var.eicr_input_prefix
SCHEMATRON_ERROR_PREFIX = var.schematron_error_prefix
TTC_INPUT_PREFIX = var.ttc_input_prefix
Expand Down
Loading