Inf2 hf endpoints docker image

oOraph · oOraph · commit 6df887cc0044 · 2025-03-13T10:35:30.000+01:00
Signed-off-by: Raphael Glon &lt;oOraph@users.noreply.github.com&gt;
diff --git a/dockerfiles/pytorch/Dockerfile.inf2 b/dockerfiles/pytorch/Dockerfile.inf2
@@ -98,5 +98,7 @@ COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starle
 
 # copy entrypoint and change permissions
 COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
+COPY --chmod=0755  scripts/inf2_env.py inf2.env.py
+COPY --chmod=0755  scripts/inf2_entrypoint.sh inf2_entrypoint.sh
 
-ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
+ENTRYPOINT ["bash", "-c", "./inf2_entrypoint.sh"]
diff --git a/scripts/inf2_entrypoint.sh b/scripts/inf2_entrypoint.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e -o pipefail -u
+
+export ENV_FILEPATH=$(mktemp)
+
+trap "rm -f ${ENV_FILEPATH}" EXIT
+
+touch $ENV_FILEPATH
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+${SCRIPT_DIR}/inf2_env.py $@
+
+source $ENV_FILEPATH
+
+rm -f $ENV_FILEPATH
+
+exec ${SCRIPT_DIR}/entrypoint.sh $@
diff --git a/scripts/inf2_env.py b/scripts/inf2_env.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python
+
+"""
+This script is here to specify all missing environment variables that would be required to run some encoder models on
+inferentia2.
+"""
+
+import argparse
+import logging
+import os
+import sys
+from typing import Any, Dict, List, Optional
+
+from huggingface_hub import constants
+from transformers import AutoConfig
+
+from optimum.neuron.utils import get_hub_cached_entries
+from optimum.neuron.utils.version_utils import get_neuronxcc_version
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True)
+logger = logging.getLogger(__name__)
+
+env_config_peering = [
+    ("HF_BATCH_SIZE", "static_batch_size"),
+    ("HF_OPTIMUM_SEQUENCE_LENGTH", "static_sequence_length"),
+]
+
+# By the end of this script all env vars should be specified properly
+env_vars = list(map(lambda x: x[0], env_config_peering))
+
+# Currently not used for encoder models
+# available_cores = get_available_cores()
+
+neuronxcc_version = get_neuronxcc_version()
+
+
+def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    if not argv:
+        argv = sys.argv
+    # All these are params passed to tgi and intercepted here
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=os.getenv("HF_BATCH_SIZE", os.getenv("BATCH_SIZE", 0)),
+    )
+    parser.add_argument(
+        "--sequence-length", type=int,
+        default=os.getenv("HF_OPTIMUM_SEQUENCE_LENGTH",
+                          os.getenv("SEQUENCE_LENGTH", 0))
+    )
+
+    parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID", os.getenv("HF_MODEL_DIR")))
+    parser.add_argument("--revision", type=str, default=os.getenv("REVISION"))
+
+    args = parser.parse_known_args(argv)[0]
+
+    if not args.model_id:
+        raise Exception(
+            "No model id provided ! Either specify it using --model-id cmdline or MODEL_ID env var"
+        )
+
+    # Override env with cmdline params
+    os.environ["MODEL_ID"] = args.model_id
+
+    # Set all tgi router and tgi server values to consistent values as early as possible
+    # from the order of the parser defaults, the tgi router value can override the tgi server ones
+    if args.batch_size > 0:
+        os.environ["HF_BATCH_SIZE"] = str(args.batch_size)
+
+    if args.sequence_length > 0:
+        os.environ["HF_OPTIMUM_SEQUENCE_LENGTH"] = str(args.sequence_length)
+
+    if args.revision:
+        os.environ["REVISION"] = str(args.revision)
+
+    return args
+
+
+def neuron_config_to_env(neuron_config):
+    with open(os.environ["ENV_FILEPATH"], "w") as f:
+        for env_var, config_key in env_config_peering:
+            f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
+
+
+def sort_neuron_configs(dictionary):
+    return -dictionary["static_batch_size"]
+
+
+def lookup_compatible_cached_model(
+        model_id: str, revision: Optional[str]
+) -> Optional[Dict[str, Any]]:
+    # Reuse the same mechanic as the one in use to configure the tgi server part
+    # The only difference here is that we stay as flexible as possible on the compatibility part
+    entries = get_hub_cached_entries(model_id, "inference")
+
+    logger.debug(
+        "Found %d cached entries for model %s, revision %s",
+        len(entries),
+        model_id,
+        revision,
+    )
+
+    all_compatible = []
+    for entry in entries:
+        if check_env_and_neuron_config_compatibility(
+                entry, check_compiler_version=True
+        ):
+            all_compatible.append(entry)
+
+    if not all_compatible:
+        logger.debug(
+            "No compatible cached entry found for model %s, env %s, neuronxcc version %s",
+            model_id,
+            get_env_dict(),
+            neuronxcc_version,
+        )
+        return None
+
+    logger.info("%d compatible neuron cached models found", len(all_compatible))
+
+    all_compatible = sorted(all_compatible, key=sort_neuron_configs)
+
+    entry = all_compatible[0]
+
+    logger.info("Selected entry %s", entry)
+
+    return entry
+
+
+def check_env_and_neuron_config_compatibility(
+        neuron_config: Dict[str, Any], check_compiler_version: bool
+) -> bool:
+    logger.debug(
+        "Checking the provided neuron config %s is compatible with the local setup and provided environment",
+        neuron_config,
+    )
+
+    # Local setup compat checks
+    # if neuron_config["num_cores"] > available_cores:
+    #     logger.debug(
+    #         "Not enough neuron cores available to run the provided neuron config"
+    #     )
+    #     return False
+
+    if (
+            check_compiler_version
+            and neuron_config["compiler_version"] != neuronxcc_version
+    ):
+        logger.debug(
+            "Compiler version conflict, the local one (%s) differs from the one used to compile the model (%s)",
+            neuronxcc_version,
+            neuron_config["compiler_version"],
+        )
+        return False
+
+    for env_var, config_key in env_config_peering:
+        try:
+            neuron_config_value = str(neuron_config[config_key])
+        except KeyError:
+            logger.debug("No key %s found in neuron config %s", config_key, neuron_config)
+            return False
+        env_value = os.getenv(env_var, str(neuron_config_value))
+        if env_value != neuron_config_value:
+            logger.debug(
+                "The provided env var '%s' and the neuron config '%s' param differ (%s != %s)",
+                env_var,
+                config_key,
+                env_value,
+                neuron_config_value,
+            )
+            return False
+
+    return True
+
+
+def get_env_dict() -> Dict[str, str]:
+    d = {}
+    for k in env_vars:
+        d[k] = os.getenv(k)
+    return d
+
+
+def main():
+    """
+    This script determines proper default TGI env variables for the neuron precompiled models to
+    work properly
+    :return:
+    """
+    args = parse_cmdline_and_set_env()
+
+    for env_var in env_vars:
+        if not os.getenv(env_var):
+            break
+    else:
+        logger.info(
+            "All env vars %s already set, skipping, user know what they are doing",
+            env_vars,
+        )
+        sys.exit(0)
+
+    cache_dir = constants.HF_HUB_CACHE
+
+    logger.info("Cache dir %s, model %s", cache_dir, args.model_id)
+
+    config = AutoConfig.from_pretrained(args.model_id, revision=args.revision)
+    neuron_config = getattr(config, "neuron", None)
+    if neuron_config is not None:
+        compatible = check_env_and_neuron_config_compatibility(
+            neuron_config, check_compiler_version=False
+        )
+        if not compatible:
+            env_dict = get_env_dict()
+            msg = (
+                "Invalid neuron config and env. Config {}, env {}, neuronxcc version {}"
+            ).format(neuron_config, env_dict, neuronxcc_version)
+            logger.error(msg)
+            raise Exception(msg)
+    else:
+        neuron_config = lookup_compatible_cached_model(args.model_id, args.revision)
+
+    if not neuron_config:
+        neuron_config = {'static_batch_size': 1, 'static_sequence_length': 128}
+        msg = (
+            "No compatible neuron config found. Provided env {}, neuronxcc version {}. Falling back to default"
+        ).format(get_env_dict(), neuronxcc_version, neuron_config)
+        logger.info(msg)
+
+    logger.info("Final neuron config %s", neuron_config)
+
+    neuron_config_to_env(neuron_config)
+
+
+if __name__ == "__main__":
+    main()