huggingface · oOraph · Apr 11, 2025 · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025
diff --git a/dockerfiles/pytorch/Dockerfile b/dockerfiles/pytorch/Dockerfile
@@ -32,8 +32,7 @@ RUN apt-get update && \
     && apt-get clean autoremove --yes \
     && rm -rf /var/lib/{apt,dpkg,cache,log}
 
-# Copying only necessary files as filtered by .dockerignore
-COPY . .
+RUN mkdir -p /var/lib/dpkg && touch /var/lib/dpkg/status
 
 # Set Python 3.11 as the default python version
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
@@ -47,6 +46,11 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
 # Upgrade pip
 RUN pip install --no-cache-dir --upgrade pip
 
+COPY requirements.txt .
+RUN pip install -r requirements.txt && rm -rf /root/.cache
+
+# Copying only necessary files as filtered by .dockerignore
+COPY . .
 # Install wheel and setuptools
 RUN pip install --no-cache-dir --upgrade pip ".[torch,st,diffusers]"
 

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,22 @@
+kenlm@ git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7
+transformers[audio,sentencepiece,sklearn,vision]==4.51.3
+huggingface_hub[hf_transfer,hf_xet]==0.31.1
+Pillow
+librosa
+pyctcdecode>=0.3.0
+phonemizer
+ffmpeg
+starlette
+uvicorn
+gunicorn
+pandas
+orjson
+einops
+timm
+sentence_transformers==4.0.2
+diffusers==0.33.1
+accelerate==1.6.0
+torch==2.5.1
+torchvision
+torchaudio
+peft==0.15.1
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -59,4 +59,4 @@ if [[ ! -z "${HF_MODEL_DIR}" ]]; then
 fi
 
 # Start the server
-exec uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
+exec gunicorn webservice_starlette:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-1} --bind 0.0.0.0:${PORT}
diff --git a/setup.py b/setup.py
@@ -1,7 +1,19 @@
 from __future__ import absolute_import
-
+import os
 from setuptools import find_packages, setup
 
+lib_folder = os.path.dirname(os.path.realpath(__file__))
+requirements_path = f"{lib_folder}/requirements.txt"
+install_requires = [] # Here we'll add: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"]
+if os.path.isfile(requirements_path):
+    with open(requirements_path) as f:
+        install_requires = f.read().splitlines()
+
+test_requirements_path =  f"{lib_folder}/test-requirements.txt"
+if os.path.isfile(test_requirements_path):
+    with open(test_requirements_path) as f:
+        test_requirements = f.read().splitlines()
+
 # We don't declare our dependency on transformers here because we build with
 # different packages for different variants
 
@@ -12,47 +24,14 @@
 # ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
 # libavcodec-extra : libavcodec-extra  includes additional codecs for ffmpeg
 
-install_requires = [
-    # Due to an error affecting kenlm and cmake (see https://github.com/kpu/kenlm/pull/464)
-    # Also see the transformers patch for it https://github.com/huggingface/transformers/pull/37091
-    "kenlm@git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7",
-    "transformers[sklearn,sentencepiece,audio,vision]==4.51.3",
-    "huggingface_hub[hf_transfer]==0.30.2",
-    # vision
-    "Pillow",
-    "librosa",
-    # speech + torchaudio
-    "pyctcdecode>=0.3.0",
-    "phonemizer",
-    "ffmpeg",
-    # web api
-    "starlette",
-    "uvicorn",
-    "pandas",
-    "orjson",
-    "einops",
-]
-
 extras = {}
-
 extras["st"] = ["sentence_transformers==4.0.2"]
 extras["diffusers"] = ["diffusers==0.33.1", "accelerate==1.6.0"]
 # Includes `peft` as PEFT requires `torch` so having `peft` as a core dependency
 # means that `torch` will be installed even if the `torch` extra is not specified.
 extras["torch"] = ["torch==2.5.1", "torchvision", "torchaudio", "peft==0.15.1"]
-extras["test"] = [
-    "pytest==7.2.1",
-    "pytest-xdist",
-    "parameterized",
-    "psutil",
-    "datasets",
-    "pytest-sugar",
-    "mock==2.0.0",
-    "docker",
-    "requests",
-    "tenacity",
-]
 extras["quality"] = ["isort", "ruff"]
+extras["test"] = test_requirements
 extras["inf2"] = ["optimum-neuron"]
 extras["google"] = ["google-cloud-storage", "crcmod==1.7"]
 

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -1,4 +1,5 @@
 import importlib.util
+import os
 from typing import Union
 
 from transformers.utils.import_utils import is_torch_bf16_gpu_available
@@ -63,6 +64,16 @@ def __call__(
             kwargs.pop("num_images_per_prompt")
             logger.warning("Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1.")
 
+        if "num_inference_steps" not in kwargs:
+            default_num_steps = os.environ.get("DEFAULT_NUM_INFERENCE_STEPS")
+            if default_num_steps:
+                kwargs["num_inference_steps"] = int(default_num_steps)
+
+        if "guidance_scale" not in kwargs:
+            guidance_scale = os.environ.get("DEFAULT_GUIDANCE_SCALE")
+            if guidance_scale is not None:
+                kwargs["guidance_scale"] = float(guidance_scale)
+
         if "target_size" in kwargs:
             kwargs["height"] = kwargs["target_size"].pop("height", None)
             kwargs["width"] = kwargs["target_size"].pop("width", None)

diff --git a/src/huggingface_inference_toolkit/env_utils.py b/src/huggingface_inference_toolkit/env_utils.py
@@ -1,3 +1,6 @@
+import os
+
+
 def strtobool(val: str) -> bool:
     """Convert a string representation of truth to True or False booleans.
     True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
@@ -20,3 +23,11 @@ def strtobool(val: str) -> bool:
     raise ValueError(
         f"Invalid truth value, it should be a string but {val} was provided instead."
     )
+
+
+def api_inference_compat():
+    return strtobool(os.getenv("API_INFERENCE_COMPAT", "false"))
+
+
+def ignore_custom_handler():
+    return strtobool(os.getenv("IGNORE_CUSTOM_HANDLER", "false"))
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
@@ -2,12 +2,10 @@
 from pathlib import Path
 from typing import Any, Dict, Literal, Optional, Union
 
+from huggingface_inference_toolkit import logging
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
-from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
-from huggingface_inference_toolkit.utils import (
-    check_and_register_custom_pipeline_from_directory,
-    get_pipeline,
-)
+from huggingface_inference_toolkit.env_utils import api_inference_compat, ignore_custom_handler
+from huggingface_inference_toolkit.utils import check_and_register_custom_pipeline_from_directory
 
 
 class HuggingFaceHandler:
@@ -19,6 +17,7 @@ class HuggingFaceHandler:
     def __init__(
         self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
     ) -> None:
+        from huggingface_inference_toolkit.heavy_utils import get_pipeline
         self.pipeline = get_pipeline(
             model_dir=model_dir,  # type: ignore
             task=task,  # type: ignore
@@ -33,6 +32,10 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
             :data: (obj): the raw request body data.
         :return: prediction output
         """
+
+        # import as late as possible to reduce the footprint
+        from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
+
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
 
@@ -101,9 +104,63 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                     "or `candidateLabels`."
                 )
 
-        return (
-            self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters)  # type: ignore
-        )
+        if api_inference_compat():
+            if self.pipeline.task == "text-classification" and isinstance(inputs, str):
+                inputs = [inputs]
+                parameters.setdefault("top_k", os.environ.get("DEFAULT_TOP_K", 5))
+            if self.pipeline.task == "token-classification":
+                parameters.setdefault("aggregation_strategy", os.environ.get("DEFAULT_AGGREGATION_STRATEGY", "simple"))
+
+        resp = self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else \
+            self.pipeline(inputs, **parameters)
+
+        if api_inference_compat():
+            if self.pipeline.task == "text-classification":
+                # We don't want to return {} but [{}] in any case
+                if isinstance(resp, list) and len(resp) > 0:
+                    if not isinstance(resp[0], list):
+                        return [resp]
+                return resp
+            if self.pipeline.task == "feature-extraction":
+                # If the library used is Transformers then the feature-extraction is returning the headless encoder
+                # outputs as embeddings. The shape is a 3D or 4D array
+                # [n_inputs, batch_size = 1, n_sentence_tokens, num_hidden_dim].
+                # Let's just discard the batch size dim that always seems to be 1 and return a 2D/3D array
+                # https://github.com/huggingface/transformers/blob/5c47d08b0d6835b8d8fc1c06d9a1bc71f6e78ace/src/transformers/pipelines/feature_extraction.py#L27
+                # for api inference (reason: mainly display)
+                new_resp = []
+                if isinstance(inputs, list):
+                    if isinstance(resp, list) and len(resp) == len(inputs):
+                        for it in resp:
+                            # Batch size dim is the first it level, dicard it
+                            if isinstance(it, list) and len(it) == 1:
+                                new_resp.append(it[0])
+                            else:
+                                logging.logger.warning("One of the output batch size differs from 1: %d", len(it))
+                                return resp
+                        return new_resp
+                    else:
+                        logging.logger.warning("Inputs and resp len differ (or resp is not a list, type %s)",
+                                               type(resp))
+                        return resp
+                elif isinstance(inputs, str):
+                    if isinstance(resp, list) and len(resp) == 1:
+                        return resp[0]
+                    else:
+                        logging.logger.warning("The output batch size differs from 1: %d", len(resp))
+                        return resp
+                else:
+                    logging.logger.warning("Output unexpected type %s", type(resp))
+                    return resp
+            if self.pipeline.task == "image-segmentation":
+                if isinstance(resp, list):
+                    new_resp = []
+                    for el in resp:
+                        if isinstance(el, dict) and el.get("score") is None:
+                            el["score"] = 1
+                        new_resp.append(el)
+                    resp = new_resp
+        return resp
 
 
 class VertexAIHandler(HuggingFaceHandler):
@@ -149,7 +206,10 @@ def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task
     Returns:
         InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
-    custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
+    if ignore_custom_handler():
+        custom_pipeline = None
+    else:
+        custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
     if custom_pipeline is not None:
         return custom_pipeline