Skip to content

api inference mini fork #109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6d08e97
customize default num inference steps
oOraph Apr 11, 2025
7e01334
default content type env var
oOraph Apr 14, 2025
2d39740
default accept env var
oOraph Apr 14, 2025
d41f536
content type case ignore
oOraph Apr 14, 2025
b0f1b2d
Diffusers, txt2img (and img2img when supported), make sure guidance s…
oOraph Apr 29, 2025
77e870a
api inference compat response
oOraph Apr 30, 2025
fc71ab9
fix: content-type and accept parsing
oOraph May 2, 2025
60745f3
Multi task support + /pipeline/<task> support for api-inference backw…
oOraph May 2, 2025
ba52d1e
substitute /pipeline/sentence-embeddings to /pipeline/feature-extract…
oOraph May 5, 2025
33d23f3
application/octet-stream support in content type deserialization
oOraph May 5, 2025
5bbf5a9
fix(api inference): compat for text-classification token-classification
oOraph May 5, 2025
422c7b2
fix: token classification api-inference-compat
oOraph May 6, 2025
ae367fd
add timm dependency (for object detection)
oOraph May 9, 2025
1565769
fix(api-inference): feature-extraction, flatten array, discard the ba…
oOraph May 9, 2025
3c75bcb
minor: make quality
oOraph May 12, 2025
f6e1f85
install hf_xet
oOraph May 12, 2025
d14b5c7
fix: avoid returning none as a serializer
oOraph May 13, 2025
603ce84
fix: de/serializer is not optional, do not support content type which…
oOraph May 13, 2025
77d9b12
feat(memory): reduce memory footprint on idle service
oOraph May 23, 2025
bb1eded
Dockerfile refacto: split requirements and source code layers
oOraph May 23, 2025
088cad0
fix: minor, idle unload distinguish sleep time and timeout
oOraph May 23, 2025
2eda42a
fix: image segmentation on hf inference
oOraph Jun 9, 2025
0bdb7c2
feat(hf-inference): disable custom handler
oOraph Jun 9, 2025
3daa1ad
minor: dockerfile
oOraph Jun 11, 2025
bb2a6c3
quality check
oOraph Jun 11, 2025
a781375
fix tests
oOraph Jun 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions dockerfiles/pytorch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ RUN apt-get update && \
&& apt-get clean autoremove --yes \
&& rm -rf /var/lib/{apt,dpkg,cache,log}

# Copying only necessary files as filtered by .dockerignore
COPY . .
RUN mkdir -p /var/lib/dpkg && touch /var/lib/dpkg/status

# Set Python 3.11 as the default python version
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
Expand All @@ -47,6 +46,11 @@ RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
# Upgrade pip
RUN pip install --no-cache-dir --upgrade pip

COPY requirements.txt .
RUN pip install -r requirements.txt && rm -rf /root/.cache

# Copying only necessary files as filtered by .dockerignore
COPY . .
# Install wheel and setuptools
RUN pip install --no-cache-dir --upgrade pip ".[torch,st,diffusers]"

Expand Down
22 changes: 22 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
kenlm@ git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7
transformers[audio,sentencepiece,sklearn,vision]==4.51.3
huggingface_hub[hf_transfer,hf_xet]==0.31.1
Pillow
librosa
pyctcdecode>=0.3.0
phonemizer
ffmpeg
starlette
uvicorn
gunicorn
pandas
orjson
einops
timm
sentence_transformers==4.0.2
diffusers==0.33.1
accelerate==1.6.0
torch==2.5.1
torchvision
torchaudio
peft==0.15.1
2 changes: 1 addition & 1 deletion scripts/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,4 @@ if [[ ! -z "${HF_MODEL_DIR}" ]]; then
fi

# Start the server
exec uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
exec gunicorn webservice_starlette:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-1} --bind 0.0.0.0:${PORT}
49 changes: 14 additions & 35 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
from __future__ import absolute_import

import os
from setuptools import find_packages, setup

lib_folder = os.path.dirname(os.path.realpath(__file__))
requirements_path = f"{lib_folder}/requirements.txt"
install_requires = [] # Here we'll add: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"]
if os.path.isfile(requirements_path):
with open(requirements_path) as f:
install_requires = f.read().splitlines()

test_requirements_path = f"{lib_folder}/test-requirements.txt"
if os.path.isfile(test_requirements_path):
with open(test_requirements_path) as f:
test_requirements = f.read().splitlines()

# We don't declare our dependency on transformers here because we build with
# different packages for different variants

Expand All @@ -12,47 +24,14 @@
# ffmpeg: ffmpeg is required for audio processing. On Ubuntu it can be installed as follows: apt install ffmpeg
# libavcodec-extra : libavcodec-extra includes additional codecs for ffmpeg

install_requires = [
# Due to an error affecting kenlm and cmake (see https://github.com/kpu/kenlm/pull/464)
# Also see the transformers patch for it https://github.com/huggingface/transformers/pull/37091
"kenlm@git+https://github.com/kpu/kenlm@ba83eafdce6553addd885ed3da461bb0d60f8df7",
"transformers[sklearn,sentencepiece,audio,vision]==4.51.3",
"huggingface_hub[hf_transfer]==0.30.2",
# vision
"Pillow",
"librosa",
# speech + torchaudio
"pyctcdecode>=0.3.0",
"phonemizer",
"ffmpeg",
# web api
"starlette",
"uvicorn",
"pandas",
"orjson",
"einops",
]

extras = {}

extras["st"] = ["sentence_transformers==4.0.2"]
extras["diffusers"] = ["diffusers==0.33.1", "accelerate==1.6.0"]
# Includes `peft` as PEFT requires `torch` so having `peft` as a core dependency
# means that `torch` will be installed even if the `torch` extra is not specified.
extras["torch"] = ["torch==2.5.1", "torchvision", "torchaudio", "peft==0.15.1"]
extras["test"] = [
"pytest==7.2.1",
"pytest-xdist",
"parameterized",
"psutil",
"datasets",
"pytest-sugar",
"mock==2.0.0",
"docker",
"requests",
"tenacity",
]
extras["quality"] = ["isort", "ruff"]
extras["test"] = test_requirements
extras["inf2"] = ["optimum-neuron"]
extras["google"] = ["google-cloud-storage", "crcmod==1.7"]

Expand Down
11 changes: 11 additions & 0 deletions src/huggingface_inference_toolkit/diffusers_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import os
from typing import Union

from transformers.utils.import_utils import is_torch_bf16_gpu_available
Expand Down Expand Up @@ -63,6 +64,16 @@ def __call__(
kwargs.pop("num_images_per_prompt")
logger.warning("Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1.")

if "num_inference_steps" not in kwargs:
default_num_steps = os.environ.get("DEFAULT_NUM_INFERENCE_STEPS")
if default_num_steps:
kwargs["num_inference_steps"] = int(default_num_steps)

if "guidance_scale" not in kwargs:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

useful for sd 3.5 turbo -> we want guidance scale 0 by default (e.g when not specified by user) because the num steps is too low, so that generated images are ok

guidance_scale = os.environ.get("DEFAULT_GUIDANCE_SCALE")
if guidance_scale is not None:
kwargs["guidance_scale"] = float(guidance_scale)

if "target_size" in kwargs:
kwargs["height"] = kwargs["target_size"].pop("height", None)
kwargs["width"] = kwargs["target_size"].pop("width", None)
Expand Down
11 changes: 11 additions & 0 deletions src/huggingface_inference_toolkit/env_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os


def strtobool(val: str) -> bool:
"""Convert a string representation of truth to True or False booleans.
True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
Expand All @@ -20,3 +23,11 @@ def strtobool(val: str) -> bool:
raise ValueError(
f"Invalid truth value, it should be a string but {val} was provided instead."
)


def api_inference_compat():
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with this env var we intend to handle the small response differences between the api inference widgets on the hub and on endpoints ui. TODO: we should probably unify both widgets instead

return strtobool(os.getenv("API_INFERENCE_COMPAT", "false"))


def ignore_custom_handler():
return strtobool(os.getenv("IGNORE_CUSTOM_HANDLER", "false"))
78 changes: 69 additions & 9 deletions src/huggingface_inference_toolkit/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
from pathlib import Path
from typing import Any, Dict, Literal, Optional, Union

from huggingface_inference_toolkit import logging
from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
from huggingface_inference_toolkit.utils import (
check_and_register_custom_pipeline_from_directory,
get_pipeline,
)
from huggingface_inference_toolkit.env_utils import api_inference_compat, ignore_custom_handler
from huggingface_inference_toolkit.utils import check_and_register_custom_pipeline_from_directory


class HuggingFaceHandler:
Expand All @@ -19,6 +17,7 @@ class HuggingFaceHandler:
def __init__(
self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
) -> None:
from huggingface_inference_toolkit.heavy_utils import get_pipeline
self.pipeline = get_pipeline(
model_dir=model_dir, # type: ignore
task=task, # type: ignore
Expand All @@ -33,6 +32,10 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
:data: (obj): the raw request body data.
:return: prediction output
"""

# import as late as possible to reduce the footprint
from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS

inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})

Expand Down Expand Up @@ -101,9 +104,63 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"or `candidateLabels`."
)

return (
self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters) # type: ignore
)
if api_inference_compat():
if self.pipeline.task == "text-classification" and isinstance(inputs, str):
inputs = [inputs]
parameters.setdefault("top_k", os.environ.get("DEFAULT_TOP_K", 5))
if self.pipeline.task == "token-classification":
parameters.setdefault("aggregation_strategy", os.environ.get("DEFAULT_AGGREGATION_STRATEGY", "simple"))

resp = self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else \
self.pipeline(inputs, **parameters)

if api_inference_compat():
if self.pipeline.task == "text-classification":
# We don't want to return {} but [{}] in any case
if isinstance(resp, list) and len(resp) > 0:
if not isinstance(resp[0], list):
return [resp]
return resp
if self.pipeline.task == "feature-extraction":
# If the library used is Transformers then the feature-extraction is returning the headless encoder
# outputs as embeddings. The shape is a 3D or 4D array
# [n_inputs, batch_size = 1, n_sentence_tokens, num_hidden_dim].
# Let's just discard the batch size dim that always seems to be 1 and return a 2D/3D array
# https://github.com/huggingface/transformers/blob/5c47d08b0d6835b8d8fc1c06d9a1bc71f6e78ace/src/transformers/pipelines/feature_extraction.py#L27
# for api inference (reason: mainly display)
new_resp = []
if isinstance(inputs, list):
if isinstance(resp, list) and len(resp) == len(inputs):
for it in resp:
# Batch size dim is the first it level, dicard it
if isinstance(it, list) and len(it) == 1:
new_resp.append(it[0])
else:
logging.logger.warning("One of the output batch size differs from 1: %d", len(it))
return resp
return new_resp
else:
logging.logger.warning("Inputs and resp len differ (or resp is not a list, type %s)",
type(resp))
return resp
elif isinstance(inputs, str):
if isinstance(resp, list) and len(resp) == 1:
return resp[0]
else:
logging.logger.warning("The output batch size differs from 1: %d", len(resp))
return resp
else:
logging.logger.warning("Output unexpected type %s", type(resp))
return resp
if self.pipeline.task == "image-segmentation":
if isinstance(resp, list):
new_resp = []
for el in resp:
if isinstance(el, dict) and el.get("score") is None:
el["score"] = 1
new_resp.append(el)
resp = new_resp
return resp


class VertexAIHandler(HuggingFaceHandler):
Expand Down Expand Up @@ -149,7 +206,10 @@ def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task
Returns:
InferenceHandler: The appropriate inference handler based on the given model directory and task.
"""
custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
if ignore_custom_handler():
custom_pipeline = None
else:
custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
if custom_pipeline is not None:
return custom_pipeline

Expand Down
Loading