feat: Support Customizable Logging Format in Infinity

shalousun · shalousun · commit 50239f0f035e · 2025-09-20T00:48:48.000+08:00
diff --git a/libs/infinity_emb/infinity_emb/cli.py b/libs/infinity_emb/infinity_emb/cli.py
@@ -5,7 +5,6 @@
 import re
 import sys
 
-
 import infinity_emb
 from infinity_emb._optional_imports import CHECK_TYPER, CHECK_UVICORN
 from infinity_emb.args import EngineArgs
@@ -107,40 +106,41 @@ def _construct(name: str):
 
     tp = typer.Typer()
 
+
     @tp.command("v1")
     def v1(
-        # v1 is deprecated. Please do no longer modify it.
-        model_name_or_path: str = MANAGER.model_id[0],
-        served_model_name: str = MANAGER.served_model_name[0],
-        batch_size: int = MANAGER.batch_size[0],
-        revision: str = MANAGER.revision[0],
-        trust_remote_code: bool = MANAGER.trust_remote_code[0],
-        redirect_slash: str = MANAGER.redirect_slash,
-        engine: "InferenceEngine" = MANAGER.engine[0],  # type: ignore # noqa
-        model_warmup: bool = MANAGER.model_warmup[0],
-        vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
-        device: "Device" = MANAGER.device[0],  # type: ignore
-        lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
-        dtype: Dtype = MANAGER.dtype[0],  # type: ignore
-        embedding_dtype: "EmbeddingDtype" = EmbeddingDtype.default_value(),  # type: ignore
-        pooling_method: "PoolingMethod" = MANAGER.pooling_method[0],  # type: ignore
-        compile: bool = MANAGER.compile[0],
-        bettertransformer: bool = MANAGER.bettertransformer[0],
-        preload_only: bool = MANAGER.preload_only,
-        permissive_cors: bool = MANAGER.permissive_cors,
-        api_key: str = MANAGER.api_key,
-        url_prefix: str = MANAGER.url_prefix,
-        host: str = MANAGER.host,
-        port: int = MANAGER.port,
-        log_level: "UVICORN_LOG_LEVELS" = MANAGER.log_level,  # type: ignore
+            # v1 is deprecated. Please do no longer modify it.
+            model_name_or_path: str = MANAGER.model_id[0],
+            served_model_name: str = MANAGER.served_model_name[0],
+            batch_size: int = MANAGER.batch_size[0],
+            revision: str = MANAGER.revision[0],
+            trust_remote_code: bool = MANAGER.trust_remote_code[0],
+            redirect_slash: str = MANAGER.redirect_slash,
+            engine: "InferenceEngine" = MANAGER.engine[0],  # type: ignore # noqa
+            model_warmup: bool = MANAGER.model_warmup[0],
+            vector_disk_cache: bool = MANAGER.vector_disk_cache[0],
+            device: "Device" = MANAGER.device[0],  # type: ignore
+            lengths_via_tokenize: bool = MANAGER.lengths_via_tokenize[0],
+            dtype: Dtype = MANAGER.dtype[0],  # type: ignore
+            embedding_dtype: "EmbeddingDtype" = EmbeddingDtype.default_value(),  # type: ignore
+            pooling_method: "PoolingMethod" = MANAGER.pooling_method[0],  # type: ignore
+            compile: bool = MANAGER.compile[0],
+            bettertransformer: bool = MANAGER.bettertransformer[0],
+            preload_only: bool = MANAGER.preload_only,
+            permissive_cors: bool = MANAGER.permissive_cors,
+            api_key: str = MANAGER.api_key,
+            url_prefix: str = MANAGER.url_prefix,
+            host: str = MANAGER.host,
+            port: int = MANAGER.port,
+            log_level: "UVICORN_LOG_LEVELS" = MANAGER.log_level,  # type: ignore
     ):
         """Infinity API ♾️  cli v1 - deprecated, consider use cli v2 via `infinity_emb v2`."""
         if api_key:
             # encourage switch to v2
             raise ValueError("api_key is not supported in `v1`. Please migrate to `v2`.")
         if not (
-            embedding_dtype == EmbeddingDtype.float32
-            or embedding_dtype == EmbeddingDtype.default_value()
+                embedding_dtype == EmbeddingDtype.float32
+                or embedding_dtype == EmbeddingDtype.default_value()
         ):
             # encourage switch to v2
             raise ValueError(
@@ -177,107 +177,108 @@ def v1(
             proxy_root_path="",  # set as empty string
         )
 
+
     @tp.command("v2")
     def v2(
-        # t
-        # arguments for engine
-        model_id: list[str] = typer.Option(
-            **_construct("model_id"),
-            help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
-        ),
-        served_model_name: list[str] = typer.Option(
-            **_construct("served_model_name"),
-            help="the nickname for the API, under which the model_id can be selected",
-        ),
-        batch_size: list[int] = typer.Option(
-            **_construct("batch_size"), help="maximum batch size for inference"
-        ),
-        revision: list[str] = typer.Option(
-            **_construct("revision"), help="huggingface  model repo revision."
-        ),
-        trust_remote_code: list[bool] = typer.Option(
-            **_construct("trust_remote_code"),
-            help="if potential remote modeling code from huggingface repo is trusted.",
-        ),
-        engine: list[InferenceEngine] = typer.Option(
-            **_construct("engine"),
-            help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
-        ),
-        model_warmup: list[bool] = typer.Option(
-            **_construct("model_warmup"),
-            help="if model should be warmed up after startup, and before ready.",
-        ),
-        vector_disk_cache: list[bool] = typer.Option(
-            **_construct("vector_disk_cache"),
-            help="If hash(request)/results should be cached to SQLite for latency improvement.",
-        ),
-        device: list[Device] = typer.Option(
-            **_construct("device"),
-            help="device to use for computing the model forward pass.",
-        ),
-        device_id: list[str] = typer.Option(
-            **_construct("device_id"),
-            help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
-        ),
-        lengths_via_tokenize: list[bool] = typer.Option(
-            **_construct("lengths_via_tokenize"),
-            help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
-        ),
-        dtype: list[Dtype] = typer.Option(
-            **_construct("dtype"), help="dtype for the model weights."
-        ),
-        embedding_dtype: list[EmbeddingDtype] = typer.Option(
-            **_construct("embedding_dtype"),
-            help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
-        ),
-        pooling_method: list[PoolingMethod] = typer.Option(
-            **_construct("pooling_method"),
-            help="overwrite the pooling method if inferred incorrectly.",
-        ),
-        compile: list[bool] = typer.Option(
-            **_construct("compile"),
-            help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
-        ),
-        bettertransformer: list[bool] = typer.Option(
-            **_construct("bettertransformer"),
-            help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
-        ),
-        # arguments for uvicorn / server
-        preload_only: bool = typer.Option(
-            **_construct("preload_only"),
-            help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
-        ),
-        host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
-        port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
-        url_prefix: str = typer.Option(
-            **_construct("url_prefix"),
-            callback=validate_url,
-            help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
-        ),
-        redirect_slash: str = typer.Option(
-            **_construct("redirect_slash"), help="where to redirect `/` requests to."
-        ),
-        log_level: "UVICORN_LOG_LEVELS" = typer.Option(
-            **_construct("log_level"), help="console log level."
-        ),  # type: ignore
-        permissive_cors: bool = typer.Option(
-            **_construct("permissive_cors"), help="whether to allow permissive cors."
-        ),
-        api_key: str = typer.Option(
-            **_construct("api_key"), help="api_key used for authentication headers."
-        ),
-        proxy_root_path: str = typer.Option(
-            **_construct("proxy_root_path"),
-            help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
-        ),
-        onnx_disable_optimize: list[bool] = typer.Option(
-            **_construct("onnx_disable_optimize"),
-            help="Disable onnx optimization",
-        ),
-        onnx_do_not_prefer_quantized: list[bool] = typer.Option(
-            **_construct("onnx_do_not_prefer_quantized"),
-            help="Do not use quantized onnx models by default if available",
-        ),
+            # t
+            # arguments for engine
+            model_id: list[str] = typer.Option(
+                **_construct("model_id"),
+                help="Huggingface model repo id. Subset of possible models: https://huggingface.co/models?other=text-embeddings-inference&",
+            ),
+            served_model_name: list[str] = typer.Option(
+                **_construct("served_model_name"),
+                help="the nickname for the API, under which the model_id can be selected",
+            ),
+            batch_size: list[int] = typer.Option(
+                **_construct("batch_size"), help="maximum batch size for inference"
+            ),
+            revision: list[str] = typer.Option(
+                **_construct("revision"), help="huggingface  model repo revision."
+            ),
+            trust_remote_code: list[bool] = typer.Option(
+                **_construct("trust_remote_code"),
+                help="if potential remote modeling code from huggingface repo is trusted.",
+            ),
+            engine: list[InferenceEngine] = typer.Option(
+                **_construct("engine"),
+                help="Which backend to use. `torch` uses Pytorch GPU/CPU, optimum uses ONNX on GPU/CPU/NVIDIA-TensorRT, `CTranslate2` uses torch+ctranslate2 on CPU/GPU.",
+            ),
+            model_warmup: list[bool] = typer.Option(
+                **_construct("model_warmup"),
+                help="if model should be warmed up after startup, and before ready.",
+            ),
+            vector_disk_cache: list[bool] = typer.Option(
+                **_construct("vector_disk_cache"),
+                help="If hash(request)/results should be cached to SQLite for latency improvement.",
+            ),
+            device: list[Device] = typer.Option(
+                **_construct("device"),
+                help="device to use for computing the model forward pass.",
+            ),
+            device_id: list[str] = typer.Option(
+                **_construct("device_id"),
+                help="device id defines the model placement. e.g. `0,1` will place the model on MPS/CUDA/GPU 0 and 1 each",
+            ),
+            lengths_via_tokenize: list[bool] = typer.Option(
+                **_construct("lengths_via_tokenize"),
+                help="if True, returned tokens is based on actual tokenizer count. If false, uses len(input) as proxy.",
+            ),
+            dtype: list[Dtype] = typer.Option(
+                **_construct("dtype"), help="dtype for the model weights."
+            ),
+            embedding_dtype: list[EmbeddingDtype] = typer.Option(
+                **_construct("embedding_dtype"),
+                help="dtype post-forward pass. If != `float32`, using Post-Forward Static quantization.",
+            ),
+            pooling_method: list[PoolingMethod] = typer.Option(
+                **_construct("pooling_method"),
+                help="overwrite the pooling method if inferred incorrectly.",
+            ),
+            compile: list[bool] = typer.Option(
+                **_construct("compile"),
+                help="Enable usage of `torch.compile(dynamic=True)` if engine relies on it.",
+            ),
+            bettertransformer: list[bool] = typer.Option(
+                **_construct("bettertransformer"),
+                help="Enables varlen flash-attention-2 via the `BetterTransformer` implementation. If available for this model.",
+            ),
+            # arguments for uvicorn / server
+            preload_only: bool = typer.Option(
+                **_construct("preload_only"),
+                help="If true, only downloads models and verifies setup, then exit. Recommended for pre-caching the download in a Dockerfile.",
+            ),
+            host: str = typer.Option(**_construct("host"), help="host for the FastAPI uvicorn server"),
+            port: int = typer.Option(**_construct("port"), help="port for the FastAPI uvicorn server"),
+            url_prefix: str = typer.Option(
+                **_construct("url_prefix"),
+                callback=validate_url,
+                help="prefix for all routes of the FastAPI uvicorn server. Useful if you run behind a proxy / cascaded API.",
+            ),
+            redirect_slash: str = typer.Option(
+                **_construct("redirect_slash"), help="where to redirect `/` requests to."
+            ),
+            log_level: "UVICORN_LOG_LEVELS" = typer.Option(
+                **_construct("log_level"), help="console log level."
+            ),  # type: ignore
+            permissive_cors: bool = typer.Option(
+                **_construct("permissive_cors"), help="whether to allow permissive cors."
+            ),
+            api_key: str = typer.Option(
+                **_construct("api_key"), help="api_key used for authentication headers."
+            ),
+            proxy_root_path: str = typer.Option(
+                **_construct("proxy_root_path"),
+                help="Proxy prefix for the application. See: https://fastapi.tiangolo.com/advanced/behind-a-proxy/",
+            ),
+            onnx_disable_optimize: list[bool] = typer.Option(
+                **_construct("onnx_disable_optimize"),
+                help="Disable onnx optimization",
+            ),
+            onnx_do_not_prefer_quantized: list[bool] = typer.Option(
+                **_construct("onnx_do_not_prefer_quantized"),
+                help="Do not use quantized onnx models by default if available",
+            ),
     ):
         """Infinity API ♾️  cli v2. MIT License. Copyright (c) 2023-now Michael Feil \n
         \n
@@ -380,6 +381,8 @@ def v2(
             api_key=api_key,
             proxy_root_path=proxy_root_path,
         )
+        # Update logging configs
+        set_uvicorn_logging_configs()
 
         uvicorn.run(
             app,
@@ -391,6 +394,47 @@ def v2(
         )
 
 
+def set_uvicorn_logging_configs():
+    """Configure Uvicorn logging with environment variable overrides.
+
+    Allows customization of log formats through environment variables:
+    - INFINITY_UVICORN_DEFAULT_FORMAT: Format for default logs
+    - INFINITY_UVICORN_ACCESS_FORMAT: Format for access logs
+    - INFINITY_UVICORN_DATE_FORMAT: Date format for all logs
+    """
+    from uvicorn.config import LOGGING_CONFIG
+    import os
+
+    # Define constants for environment variable names to improve maintainability
+    default_format_env = MANAGER.uvicorn_default_format
+    access_format_env = MANAGER.uvicorn_access_format
+    date_format_env = MANAGER.uvicorn_date_format
+
+    # Default log format (can be overridden by env var)
+    default_fmt = os.getenv(
+        default_format_env,
+        "%(asctime)s %(levelprefix)s %(message)s"
+    )
+
+    # Access log format (can be overridden by env var)
+    access_fmt = os.getenv(
+        access_format_env,
+        '%(asctime)s %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s'
+    )
+
+    # Date format for all logs (can be overridden by env var)
+    date_fmt = os.getenv(
+        date_format_env,
+        "%Y-%m-%d %H:%M:%S"
+    )
+
+    # Apply the configurations
+    LOGGING_CONFIG["formatters"]["default"]["fmt"] = default_fmt
+    LOGGING_CONFIG["formatters"]["default"]["datefmt"] = date_fmt
+    LOGGING_CONFIG["formatters"]["access"]["fmt"] = access_fmt
+    LOGGING_CONFIG["formatters"]["access"]["datefmt"] = date_fmt
+
+
 def cli():
     CHECK_TYPER.mark_required()
     if len(sys.argv) == 1 or sys.argv[1] not in [
diff --git a/libs/infinity_emb/infinity_emb/env.py b/libs/infinity_emb/infinity_emb/env.py
@@ -271,4 +271,23 @@ def onnx_do_not_prefer_quantized(self):
         return self._to_bool_multiple(
             self._optional_infinity_var_multiple("onnx_do_not_prefer_quantized", default=["false"])
         )
+    @cached_property
+    def disable_rich_handler(self):
+        return self._to_bool(self._optional_infinity_var("disable_rich_handler", default="false"))
+
+    @cached_property
+    def log_format(self):
+        return self._optional_infinity_var("log_format", default="%(asctime)s %(name)s %(levelname)s: %(message)s")
+    @cached_property
+    def uvicorn_default_format(self):
+        return self._optional_infinity_var("uvicorn_default_format", default="[%(asctime)s] %(levelprefix)s %(message)s")
+
+    @cached_property
+    def uvicorn_access_format(self):
+        return self._optional_infinity_var("uvicorn_access_format", default='[%(asctime)s] %(levelprefix)s %(client_addr)s - "%(request_line)s" %(status_code)s')
+
+    @cached_property
+    def uvicorn_date_format(self):
+        return self._optional_infinity_var("uvicorn_date_format", default="%Y-%m-%d %H:%M:%S")
 MANAGER = __Infinity_EnvManager()
+
diff --git a/libs/infinity_emb/infinity_emb/log_handler.py b/libs/infinity_emb/infinity_emb/log_handler.py