Skip to content

Fix typos #3210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ class Exl2WeightsLoader(WeightsLoader):

def get_weights(self, weights: "Weights", prefix: str):
"""
Get weights at the given prefix and apply without tensor paralllism.
Get weights at the given prefix and apply without tensor parallelism.
"""
try:
q_weight = weights.get_tensor(f"{prefix}.q_weight")
Original file line number Diff line number Diff line change
@@ -598,7 +598,7 @@ def get_loaders(

def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
# Skip last lm_head linear
# Need isintance Falcon is inheriting Linear.
# Need isinstance Falcon is inheriting Linear.
if isinstance(module, layers) and "lm_head" not in name:
return {name: module}
res = {}
Original file line number Diff line number Diff line change
@@ -221,7 +221,7 @@ def __init__(

log_once(
logger.info,
"Using MoE layer wih fused gemm",
"Using MoE layer with fused gemm",
)

self.moe = cls(
Original file line number Diff line number Diff line change
@@ -282,7 +282,7 @@ def __call__(

"""

# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
if add_end_of_utterance_token is None:
add_end_of_utterance_token = (
self.tokenizer_was_trained_with_end_of_utterance_token
Original file line number Diff line number Diff line change
@@ -1518,7 +1518,7 @@ def warmup(
)
self.bucketing_ctx.num_hpu_blocks = num_blocks
if os.getenv("VLLM_SKIP_WARMUP", "false").lower() == "true":
logger.info("skip warmup hpu graph, not recommmended")
logger.info("skip warmup hpu graph, not recommended")
del _batch, batch
return int(num_blocks * BLOCK_SIZE), max_input_tokens, max_total_tokens

4 changes: 2 additions & 2 deletions backends/gaudi/server/text_generation_server/utils/weights.py
Original file line number Diff line number Diff line change
@@ -22,7 +22,7 @@ class WeightsLoader(ABC):
@abstractmethod
def get_weights(self, weights: "Weights", prefix: str):
"""
Get weights at the given prefix and apply without tensor paralllism.
Get weights at the given prefix and apply without tensor parallelism.
"""
...

@@ -50,7 +50,7 @@ def get_weights_col_packed(
def get_weights_col(self, weights: "Weights", prefix: str):
"""
Get weights at the given prefix and apply column-splitting for tensor
paralllism.
parallelism.
"""
return weights.get_multi_weights_col([prefix], 0)

2 changes: 1 addition & 1 deletion backends/neuron/tests/fixtures/model.py
Original file line number Diff line number Diff line change
@@ -118,7 +118,7 @@ def neuron_model_config(request):

For each exposed model, the local directory is maintained for the duration of the
test session and cleaned up afterwards.
The hub model artifacts are never cleaned up and persist accross sessions.
The hub model artifacts are never cleaned up and persist across sessions.
They must be cleaned up manually when the optimum-neuron version changes.

"""
4 changes: 2 additions & 2 deletions docs/source/backends/gaudi.mdx
Original file line number Diff line number Diff line change
@@ -93,7 +93,7 @@ To run FP8 Inference:
1. Measure statistics using [Optimum Habana measurement script](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation#running-with-fp8)
2. Run the model in TGI with QUANT_CONFIG setting - e.g. `-e QUANT_CONFIG=./quantization_config/maxabs_quant.json`.

The following commmand example for FP8 inference is based on the assumption that measurement is done via the first step above.
The following command example for FP8 inference is based on the assumption that measurement is done via the first step above.

Example for Llama3.1-70B on 8 cards with FP8 precision:

@@ -155,7 +155,7 @@ curl -N 127.0.0.1:8080/generate \
-H 'Content-Type: application/json'
```

> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calcualted as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.
> Note: In Llava-v1.6-Mistral-7B, an image usually accounts for 2000 input tokens. For example, an image of size 512x512 is represented by 2800 tokens. Thus, `max-input-tokens` must be larger than the number of tokens associated with the image. Otherwise the image may be truncated. We set `BASE_IMAGE_TOKENS=2048` as the default image token value. This is the minimum value of `max-input-tokens`. You can override the environment variable `BASE_IMAGE_TOKENS` to change this value. The warmup will generate graphs with input length from `BASE_IMAGE_TOKENS` to `max-input-tokens`. For Llava-v1.6-Mistral-7B, the value of `max-batch-prefill-tokens` is 16384, which is calculated as follows: `prefill_batch_size` = `max-batch-prefill-tokens` / `max-input-tokens`.

### How to Benchmark Performance

8 changes: 4 additions & 4 deletions docs/source/conceptual/chunking.md
Original file line number Diff line number Diff line change
@@ -5,13 +5,13 @@
Performance leap: TGI processes 3x more tokens, 13x faster than vLLM on long prompts. Zero config !

### 3x more tokens.
By reducing our memory footprint, we’re able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effect are best seen on smaller constrained environments.
By reducing our memory footprint, we’re able to ingest many more tokens and more dynamically than before. A single L4 (24GB) can handle 30k tokens on llama 3.1-8B, while vLLM gets barely 10k. A lot of work went into reducing the footprint of the runtime and its effects are best seen on smaller constrained environments.

### 13x faster
On long prompts (200k+ tokens) conversation replies take 27.5s in vLLM, while it takes only 2s in TGI. How so ? We keep the initial conversation around, so when a new reply comes in, we can answer almost instantly. The overhead of the lookup is ~5us. Thanks @Daniël de Kok for the beast data structure.

### Zero config
That’s it. Remove all the flags your are using and you’re likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we don’t have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.
That’s it. Remove all the flags you are using and you’re likely to get the best performance. By evaluating the hardware and model, TGI carefully selects automatic values to give best performance. In production, we don’t have any flags anymore in our deployments. We kept all existing flags around, they may come in handy in niche scenarios.



@@ -37,7 +37,7 @@ For more details on benchmarking in general we recommend the documentation of k6
We selected a handful of scenarios to simplify the picture, they seem to accurately reflect a larger trend.

1. **Small scenario**: This scenario consists of the first 200 requests from the orca datasets being prompted to the model. The 200 requests total 8k tokens together and are representative of conversation starters. Prefix caching has very limited impact in that scenario and we feel it's a relatively balanced benchmark for simple use cases.
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enable ever larger prompts and ever faster caching.
2. **Long scenario**: This scenario consists of 20 requests totalling 200k prompt tokens which are essentially asking for summaries of large chunks for text. In practical scenarios this is really useful when you are feeding large chunks of code, large chunks of business data or documents repeatedly and ask simple questions about them (summarization, classification, or where to find some data). This scenario is the one closest to what a lot of professional use cases seem to be doing by including a lot of information in the prompt itself. Those very long conversations are the ones that benefit the most for our recent changes since we are enabling even larger prompts and even faster caching.

### Hardware

@@ -119,7 +119,7 @@ Our performance gains can be attributed to several key factors:
While we've made significant progress, there are still opportunities for improvement:

1. **Special models**: All LLMs come with the aforementioned improvements. Some specific set of features might not (some quantizations, speculation or VLMs for instance are harder to optimize for with the same level of detail).
2. **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisionned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
2. **KV-Cache Long-Term Retention**: Addressing KV-cache long-term retention is a challenge. There are several solutions envisioned like shared KV-cache (like redis or memcached) solutions or innovative storage approaches. It is an area of ongoing research of ours.
3. **Multimodal models**: We are also investigating quite a lot other kind of models, like audio-to-audio, image/video generation, and other hybrids, where we see a lot of potential of applying the same principles we've applied in TGI to maximize performance.

By sharing our benchmarking methodology, results, and technical insights, we aim to contribute to the ongoing development of more efficient and effective LLMs.
6 changes: 3 additions & 3 deletions docs/source/conceptual/lora.md
Original file line number Diff line number Diff line change
@@ -2,7 +2,7 @@

## What is LoRA?

LoRA is a technique that allows for efficent fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.
LoRA is a technique that allows for efficient fine-tuning a model while only updating a small portion of the model's weights. This is useful when you have a large model that has been pre-trained on a large dataset, but you want to fine-tune it on a smaller dataset or for a specific task.

LoRA works by adding a small number of additional weights to the model, which are used to adapt the model to the new dataset or task. These additional weights are learned during the fine-tuning process, while the rest of the model's weights are kept fixed.

@@ -18,13 +18,13 @@ Technically, LoRA can be used to fine-tune a large language model on a small dat

## Optimizing Inference with LoRA

LoRA's can be used during inference by mutliplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with mulitple LoRA models.
LoRA's can be used during inference by multiplying the adapter weights with the model weights at each specified layer. This process can be computationally expensive, but due to awesome work by [punica-ai](https://github.com/punica-ai/punica) and the [lorax](https://github.com/predibase/lorax) team, optimized kernels/and frameworks have been developed to make this process more efficient. TGI leverages these optimizations in order to provide fast and efficient inference with multiple LoRA models.

## Serving multiple LoRA adapters with TGI

Once a LoRA model has been trained, it can be used to generate text or perform other tasks just like a regular language model. However, because the model has been fine-tuned on a specific dataset, it may perform better on that dataset than a model that has not been fine-tuned.

In practice its often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.
In practice it's often useful to have multiple LoRA models, each fine-tuned on a different dataset or for a different task. This allows you to use the model that is best suited for a particular task or dataset.

Text Generation Inference (TGI) now supports loading multiple LoRA models at startup that can be used in generation requests. This feature is available starting from version `~2.0.6` and is compatible with LoRA models trained using the `peft` library.

2 changes: 1 addition & 1 deletion docs/source/reference/launcher.md
Original file line number Diff line number Diff line change
@@ -138,7 +138,7 @@ Options:
## MAX_TOP_N_TOKENS
```shell
--max-top-n-tokens <MAX_TOP_N_TOKENS>
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking
This is the maximum allowed value for clients to set `top_n_tokens`. `top_n_tokens` is used to return information about the `n` most likely tokens at each generation step, instead of just the sampled token. This information can be used for downstream tasks like for classification or ranking

[env: MAX_TOP_N_TOKENS=]
[default: 5]
2 changes: 1 addition & 1 deletion integration-tests/fixtures/neuron/export_models.py
Original file line number Diff line number Diff line change
@@ -238,7 +238,7 @@ def neuron_model_config(request):

For each exposed model, the local directory is maintained for the duration of the
test session and cleaned up afterwards.
The hub model artifacts are never cleaned up and persist accross sessions.
The hub model artifacts are never cleaned up and persist across sessions.
They must be cleaned up manually when the optimum-neuron version changes.

"""
4 changes: 2 additions & 2 deletions integration-tests/models/test_flash_llama_prefix.py

Large diffs are not rendered by default.

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion launcher/src/main.rs
Original file line number Diff line number Diff line change
@@ -663,7 +663,7 @@ struct Args {
max_stop_sequences: usize,

/// This is the maximum allowed value for clients to set `top_n_tokens`.
/// `top_n_tokens` is used to return information about the the `n` most likely
/// `top_n_tokens` is used to return information about the `n` most likely
/// tokens at each generation step, instead of just the sampled token. This
/// information can be used for downstream tasks like for classification or
/// ranking.
4 changes: 2 additions & 2 deletions server/custom_kernels/custom_kernels/fused_attention_cuda.cu
Original file line number Diff line number Diff line change
@@ -152,7 +152,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
auto query_scaled = query_view * inv_norm_factor;
auto attention_scores = at::bmm(query_scaled, key_view);

// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_initial_dtype`
at::Tensor attention_probs;
if (true) {
// TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
@@ -182,7 +182,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
*/

/*
* We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
* We should split [batch_size_times_num_heads_block, q_length] in separate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
* with multiple threads as we need to `sync_threads` to run exponential sum.
* We maximise the usage of threads within a single block
*/
Original file line number Diff line number Diff line change
@@ -150,7 +150,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa

auto attention_scores = alibi.baddbmm(query_layer, key_layer, beta, inv_norm_factor);

// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_intial_dtype`
// Computing `optionally_cast_fp16_to_fp32 + masked_fill + softmax + cast_to_initial_dtype`
at::Tensor attention_probs;
if (true) {
const auto kv_length = key_layer.size(2);
@@ -182,7 +182,7 @@ std::tuple<at::Tensor, std::optional<std::vector<at::Tensor>>, at::Tensor> forwa
*/

/*
* We should split [batch_size_times_num_heads_block, q_length] in seperate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
* We should split [batch_size_times_num_heads_block, q_length] in separate blocks and [batch_size_times_num_heads_block_size, kv_length] a single block
* with multiple threads as we need to `sync_threads` to run exponential sum.
* We maximise the usage of threads within a single block
*/
Original file line number Diff line number Diff line change
@@ -34,7 +34,7 @@ def __str__(self) -> str:

def get_weights(self, weights: Weights, prefix: str):
"""
Get weights at the given prefix and apply without tensor paralllism.
Get weights at the given prefix and apply without tensor parallelism.
"""
weight_packed = weights.get_tensor(f"{prefix}.weight_packed")
meta = weights.get_tensor(f"{prefix}.meta")
2 changes: 1 addition & 1 deletion server/text_generation_server/layers/exl2.py
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ class Exl2WeightsLoader(WeightsLoader):

def get_weights(self, weights: "Weights", prefix: str):
"""
Get weights at the given prefix and apply without tensor paralllism.
Get weights at the given prefix and apply without tensor parallelism.
"""
try:
q_weight = weights.get_tensor(f"{prefix}.q_weight")
2 changes: 1 addition & 1 deletion server/text_generation_server/layers/gptq/quantize.py
Original file line number Diff line number Diff line change
@@ -598,7 +598,7 @@ def get_loaders(

def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
# Skip last lm_head linear
# Need isintance Falcon is inheriting Linear.
# Need isinstance Falcon is inheriting Linear.
if isinstance(module, layers) and "lm_head" not in name:
return {name: module}
res = {}
2 changes: 1 addition & 1 deletion server/text_generation_server/layers/marlin/marlin.py
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@ def __init__(self, *, bits: int, is_marlin_24: bool):

def get_weights(self, weights: "Weights", prefix: str):
"""
Get weights at the given prefix and apply without tensor paralllism.
Get weights at the given prefix and apply without tensor parallelism.
"""
is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24"
if is_marlin_24:
2 changes: 1 addition & 1 deletion server/text_generation_server/layers/moe/__init__.py
Original file line number Diff line number Diff line change
@@ -244,7 +244,7 @@ def __init__(

log_once(
logger.info,
"Using MoE layer wih fused gemm",
"Using MoE layer with fused gemm",
)

self.moe = cls(
Original file line number Diff line number Diff line change
@@ -295,7 +295,7 @@ def __init__(
else:
vision_config = SiglipVisionConfig()
logger.info(
"vision_config is None or incompatible with Gemma3VisionConfig intialization. Gemma3 will be limited "
"vision_config is None or incompatible with Gemma3VisionConfig initialization. Gemma3 will be limited "
"to text tasks."
)

Original file line number Diff line number Diff line change
@@ -282,7 +282,7 @@ def __call__(

"""

# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
if add_end_of_utterance_token is None:
add_end_of_utterance_token = (
self.tokenizer_was_trained_with_end_of_utterance_token
Loading