Skip to content

Commit 645639b

Browse files
oelayan7jomayerilekurileloadamstjruwase
authored
Rearrange inference OPS and stop using builder.load (#5490)
This PR mainly handles all places where InferenceBuilder is used to access any op or a specific implementation for an op. Instead an op is defined, and its proper implementation is picked inside and the usage will be transparent to the user. What was done in the PR: 1) Added missing ops (added a py file with fallback mechanism) 2) Added missing fallback implementations for existing ops 3) removed all usages for builder.load and replaced them with ops instead. 4) added workspace op and inferenceContext which contains all workspace related functions and inferenceContext is the python fallback of inferenceContext in CUDA 5) a small change to softmax_context signature to fit the fallback signature. --------- Co-authored-by: Joe Mayer <[email protected]> Co-authored-by: Lev Kurilenko <[email protected]> Co-authored-by: Logan Adams <[email protected]> Co-authored-by: Olatunji Ruwase <[email protected]>
1 parent ca8b1fe commit 645639b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1064
-363
lines changed

csrc/transformer/inference/csrc/pt_binding.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -452,14 +452,17 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
452452
unsigned layer_id,
453453
unsigned num_layers,
454454
at::Tensor& alibi,
455-
float rope_theta)
455+
float rope_theta,
456+
bool is_prompt,
457+
std::optional<at::Tensor> token_idx,
458+
std::optional<at::Tensor> position_ids)
456459
{
457460
unsigned bsz = query_key_value.size(0);
458461
unsigned seq_len = query_key_value.size(1);
459462
int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
460463
unsigned hidden_dim = heads * k;
461464

462-
bool is_prompt = (seq_len > 1);
465+
is_prompt = (seq_len > 1);
463466

464467
if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
465468
unsigned soft_len = InferenceContext::Instance().current_tokens();
@@ -2031,7 +2034,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
20312034
"DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)"); \
20322035
m.def("dequantize_" #_name, \
20332036
&ds_dequantize<_dtype>, \
2034-
"DeepSpeed dequantize with " #_name " (CUDA)")
2037+
"DeepSpeed dequantize with " #_name " (CUDA)");
20352038

20362039
DEF_OPS(fp32, float);
20372040
DEF_OPS(fp16, __half);

deepspeed/inference/engine.py

+4-13
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,7 @@ def __init__(self, model, config):
5353
DS_INFERENCE_ENABLED = True
5454

5555
super().__init__()
56-
57-
# Have to import here because inference_module is a global, but python
58-
# globals only work at the module level and will not be updated unless
59-
# we import it each time we init a new inference engine.
60-
from ..model_implementations.transformers.ds_transformer import inference_module
61-
if inference_module is not None:
56+
if DeepSpeedTransformerInference.workspace is not None:
6257
self.destroy()
6358

6459
self.module = model
@@ -191,15 +186,11 @@ def __init__(self, model, config):
191186
self._is_compiled = False
192187

193188
def destroy(self):
194-
# Have to import here because inference_module is a global, but python
195-
# globals only work at the module level and will not be updated unless
196-
# we import it each time we init a new inference engine.
197-
from ..model_implementations.transformers.ds_transformer import inference_module
198189
DeepSpeedTransformerInference.layer_id = 0
199190
DeepSpeedSelfAttention.num_layers = 0
200-
if inference_module is not None:
201-
inference_module.release_workspace()
202-
inference_module = None
191+
if DeepSpeedTransformerInference.workspace.is_allocated():
192+
DeepSpeedTransformerInference.workspace.release_workspace()
193+
DeepSpeedTransformerInference.workspace = None
203194

204195
def profile_model_time(self, use_cuda_events=True):
205196
if not self.model_profile_enabled and not self._config.enable_cuda_graph:

deepspeed/model_implementations/transformers/ds_llama2.py

+2-13
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,8 @@
44
# DeepSpeed Team
55

66
import torch
7-
from deepspeed import comm as dist
87
from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
98

10-
inference_module = None
11-
129

1310
class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
1411
"""Initialize the DeepSpeed OPT Transformer Layer.
@@ -27,18 +24,10 @@ def forward(self, *args, **kwargs):
2724

2825
input = args[0]
2926
input_mask = None
30-
# Allocate memory only on first layer forward
31-
if self.config.layer_id == 0 and self._alloc_workspace:
32-
self.allocate_workspace(self.config.hidden_size, self.config.heads,
33-
input.size()[1],
34-
input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
35-
self.config.bigscience_bloom,
36-
dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
37-
self.config.min_out_tokens)
38-
self._alloc_workspace = False
39-
4027
get_present = True
4128

29+
self.allocate_workspace(input.size())
30+
4231
# We set the prev key/value to None when there is a prompt
4332
if input.shape[1] > 1:
4433
self.layer_past = None

deepspeed/model_implementations/transformers/ds_transformer.py

+23-31
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,18 @@
66
import torch
77
import torch.nn as nn
88
from deepspeed import comm as dist
9+
from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
910
from deepspeed.utils.logging import log_dist
1011

1112
from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
1213
from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
14+
from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
1315
from deepspeed.accelerator import get_accelerator
14-
from deepspeed.ops.op_builder import InferenceBuilder
1516
import deepspeed
1617
if deepspeed.HAS_TRITON:
1718
from deepspeed.ops.transformer.inference.triton.mlp import TritonMLP
1819
from deepspeed.ops.transformer.inference.triton.attention import TritonSelfAttention
1920

20-
inference_module = None
21-
2221

2322
class DeepSpeedTransformerInference(nn.Module):
2423
"""Initialize the DeepSpeed Transformer Layer.
@@ -37,6 +36,7 @@ class DeepSpeedTransformerInference(nn.Module):
3736
for specific downstream tasks.
3837
"""
3938
layer_id = 0
39+
workspace = None
4040

4141
def __init__(self,
4242
config,
@@ -52,10 +52,6 @@ def __init__(self,
5252
DeepSpeedTransformerInference.layer_id += 1
5353

5454
data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype
55-
global inference_module
56-
if inference_module is None:
57-
builder = InferenceBuilder()
58-
inference_module = builder.load()
5955

6056
if DeepSpeedTransformerInference.layer_id == 1:
6157
log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
@@ -88,22 +84,25 @@ def __init__(self,
8884
self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
8985
requires_grad=False)
9086
self.layer_past = None
91-
try:
92-
if config.dtype == torch.float32:
93-
self.allocate_workspace = inference_module.allocate_workspace_fp32
94-
elif config.dtype == torch.bfloat16:
95-
self.allocate_workspace = inference_module.allocate_workspace_bf16
96-
else:
97-
self.allocate_workspace = inference_module.allocate_workspace_fp32
98-
self._alloc_workspace = True
99-
except AttributeError:
100-
self.allocate_workspace = None
101-
self._alloc_workspace = False
87+
self.layer_norm = LayerNormOp()
88+
if DeepSpeedTransformerInference.workspace is None:
89+
DeepSpeedTransformerInference.workspace = WorkspaceOp(self.config)
90+
self._should_allocate_workspace = True
91+
92+
def allocate_workspace(self, size):
93+
# Allocate memory only on first layer forward
94+
if self.config.layer_id == 0 and self._should_allocate_workspace:
95+
DeepSpeedTransformerInference.workspace.allocate_workspace(
96+
self.config.hidden_size, self.config.heads, size[1], size[0], DeepSpeedTransformerInference.layer_id,
97+
self.config.mp_size, self.config.bigscience_bloom,
98+
dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
99+
self.config.min_out_tokens)
100+
self._should_allocate_workspace = False
102101

103102
@classmethod
104103
def reset_cache(cls):
105-
if inference_module is not None:
106-
inference_module.reset_cache()
104+
if cls.workspace is not None:
105+
cls.workspace.reset_cache()
107106

108107
def forward(
109108
self,
@@ -136,15 +135,7 @@ def forward(
136135

137136
input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
138137

139-
# Allocate memory only on first layer forward
140-
if self.config.layer_id == 0 and self._alloc_workspace:
141-
self.allocate_workspace(self.config.hidden_size, self.config.heads,
142-
input.size()[1],
143-
input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
144-
self.config.bigscience_bloom,
145-
dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
146-
self.config.min_out_tokens)
147-
self._alloc_workspace = False
138+
self.allocate_workspace(input.size())
148139

149140
get_present = (get_present or get_key_value or use_cache)
150141
input_mask = input_mask if attention_mask is None else attention_mask
@@ -178,14 +169,15 @@ def forward(
178169
output_attentions,
179170
self.norm_w,
180171
self.norm_b,
181-
alibi)
172+
alibi,
173+
**kwargs)
182174

183175
presents = (key, value)
184176
self.layer_past = presents if layer_past is None else None
185177
output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
186178

187179
if not self.config.pre_layer_norm:
188-
output = inference_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
180+
output = self.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
189181

190182
output = output.to(input_type)
191183
if get_present:

deepspeed/ops/transformer/inference/config.py

-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,6 @@ def __init__(self,
103103
self.return_tuple = return_tuple
104104
self.mlp_after_attn = mlp_after_attn
105105
self.mlp_act_func_type = mlp_act_func_type
106-
self.specialized_mode = False
107106
self.training_mp_size = training_mp_size
108107
self.bigscience_bloom = bigscience_bloom
109108
self.max_out_tokens = max_out_tokens

deepspeed/ops/transformer/inference/diffusers_attention.py

+24-28
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
from packaging import version as pkg_version
1111
from deepspeed.utils.logging import log_dist
1212
from deepspeed.accelerator import get_accelerator
13-
from deepspeed.ops.op_builder import InferenceBuilder
13+
from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
14+
from deepspeed.ops.transformer.inference.op_binding.softmax_context import SoftmaxContextOp
15+
from deepspeed.ops.transformer.inference.op_binding import LinearOp
16+
from deepspeed.ops.transformer.inference.op_binding.pad_transform import PadTransformOp
1417

15-
# Cuda modules will be imported if needed
16-
inference_module = None
1718
minus_inf = -10000.0
1819
triton_flash_attn = None
1920

@@ -36,7 +37,8 @@ class DeepSpeedDiffusersAttentionFunction(Function):
3637
@staticmethod
3738
def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
3839
num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
39-
do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel, rope_theta):
40+
do_out_bias, score_context_func, linear_func, pad_transform_func, triton_flash_attn_kernel,
41+
rope_theta):
4042

4143
def _transpose_for_context(x):
4244
x = x.permute(0, 2, 1, 3)
@@ -77,7 +79,7 @@ def selfAttention_fp(input, context, input_mask):
7779
query = query.contiguous()
7880
key = key.contiguous()
7981
value = value.contiguous()
80-
query, key, value = inference_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
82+
query, key, value = pad_transform_func(query, key, value, config.heads, do_flash_attn)
8183
attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
8284
context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
8385

@@ -117,10 +119,6 @@ def __init__(
117119

118120
data_type = self.config.dtype
119121
data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
120-
global inference_module
121-
if inference_module is None:
122-
builder = InferenceBuilder()
123-
inference_module = builder.load()
124122

125123
if DeepSpeedDiffusersAttention.layer_id == 1:
126124
log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
@@ -171,26 +169,24 @@ def __init__(
171169
self.norm_factor *= math.sqrt(self.config.layer_id + 1)
172170
# https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
173171

174-
if self.config.dtype in [torch.float16, torch.int8]:
175-
self.score_context_func = inference_module.softmax_context_fp16
176-
self.linear_func = inference_module.linear_layer_fp16
177-
self.allocate_workspace = inference_module.allocate_workspace_fp16
178-
else:
179-
self.score_context_func = inference_module.softmax_context_fp32
180-
self.linear_func = inference_module.linear_layer_fp32
181-
self.allocate_workspace = inference_module.allocate_workspace_fp32
172+
self.workspace = WorkspaceOp(self.config)
173+
self.score_context_func = SoftmaxContextOp(self.config)
174+
self.linear_func = LinearOp(self.config)
175+
self.pad_transform_func = PadTransformOp(self.config)
182176

183-
def forward(self, input, context=None, input_mask=None):
177+
def allocate_workspace(self, size):
178+
# Allocate memory only on first layer forward
184179
if self.config.layer_id == 0:
185-
self.allocate_workspace(self.config.hidden_size, self.config.heads,
186-
input.size()[1],
187-
input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
188-
0, self.config.max_out_tokens, self.config.min_out_tokens)
189-
output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
190-
self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
191-
self.num_attention_heads_per_partition, self.norm_factor,
192-
self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
193-
self.do_out_bias, self.score_context_func, self.linear_func,
194-
self.triton_flash_attn_kernel, self.config.rope_theta)
180+
self.workspace.allocate_workspace(self.config.hidden_size, self.config.heads, size[1], size[0],
181+
DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False, 0,
182+
self.config.max_out_tokens, self.config.min_out_tokens)
183+
184+
def forward(self, input, context=None, input_mask=None):
185+
self.allocate_workspace(input.size())
186+
output = DeepSpeedDiffusersAttentionFunction.apply(
187+
input, context, input_mask, self.config, self.attn_qkvw, self.attn_qw, self.attn_kw, self.attn_vw,
188+
self.attn_qkvb, self.num_attention_heads_per_partition, self.norm_factor, self.hidden_size_per_partition,
189+
self.attn_ow, self.attn_ob, self.do_out_bias, self.score_context_func, self.linear_func,
190+
self.pad_transform_func, self.triton_flash_attn_kernel, self.config.rope_theta)
195191

196192
return output

deepspeed/ops/transformer/inference/diffusers_transformer_block.py

+8-25
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,9 @@
1010
from .diffusers_attention import DeepSpeedDiffusersAttention
1111
from .bias_add import nhwc_bias_add
1212
from .diffusers_2d_transformer import Diffusers2DTransformerConfig
13-
from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
1413
from deepspeed.utils.types import ActivationFuncType
15-
16-
# Ops will be loaded on demand
17-
transformer_cuda_module = None
18-
spatial_cuda_module = None
19-
20-
21-
def load_transformer_module():
22-
global transformer_cuda_module
23-
if transformer_cuda_module is None:
24-
transformer_cuda_module = InferenceBuilder().load()
25-
return transformer_cuda_module
26-
27-
28-
def load_spatial_module():
29-
global spatial_cuda_module
30-
if spatial_cuda_module is None:
31-
spatial_cuda_module = SpatialInferenceBuilder().load()
32-
return spatial_cuda_module
14+
from .op_binding.gated_activation import GatedActivationOp
15+
from .op_binding.layer_norm import LayerNormOp
3316

3417

3518
class DeepSpeedDiffusersTransformerBlock(nn.Module):
@@ -76,8 +59,8 @@ def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerC
7659
else:
7760
self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
7861

79-
self.transformer_cuda_module = load_transformer_module()
80-
load_spatial_module()
62+
self.gated_activation = GatedActivationOp()
63+
self.layer_norm = LayerNormOp()
8164

8265
def forward(self, hidden_states, context=None, timestep=None, **kwargs):
8366
# In v0.12.0 of diffuser, several new kwargs were added. Capturing
@@ -88,17 +71,17 @@ def forward(self, hidden_states, context=None, timestep=None, **kwargs):
8871
if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
8972
context = kwargs["encoder_hidden_states"]
9073

91-
out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
74+
out_norm_1 = self.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
9275
out_attn_1 = self.attn_1(out_norm_1)
9376

94-
out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
77+
out_norm_2, out_attn_1 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
9578
out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
9679
out_attn_2 = self.attn_2(out_norm_2, context=context)
97-
out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
80+
out_norm_3, out_attn_2 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
9881
out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
9982

10083
out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
101-
out_geglu = self.transformer_cuda_module.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
84+
out_geglu = self.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
10285

10386
out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
10487
return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)

0 commit comments

Comments
 (0)