Explicitly check for OPT activation function (#3278)

cmikeh2 · mrwyattii · jeffra · web-flow · commit 793c23e5c118 · 2023-04-20T17:38:43.000-07:00
Co-authored-by: Michael Wyatt &lt;michaelwyatt@microsoft.com&gt;
Co-authored-by: Jeff Rasley &lt;jerasley@microsoft.com&gt;
diff --git a/deepspeed/module_inject/containers/opt.py b/deepspeed/module_inject/containers/opt.py
@@ -72,18 +72,26 @@ class HFOPTLayerPolicy(TransformerPolicy):
     _orig_layer_class = None
 
     def __init__(self, client_module, inference=True, use_load_prefix=True):
-        super().__init__(inference,
-                         linear_layer=True,
-                         mlp_act_func_type=ActivationFuncType.ReLU,
-                         pre_attn_norm=True,
-                         use_load_prefix=use_load_prefix)
+        super().__init__(inference, linear_layer=True, pre_attn_norm=True, use_load_prefix=use_load_prefix)
         self.client_module = client_module
         try:
             import transformers
             HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
         except:
             HFOPTLayerPolicy._orig_layer_class = None
 
+        if hasattr(TransformerPolicy, "hf_model_config") and hasattr(TransformerPolicy.hf_model_config,
+                                                                     "activation_function"):
+            if TransformerPolicy.hf_model_config.activation_function == "relu":
+                self.mlp_act_func_type == ActivationFuncType.ReLU
+            elif TransformerPolicy.hf_model_config.activation_function in ["gelu", "gelu_new"]:
+                self.mlp_act_func_type == ActivationFuncType.GELU
+            else:
+                raise ValueError("Unsupported activation function: {}".format(
+                    TransformerPolicy.hf_model_config.activation_function))
+        else:
+            self.mlp_act_func_type == ActivationFuncType.ReLU  # default
+
     def get_hidden_heads(self):
         return self.client_module.self_attn.embed_dim, \
                 self.client_module.self_attn.num_heads, \