From 08f728d30ffdefe65bcd79be5622dc66dcf92040 Mon Sep 17 00:00:00 2001
From: gyou2021 <ganmei.you@intel.com>
Date: Wed, 18 Sep 2024 10:18:01 +0000
Subject: [PATCH 1/2] Enabled Qwen2-MoE Tensor Parallism (TP) inference

---
 deepspeed/module_inject/auto_tp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index e6eea2183de5..511a8f2f0c7d 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -473,7 +473,10 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''):
             if len(child._buffers) != 0 and self.state_dict is not None:
                 Loading.load_buffer(child, self.state_dict, checking_key)
             if child.__class__ in self.linear_policies:
-                setattr(r_module, name, self.linear_policies[child.__class__](child, prev_name + '.' + name,
+                if ('shared_expert_gate' not in checking_key and '.gate.' not in checking_key
+                        and 'qwen2_moe' in str(type(r_module))) or 'qwen2_moe' not in str(type(r_module)):
+                    setattr(
+                        r_module, name, self.linear_policies[child.__class__](child, prev_name + '.' + name,
                                                                               self.conv_linear_layer))
             elif any(isinstance(child, lp) for lp in self.linear_policies):
                 # Added for falcon model support

From f6e8637339587312a3496ffce52e1680f34899c8 Mon Sep 17 00:00:00 2001
From: gyou2021 <ganmei.you@intel.com>
Date: Wed, 18 Sep 2024 11:04:40 +0000
Subject: [PATCH 2/2] Enabled configurable auto Tensor Parallelism (TP) for
 inference of diverse models

---
 deepspeed/module_inject/auto_tp.py | 42 ++++++++++++------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index 511a8f2f0c7d..c2a5b2063767 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -15,6 +15,8 @@
 from deepspeed.accelerator import get_accelerator
 from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw, shard_value_with_share_qk, shard_chunk_mlp
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+import os
+import ast
 
 
 def move(tensor, device):
@@ -270,6 +272,7 @@ def kernel_supported(module_list):
                 return True
         return False
 
+    ## tp parser based on autoTP config in environment
     def tp_parser(model):
         policy_list = []
         module_list = []
@@ -279,40 +282,27 @@ def tp_parser(model):
         module_list = AutoTP.get_module_list(model)
         assert AutoTP.supported(model), "AutoTP not supported for model. Please use kernel injection since container policy for model exists." \
         if AutoTP.kernel_supported(module_list) else "AutoTP not supported for model. Please provide policy."
-        norm_layer_name_list = ['LayerNorm', 'layer_norm', 'ln_1', 'ln_2']
-        #ln_1 , ln_2 for Qwen
+
+        allReduceLinearItems = os.environ['allReduceLinearItems']
+        allReduceLinearItems = ast.literal_eval(allReduceLinearItems)
+
         for module in module_list:
             for key, submodule in module._modules.items():
                 if isinstance(submodule, nn.Linear):
                     layer_list = layer_list + ["." + key]
-                elif isinstance(submodule, nn.LayerNorm) or key in norm_layer_name_list:
+                elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
                     layer_list = layer_list + ["ln"]
                 else:
                     layer_list = layer_list + AutoTP.get_layers(key, submodule)
+
             for i, layer in enumerate(layer_list):
                 if layer == 'ln':
                     if layer_list[i - 1] != 'ln':
                         gem_list = gem_list + [layer_list[i - 1]]
-                elif 'out_proj' in layer:
-                    gem_list = gem_list + [layer]
-                elif 'o_proj' in layer:
-                    gem_list = gem_list + [layer]
-                elif 'down_proj' in layer:
-                    gem_list = gem_list + [layer]
-                elif 'attention.dense' in layer and 'GPTNeoX' in str(model):
-                    gem_list = gem_list + [layer]
-                elif 'self_attention.dense' in layer and 'falcon' in str(
-                        type(module)):  # this is a hack to get the right linear layer for this model!
-                    gem_list = gem_list + [layer]
-                # Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce.
-                elif 'w2' in layer and 'Mixtral' in str(type(module)):
-                    gem_list = gem_list + [layer]
-                elif 'self_attn.dense' in layer and 'Phi' in str(type(module)):
-                    gem_list = gem_list + [layer]
-                elif 'self_attention.dense' in layer and 'ChatGLM' in str(model):
-                    gem_list = gem_list + [layer]
-                elif 'dense_4h_to_h' in layer and 'ChatGLM' in str(model):
-                    gem_list = gem_list + [layer]
+                    continue
+                for item in allReduceLinearItems:
+                    if item in layer:
+                        gem_list = gem_list + [layer]
 
             layer_list = []
             if gem_list != []:
@@ -473,8 +463,10 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''):
             if len(child._buffers) != 0 and self.state_dict is not None:
                 Loading.load_buffer(child, self.state_dict, checking_key)
             if child.__class__ in self.linear_policies:
-                if ('shared_expert_gate' not in checking_key and '.gate.' not in checking_key
-                        and 'qwen2_moe' in str(type(r_module))) or 'qwen2_moe' not in str(type(r_module)):
+                keepLinearItems = os.environ['keepLinearItems']
+                keepLinearItems = ast.literal_eval(keepLinearItems)
+
+                if any(item not in checking_key for item in keepLinearItems):
                     setattr(
                         r_module, name, self.linear_policies[child.__class__](child, prev_name + '.' + name,
                                                                               self.conv_linear_layer))