You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
该函数的引用方式:
from paddleslim.nas.ofa.utils import nlp_utils
该函数的原文:
def compute_neuron_head_importance(task_name,
model,
data_loader,
num_layers,
num_heads,
loss_fct=paddle.nn.loss.CrossEntropyLoss(),
intermediate_name='linear1',
output_name='linear2'):
"""
Compute the importance of multi-head attention and feed-forward neuron in each transformer layer.
Args:
task_name(str): task name.
model(paddle.nn.Layer): the instance of transformer model.
data_loader(DataLoader): An iterable data loader is used for evaluate. An instance of `paddle.io.Dataloader`.
num_layers(int): number of transformer layers.
num_heads(int): number of heads in each multi-head attention.
loss_fct(Loss|optional): loss function can be a `paddle.nn.Layer` instance. Default: `nn.loss.CrossEntropyLoss()`.
intermediate_name(str|optional): the name of intermediate `Linear` layer in feed-forward. Default: `linear1`.
output_name(str|optional): the name of output `Linear` layer in feed-forward. Default: `linear2`.
"""
head_importance = paddle.zeros(
shape=[num_layers, num_heads], dtype='float32')
head_mask = paddle.ones(shape=[num_layers, num_heads], dtype='float32')
head_mask.stop_gradient = False
intermediate_weight = []
intermediate_bias = []
output_weight = []
for name, w in model.named_parameters():
if intermediate_name in name:
if len(w.shape) > 1:
intermediate_weight.append(w)
else:
intermediate_bias.append(w)
if output_name in name:
if len(w.shape) > 1:
output_weight.append(w)
neuron_importance = []
for w in intermediate_weight:
neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32'))
if task_name.lower() != 'mnli':
data_loader = (data_loader, )
for data in data_loader:
for batch in data:
if isinstance(batch, dict):
input_ids, segment_ids, labels = batch['input_ids'], batch[
'token_type_ids'], batch['labels']
else:
input_ids, segment_ids, labels = batch
logits = model(
input_ids, segment_ids, attention_mask=[None, head_mask])
loss = loss_fct(logits, labels)
loss.backward()
head_importance += paddle.abs(
paddle.to_tensor(head_mask.gradient()))
for w1, b1, w2, current_importance in zip(
intermediate_weight, intermediate_bias, output_weight,
neuron_importance):
current_importance += np.abs(
(np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() *
b1.gradient()))
current_importance += np.abs(
np.sum(w2.numpy() * w2.gradient(), axis=1))
return head_importance, neuron_importance
该函数的引用方式:
from paddleslim.nas.ofa.utils import nlp_utils
该函数的原文:
def compute_neuron_head_importance(task_name,
model,
data_loader,
num_layers,
num_heads,
loss_fct=paddle.nn.loss.CrossEntropyLoss(),
intermediate_name='linear1',
output_name='linear2'):
"""
Compute the importance of multi-head attention and feed-forward neuron in each transformer layer.
在使用该函数时,我遇到了报错:
AttributeError Traceback (most recent call last)
Cell In[46], line 180
172 dev_batch_sampler = paddle.io.BatchSampler(
173 dev_ds, batch_size=4, shuffle=False)
174 dev_data_loader = DataLoader(
175 dataset=dev_ds,
176 #batch_sampler=dev_batch_sampler,
177 #collate_fn=batchify_fn
178 )
--> 180 head_importance, neuron_importance = nlp_utils.compute_neuron_head_importance(
181 task_name='cluewsc2020',
182 model=ofa_model.model,
183 data_loader=dev_ds,
184 loss_fct=paddle.nn.loss.CrossEntropyLoss(
185 ) if [True,False] else paddle.nn.loss.MSELoss(),
186 num_layers=model.ppminilm.config['num_hidden_layers'],
187 num_heads=model.ppminilm.config['num_attention_heads'])
189 # 重新组合参数的顺序
190 reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddleslim/nas/ofa/utils/nlp_utils.py:76, in compute_neuron_head_importance(task_name, model, data_loader, num_layers, num_heads, loss_fct, intermediate_name, output_name)
74 else:
75 input_ids, segment_ids, labels = batch
---> 76 logits = model(
77 input_ids, segment_ids, attention_mask=[None, head_mask])
78 loss = loss_fct(logits, labels)
79 loss.backward()
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, **kwargs)
1417 if (
1418 (not in_to_static_mode())
1419 and (not self._forward_pre_hooks)
(...)
1423 and (not in_profiler_mode())
1424 ):
1425 self._build_once(*inputs, **kwargs)
-> 1426 return self.forward(*inputs, **kwargs)
1427 else:
1428 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:300, in PPMiniLMForSequenceClassification.forward(self, input_ids, token_type_ids, position_ids, attention_mask)
270 def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
271 r"""
272 Args:
273 input_ids (Tensor):
(...)
298
299 """
--> 300 _, pooled_output = self.ppminilm(
301 input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
302 )
304 pooled_output = self.dropout(pooled_output)
305 logits = self.classifier(pooled_output)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddle/nn/layer/layers.py:1426, in Layer.call(self, *inputs, **kwargs)
1417 if (
1418 (not in_to_static_mode())
1419 and (not self._forward_pre_hooks)
(...)
1423 and (not in_profiler_mode())
1424 ):
1425 self._build_once(*inputs, **kwargs)
-> 1426 return self.forward(*inputs, **kwargs)
1427 else:
1428 return self._dygraph_call_func(*inputs, **kwargs)
File /opt/conda/envs/python35-paddle120-env/lib/python3.10/site-packages/paddlenlp/transformers/ppminilm/modeling.py:230, in PPMiniLMModel.forward(self, input_ids, token_type_ids, position_ids, attention_mask)
226 attention_mask = paddle.unsqueeze(
227 (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
228 )
229 else:
--> 230 if attention_mask.ndim == 2:
231 # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
232 attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
233 attention_mask = (1.0 - attention_mask) * -1e4
AttributeError: 'list' object has no attribute 'ndim'
经过我的甄别,我觉得该函数的attention_mask部分写的有问题:
input_ids, segment_ids, attention_mask=[None, head_mask])
在这一行代码中,attention_mask=[None, head_mask],这导致了函数的报错
The text was updated successfully, but these errors were encountered: