#scale推荐值:0.0001
#附加稀疏惩罚项到梯度下降过程中 x_{k+1} = x_{k} - \alpha_{k} * g^{k}
#m.weight.data权值越小对梯度影响越小,证明对通道越不重要
def updateBN(scale, model):
for m in model.modules():
if isinstance(m, nn.BatchNorm2d):
m.weight.grad.data.add_(scale*torch.sign(m.weight.data)) #L1 正则化