CodeCollection2018
1/19/2019 - 3:41 PM

填充不同长度的seq并构建掩码矩阵

def prepare_data(seqs,labels):
	"""
	create the matrics from the datasets
	this pad each sequence to the same length:the length of the longest seuence or maxlen.
	if maxlen is set,we will out all sequence to this maximum length.
	this swap the axis
	"""
	#x:a list of sentences
	
	lengths = [len(s) for s in seqs]
	n_samples = len(seqs)
	maxlen = numpy.max(lengths)
	
	x = numpy.zeros((maxlen,n_samples)).astype('int64')
	x_mask = numpy.ones((maxlen,n_samples)).astype(theano.config.floatx)
	
	for idx,s in enumerate(seqs):
		x[:lengths[idx],idx] = s
	
	x_mask *= (1-(x == 0))  #构建mask矩阵的绝佳技巧
	
	return x,x_mask,labels