分箱.py

7/1/2017 - 2:21 PM

#根据单列顺序加序号
data['sort_num']=data['comment_num'].rank(ascending=0,method='dense')  
# method'dense'表示并列第一，并列第二…… 
# method='min' 如果第一有3个，第二有2个，则并列第一3个，并列第四2个…… 
# method='first' 如果出现相等，则取最先出现的值序号为“最小”，其他相同值依次按1递增
#根据分组内顺序加序号
data['group_sort']=data['comment_num'].groupby(data['cate']).rank(ascending=0,method='dense')

分类变量编码.py

#有时，我们会面对要改动分类变量的情况。原因可能是：
#有些算法（如罗吉斯回归）要求所有输入项目是数字形式。所以分类变量常被编码为0, 1….(n-1)
#有时同一个分类变量可能会有两种表现方式。如，温度可能被标记为“High”， “Medium”， “Low”，“H”， “low”。这里 “High” 和 “H”都代表同一类别。同理， “Low” 和“low”也是同一类别。但Python会把它们当作不同的类别。
#一些类别的频数非常低，把它们归为一类是个好主意。
#这里我们定义了一个函数，以字典的方式输入数值，用‘replace’函数进行编码。

#使用Pandas replace函数定义新函数：
def coding(col, codeDict):
    colCoded = pd.Series(col, copy=True)
    for key, value in codeDict.items():
        colCoded.replace(key, value, inplace=True)
        return colCoded
#把贷款状态LoanStatus编码为Y=1, N=0:
data["Loan_Status_Coded"] = coding(data["Loan_Status"], {'N':0,'Y':1})
print pd.value_counts(data["Loan_Status_Coded"])

分箱.py

#有时把数值聚集在一起更有意义。例如，如果我们要为交通状况（路上的汽车数量）根据时间（分钟数据）建模。
#具体的分钟可能不重要，而时段如“上午”“下午”“傍晚”“夜间”“深夜”更有利于预测。
#如此建模更直观，也能避免过度拟合。

def binning(col, cut_points, labels=None):
    minval = col.min()
    maxval = col.max()
    #利用最大值和最小值创建分箱点的列表
    break_points = [minval] + cut_points + [maxval]
    #如果没有标签，则使用默认标签0 ... (n-1)
    if not labels:
        labels = range(len(cut_points)+1)
    #使用pandas的cut功能分箱
    colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
    return colBin
#为订单时间分箱:
cut_points = [6,12,18]
labels = ["night","morning","afernoon","evening"]
orders["hour_Bin"] = binning(orders["order_hour_of_day"], cut_points, labels)
print pd.value_counts(orders["hour_Bin"], sort=False)

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

分箱.py