jmquintana79
6/1/2017 - 1:41 AM

Check percent of missing values and the trend of holes in df column and if they are consecutive or not.

Check percent of missing values and the trend of holes in df column and if they are consecutive or not.

## check percent of nan values in pandas df
def check_nans(DATA,col_total):
    print('Shape:', DATA[col_total].shape)
    nas = [x for x in DATA[col_total].columns.values if DATA[x].isnull().sum() > 0]
    print('Cols with NAs:', len(nas))
    if len(nas)>0: 
        for x in nas: 
            print(x, ':{:.2f}% of NAs'.format(DATA[x].isnull().sum()/float(len(DATA))*100))
    return None
    

## check trend of holes (consecutive or not)
def check_trend_holes(DF,svar,sdatetime):
    import matplotlib.pyplot as plt
    import copy
    
    # copy 
    AUX = copy.deepcopy(DF[[svar]])

    # reset index
    AUX.reset_index(drop=False, inplace=True)
    AUX.reset_index(drop=False, inplace=True)

    # get 1st and last dates
    dt1,dt2 = AUX[sdatetime].tolist()[0],AUX[sdatetime].tolist()[-1]
    sdt1, sdt2 = dt1.strftime("%Y-%m-%d %H:%M"),dt2.strftime("%Y-%m-%d %H:%M")
    
    # filtering nan values
    lindex = list(AUX[np.isnan(AUX[svar])].index)
    AUX = AUX[AUX.index.isin(lindex)]

    # check consecutive holes
    AUX.loc[(AUX['index'].shift(-1) - AUX['index'] == 1) | (AUX['index'].shift(1) - AUX['index'] == -1), 'isconsecutive'] = True

    # display results
    stitle = 'Period: %s - %s (Total: %s) \nHOLES: Total holes = %s / Consecutive holes = %s / Non-Consecutive holes = %s'%(sdt1,sdt2,len(DF)
                                                                                    len(AUX),
                                                                                    len(AUX[AUX.isconsecutive==True]),
                                                                                    len(AUX[AUX.isconsecutive!=True]))
    
    # plot                                                                     
    pd.isnull(DF[svar]).plot(figsize=(20,3))
    plt.title(stitle,fontsize=22)
    plt.xticks(rotation='horizontal', fontsize=16)
    plt.show()
    
    # clean
    del(AUX)
                                                                                       
    # return
    return None