jmquintana79
6/1/2017 - 1:41 AM

## Check percent of missing values and the trend of holes in df column and if they are consecutive or not.

Check percent of missing values and the trend of holes in df column and if they are consecutive or not.

``````## check percent of nan values in pandas df
def check_nans(DATA,col_total):
print('Shape:', DATA[col_total].shape)
nas = [x for x in DATA[col_total].columns.values if DATA[x].isnull().sum() > 0]
print('Cols with NAs:', len(nas))
if len(nas)>0:
for x in nas:
print(x, ':{:.2f}% of NAs'.format(DATA[x].isnull().sum()/float(len(DATA))*100))
return None

## check trend of holes (consecutive or not)
def check_trend_holes(DF,svar,sdatetime):
import matplotlib.pyplot as plt
import copy

# copy
AUX = copy.deepcopy(DF[[svar]])

# reset index
AUX.reset_index(drop=False, inplace=True)
AUX.reset_index(drop=False, inplace=True)

# get 1st and last dates
dt1,dt2 = AUX[sdatetime].tolist()[0],AUX[sdatetime].tolist()[-1]
sdt1, sdt2 = dt1.strftime("%Y-%m-%d %H:%M"),dt2.strftime("%Y-%m-%d %H:%M")

# filtering nan values
lindex = list(AUX[np.isnan(AUX[svar])].index)
AUX = AUX[AUX.index.isin(lindex)]

# check consecutive holes
AUX.loc[(AUX['index'].shift(-1) - AUX['index'] == 1) | (AUX['index'].shift(1) - AUX['index'] == -1), 'isconsecutive'] = True

# display results
stitle = 'Period: %s - %s (Total: %s) \nHOLES: Total holes = %s / Consecutive holes = %s / Non-Consecutive holes = %s'%(sdt1,sdt2,len(DF)
len(AUX),
len(AUX[AUX.isconsecutive==True]),
len(AUX[AUX.isconsecutive!=True]))

# plot
pd.isnull(DF[svar]).plot(figsize=(20,3))
plt.title(stitle,fontsize=22)
plt.xticks(rotation='horizontal', fontsize=16)
plt.show()

# clean
del(AUX)

# return
return None``````