jmquintana79
5/19/2016 - 3:21 PM

filtering pandas

Filter data stored into pandas dataframe

""" Filter by condition """
# conditional operators for pandas:
# EQUAL: ==
# DIFFERENT TO: !=
# BIGGER THAN: >
# LOWER THAN: <
# BIGGER OR EQUAL TO: >=
# LOWER OR EQUEAL TO: <=
# AND: &
# OR: |

# condition is set through single values
DF_filtered = DF[DF.column == value] # or other condition
DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition



""" Filtering by "isin()" method """
# condition is set through multiple values

#When values is a list:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df.isin([1, 3, 12, 'a'])
"""
       A      B
0   True   True
1  False  False
2   True  False
"""

# When values is a dict:
df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
df.isin({'A': [1, 3], 'B': [4, 7, 12]})
"""
       A      B
0   True  False  # Note that B didn't match the 1 here.
1  False   True
2   True   True
"""

# When values is a Series or DataFrame:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
df.isin(other)
"""
       A      B
0   True  False
1  False  False  # Column A in `other` has a 3, but not at index 1.
2   True   True
"""

# filter dataframe through a condition with values of list
df_filtered = df[df.column.isin(list_values_to_filter)]

# filter dataframe through a condition with values out of list
df_filtered = df[~df.column.isin(list_values_to_filter)]

# filter according to a column values if appear in a list
df[df.columna.isin(list_targets)]

# filter according to a string column value contain a substring
df[df.columna.map(lambda x: 'substr' in x)]
df[df.COLUMN.str.contains('substring')]

## filtering nan values
lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
lindex = np.where(pd.isnull(df["column"])) # only possible with float values
lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
df_filtered = df[~df.index.isin(lindex)]

## other way
df_filtered = df.dropna(inplace=False)
# dropping rows with NaN values in only a list of column
df.dropna(subset=['column_name'])


""" duplicated """
# return True / False column
DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
# filter duplicated values
DF = DF[~DF['duplicated']]

# DROP DUPLICATES
df.drop_duplicates(['col1','col2'],keep='first',inplace=True)