vgrabovets
4/28/2017 - 3:21 PM

dataframe multiprocessing

dataframe multiprocessing

from multiprocessing import Pool
import numpy as np

def parallelize_dataframe(df, func, num_partitions = 128, num_cores = 8):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df.sort_index()

def parallelize_preprocess_name(df):
    return df.apply(lambda x: preprocess_name_test(x['NOMEN_LONG'], x['LIBNJ']), axis=1)
    
df_result = parallelize_dataframe(df[['NOMEN_LONG', 'LIBNJ']], parallelize_preprocess_name)