dataframe multiprocessing
from multiprocessing import Pool
import numpy as np
def parallelize_dataframe(df, func, num_partitions = 128, num_cores = 8):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df.sort_index()
def parallelize_preprocess_name(df):
return df.apply(lambda x: preprocess_name_test(x['NOMEN_LONG'], x['LIBNJ']), axis=1)
df_result = parallelize_dataframe(df[['NOMEN_LONG', 'LIBNJ']], parallelize_preprocess_name)