LeonardAukea
11/15/2017 - 12:37 AM

Pyspark: dual explode generator

Pyspark: dual explode generator

def dualExplode(row):
    """Explode weights and category_ids list elements to separate rows.
    Args:
        row: Row
    Yield:
        Row(**newDict)
    """
    rowDict = row.asDict()
    xList = rowDict.pop('x')
    yList = rowDict.pop('y')
    for x,y in zip(xList, yList):
        newDict = dict(rowDict)
        newDict['category_ids'] = x
        newDict['weights'] = y
        yield Row(**newDict)
 
# Example usage
exploded_df = sqlContext.createDataFrame(df.rdd.flatMap(dualExplode))