alathrop
8/13/2019 - 7:18 PM

spark snippets

# ---
# Databricks training
# ---

%fs ls /mnt/training/dataframes/people-10m.parquet

peopleDF = spark.read.parquet("/mnt/training/dataframes/people-10m.parquet")

peopleDF.printSchema()

from pyspark.sql.functions import year
display(
  peopleDF 
    .select("firstName","middleName","lastName","birthDate","gender") 
    .filter("gender = 'F'") 
    .filter(year("birthDate") > "1990")
)

display(
  peopleDF.select("firstName","middleName","lastName",year("birthDate").alias('birthYear'),"salary") 
    .filter(year("birthDate") > "1990") 
    .filter("gender = 'F' ")
)

# pyspark equaivalent to python shape method ; dataframe dimensions; rows and columns
print((df.count(), len(df.columns)))