# ---
# Databricks training
# ---
%fs ls /mnt/training/dataframes/people-10m.parquet
peopleDF = spark.read.parquet("/mnt/training/dataframes/people-10m.parquet")
peopleDF.printSchema()
from pyspark.sql.functions import year
display(
peopleDF
.select("firstName","middleName","lastName","birthDate","gender")
.filter("gender = 'F'")
.filter(year("birthDate") > "1990")
)
display(
peopleDF.select("firstName","middleName","lastName",year("birthDate").alias('birthYear'),"salary")
.filter(year("birthDate") > "1990")
.filter("gender = 'F' ")
)
# pyspark equaivalent to python shape method ; dataframe dimensions; rows and columns
print((df.count(), len(df.columns)))