Introduction to bash and data science ep3,4
mkdir sortdemo
cd sortdemo
wget https://www.data36.com/demo1.csv --no-check-certificate # random numbers generated
wget https://www.data36.com/demo2.csv --no-check-certificate # random names generated
# Sort in bash
sort demo2.csv
sort demo1.csv -n # sort in numerical order
sort -r demo2.csv # sort in reverse (ascending order)
sort -r -n demo1.csv # it work with numbers too
sort -n -u demo1.csv # sort in numerical order but remove duplicates7
sort -t’,’ -k2 -n filename.csv
# Calculate MAX, MIN and MEDIAN
# MAX
cut -d',' -f15 flightdelays.csv | sort -n | tail -1
# MIN
cut -d',' -f15 flightdelays.csv | sort -n | head -1
# MEDIAN:
# 1. Filter data (remove NA and the first line (header))
# 2. sort
# 3. get rid of the second half of the data set
# 4. get median
cut -d',' -f15 flightdelays.csv |grep -v 'NA' |grep -v 'ArrDelay' |sort -n |head -3637645 |tail -2
# Uniq lines -> create a demo3.csv with random number per line
uniq demo3.csv # it will only remove uniq adjacent lines
sort demo3.csv |uniq -c # first we sort then, remove... -c flag is for count
# How many different airports we have in our flightdelays.csv file?
# 1. get all destination airports (field 18)
# 2. sort in alphabetical order
# 3. remove duplicated lines
# 4. count the number of lines
cut flightdelays.csv -d',' -f18 | sort | uniq | wc -l
# How many different airports we have in our flightdelays.csv file?
cut flightdelays.csv -d',' -f18 |sort |uniq |wc -l
# List out the top 3 destination airports (by the number of Arrival planes)!
cut -d',' -f18 flightdelays.csv | sort | uniq -c | sort -n -r | head -3
# BASH SCRIPTS - we will use mcedit and... we can use mouse :D
mcedit demoscript.sh
#!/usr/bin/env bash
echo "The top 3 airports:"
cut flightdelays.csv -d',' -f18 |sort |uniq -c |sort -n |tail -3
echo "The number of unique airports:"
cut flightdelays.csv -d',' -f18 |sort |uniq |wc -l
chmod 700 demoscript.sh # just to give the right permission
./demoscript.sh
# Clean CSV files
sudo pip install csvkit
csvlook # helps to see csv files in processable-by-humans format
cat 2007.csv |head |cut -d',' -f12,13,14,15| csvlook
csvstat # gives statistics for the dataset -> DO NOT USE IN LARGE FILES
cat demo1.csv | csvstat