Introduction to bash and data science 10 ep1,2
# download an open data set and extract the zip
wget http://stat-computing.org/dataexpo/2009/2007.csv.bz2
sudo apt-get install dtrx
dtrx 2007.csv.bz2
mkdir practice
mv 2007.csv practice/flightdelays.csv
cd practice
head flightdelays.csv # first 10 lines
tails flightdelays.csv # last 10 lines
wc flightdelays.csv # word count
wc -c flightdelays.csv # only number the characters
wc -w flightdelays.csv # only number of words
wc -l flightdelays.csv # only the number of lines
wc -L flightdelays.csv # number of words of the longest line
head -100 flightdelays.csv > first_hundred_lines.csv # put the first 100 lines in another file
# Concept of pipe
# 1. get the 1824 first lines: head -1824 flightdelays.csv
# 2. given the 1824 first lines... retrieve the last one
# 3. we get the exact 1824 line
head -1824 flightdelays.csv | tail -1
# Concept of Cut: get the first 5 collumns
# 1. we choose a delimiter (-d) and we choose the comma (-d',')
# 2. we want some fileds (-f) and they are the number 1,2,3,15 and 16
# 3. we get the first 10 lines of that
cut -d',' -f1,2,3,15,16 flightdelays.csv | head
# Concept of grep
grep 'N749SW' flightdelays.csv # get all the lines that have N749SW
grep -v 'N749SW' flightdelays.csv # get all the lines that have not N749SW
grep -i 'n749sw' flightdelays.csv # same as the first one but case insensitive
grep -n 'N749SW' flightdelays.csv # same as the first one but prints the line number before