luisfmelo
5/19/2017 - 3:28 PM

Introduction to bash and data science 10 ep1,2

Introduction to bash and data science 10 ep1,2

# download an open data set and extract the zip
wget http://stat-computing.org/dataexpo/2009/2007.csv.bz2
sudo apt-get install dtrx
dtrx 2007.csv.bz2
mkdir practice
mv 2007.csv practice/flightdelays.csv
cd practice
head flightdelays.csv    # first 10 lines
tails flightdelays.csv   # last 10 lines
wc flightdelays.csv      # word count
wc -c flightdelays.csv   # only number the characters
wc -w flightdelays.csv   # only number of words
wc -l flightdelays.csv   # only the number of lines
wc -L flightdelays.csv   # number of words of the longest line

head -100 flightdelays.csv > first_hundred_lines.csv # put the first 100 lines in another file

# Concept of pipe 
#   1. get the 1824 first lines: head -1824 flightdelays.csv
#   2. given the 1824 first lines... retrieve the last one
#   3. we get the exact 1824 line
head -1824 flightdelays.csv | tail -1



# Concept of Cut: get the first 5 collumns
#   1. we choose a delimiter (-d) and we choose the comma (-d',')
#   2. we want some fileds (-f) and they are the number 1,2,3,15 and 16
#   3. we get the first 10 lines of that
cut -d',' -f1,2,3,15,16 flightdelays.csv | head


# Concept of grep
grep 'N749SW' flightdelays.csv                # get all the lines that have N749SW
grep -v 'N749SW' flightdelays.csv             # get all the lines that have not N749SW
grep -i 'n749sw' flightdelays.csv             # same as the first one but case insensitive
grep -n 'N749SW' flightdelays.csv             # same as the first one but prints the line number before