elowy01
10/3/2018 - 2:51 PM

AWK cheat sheet

AWK cheat sheet

awk '/gold/' coins.txt #look for all the records with the word gold and shows
these rows
//
awk '{if ($3 < 1980) print $3, "    ",$5,$6,$7,$8}' coins.txt #$3 is a variable
that stores the 3rd word of each row . "    " introduces 4 whitespaces for the
printing
//
awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
//
NR gives you the total number of records being processed or line number. 
In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.

$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
Processing Record -  1
Processing Record -  2
Processing Record -  3
Processing Record -  4
Processing Record -  5
5 Students Records are processed
//
awk 'END { print NR }' data #Count the lines in a file
//
NF # Number of fields (columns) in a record

For example, if we have a file like the following:

cat student-marks
Jones 2143 78 84 77
Gondrol 2321 56 58 45
RinRao 2122 38 37
Edwin 2537 78 67 45
Dayan 2415 30 47

The following awk will generate:

$ awk '{print NR,"->",NF}' student-marks
1 -> 5
2 -> 5
3 -> 4
4 -> 5
5 -> 4

//
awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk 
can be told to execute the commands
//
awk 'program' input-file1 input-file2.... #If the program is short, we can run the
program from the command-line
//
$example++ #increments the specified variable by one
//
Example: 
-rw-r--r--  1 arnold   user   1933 Nov  7 13:05 Makefile
-rw-r--r--  1 arnold   user  10809 Nov  7 13:03 awk.h
-rw-r--r--  1 arnold   user    983 Apr 13 12:14 awk.tab.h
-rw-r--r--  1 arnold   user  31869 Jun 15 12:20 awk.y
-rw-r--r--  1 arnold   user  22414 Nov  7 13:03 awk1.c
-rw-r--r--  1 arnold   user  37455 Nov  7 13:03 awk2.c
-rw-r--r--  1 arnold   user  27511 Dec  9 13:07 awk3.c
-rw-r--r--  1 arnold   user   7989 Nov  7 13:03 awk4.c

ls -l | awk '$6 == "Nov" { sum += $5 }
             END { print sum }'
#when 6th row is equal to Nov executes the action. In this case it adds the  5th
row value to sum varible. At the end we print the value of sum.
//
#another arithmetic operation
awk '{sum+=$3-$2} END {print sum}' test.txt
//
/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of 
them on a line. This is accomplished by separating the statements 
with a semicolon (;). 	     
//
awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile | wc -l#in this case the first
line of rawdatafile starts by #. So with this regex we say awk that do not
consider this line. Besides, with \ wc -l we count the number of lines that 
returns the awk command
//
#Some characters cannot be included literally in string constants ("foo") or regexp 
constants (/foo/).Instead, they should be represented with escape sequences, which 
are character sequences beginning with a backslash (\).
//
^@chapter #matches @chapter at the beginning of a string
//
[^awk] #matches any character that is not an a, w, or k.
//
awk '{print $1}' prueba #print number 1 column
//
awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
// 
awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
record
//
awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
//
awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
the middle
//
awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
//
awk -F : #sets the field separator
awk -F"\t"  {print $2}' minus_ko_125_FDR.bed
//	
awk '{s += $1} END {print s}' prueba.txt #to sum column $1
//
#calculating number of columns in a tab-separated file
awk -F'\t' '{print NF; exit}' filename
//
#skipping first line of a file
awk 'NR!=1{print}' filename 
//
awk 'NR==10' file.txt #jump to line 10 in file.txt
//
#equal to string or character
awk '{if ($5=="U") print}' filename
//
#remove all whitespaces by a single tab
awk -v OFS="\t" '$1=$1' file1
//
#regex in AWK
/
# selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
awk '$1 ~ /J/' inventory-shipped
or
awk '{ if ($1 ~ /J/) print }' inventory-shipped
/
#negating the REGEX now:
awk '$1 !~ /J/' inventory-shipped
//
#Tab field separator
awk 'BEGIN { FS = "\t" } ; { print $2 }'
#Using REGEX in AWK
 awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
//
#regex substitution within a field
echo '02/08/2011 7,33 Shopping' | awk '{sub(/,/,".",$2)} 1'

02/08/2011 7.33 Shopping

//
#Print all records from some pattern:
awk '/pattern/{f=1}f' file
//
#doing arithmetic operations within AWK
awk '{sum=$1+$2; print}' filename.txt
//
#piping in AWK
cut -f1 test_path | awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
//
#string concatenation in awk
awk -F'\t' '{print "string_to_concat" $1}'
//
#printing all columns except the first one:
awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1'  filename
//
#concatenating a string to each line in a file
awk '{print "prefix" $0}' file
//
#modifying a certain column in a file and printing the new columns separated by ;
awk -F'\t'  '{ OFS=";"; $44=$44"something"; print}' file.txt
//
#getting sequence lengths in a FASTQ file:
cat file.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c
//
#add single quotes to a comma separated list of words:
awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
//
#getting the max among a set of numbers:

Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.

> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
//
#getting columns names and their position in file
awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
//
#split in awk:
 awk '{split($0, a, ":")}'
 #           ^^  ^  ^^^
 #            |  |   |
 #       string  |   delimiter
 #               |
 #               array to store the pieces

For example:

echo "12|23|11" | awk '{split($0,a,"|"); print a[3],a[2],a[1]}'
//
# remove newlines (or breaks) by whitespaces:
awk '{printf "%s ",$0} END {print ""}' yourfile.txt
//
# change chromosome notations
(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
 1. Remove 'chr' from the chromosome notation:
awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
 2. Add chr before chromosome id
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf