AWK cheat sheet
awk '/gold/' coins.txt #look for all the records with the word gold and shows
these rows
//
awk '{if ($3 < 1980) print $3, " ",$5,$6,$7,$8}' coins.txt #$3 is a variable
that stores the 3rd word of each row . " " introduces 4 whitespaces for the
printing
//
awk '{if ($3 >= 0) print $3}' filename #same as the previous one but we add the equal sign
//
NR gives you the total number of records being processed or line number.
In the following awk NR example, NR variable has line number, in the END section awk NR tells you the total number of records in a file.
$ awk '{print "Processing Record - ",NR;}END {print NR, "Students Records are processed";}' student-marks
Processing Record - 1
Processing Record - 2
Processing Record - 3
Processing Record - 4
Processing Record - 5
5 Students Records are processed
//
awk 'END { print NR }' data #Count the lines in a file
//
NF # Number of fields (columns) in a record
For example, if we have a file like the following:
cat student-marks
Jones 2143 78 84 77
Gondrol 2321 56 58 45
RinRao 2122 38 37
Edwin 2537 78 67 45
Dayan 2415 30 47
The following awk will generate:
$ awk '{print NR,"->",NF}' student-marks
1 -> 5
2 -> 5
3 -> 4
4 -> 5
5 -> 4
//
awk -f <awk program file name> input-file1 #The commands can be written into a file, and then Awk
can be told to execute the commands
//
awk 'program' input-file1 input-file2.... #If the program is short, we can run the
program from the command-line
//
$example++ #increments the specified variable by one
//
Example:
-rw-r--r-- 1 arnold user 1933 Nov 7 13:05 Makefile
-rw-r--r-- 1 arnold user 10809 Nov 7 13:03 awk.h
-rw-r--r-- 1 arnold user 983 Apr 13 12:14 awk.tab.h
-rw-r--r-- 1 arnold user 31869 Jun 15 12:20 awk.y
-rw-r--r-- 1 arnold user 22414 Nov 7 13:03 awk1.c
-rw-r--r-- 1 arnold user 37455 Nov 7 13:03 awk2.c
-rw-r--r-- 1 arnold user 27511 Dec 9 13:07 awk3.c
-rw-r--r-- 1 arnold user 7989 Nov 7 13:03 awk4.c
ls -l | awk '$6 == "Nov" { sum += $5 }
END { print sum }'
#when 6th row is equal to Nov executes the action. In this case it adds the 5th
row value to sum varible. At the end we print the value of sum.
//
#another arithmetic operation
awk '{sum+=$3-$2} END {print sum}' test.txt
//
/12/ { print $0 } ; /21/ { print $0 } #you might want to put more than one of
them on a line. This is accomplished by separating the statements
with a semicolon (;).
//
awk '!/^#/ && $2==1 && $7==1 && $8==1' rawdatafile | wc -l#in this case the first
line of rawdatafile starts by #. So with this regex we say awk that do not
consider this line. Besides, with \ wc -l we count the number of lines that
returns the awk command
//
#Some characters cannot be included literally in string constants ("foo") or regexp
constants (/foo/).Instead, they should be represented with escape sequences, which
are character sequences beginning with a backslash (\).
//
^@chapter #matches @chapter at the beginning of a string
//
[^awk] #matches any character that is not an a, w, or k.
//
awk '{print $1}' prueba #print number 1 column
//
awk '{if ($2>90) print}' prueba #print number 2 column but only >90 values
//
awk '/ENSP00000339623/ {print}' datafile1008 #searchs for the regex and print the
record
//
awk '$1 !~/7/ {print}' prueba #prints all the records but the number 7 record
//
awk '{print $1 "\t\t" $2}' filename #prints $1 and $2 column leaving a tab in
the middle
//
awk '$3~/PATTERN/ {print}' filename.txt #search for a pattern in column 3 inside filename.txt
//
awk -F : #sets the field separator
awk -F"\t" {print $2}' minus_ko_125_FDR.bed
//
awk '{s += $1} END {print s}' prueba.txt #to sum column $1
//
#calculating number of columns in a tab-separated file
awk -F'\t' '{print NF; exit}' filename
//
#skipping first line of a file
awk 'NR!=1{print}' filename
//
awk 'NR==10' file.txt #jump to line 10 in file.txt
//
#equal to string or character
awk '{if ($5=="U") print}' filename
//
#remove all whitespaces by a single tab
awk -v OFS="\t" '$1=$1' file1
//
#regex in AWK
/
# selects, all input records with the uppercase letter ‘J’ somewhere in the first field:
awk '$1 ~ /J/' inventory-shipped
or
awk '{ if ($1 ~ /J/) print }' inventory-shipped
/
#negating the REGEX now:
awk '$1 !~ /J/' inventory-shipped
//
#Tab field separator
awk 'BEGIN { FS = "\t" } ; { print $2 }'
#Using REGEX in AWK
awk 'BEGIN { FS = "[\t]" } {print $3}' results/linc_up.tfbs.sorted.tsv.tmp
//
#regex substitution within a field
echo '02/08/2011 7,33 Shopping' | awk '{sub(/,/,".",$2)} 1'
02/08/2011 7.33 Shopping
//
#Print all records from some pattern:
awk '/pattern/{f=1}f' file
//
#doing arithmetic operations within AWK
awk '{sum=$1+$2; print}' filename.txt
//
#piping in AWK
cut -f1 test_path | awk 'BEGIN{OFS="\t"}{print "pg-trace-001:/nfs/1000g-work/ihec/drop/bp-raw-data/blueprint/data/"$1,"/ebi/ftp/pub/databases/blueprint/next_data/"$1}'
//
#string concatenation in awk
awk -F'\t' '{print "string_to_concat" $1}'
//
#printing all columns except the first one:
awk 'BEGIN{FS=OFS="\t"}{$1="";sub("\t","")}1' filename
//
#concatenating a string to each line in a file
awk '{print "prefix" $0}' file
//
#modifying a certain column in a file and printing the new columns separated by ;
awk -F'\t' '{ OFS=";"; $44=$44"something"; print}' file.txt
//
#getting sequence lengths in a FASTQ file:
cat file.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c
//
#add single quotes to a comma separated list of words:
awk -F"," -v quote="'" -v OFS="','" '$1=$1 {print quote $0 quote}' file
//
#getting the max among a set of numbers:
Suppose I have a file data.dat with three columns of numbers in plain text. I want to get the maximum value in column 3.
> awk 'BEGIN {max = 0} {if ($3>max) max=$3} END {print max}' data.dat
//
#getting columns names and their position in file
awk -F'\t' ' { for (i = 1; i <= NF; ++i) print i, $i; exit } ' file
//
#split in awk:
awk '{split($0, a, ":")}'
# ^^ ^ ^^^
# | | |
# string | delimiter
# |
# array to store the pieces
For example:
echo "12|23|11" | awk '{split($0,a,"|"); print a[3],a[2],a[1]}'
//
# remove newlines (or breaks) by whitespaces:
awk '{printf "%s ",$0} END {print ""}' yourfile.txt
//
# change chromosome notations
(read at http://webappl.blogspot.com/2014/06/convert-vcf-chromosome-notation.html)
1. Remove 'chr' from the chromosome notation:
awk '{gsub(/^chr/,""); print}' with_chr.vcf > no_chr.vcf
2. Add chr before chromosome id
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' no_chr.vcf > with_chr.vcf