Prints the lengths of the quality and the sequence lines of a fastq file, and prints a boolean indicating whether the quality line and the sequence line have the same length.
#!/usr/bin/env python
import sys
import os
import argparse
q_ascii = "!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
q_ascii += '"'
def id_error(line_no, name, line):
"""
Prints error message.
"""
print "Error in line %s of file, because no name (%s):\n'%s'"%(line_no, name, line)
def check_lengths(fh):
"""
Goes through each line and checks length
"""
lineno = 0
for i, line in enumerate(fh.readlines()):
line = line.strip()
#print i, lineno, line
if lineno == 0:
if not line.startswith('@'):
print "Error 01 in line %s. Invalid first line:"%i
print " %s"%line
print " Line needs to start with '@'!"
name = line
lineno += 1
elif lineno == 1:
line = line.upper()
if not set(line).issubset(set('ACTGN')):
print "Error 02 in line %s. Invalid nucleotide sequence:"%i
print " %s"%line
print " Contains these characters: %s"%set(line)
print " Invalid character/s: %s"%(set(line) - set('ACTGN'))
print " Allowed characters are: %s"%set('ACTGN')
seq_len = len(line)
lineno += 1
elif lineno == 2:
if not line.startswith('+'):
print "Error 02 in line %s. Invalid third line:"%i
print " %s"%line
print " Line needs to start with '+'!"
lineno += 1
elif lineno == 3:
if not set(line).issubset(set(q_ascii)):
print "Error 04 in line %s. Invalid quality line:"%i
print " %s"%line
print " Contains these characters: %s"%set(line)
print " Invalid character/s: %s"%(set(line) - set(q_ascii))
print " Allowed characters are: %s"%set(q_ascii)
qual_len = len(line)
lineno = 0
print name, seq_len, qual_len, '\t', seq_len == qual_len
# MAIN
if __name__ == "__main__":
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("Fastq_file", nargs='?', type=argparse.FileType('r'), help="file with Fastq sequences", \
default=sys.stdin)
args = parser.parse_args()
seq_file = args.Fastq_file
check_lengths(seq_file)