Tabea-K
8/26/2015 - 6:57 AM

Prints the lengths of the quality and the sequence lines of a fastq file, and prints a boolean indicating whether the quality line and the s

Prints the lengths of the quality and the sequence lines of a fastq file, and prints a boolean indicating whether the quality line and the sequence line have the same length.

#!/usr/bin/env python

import sys
import os
import argparse

q_ascii = "!#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
q_ascii += '"'

def id_error(line_no, name, line):
	"""
	Prints error message.
	"""
	print "Error in line %s of file, because no name (%s):\n'%s'"%(line_no, name, line)


def check_lengths(fh):
	"""
	Goes through each line and checks length
	"""
	lineno = 0
	for i, line in enumerate(fh.readlines()):
		line = line.strip()
		#print i, lineno, line
		if lineno == 0:
			if not line.startswith('@'):
				print "Error 01 in line %s. Invalid first line:"%i
				print "          %s"%line
				print "          Line needs to start with '@'!"
			name = line
			lineno += 1
		elif lineno == 1:
			line = line.upper()
			if not set(line).issubset(set('ACTGN')):
				print "Error 02 in line %s. Invalid nucleotide sequence:"%i
				print "          %s"%line
				print "          Contains these characters: %s"%set(line)
				print "          Invalid character/s: %s"%(set(line) - set('ACTGN'))	
				print "          Allowed characters are: %s"%set('ACTGN')		
			seq_len = len(line)
			lineno += 1
		elif lineno == 2:
			if not line.startswith('+'):
				print "Error 02 in line %s. Invalid third line:"%i
				print "          %s"%line
				print "          Line needs to start with '+'!"
			lineno += 1
		elif lineno == 3:
			if not set(line).issubset(set(q_ascii)):
				print "Error 04 in line %s. Invalid quality line:"%i
				print "          %s"%line
				print "          Contains these characters: %s"%set(line)			
				print "          Invalid character/s: %s"%(set(line) - set(q_ascii))		
				print "          Allowed characters are: %s"%set(q_ascii)		
			qual_len = len(line)
			lineno = 0
			print name, seq_len, qual_len, '\t', seq_len == qual_len
			



# MAIN
if __name__ == "__main__":

	# parse arguments
	parser = argparse.ArgumentParser()
	parser.add_argument("Fastq_file", nargs='?', type=argparse.FileType('r'), help="file with Fastq sequences", \
		default=sys.stdin)
	args = parser.parse_args()
	seq_file = args.Fastq_file

	check_lengths(seq_file)