lh3
11/24/2014 - 10:06 PM

On FASTQ parsing performance

On FASTQ parsing performance

#include <iostream>
#include <seqan/seq_io.h>
#include <seqan/sequence.h>

int main(int argc, char const ** argv)
{
    if (argc != 2) return 1;
    seqan::SequenceStream seqStream(argv[1]);
    seqan::CharString id, seq, qual;
    long len = 0;
    while (!atEnd(seqStream)) {
        if (readRecord(id, seq, qual, seqStream) != 0)
            return 1;
        len += length(seq);
    }
    std::cout << len << std::endl;
    return 0;
}
#include <fstream>
#include <iostream>

#include <seqan/seq_io.h>
#include <seqan/sequence.h>

int main(int argc, char const ** argv)
{
    if (argc != 2) return 1;

    std::fstream in(argv[1], std::ios::binary | std::ios::in);
    seqan::RecordReader<std::fstream, seqan::SinglePass<> > reader(in);

    seqan::CharString id;
    seqan::CharString seq;
    seqan::CharString qual;
    long len = 0;
    while (!atEnd(reader)) {
        if (readRecord(id, seq, qual, reader, seqan::Fastq()) != 0)
            return 1;
        len += length(seq);
    }
    std::cout << len << std::endl;
    
    return 0;
}
#include <zlib.h>  
#include <stdio.h>  
#include "kseq.h"  
KSEQ_INIT(gzFile, gzread)  
  
int main(int argc, char *argv[])  
{  
    gzFile fp;  
    kseq_t *seq;  
    long len = 0;
    if (argc == 1) {  
        fprintf(stderr, "Usage: %s <in.seq>\n", argv[0]);  
        return 1;  
    }  
    fp = gzopen(argv[1], "r");
    seq = kseq_init(fp);
    while (kseq_read(seq) >= 0) len += seq->seq.l;
    printf("%ld\n", len);  
    kseq_destroy(seq);
    gzclose(fp);
    return 0;  
}  

As we were talking about FASTQ parsing on Twitter, I dig out an old experiement and redid it with the latest kseq.h and SeqAn-1.4.2. The task is simple: to count the number of bases in a FASTQ files containing 2 million 100bp short reads. The file is put on the RAM disk. The following gives timing for a few programs/settings:

User time (s)Sys time (s)Command line
0.980.16kseq-len /dev/shm/tmp.fq
4.240.11kseq-len /dev/shm/tmp.fq.gz # gzip'd
5.530.13seqtk fqchk /dev/shm/tmp.fq.gz # more than counting
12.030.12seqan2-len /dev/shm/tmp.fq
14.110.28seqan-len /dev/shm/tmp.fq