lh3
8/21/2014 - 8:39 PM

kfreq.c

#include <zlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include "kseq.h"
KSEQ_INIT(gzFile, gzread)

unsigned char seq_nt6_table[256] = {
    0, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 1, 5, 2,  5, 5, 5, 3,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  4, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 1, 5, 2,  5, 5, 5, 3,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  4, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,

    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,
    5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5,  5, 5, 5, 5
};

int main(int argc, char *argv[])
{
	gzFile fp;
	kseq_t *ks;
	int kmer, i, l, mask;
	char *nei;

	if (argc < 2) {
		fprintf(stderr, "Usage: kfreq <kmer> <in.fa>\n");
		return 1;
	}

	// get the k-mer
	l = strlen(argv[1]);
	for (i = kmer = 0; i < l; ++i) {
		int c = seq_nt6_table[(int)argv[1][i]];
		assert(c >= 1 && c <= 4);
		kmer = kmer << 2 | (c - 1);
	}
	mask = (1<<2*l) - 1;

	// get the neighbors
	nei = calloc(1, 1<<2*l);
	for (i = 0; i < l; ++i) {
		int j, x, c;
		c = kmer >> 2*i & 3;
		x = kmer & ~(3 << 2*i);
		for (j = 0; j < 4; ++j) 
			if (j != c) nei[x|j<<2*i] = 1;
	}

	fp = argc == 2 || strcmp(argv[2], "-") == 0? gzdopen(fileno(stdin), "r") : gzopen(argv[2], "r");
	ks = kseq_init(fp);
	while (kseq_read(ks) >= 0) {
		int k, x[2], cnt, cnt_nei;
		x[0] = x[1] = k = cnt = cnt_nei = 0;
		for (i = 0; i < ks->seq.l; ++i) {
			int c = seq_nt6_table[(int)ks->seq.s[i]];
			if (c >= 1 && c <= 4) {
				x[0] = (x[0] << 2 | (c - 1)) & mask;
				x[1] = (x[1] >> 2 | (4 - c) << 2*(l-1));
				if (k < l) ++k;
				if (k == l) {
					if (x[0] == kmer || x[1] == kmer) ++cnt;
					else if (nei[x[0]] || nei[x[1]]) ++cnt_nei;
				}
			} else k = 0;
		}
		printf("%s\t%ld\t%d\t%d\n", ks->name.s, ks->seq.l, cnt, cnt_nei);
	}
	kseq_destroy(ks);
	gzclose(fp);
	return 0;
}