iyak
6/7/2017 - 7:06 AM

モチーフの2次構造と配列のセットからantaRNAによって配列セットを作る。

モチーフの2次構造と配列のセットからantaRNAによって配列セットを作る。

# usage: qsub -t 1-`wc -l<5-10-embed-seq.li` gen_seq_set.py <outdir>
# assume: Rss contains 0 or 1 gap

#$ -cwd
#$ -V
#$ -e /home/iyak/.ugeerr
#$ -o /home/iyak/.ugeout
#$ -i 5-10-embed-seq.li
#$ -S /home/iyak/.pyenv/shims/python3
##$ -m es -M h.m@tuta.io
#$ -l s_vmem=1G,mem_req=1G
#$ -r no

from mymo import util
import sys, random, numpy as np

try: tid = util.parse_uge_ev().tid
except KeyError: tid = 1 # for test running without qsub

ofname = util.fullpath(sys.argv[1]) + '/%d-seq'%tid
Lseq = 150
flanking = 50
Nseq = 200
antaRNA = "/usr/local/package/python2.7/2.7.2/bin/python2.7 "\
        "$HOME/local/bin/antaRNA118.py"

def gen_seq_set(rss, seq, L, N, flanking):
    o = []
    for k in range(N):
        gap_len = random.choice(range(10))
        gap_rss = rss.split('*')

        gap_seq = []
        seq0 = seq[:]
        for gr in gap_rss:
            l = len(gr)
            gap_seq += [seq0[:l]]
            seq0 = seq0[l:]

        bind_len = L - flanking*2
        bind_site = random.choice(range(bind_len))
        anta_seq = 'N'*(flanking+bind_site)+('N'*gap_len).join(gap_seq)
        anta_seq += 'N'*(L-len(anta_seq))
        anta_rss = 'A'*(flanking+bind_site)+('A'*gap_len).join(gap_rss)
        anta_rss += 'A'*(L-len(anta_rss))
        ok,e,_ = util.po(
                antaRNA,
                arg=[
                    "-Cstr", "'" + anta_rss + "'",
                    "-Cseq", "'" + anta_seq + "'",
                    "--tGC", 0.5,
                    "--seed", k,
                    "--noOfColonies", 1,
                    "-noGU", # -noGU option is reversed due to a bug in antaRNA.
                    ],
                )
        sys.stderr.write(e)

        ok = ok.strip().split('\n')
        ok[0] = ">antaRNA:%d"%k
        if 1==len(gap_seq): ok[0] += ";motif-site:%d-%d"%(
                flanking+bind_site, flanking+bind_site+len(gap_seq[0]))
        elif 2==len(gap_seq): ok[0] += ";motif-site:%d-%d,%d-%d"%(
                flanking+bind_site, flanking+bind_site+len(gap_seq[0]),
                flanking+bind_site+len(gap_seq[0])+gap_len,
                flanking+bind_site+len(gap_seq[0])+gap_len+len(gap_seq[1]))
        else: raise RuntimeError
        ok[0] += ";motif-seq:%s"%(''.join(gap_seq));
        ok[0] += ";motif-rss:%s"%('*'.join(gap_rss));
        o += ok
    return o

def main():
    for i,l in enumerate(sys.stdin, start=1):
        if tid==i:
            with open(ofname, "w") as fo:
                ll = l.strip().split('\t')
                Rss = ll[0]
                for seq in ll[1:]:
                    fa = gen_seq_set(Rss, seq, Lseq, Nseq, flanking)
                    fo.write('\t'.join(fa)+'\n')

main()