#!/usr/bin/perl
use strict;
use warnings;
# the specification is https://dl.dropbox.com/u/40857893/cv/spTag_database_creation.docx
# the input file is https://dl.dropbox.com/u/40857893/cv/Input.fasta
#use Bio::SeqIO; fail to install the bioperl, so analysis the seq manually
my %mass_table=(
A => 71.03711,
C => 156.10111,
D => 114.04293,
E => 115.02694,
F => 103.00919,
G => 129.04259,
H => 128.05858,
I => 57.02146,
K => 137.05891,
L => 113.08406,
M => 113.08406,
N => 128.09496,
P => 131.04049,
Q => 147.06841,
R => 97.05276,
S => 87.03203,
T => 101.04768,
V => 186.07931,
W => 163.06333,
Y => 99.06841,
);
#digest seq->peps
sub digest {
my $seq = shift;
$seq =~ s/R/R;/g;
$seq =~ s/K/K;/g;
split(/;/,$seq);
}
#mass pep->mass
sub mass {
my $pep = shift;
my $mass_pep = 0;
foreach my $amino (split //, $pep) {
$mass_pep = $mass_pep + $mass_table{$amino};
}
return $mass_pep;
}
##################################################
# seqs process begins here.
# "input.fasta" should be the file to be processed
##################################################
open INPUT, "<", "input.fasta" or die $!;
open OUT1, ">", "outfile1.txt" or die $!;
my @mass_freq = ();
local $/ = '>';
while( <INPUT> ) {
chomp;
my ($desc,$seq) = split("\n",$_,2);
if ($seq){
$seq =~ s[\n][]g;
my @peps = digest($seq);
foreach my $pep (sort {length($a) <=> length($b)} @peps)
{
my $mass_temp = mass($pep);
if ($mass_temp >= 400 && $mass_temp <= 6000){
push(@mass_freq, $mass_temp);
printf OUT1 ">Mass= %.2f | %s\n\n %s \n\n", $mass_temp,$desc,$pep;
}
}
}
}
close INPUT;
close OUT1;
open OUT2, ">", "outfile2.txt" or die $!;
# print freq table;
my %freq;
foreach my $mas (@mass_freq) {
$freq{$mas}++;
}
print OUT2 "Mass Frequency\n";
foreach my $mas (sort {$a <=> $b} keys %freq) {
printf OUT2 "%.2f %d\n",$mas,$freq{$mas};
}
close OUT2;