-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthomclReduceFasta
52 lines (41 loc) · 1.19 KB
/
orthomclReduceFasta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/perl
use strict;
&usage() unless scalar(@ARGV) == 2;
my $fastafile = $ARGV[0];
my $taxa_str = $ARGV[1];
if ($fastafile =~ /\.gz$/) {
open(IN, "zcat $fastafile|") || die "Can't open fasta file '$fastafile'\n";
} else {
open(IN, $fastafile) || die "Can't open fasta file '$fastafile'\n";
}
my @t = split(/,/, $taxa_str);
scalar(@t) > 1 || die "Invalid taxa specification '$taxa_str'\n";
my $taxa;
map {$taxa->{$_} = 1} @t;
my $currentSeq;
my $currentTaxon;
# process lines of one file
while (<IN>) {
chomp;
# handle prev seq
if (/\>([^\|]+)/) {
print $currentSeq if ($currentSeq && $taxa->{$currentTaxon});
$currentTaxon = $1;
$currentSeq = "";
}
$currentSeq .= "$_\n" if $taxa->{$currentTaxon};
}
print $currentSeq if $taxa->{$currentTaxon};
sub usage {
print STDERR "
Reduce a fasta file by taxon. Input is a fasta file and a set of taxa.
Output is a fasta file that contains only those taxa
Usage:
orthomclReduceFasta fasta_file taxa
where:
fasta_file: a standard orthomcl compatible fasta file. (.gz file is ok)
taxa: a comma delimited list of taxon abbreviations
EXAMPLE: orthomclSoftware/bin/orthomclReduceFasta proteins.fasta hsa,pfa,txo
";
exit(1);
}