-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorthomclAdjustFasta
66 lines (50 loc) · 1.98 KB
/
orthomclAdjustFasta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/perl
use strict;
&usage() unless scalar(@ARGV) == 3;
my $taxoncode = $ARGV[0];
my $inputfile = $ARGV[1];
my $idField = $ARGV[2];
open(IN, $inputfile) || die "Can't open input file '$inputfile'\n";
open(OUT, ">$taxoncode.fasta") || die "Can't open output file '$taxoncode.fasta'\n";
my %ids;
while(<IN>) {
if (/\>/) {
s/^\>\s*//;
s/\s+/ /g;
s/\s*\|\s*/\|/g;
my @a = split(/[\s\|]/);
my $id = $a[$idField-1];
die "Fasta file '$inputfile' contains a duplicate id: $id\n" if $ids{$id};
$ids{$id} = 1;
print OUT ">$taxoncode|$id\n";
} else {
print OUT $_;
}
}
sub usage {
print STDERR "
Create an OrthoMCL compliant .fasta file, by adjusting definition lines.
Usage:
orthomclAdjustFasta taxon_code fasta_file id_field
where:
taxon_code: a three or four letter unique abbreviation for the taxon
fasta_file: the input fasta file
id_field: a number indicating what field in the definition line contains
the protein ID. Fields are separated by either ' ' or '|'. Any
spaces immediately following the '>' are ignored. The first
field is 1. For example, in the following definition line, the
ID (AP_000668.1) is in field 4: >gi|89106888|ref|AP_000668.1|
Input file requirements:
(1) .fasta format
(2) a unique id is provided for each sequence, and is in the field specified
by id_field
Output file format:
(1) .fasta format
(2) definition line is of the form:
>taxoncode|unique_protein_id
The output file is named taxoncode.fasta
Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to create the required input files to this program, or the required output files. This program is provided as a convenience, but OrthoMCL users are expected to have the scripting skills to provide OrthoMCL compliant .fasta files.
EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1
";
exit(1);
}