diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5ca700e..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index 4be97fb..bb6d55d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Broccoli, a user-friendly pipeline designed to infer with high precision orthologous groups and pairs of proteins using a phylogeny-based approach. Briefly, Broccoli performs ultra-fast phylogenetic analyses on most proteins and builds a network of orthologous relationships. Orthologous groups are then identified from the network using a parameter-free machine learning algorithm (label propagation). Broccoli is also able to detect chimeric proteins resulting from gene-fusion events and to assign these proteins to the corresponding orthologous groups. -__Reference:__ insert reference +__Reference:__ Broccoli: combining phylogenetic and network analyses for orthology assignment
diff --git a/broccoli.py b/broccoli.py
index 5143ded..ba28e6a 100644
--- a/broccoli.py
+++ b/broccoli.py
@@ -33,7 +33,7 @@
def parse_args():
# define and parse command-line arguments
- parser = argparse.ArgumentParser(description=' Broccoli v1.01', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n')
+ parser = argparse.ArgumentParser(description=' Broccoli v1.1', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n')
common = parser.add_argument_group(' general options')
common.add_argument('-steps', help='steps to be performed, comma separated (default = \'1,2,3,4\')', metavar='', type=str, default='1,2,3,4')
diff --git a/Manual_broccoli_v1.0.pdf b/manual_Broccoli_v1.1.pdf
similarity index 72%
rename from Manual_broccoli_v1.0.pdf
rename to manual_Broccoli_v1.1.pdf
index 2d9905d..9ea39c7 100644
Binary files a/Manual_broccoli_v1.0.pdf and b/manual_Broccoli_v1.1.pdf differ
diff --git a/scripts/broccoli_step2.py b/scripts/broccoli_step2.py
index 18e71d3..48607a3 100644
--- a/scripts/broccoli_step2.py
+++ b/scripts/broccoli_step2.py
@@ -24,6 +24,7 @@
import subprocess
import gzip
import math
+import re
from multiprocessing import Pool as ThreadPool
from scripts import utils
try:
@@ -166,14 +167,27 @@ def analyse_species(dict_sp):
return present, dupli
-def correct_HSP(qu_seq, ta_seq):
- # remove insertions from target HSP
- for i, k in enumerate(qu_seq):
- if k == '-':
- #ta_seq[i] = 'z'
- ta_seq = ta_seq[:i] + 'z' + ta_seq[(i+1):]
- no_insertion = ta_seq.replace('z','')
- return no_insertion
+''' new function that create aligned HSP sequences from cigar string '''
+def extract_HSP(full_seq, start, cig):
+ # split cigar
+ matches = re.findall(r'(\d+)([A-Z]{1})', cig)
+ l_tup = [(m[1], int(m[0])) for m in matches]
+
+ # reconstruct HSP
+ hsp = ''
+ position = start
+ for t in l_tup:
+ # case of aa matches
+ if t[0] == 'M':
+ hsp += full_seq[position:(position + t[1])]
+ position += t[1]
+ # case of deletion
+ elif t[0] == 'D':
+ position += t[1]
+ # case of insertion
+ elif t[0] == 'I':
+ hsp += '-' * t[1]
+ return hsp
def process_location(qu_start, qu_end, min_start, max_end):
@@ -214,7 +228,7 @@ def process_file(file):
for file_db in list_files:
search_output = index + '_' + file_db.replace('.fas','.gz')
subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ./dir_step2/' + file_db.replace('.fas','.db') + ' --max-target-seqs ' + str(max_per_species) + ' --query ./dir_step1/' + file + ' \
- --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ./dir_step2/' + index + '/' + search_output + ' --outfmt 6 qseqid sseqid qstart qend qseq_gapped sseq_gapped 2>&1', shell=True)
+ --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ./dir_step2/' + index + '/' + search_output + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True)
## get all hits in a dict of list
all_output = collections.defaultdict(list)
@@ -247,13 +261,13 @@ def process_file(file):
species = name_2_sp_phylip_seq[target][0]
qu_start = int(ll[1]) -1
qu_end = int(ll[2]) -1
- qu_seq = ll[3]
- ta_seq = ll[4]
+ ta_start = int(ll[3]) -1
+ cigar = ll[4]
if target in all_hits:
# extract HSP and add to target seq
- corrected_HSP = correct_HSP(qu_seq, ta_seq)
- all_hits[target] = all_hits[target][:qu_start] + corrected_HSP + all_hits[target][qu_end + 1:]
+ HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
+ all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
min_start, max_end = process_location(qu_start, qu_end, min_start, max_end)
else:
@@ -261,8 +275,8 @@ def process_file(file):
# create target seq
all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2])
# extract HSP
- corrected_HSP = correct_HSP(qu_seq, ta_seq)
- all_hits[target] = all_hits[target][:qu_start] + corrected_HSP + all_hits[target][qu_end + 1:]
+ HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar)
+ all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:]
min_start, max_end = process_location(qu_start, qu_end, min_start, max_end)
# reduce output for pickle (convert all element to integers)