diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5ca700e..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index 4be97fb..bb6d55d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Broccoli, a user-friendly pipeline designed to infer with high precision orthologous groups and pairs of proteins using a phylogeny-based approach. Briefly, Broccoli performs ultra-fast phylogenetic analyses on most proteins and builds a network of orthologous relationships. Orthologous groups are then identified from the network using a parameter-free machine learning algorithm (label propagation). Broccoli is also able to detect chimeric proteins resulting from gene-fusion events and to assign these proteins to the corresponding orthologous groups. -__Reference:__ insert reference +__Reference:__ Broccoli: combining phylogenetic and network analyses for orthology assignment

diff --git a/broccoli.py b/broccoli.py index 5143ded..ba28e6a 100644 --- a/broccoli.py +++ b/broccoli.py @@ -33,7 +33,7 @@ def parse_args(): # define and parse command-line arguments - parser = argparse.ArgumentParser(description=' Broccoli v1.01', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n') + parser = argparse.ArgumentParser(description=' Broccoli v1.1', add_help=False, formatter_class=argparse.RawTextHelpFormatter, epilog=' \n') common = parser.add_argument_group(' general options') common.add_argument('-steps', help='steps to be performed, comma separated (default = \'1,2,3,4\')', metavar='', type=str, default='1,2,3,4') diff --git a/Manual_broccoli_v1.0.pdf b/manual_Broccoli_v1.1.pdf similarity index 72% rename from Manual_broccoli_v1.0.pdf rename to manual_Broccoli_v1.1.pdf index 2d9905d..9ea39c7 100644 Binary files a/Manual_broccoli_v1.0.pdf and b/manual_Broccoli_v1.1.pdf differ diff --git a/scripts/broccoli_step2.py b/scripts/broccoli_step2.py index 18e71d3..48607a3 100644 --- a/scripts/broccoli_step2.py +++ b/scripts/broccoli_step2.py @@ -24,6 +24,7 @@ import subprocess import gzip import math +import re from multiprocessing import Pool as ThreadPool from scripts import utils try: @@ -166,14 +167,27 @@ def analyse_species(dict_sp): return present, dupli -def correct_HSP(qu_seq, ta_seq): - # remove insertions from target HSP - for i, k in enumerate(qu_seq): - if k == '-': - #ta_seq[i] = 'z' - ta_seq = ta_seq[:i] + 'z' + ta_seq[(i+1):] - no_insertion = ta_seq.replace('z','') - return no_insertion +''' new function that create aligned HSP sequences from cigar string ''' +def extract_HSP(full_seq, start, cig): + # split cigar + matches = re.findall(r'(\d+)([A-Z]{1})', cig) + l_tup = [(m[1], int(m[0])) for m in matches] + + # reconstruct HSP + hsp = '' + position = start + for t in l_tup: + # case of aa matches + if t[0] == 'M': + hsp += full_seq[position:(position + t[1])] + position += t[1] + # case of deletion + elif t[0] == 'D': + position += t[1] + # case of insertion + elif t[0] == 'I': + hsp += '-' * t[1] + return hsp def process_location(qu_start, qu_end, min_start, max_end): @@ -214,7 +228,7 @@ def process_file(file): for file_db in list_files: search_output = index + '_' + file_db.replace('.fas','.gz') subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ./dir_step2/' + file_db.replace('.fas','.db') + ' --max-target-seqs ' + str(max_per_species) + ' --query ./dir_step1/' + file + ' \ - --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ./dir_step2/' + index + '/' + search_output + ' --outfmt 6 qseqid sseqid qstart qend qseq_gapped sseq_gapped 2>&1', shell=True) + --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ./dir_step2/' + index + '/' + search_output + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True) ## get all hits in a dict of list all_output = collections.defaultdict(list) @@ -247,13 +261,13 @@ def process_file(file): species = name_2_sp_phylip_seq[target][0] qu_start = int(ll[1]) -1 qu_end = int(ll[2]) -1 - qu_seq = ll[3] - ta_seq = ll[4] + ta_start = int(ll[3]) -1 + cigar = ll[4] if target in all_hits: # extract HSP and add to target seq - corrected_HSP = correct_HSP(qu_seq, ta_seq) - all_hits[target] = all_hits[target][:qu_start] + corrected_HSP + all_hits[target][qu_end + 1:] + HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) + all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) else: @@ -261,8 +275,8 @@ def process_file(file): # create target seq all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2]) # extract HSP - corrected_HSP = correct_HSP(qu_seq, ta_seq) - all_hits[target] = all_hits[target][:qu_start] + corrected_HSP + all_hits[target][qu_end + 1:] + HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) + all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) # reduce output for pickle (convert all element to integers)