-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
186 lines (138 loc) · 8.42 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- coding:utf-8 -*-
import argparse
from classes.Sequence import Sequence
from src.SarsCov2Epitopes import *
import src.SarsCov2Variants as scv
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')
'''
Sample Run in Command Prompt/Terminal:
python3 main.py -i "references/Sequences/Philippines"
'''
parser = argparse.ArgumentParser(description="parses SNPs from alignments")
parser.add_argument('-i', '--input_alignments_fasta', dest='input', help='directory that includes the input fasta files', default='references/Sequences/Philippines')
parser.add_argument('-aln', '--needs_alignment', dest='needs_alignment', help='(Type: Bool) True if input still needs to be aligned,'
'False if input is already aligned, e.g. -aln True, default value is True',
default=True, type=lambda x: (str(x).lower() in ['true', '1', 'yes']))
parser.add_argument('-m', '--metadata', dest='metadata', help='Patient Status Metadata ().tsv) file', default='references/Sequences/Philippines')
parser.add_argument('-loc', '--gene_loc', dest='gene_loc', help='input file with the location of genes', default='input/reference_genes_locations.txt' )
parser.add_argument('-o', '--output', dest='out', help='base name for output files', default='output/04_mutations.fasta')
parser.add_argument('-ref', '--reference_name', dest='ref_name',
help='name of reference sequence in alignment to compare with other sequences; otherwise uses first sequence in alignment as reference',
default='NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome')
parser.add_argument('-g', '--genome', dest='ref_genome', help='fasta file with only the reference genome',
default='references/Sequences/References\ Sequences/NCBI\ Reference\ Sequence_NC_045512.2.fasta')
parser.add_argument('-eps', '--epitope_regions', dest='eps', help='text file that lists epitopes to check', default='input/epitopes.txt')
args = parser.parse_args()
#######################################################################
if __name__ == "__main__":
tic = time.perf_counter()
# Welcome Message
width = os.get_terminal_size().columns
os.system('clear')
print('\n\n\n')
print('================================================================\n'.center(width))
print('SARS-COV-2 EPITOPE SURVEILLANCE'.center(width))
print('A Bioinformatics Project of Center for Informatics'.center(width))
print('University of San Agustin, Iloilo City, Philippines'.center(width))
print('Copyright \xa9 2021\n'.center(width))
print('Authors: Rico JA, Zamora PRF, Bolinas DK, Aguila RN,'.center(width))
print('Isip I, Dumancas G, Fenger D, de Castro R\n'.center(width))
print('================================================================\n\n'.center(width))
# Search and combine all metadata.tsv
meta_df = scv.combine_metadata(f'{args.metadata}')
meta_df = scv.lineage_to_variant(meta_df)
#meta_df = scv.aggregate_divisions(meta_df)
meta_df.to_csv('output/13_filtered_seq_epitsurve.csv', index=False)
# Search and combine all fasta files
combine_fasta_files(f'{args.input}')
input_fasta = f'{args.input}/combined.fasta'
# Align sequences using MUSCLE/MAFFT if the input is not yet aligned
if args.needs_alignment:
output_alignment = 'output/01_aligned.fasta'
align_seq(input_fasta, args.ref_genome, output_alignment)
args.input = output_alignment
# get reference genome name
ref_genome_name = args.ref_name
# Empty the output files
open('output/03_protein.fasta', 'w').close()
open(args.out, 'w').close()
# Create a list of epitope locations
epitopes = parse_input_txt(args.eps)
# Create a list of gene names
genes = parse_input_txt(args.gene_loc)
for i in range(0,len(genes)):
gene_name = genes[i][0]
gene_start = int(genes[i][1]) - 1
gene_stop = int(genes[i][2])
# read and process file
# create sequence dictionary
seq_dict, ref_genome_name = parse_alignment(args.input, ref_genome_name, gene_name)
# remove gaps in the reference genome and the corresponding column
#seq_dict = remove_gaps(seq_dict, gene_name)
# search for the locations of the ref genome sequence start and stop codon
# since the genome is not yet trimmed, gene_name still contains the entire genome
ref_genome = seq_dict[gene_name]
(start_loc, stop_loc) = search_start_stop(ref_genome, gene_start, gene_stop)
print("{} | start codon: {} | stop codon: {}".format(gene_name, start_loc+1, stop_loc))
# Create a string of values for each epitopes
bcell_epitope = '_' * int((gene_stop - gene_start + 1)/3 - 1)
tcell_epitope_c1 = '_' * int((gene_stop - gene_start + 1)/3 - 1)
tcell_epitope_c2 = '_' * int((gene_stop - gene_start + 1)/3 - 1)
for epitope in epitopes:
if gene_name == epitope[1]:
epi_start = int(epitope[2]) - 1
epi_stop = int(epitope[3])
if epitope[0] == 'B cell':
bcell_epitope = bcell_epitope[0:epi_start] + "W" * (epi_stop - epi_start) + bcell_epitope[epi_stop:]
elif epitope[0] == 'T cell (class I)':
tcell_epitope_c1 = tcell_epitope_c1[0:epi_start] + "W" * (epi_stop - epi_start) + tcell_epitope_c1[epi_stop:]
elif epitope[0] == 'T cell (class II)':
tcell_epitope_c2 = tcell_epitope_c2[0:epi_start] + "W" * (epi_stop - epi_start) + tcell_epitope_c2[epi_stop:]
# Include the reference genome at the top of the Trimmed output file for checking purposes
if i==0:
# Initialize output files
open('output/02_trimmed.fasta', 'w').write('>' + str(ref_genome_name) + '\n' + str(ref_genome) + '\n')
df = pd.DataFrame(seq_dict.keys(), columns=['ID'])
df['ID'] = df['ID'].replace([gene_name],ref_genome_name)
df = df.merge(meta_df, how='left', left_on='ID', right_on='strain')
open('output/03_protein.fasta', 'a').write('>B-Cell\n' + str(bcell_epitope) + '\n')
open('output/03_protein.fasta', 'a').write('>T-Cell (class I)\n' + str(tcell_epitope_c1) + '\n')
open('output/03_protein.fasta', 'a').write('>T-Cell (class II)\n' + str(tcell_epitope_c2) + '\n')
open(args.out, 'a').write('>B-Cell\n' + str(bcell_epitope) + '\n')
open(args.out, 'a').write('>T-Cell (class I)\n' + str(tcell_epitope_c1) + '\n')
open(args.out, 'a').write('>T-Cell (class II)\n' + str(tcell_epitope_c2) + '\n')
# initialize sequences
# seq_dict = { "name1": "SEQUENCE", "name2": "SEQUENCE" }
create_sequences(seq_dict, start_loc, stop_loc)
# seq_dict = { "name1": Sequence(), "name2": Sequence() }
# calculate protein diffs for each test sequence against the reference protein
ref_gene = seq_dict[gene_name].protein
df = calculate_protein_diffs(seq_dict, gene_name, ref_gene, start_loc, df)
# parse epitopes (Not necessary)
# calculate epitope protein diffs for each sequence against reference protein
#if args.eps is not None:
# parse_epitopes(args.eps, seq_dict, ref_protein)
# Append results in a single output file
with open(args.out, 'a') as out:
for seq in seq_dict.values():
# Write the reference sequence and test sequences to the output file
if seq.name == gene_name or seq.name not in genes:
out.write('>' + str(seq.name) + '\n')
out.write(str(''.join(seq.mutations)) + '\n')
# Create a dataframe with mutation and respective instances
new_df = create_df(df, genes, epitopes)
# Save the dataframes into separate files
df.to_csv('output/05_aminoacid_replacements.csv')
new_df.to_csv('output/06_unique_mutations.csv')
for n in range(len(genes)):
heatmap = fasta_to_heatmap('output/04_mutations.fasta', gene=n, meta_df=meta_df)
heatmap.to_csv(f'output/07_heatmap_{genes[n][0]}.csv', index=False)
# Plot the mutations
#plot_mutations(fasta_to_df('output/04_mutations.fasta'), meta_df, epitopes, genes)
# Plot mutations geographically
#scv.main()
toc = time.perf_counter()
print('\nOverall Runtime: {:.4f} seconds\n'.format(toc-tic))