Skip to content

Commit 1ef50b5

Browse files
Merge pull request #216 from X-lab-3D/cleanup_database_names
Rename mhcseqs files and add blast database folder.
2 parents 0d41d50 + 41240da commit 1ef50b5

File tree

5 files changed

+31
-29
lines changed

5 files changed

+31
-29
lines changed

PANDORA/Database/Database.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def construct_both_blast_db(self, data_dir=PANDORA.PANDORA_data):
222222

223223
#Define db name and path
224224
db_name = 'templates_blast_db'
225-
outpath = data_dir + '/' + db_name
225+
outpath = data_dir + '/BLAST_databases/' + db_name
226226
out_fasta = outpath + '/'+ db_name +'.fasta'
227227

228228
#Create db directory
@@ -239,15 +239,15 @@ def construct_both_blast_db(self, data_dir=PANDORA.PANDORA_data):
239239

240240
#Define db name and path
241241
db_name = 'refseq_blast_db'
242-
outpath = data_dir + '/' + db_name
242+
outpath = data_dir + '/BLAST_databases/' + db_name
243243
out_fasta = outpath + '/' + db_name + '.fasta'
244244

245245
#Create db directory
246246
if not os.path.isdir(outpath):
247247
subprocess.check_call('mkdir %s' %outpath, shell=True)
248248

249249
#Create .fasta for the db
250-
command='cat %s/mhcseqs/Human_MHC_data.fasta %s/mhcseqs/NonHuman_MHC_data.fasta > %s' %(data_dir,
250+
command='cat %s/mhcseqs/HLA_cleaned.fasta %s/mhcseqs/MHC_cleaned.fasta > %s' %(data_dir,
251251
data_dir,
252252
out_fasta)
253253
subprocess.check_call(command, shell=True)

PANDORA/Database/Database_functions.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -1631,15 +1631,15 @@ def get_sequence_for_fasta(template, MHC_class, chain):
16311631

16321632

16331633
def generate_mhcseq_database(data_dir = PANDORA.PANDORA_data + '/mhcseqs',
1634-
HLA_out = 'Human_MHC_data.fasta',
1635-
nonHLA_out = 'NonHuman_MHC_data.fasta'):
1636-
"""generate_mhcseq_database(data_dir=PANDORA.PANDORA_data, HLA_out='Human_MHC_data.fasta', nonHLA_out='NonHuman_MHC_data.fasta')
1634+
HLA_out = 'HLA_cleaned.fasta',
1635+
nonHLA_out = 'MHC_cleaned.fasta'):
1636+
"""generate_mhcseq_database(data_dir=PANDORA.PANDORA_data, HLA_out='HLA_cleaned.fasta', nonHLA_out='MHC_cleaned.fasta')
16371637
Downloads and parse HLA and other MHC sequences to compile reference fastas
16381638
16391639
Args:
16401640
data_dir (str, optional): Data directory. Defaults to PANDORA.PANDORA_data.
1641-
HLA_out (str, optional): Output file for HLA sequences. Defaults to 'Human_MHC_data.fasta'.
1642-
nonHLA_out (str, optional): Output file for non human MHCs. Defaults to 'NonHuman_MHC_data.fasta'.
1641+
HLA_out (str, optional): Output file for HLA sequences. Defaults to 'HLA_cleaned.fasta'.
1642+
nonHLA_out (str, optional): Output file for non human MHCs. Defaults to 'MHC_cleaned.fasta'.
16431643
16441644
Returns:
16451645
None.
@@ -1665,12 +1665,12 @@ def generate_mhcseq_database(data_dir = PANDORA.PANDORA_data + '/mhcseqs',
16651665
return ref_MHCI_sequences
16661666

16671667

1668-
def generate_hla_database(data_dir, HLA_out = 'Human_MHC_data.fasta'):
1668+
def generate_hla_database(data_dir, HLA_out = 'HLA_cleaned.fasta'):
16691669
"""
16701670
Downloads and parse HLA sequences
16711671
16721672
Args:
1673-
HLA_out (str, optional): Output file for HLA sequences. Defaults to 'Human_MHC_data.fasta'.
1673+
HLA_out (str, optional): Output file for HLA sequences. Defaults to 'HLA_cleaned.fasta'.
16741674
16751675
Returns:
16761676
None.
@@ -1681,21 +1681,21 @@ def generate_hla_database(data_dir, HLA_out = 'Human_MHC_data.fasta'):
16811681
###
16821682
# Rename pre-existing raw file
16831683
try:
1684-
os.system('mv %s/hla_prot.fasta %s/OLD_hla_prot.fasta' %(data_dir, data_dir))
1684+
os.system('mv %s/HLA_raw.fasta %s/OLD_HLA_raw.fasta' %(data_dir, data_dir))
16851685
except:
16861686
pass
16871687

16881688
# Download Human data
16891689
url = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/hla_prot.fasta'
1690-
command = (' ').join(['wget', url, '-P', data_dir])
1690+
command = (' ').join(['wget', url, '-O', f'{data_dir}/HLA_raw.fasta'])
16911691
proc = subprocess.Popen(command, executable='/bin/bash',
16921692
shell=True, stdout=subprocess.PIPE)
16931693
print(proc.stdout.read())
16941694

16951695
HLAs = {}
16961696
to_write = {}
16971697
#Parse the fasta files
1698-
for seq_record in SeqIO.parse(data_dir + '/hla_prot.fasta', "fasta"):
1698+
for seq_record in SeqIO.parse(f'{data_dir}/HLA_raw.fasta', "fasta"):
16991699
allele_fullname = seq_record.description.split(' ')[1]
17001700
#allele_significant = allele_fullname[:8]
17011701
#Take only up to the allele identifyer, ignore the silent mutations
@@ -1743,18 +1743,18 @@ def generate_hla_database(data_dir, HLA_out = 'Human_MHC_data.fasta'):
17431743

17441744
# Remove pre-existing raw file
17451745
try:
1746-
os.system('rm %s/OLD_hla_prot.fasta' %data_dir)
1746+
os.system(f'rm {data_dir}/OLD_HLA_raw.fasta')
17471747
except:
17481748
pass
17491749

17501750
return to_write
17511751

1752-
def generate_nonhla_database(data_dir, nonHLA_out = 'NonHuman_MHC_data.fasta'):
1752+
def generate_nonhla_database(data_dir, nonHLA_out = 'MHC_cleaned.fasta'):
17531753
"""
17541754
Downloads and parse non human MHC sequences
17551755
17561756
Args:
1757-
nonHLA_out (str, optional): Output file for non human MHCs. Defaults to 'NonHuman_MHC_data.fasta'.
1757+
nonHLA_out (str, optional): Output file for non human MHCs. Defaults to 'MHC_cleaned.fasta'.
17581758
17591759
Returns:
17601760
None.
@@ -1765,22 +1765,22 @@ def generate_nonhla_database(data_dir, nonHLA_out = 'NonHuman_MHC_data.fasta'):
17651765
###
17661766
# Rename pre-existing raw file
17671767
try:
1768-
os.system('mv %s/MHC_prot.fasta %s/OLD_MHC_prot.fasta' %(data_dir, data_dir))
1768+
os.system(f'mv {data_dir}/MHC_raw.fasta {data_dir}/OLD_MHC_raw.fasta')
17691769
except:
17701770
pass
17711771

17721772
# Download other animlas data
17731773
#os.system('wget https://raw.githubusercontent.com/ANHIG/IPDMHC/Latest/MHC_prot.fasta')
17741774
url = 'https://raw.githubusercontent.com/ANHIG/IPDMHC/Latest/MHC_prot.fasta'
1775-
command = (' ').join(['wget', url, '-P', data_dir])
1775+
command = (' ').join(['wget', url, '-O', f'{data_dir}/MHC_raw.fasta'])
17761776
proc = subprocess.Popen(command, executable='/bin/bash',
17771777
shell=True, stdout=subprocess.PIPE)
17781778
print(proc.stdout.read())
17791779

17801780
MHCs = {}
17811781
to_write = {}
17821782
#Parse the fasta file
1783-
fasta = f'{data_dir}/MHC_prot.fasta'
1783+
fasta = f'{data_dir}/MHC_raw.fasta'
17841784
for seq_record in SeqIO.parse(fasta, "fasta"):
17851785
allele_fullname = seq_record.description.split(' ')[1]
17861786
#allele_significant = allele_fullname[:8]
@@ -1827,7 +1827,7 @@ def generate_nonhla_database(data_dir, nonHLA_out = 'NonHuman_MHC_data.fasta'):
18271827

18281828
# Remove pre-existing raw file
18291829
try:
1830-
os.system('rm %s/OLD_MHC_prot.fasta' %data_dir)
1830+
os.system(f'rm {data_dir}/OLD_MHC_raw.fasta')
18311831
except:
18321832
pass
18331833

PANDORA/PMHC/PMHC.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -408,9 +408,9 @@ def retrieve_MHC_refseq(self, input_file = None, chain='M', permissive=False):
408408
# Define correct fasta file
409409
if input_file == None:
410410
if self.allele_type[0].startswith('HLA'):
411-
input_file = PANDORA.PANDORA_data+ '/mhcseqs/Human_MHC_data.fasta'
411+
input_file = PANDORA.PANDORA_data+ '/mhcseqs/HLA_cleaned.fasta'
412412
else:
413-
input_file = PANDORA.PANDORA_data+ '/mhcseqs/NonHuman_MHC_data.fasta'
413+
input_file = PANDORA.PANDORA_data+ '/mhcseqs/MHC_cleaned.fasta'
414414

415415
# Parse Fasta file
416416
fasta_sequences = SeqIO.parse(input_file,'fasta')
@@ -535,7 +535,7 @@ def fill_allele_seq_info(self, use_templ_seq=False):
535535
try:
536536
blast_results = Modelling_functions.blast_mhc_seq(self.M_chain_seq,
537537
chain='M',
538-
blastdb=PANDORA.PANDORA_data + '/refseq_blast_db/refseq_blast_db')
538+
blastdb=PANDORA.PANDORA_data + '/BLAST_databases/refseq_blast_db/refseq_blast_db')
539539
#Take only the allele names with the highest id score
540540
top_id = blast_results[0][1]
541541
self.allele_type.extend([x[0] for x in blast_results if x[1] == top_id])
@@ -574,7 +574,7 @@ def fill_allele_seq_info(self, use_templ_seq=False):
574574
try:
575575
blast_results = Modelling_functions.blast_mhc_seq(self.N_chain_seq,
576576
chain='N',
577-
blastdb=PANDORA.PANDORA_data + '/refseq_blast_db/refseq_blast_db')
577+
blastdb=PANDORA.PANDORA_data + '/BLAST_databases/refseq_blast_db/refseq_blast_db')
578578
#Take only the allele names with the highest id score
579579
top_id = blast_results[0][1]
580580
self.allele_type.extend([x[0] for x in blast_results if x[1] == top_id])

PANDORA/Pandora/Modelling_functions.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,7 @@ def score_peptide_alignment(target, template, substitution_matrix='PAM30'):
456456

457457

458458
def find_template(target, database, best_n_templates = 1, benchmark=False,
459-
blastdb=PANDORA.PANDORA_data + '/templates_blast_db/templates_blast_db'):
459+
blastdb=PANDORA.PANDORA_data + '/BLAST_databases/templates_blast_db/templates_blast_db'):
460460
''' Selects the template structure that is best suited as template for homology modelling of the target
461461
462462
Args:
@@ -878,7 +878,7 @@ def run_modeller(output_dir, target, python_script = 'cmd_modeller.py', benchmar
878878

879879
return results
880880

881-
def blast_mhc_seq(seq, chain='M', blastdb=PANDORA.PANDORA_data + '/refseq_blast_db/refseq_blast_db'):
881+
def blast_mhc_seq(seq, chain='M', blastdb=PANDORA.PANDORA_data + '/BLAST_databases/refseq_blast_db/refseq_blast_db'):
882882
try:
883883
command = (' ').join(['blastp','-db',blastdb,
884884
'-query',

setup.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
f'{user_folder_path}/Databases',
7070
f'{user_folder_path}/Databases/{data_folder}',
7171
f'{user_folder_path}/Databases/{data_folder}/mhcseqs',
72+
f'{user_folder_path}/Databases/{data_folder}/BLAST_databases',
7273
f'{user_folder_path}/Databases/{data_folder}/PDBs',
7374
f'{user_folder_path}/Databases/{data_folder}/PDBs/pMHCI',
7475
f'{user_folder_path}/Databases/{data_folder}/PDBs/pMHCII',
@@ -77,7 +78,8 @@
7778
f'{user_folder_path}/Databases/{data_folder}/PDBs/Bad/pMHCII',
7879
f'{user_folder_path}/Databases/{data_folder}/PDBs/IMGT_retrieved',
7980
f'{user_folder_path}/Databases/{data_folder}/outputs',
80-
f'{user_folder_path}/test/test_data'
81+
f'{user_folder_path}/test/',
82+
f'{user_folder_path}/test/test_data',
8183
f'{user_folder_path}/test/test_data/PDBs/Bad',
8284
f'{user_folder_path}/test/test_data/PDBs/Bad/pMHCI',
8385
f'{user_folder_path}/test/test_data/PDBs/Bad/pMHCII',
@@ -86,5 +88,5 @@
8688
for D in dirs:
8789
try:
8890
os.mkdir(D)
89-
except OSError:
90-
print('Could not make directory: ' + D)
91+
except OSError as e:
92+
print(f'Could not make directory: {D} \n Reason: {e}')

0 commit comments

Comments
 (0)