-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathstream.py
105 lines (81 loc) · 2.75 KB
/
stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import pandas as pd
import torch
from torch.utils import data
import json
from sklearn.preprocessing import OneHotEncoder
from subword_nmt.apply_bpe import BPE
import codecs
vocab_path = './ESPF/protein_codes_uniprot.txt'
bpe_codes_protein = codecs.open(vocab_path)
pbpe = BPE(bpe_codes_protein, merges=-1, separator='')
sub_csv = pd.read_csv('./ESPF/subword_units_map_uniprot.csv')
idx2word_p = sub_csv['index'].values
words2idx_p = dict(zip(idx2word_p, range(0, len(idx2word_p))))
vocab_path = './ESPF/drug_codes_chembl.txt'
bpe_codes_drug = codecs.open(vocab_path)
dbpe = BPE(bpe_codes_drug, merges=-1, separator='')
sub_csv = pd.read_csv('./ESPF/subword_units_map_chembl.csv')
idx2word_d = sub_csv['index'].values
words2idx_d = dict(zip(idx2word_d, range(0, len(idx2word_d))))
max_d = 205
max_p = 545
def protein2emb_encoder(x):
max_p = 545
t1 = pbpe.process_line(x).split() # split
try:
i1 = np.asarray([words2idx_p[i] for i in t1]) # index
except:
i1 = np.array([0])
#print(x)
l = len(i1)
if l < max_p:
i = np.pad(i1, (0, max_p - l), 'constant', constant_values = 0)
input_mask = ([1] * l) + ([0] * (max_p - l))
else:
i = i1[:max_p]
input_mask = [1] * max_p
return i, np.asarray(input_mask)
def drug2emb_encoder(x):
max_d = 50
#max_d = 100
t1 = dbpe.process_line(x).split() # split
try:
i1 = np.asarray([words2idx_d[i] for i in t1]) # index
except:
i1 = np.array([0])
#print(x)
l = len(i1)
if l < max_d:
i = np.pad(i1, (0, max_d - l), 'constant', constant_values = 0)
input_mask = ([1] * l) + ([0] * (max_d - l))
else:
i = i1[:max_d]
input_mask = [1] * max_d
return i, np.asarray(input_mask)
class BIN_Data_Encoder(data.Dataset):
def __init__(self, list_IDs, labels, df_dti):
'Initialization'
self.labels = labels
self.list_IDs = list_IDs
self.df = df_dti
def __len__(self):
'Denotes the total number of samples'
return len(self.list_IDs)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
# Load data and get label
index = self.list_IDs[index]
#d = self.df.iloc[index]['DrugBank ID']
d = self.df.iloc[index]['SMILES']
p = self.df.iloc[index]['Target Sequence']
#d_v = drug2single_vector(d)
d_v, input_mask_d = drug2emb_encoder(d)
p_v, input_mask_p = protein2emb_encoder(p)
#print(d_v.shape)
#print(input_mask_d.shape)
#print(p_v.shape)
#print(input_mask_p.shape)
y = self.labels[index]
return d_v, p_v, input_mask_d, input_mask_p, y