Skip to content

Commit 71422b9

Browse files
committed
Add scripts for training and evaluation
1 parent 32b62f5 commit 71422b9

20 files changed

+1124
-10
lines changed

.gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
data/*
2+
results/*
3+
4+
.ipynb_checkpoints
5+
scripts_paper/.ipynb_checkpoints
6+
molbit/.ipynb_checkpoints
7+
molbit/__pycache__

0_data_preprocess.ipynb

+304
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 13,
6+
"id": "8b1fd78c-52d9-46f6-af48-feb15b2abbe4",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import os\n",
11+
"import pandas as pd\n",
12+
"import tqdm\n",
13+
"from multiprocessing import Pool\n",
14+
"from rdkit.Chem import MolFromSmiles, MolToSmiles"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 2,
20+
"id": "e979bf54-33eb-4e52-bf10-9a8f3a4339b8",
21+
"metadata": {},
22+
"outputs": [],
23+
"source": [
24+
"filepath = os.path.join('data', 'gdb13.1M.freq.ll.smi')"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 8,
30+
"id": "3fb8c505-1217-49da-910c-6949cf65864d",
31+
"metadata": {},
32+
"outputs": [
33+
{
34+
"name": "stdout",
35+
"output_type": "stream",
36+
"text": [
37+
"(1000000, 3)\n"
38+
]
39+
},
40+
{
41+
"data": {
42+
"text/html": [
43+
"<div>\n",
44+
"<style scoped>\n",
45+
" .dataframe tbody tr th:only-of-type {\n",
46+
" vertical-align: middle;\n",
47+
" }\n",
48+
"\n",
49+
" .dataframe tbody tr th {\n",
50+
" vertical-align: top;\n",
51+
" }\n",
52+
"\n",
53+
" .dataframe thead th {\n",
54+
" text-align: right;\n",
55+
" }\n",
56+
"</style>\n",
57+
"<table border=\"1\" class=\"dataframe\">\n",
58+
" <thead>\n",
59+
" <tr style=\"text-align: right;\">\n",
60+
" <th></th>\n",
61+
" <th>0</th>\n",
62+
" <th>1</th>\n",
63+
" <th>2</th>\n",
64+
" </tr>\n",
65+
" </thead>\n",
66+
" <tbody>\n",
67+
" <tr>\n",
68+
" <th>0</th>\n",
69+
" <td>C1=Cc2cc1nnc1snc(o2)-o-1</td>\n",
70+
" <td>0</td>\n",
71+
" <td>68.182535</td>\n",
72+
" </tr>\n",
73+
" <tr>\n",
74+
" <th>1</th>\n",
75+
" <td>N1C2C3C4C5NC6C7C6C5(C13)C2N47</td>\n",
76+
" <td>0</td>\n",
77+
" <td>67.352869</td>\n",
78+
" </tr>\n",
79+
" <tr>\n",
80+
" <th>2</th>\n",
81+
" <td>c1c2c[nH]c(nn3cnc(c#1)c3)-s-2</td>\n",
82+
" <td>0</td>\n",
83+
" <td>65.054106</td>\n",
84+
" </tr>\n",
85+
" <tr>\n",
86+
" <th>3</th>\n",
87+
" <td>N=c1-c2cnn-1cnccc(=O)c2</td>\n",
88+
" <td>0</td>\n",
89+
" <td>62.522982</td>\n",
90+
" </tr>\n",
91+
" <tr>\n",
92+
" <th>4</th>\n",
93+
" <td>C=Nn1-c2cccconc-1[nH]c2</td>\n",
94+
" <td>0</td>\n",
95+
" <td>59.586299</td>\n",
96+
" </tr>\n",
97+
" </tbody>\n",
98+
"</table>\n",
99+
"</div>"
100+
],
101+
"text/plain": [
102+
" 0 1 2\n",
103+
"0 C1=Cc2cc1nnc1snc(o2)-o-1 0 68.182535\n",
104+
"1 N1C2C3C4C5NC6C7C6C5(C13)C2N47 0 67.352869\n",
105+
"2 c1c2c[nH]c(nn3cnc(c#1)c3)-s-2 0 65.054106\n",
106+
"3 N=c1-c2cnn-1cnccc(=O)c2 0 62.522982\n",
107+
"4 C=Nn1-c2cccconc-1[nH]c2 0 59.586299"
108+
]
109+
},
110+
"execution_count": 8,
111+
"metadata": {},
112+
"output_type": "execute_result"
113+
}
114+
],
115+
"source": [
116+
"df_raw = pd.read_csv(filepath, header=None, sep='\\t')\n",
117+
"\n",
118+
"print(df_raw.shape)\n",
119+
"df_raw.head()"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 14,
125+
"id": "dcc2cb0f-92b8-45a0-8fe7-05b0679638a4",
126+
"metadata": {},
127+
"outputs": [],
128+
"source": [
129+
"def normalize(smi):\n",
130+
" can = MolToSmiles(MolFromSmiles(smi), kekuleSmiles=True)\n",
131+
" lgt = len(can)\n",
132+
" return can, lgt"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 15,
138+
"id": "29e9c18f-ab7b-4867-a2cb-73fcea1f3f60",
139+
"metadata": {},
140+
"outputs": [],
141+
"source": [
142+
"def loader(df):\n",
143+
" for i in tqdm.trange(len(df)):\n",
144+
" yield df_raw.iloc[i,0]"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": 17,
150+
"id": "eb61ddd8-4315-463c-809b-b4a63fcf26ac",
151+
"metadata": {},
152+
"outputs": [
153+
{
154+
"name": "stderr",
155+
"output_type": "stream",
156+
"text": [
157+
"100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000000/1000000 [00:05<00:00, 179849.38it/s]\n"
158+
]
159+
}
160+
],
161+
"source": [
162+
"with Pool(30) as p:\n",
163+
" records = p.map(normalize, loader(df_raw))"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": 19,
169+
"id": "373575cf-1200-4aac-89b7-ffba0aaa89f6",
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"df = pd.DataFrame.from_records(records)\n",
174+
"df = df.rename(columns={0:'smiles', 1:'length'})"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 20,
180+
"id": "360481e2-559f-420a-a376-d818661153fb",
181+
"metadata": {},
182+
"outputs": [
183+
{
184+
"name": "stdout",
185+
"output_type": "stream",
186+
"text": [
187+
"(1000000, 2)\n"
188+
]
189+
},
190+
{
191+
"data": {
192+
"text/html": [
193+
"<div>\n",
194+
"<style scoped>\n",
195+
" .dataframe tbody tr th:only-of-type {\n",
196+
" vertical-align: middle;\n",
197+
" }\n",
198+
"\n",
199+
" .dataframe tbody tr th {\n",
200+
" vertical-align: top;\n",
201+
" }\n",
202+
"\n",
203+
" .dataframe thead th {\n",
204+
" text-align: right;\n",
205+
" }\n",
206+
"</style>\n",
207+
"<table border=\"1\" class=\"dataframe\">\n",
208+
" <thead>\n",
209+
" <tr style=\"text-align: right;\">\n",
210+
" <th></th>\n",
211+
" <th>smiles</th>\n",
212+
" <th>length</th>\n",
213+
" </tr>\n",
214+
" </thead>\n",
215+
" <tbody>\n",
216+
" <tr>\n",
217+
" <th>0</th>\n",
218+
" <td>C1=CC2=NN=C3OC(=NS3)OC1=C2</td>\n",
219+
" <td>26</td>\n",
220+
" </tr>\n",
221+
" <tr>\n",
222+
" <th>1</th>\n",
223+
" <td>N1C2C3C4C5NC6C7C6C5(C13)C2N47</td>\n",
224+
" <td>29</td>\n",
225+
" </tr>\n",
226+
" <tr>\n",
227+
" <th>2</th>\n",
228+
" <td>C1#CC2=CN(C=N2)N=C2NC=C1S2</td>\n",
229+
" <td>26</td>\n",
230+
" </tr>\n",
231+
" <tr>\n",
232+
" <th>3</th>\n",
233+
" <td>N=C1C2=CC(=O)C=CN=CN1N=C2</td>\n",
234+
" <td>25</td>\n",
235+
" </tr>\n",
236+
" <tr>\n",
237+
" <th>4</th>\n",
238+
" <td>C=NN1C2=CNC1=NOC=CC=C2</td>\n",
239+
" <td>22</td>\n",
240+
" </tr>\n",
241+
" </tbody>\n",
242+
"</table>\n",
243+
"</div>"
244+
],
245+
"text/plain": [
246+
" smiles length\n",
247+
"0 C1=CC2=NN=C3OC(=NS3)OC1=C2 26\n",
248+
"1 N1C2C3C4C5NC6C7C6C5(C13)C2N47 29\n",
249+
"2 C1#CC2=CN(C=N2)N=C2NC=C1S2 26\n",
250+
"3 N=C1C2=CC(=O)C=CN=CN1N=C2 25\n",
251+
"4 C=NN1C2=CNC1=NOC=CC=C2 22"
252+
]
253+
},
254+
"execution_count": 20,
255+
"metadata": {},
256+
"output_type": "execute_result"
257+
}
258+
],
259+
"source": [
260+
"print(df.shape)\n",
261+
"df.head()"
262+
]
263+
},
264+
{
265+
"cell_type": "code",
266+
"execution_count": 21,
267+
"id": "0df4b197-2122-4306-b407-c2a2f9862931",
268+
"metadata": {},
269+
"outputs": [],
270+
"source": [
271+
"df.to_csv(os.path.join('data', 'gdb13.csv'), index=False)"
272+
]
273+
},
274+
{
275+
"cell_type": "code",
276+
"execution_count": null,
277+
"id": "4af66c4c-afe2-4258-bcf4-e8927802b735",
278+
"metadata": {},
279+
"outputs": [],
280+
"source": []
281+
}
282+
],
283+
"metadata": {
284+
"kernelspec": {
285+
"display_name": "Python 3 (ipykernel)",
286+
"language": "python",
287+
"name": "python3"
288+
},
289+
"language_info": {
290+
"codemirror_mode": {
291+
"name": "ipython",
292+
"version": 3
293+
},
294+
"file_extension": ".py",
295+
"mimetype": "text/x-python",
296+
"name": "python",
297+
"nbconvert_exporter": "python",
298+
"pygments_lexer": "ipython3",
299+
"version": "3.9.21"
300+
}
301+
},
302+
"nbformat": 4,
303+
"nbformat_minor": 5
304+
}

0 commit comments

Comments
 (0)