-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathsdf_check.py
executable file
·343 lines (277 loc) · 10.8 KB
/
sdf_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 13:19:51 2020
@author: Warren
Script to check sdf file format for Fragalysis upload
"""
import logging
import validators
from rdkit import Chem
from viewer.models import SiteObservation
logger = logging.getLogger(__name__)
# Set .sdf format version here
version = 'ver_1.2'
def check_compound_set(description_mol, validate_dict, update=None):
del update
# Must have a 'generation_date'
if not description_mol.HasProp('generation_date'):
validate_dict = add_warning(
molecule_name='File error',
field='compound set',
warning_string="Molecule has no generation_date",
validate_dict=validate_dict,
)
return validate_dict
# That's of the form "<Y>-<M>-<D>"...
g_date = description_mol.GetProp('generation_date')
y_m_d = g_date.split('-')
if len(y_m_d) != 3:
validate_dict = add_warning(
molecule_name='File error',
field='compound set',
warning_string="Molecule has no generation_date is not Y-M-D (g_date)",
validate_dict=validate_dict,
)
return validate_dict
return validate_dict
def add_warning(molecule_name, field, warning_string, validate_dict):
validate_dict['molecule_name'].append(molecule_name)
validate_dict['field'].append(field)
validate_dict['warning_string'].append(warning_string)
return validate_dict
def check_sdf(sdf_file, validate_dict):
"""
Checks if .sdf file can be read and follows naming format:
'compound-set_<name>.sdf' with <name> replaced with
the name you wish to give it. e.g. compound-set_fragmenstein.sdf
:sdf_file: is the sdf in the specified format
:return: Updates validate dictionary with pass/fail message
"""
# Check filename
if sdf_file.startswith("compound-set_") and sdf_file.endswith(".sdf") is False:
validate_dict = add_warning(
molecule_name='File error',
field='_File_name',
warning_string=f"Illegal filename: {str(sdf_file)} found",
validate_dict=validate_dict,
)
return validate_dict
def check_refmol(mol, validate_dict, target=None):
if target:
try:
ref_mols = mol.GetProp('ref_mols').split(',')
except KeyError:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='ref_mols',
warning_string="Molecule has no 'ref_mols' property",
validate_dict=validate_dict,
)
return validate_dict
for ref_mol in ref_mols:
ref_strip = ref_mol.strip()
query = SiteObservation.objects.filter(
code=ref_strip,
experiment__experiment_upload__target__title=target,
)
if len(query) == 0:
msg = f"No SiteObservation code contains '{ref_strip}'"
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='ref_mol',
warning_string=msg,
validate_dict=validate_dict,
)
logger.warning(msg)
return validate_dict
def check_pdb(mol, validate_dict, target=None, zfile=None):
"""
Checks if .pdb file can be read
:mol: rdkit mol read from SD file
:return: Updates validate dictionary with pass/fail message
"""
pdb_fn = mol.GetProp('ref_pdb').split('/')[-1]
# No support for PDB atm
# Check if pdb filename given and exists
# if zfile:
# pdb_code = pdb_fn.replace('.pdb', '')
# if pdb_code not in zfile:
# validate_dict = add_warning(molecule_name=mol.GetProp('_Name'),
# field='ref_pdb',
# warning_string="path " + str(pdb_fn) + " can't be found in uploaded zip file",
# validate_dict=validate_dict)
# Custom pdb added but no zfile - double check if pdb does exist before throwing error
if pdb_fn.endswith(".pdb") and not zfile:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='ref_pdb',
warning_string="Custom PDB '"
+ str(pdb_fn)
+ "' used with no zip PDB file uploaded. Please upload zip PDB file.",
validate_dict=validate_dict,
)
# If anything else given example x1408
if target and not pdb_fn.endswith(".pdb"):
query = SiteObservation.objects.filter(
code__contains=str(f'{target}-' + pdb_fn.split(':')[0].split('_')[0])
)
if len(query) == 0:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='ref_pdb',
warning_string=f"PDB for {str(pdb_fn)} does not exist",
validate_dict=validate_dict,
)
return validate_dict
def check_SMILES(mol, validate_dict):
"""
Checks if SMILES can be read by rdkit
:mol: rdkit mol read from SD file
:return: Updates validate dictionary with pass/fail message
"""
# Check SMILES
try:
smi_check = mol.GetProp('original SMILES')
except KeyError:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='original SMILES',
warning_string="Molecule has no 'original SMILES' property",
validate_dict=validate_dict,
)
return validate_dict
m = Chem.MolFromSmiles(smi_check, sanitize=False)
if m is None:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='original SMILES',
warning_string=f"Invalid SMILES {smi_check}",
validate_dict=validate_dict,
)
return validate_dict
def check_ver_name(blank_mol, check_version, validate_dict):
"""
Checks if blank mol:
The name (title line) of this molecule should be the
file format specification version e.g. ver_1.0 (as defined in this document)
:blank_mol: rdkit mol of blank mol from an SD file
:return: Updates validate dictionary with pass/fail message
"""
ver_name = blank_mol.GetProp('_Name')
if ver_name != check_version:
validate_dict = add_warning(
molecule_name=blank_mol.GetProp('_Name'),
field='_Name',
warning_string=f'Illegal version: {ver_name} found. Should be {check_version}',
validate_dict=validate_dict,
)
return validate_dict
def check_blank_mol_props(mol, validate_dict):
# check for compulsory fields in blank mols
fields = [
'ref_url',
'submitter_name',
'submitter_email',
'submitter_institution',
'generation_date',
'method',
]
for field in fields:
validate_dict = missing_field_check(mol, field, validate_dict)
return validate_dict
def check_blank_prop(blank_mol, validate_dict):
"""
Checks if blank mol properties have a description
:blank_mol: rdkit mol of blank mol from an SD file
:return: Updates validate dictionary with pass/fail message
"""
# Check if properties populated
property_dict = blank_mol.GetPropsAsDict()
# Properties to ignore
prop_ignore_list = ['ref_mols', 'ref_pdb']
for key, value in zip(list(property_dict.keys()), list(property_dict.values())):
if value == '' and key not in prop_ignore_list:
validate_dict = add_warning(
molecule_name=blank_mol.GetProp('_Name'),
field=key,
warning_string=f'Description for {key} missing',
validate_dict=validate_dict,
)
if key == 'ref_url' and check_url(value) is False:
validate_dict = add_warning(
molecule_name=blank_mol.GetProp('_Name'),
field=key,
warning_string=f'Illegal URL {value} provided',
validate_dict=validate_dict,
)
return validate_dict
def check_field_populated(mol, validate_dict):
"""
Checks if all compulsory fields are populated:
1. ref_mols - a comma separated list of the fragments
2. ref_pdb - either (a) a filepath (relative to the sdf file)
to an uploaded pdb file
3. original SMILES - the original smiles of the compound
before any computation was carried out
:mol: rdkit mol other than blank_mol
:return: Updates validate dictionary with pass/fail message
"""
# Compuslory fields
compulsory_fields = ['ref_pdb', 'ref_mols', 'original SMILES']
property_dict = mol.GetPropsAsDict()
for key, value in zip(list(property_dict.keys()), list(property_dict.values())):
if value == '' and key in compulsory_fields:
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field=key,
warning_string=f'Value for {key} missing',
validate_dict=validate_dict,
)
return validate_dict
def check_url(value):
"""
Checks if url provided exists. No internet connection required.
Checks URL using Validators package
:value: value associated with 'ref_url' key
:return: False if URL can not be validated
"""
valid = validators.url(value)
if valid is not True:
return False
def check_name_characters(name, validate_dict):
legal_non_alnum = ['-', '_', '.']
for char in name:
if not char.isalnum() and char not in legal_non_alnum:
validate_dict = add_warning(
molecule_name=name,
field='_Name',
warning_string=f'Illegal character {char} found',
validate_dict=validate_dict,
)
return validate_dict
def missing_field_check(mol, field, validate_dict):
props_dict = mol.GetPropsAsDict()
if field not in list(props_dict.keys()):
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field=field,
warning_string=f'Field {field} not found!',
validate_dict=validate_dict,
)
return validate_dict
def check_mol_props(mol, validate_dict):
# Check for (mandatory, isolated) missing fields
fields = ['ref_mols', 'original SMILES']
for field in fields:
validate_dict = missing_field_check(mol, field, validate_dict)
# More complex checks?
# One of ref_pdb and lhs_pdb must be set
if not (mol.HasProp('ref_pdb') or mol.HasProp('lhs_pdb')):
validate_dict = add_warning(
molecule_name=mol.GetProp('_Name'),
field='ref_pdb/lhs_pdb',
warning_string="Molecule has neither 'ref_pdb' nor 'lhs_pdb' property",
validate_dict=validate_dict,
)
return validate_dict