-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtarget_loader.py
2250 lines (1937 loc) · 78.1 KB
/
target_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import contextlib
import functools
import hashlib
import logging
import math
import os
import shutil
import tarfile
from collections.abc import Callable
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, Iterable, List, Optional, Tuple, TypeVar
import yaml
from celery import Task
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.postgres.aggregates import ArrayAgg
from django.core.exceptions import MultipleObjectsReturned
from django.db import IntegrityError, transaction
from django.db.models import Count, Model
from django.db.models.base import ModelBase
from django.utils import timezone
from api.utils import deployment_mode_is_production
from fragalysis.settings import TARGET_LOADER_MEDIA_DIRECTORY
from scoring.models import SiteObservationGroup
from viewer.models import (
CanonSite,
CanonSiteConf,
Compound,
Experiment,
ExperimentUpload,
Pose,
Project,
QuatAssembly,
SiteObservation,
SiteObservationTag,
TagCategory,
Target,
Xtalform,
XtalformQuatAssembly,
XtalformSite,
)
from viewer.utils import alphanumerator, sanitize_directory_name
logger = logging.getLogger(__name__)
# data that goes to tables are in the following files
# assemblies and xtalforms
XTALFORMS_FILE = "assemblies.yaml"
# target name, nothing else
CONFIG_FILE = "config*.yaml"
# everything else
METADATA_FILE = "meta_aligner.yaml"
# transformation matrices
TRANS_NEIGHBOURHOOD = "neighbourhood_transforms.yaml"
TRANS_CONF_SITE = "conformer_site_transforms.yaml"
TRANS_REF_STRUCT = "reference_structure_transforms.yaml"
class UploadState(str, Enum):
"""Target loader progress state.
PROCESSING - all good, upload in progress
REPORTING - upload failed, loader in reporting mode for diagnostics
SUCCESS - processing complete, all good
FAILED - processing complete, failed
"""
PROCESSING = "PROCESSING"
REPORTING = "REPORTING"
SUCCESS = "SUCCESS"
FAILED = "FAILED"
CANCELED = "CANCELED"
@dataclass
class MetadataObject:
"""Data structure to store freshly created model instances.
data blocks from from meta_aligner.yaml are processed into
dictionaries: { some_id: MetadataObjects, ...}
Reason being, quite often I need to refer to these by some
alternative ID. With the dataclass, I'm able to create temporary
dicts with key that are needed.
"""
instance: Model
key: str
versioned_key: str
index_data: dict = field(default_factory=dict)
new: bool = False
# type hint for wrapped yaml block processors
MetDict = TypeVar("MetDict", bound=dict[int | str, MetadataObject])
@dataclass
class ProcessedObject:
"""Data structure for creating model instances.
Returned from methods that process yaml blocks to dictionaries
that can be sent to django's model's get_or_create methods.
"""
model_class: ModelBase
fields: dict
key: str
defaults: dict = field(default_factory=dict)
index_data: dict = field(default_factory=dict)
versioned_key: Optional[str] = ""
@dataclass
class UploadReportEntry:
message: str
level: int | None = None
def __str__(self):
if self.level is None:
return self.message
return f"{logging.getLevelName(self.level)}: {self.message}"
@dataclass
class UploadReport:
task: Task | None
proposal_ref: str
stack: list[UploadReportEntry] = field(default_factory=list)
upload_state: UploadState = UploadState.PROCESSING
failed: bool = False
def __post_init__(self) -> None:
self.task_id = f"task {self.task.request.id}: " if self.task else ""
def log(self, level: int, message: str) -> None:
msg = f"{self.task_id}{message}"
if level == logging.ERROR:
self.failed = True
self.upload_state = UploadState.REPORTING
logger.log(level, msg)
self.stack.append(UploadReportEntry(level=level, message=message))
self._update_task(self.json())
def final(self, message, success=True):
self.upload_state = UploadState.SUCCESS
# This is (expected to be) the last message for the upload.
# Add the user-supplied message and then add a string indicating success or failure.
self.stack.append(UploadReportEntry(message=message))
status_line = 'SUCCESS' if success else 'FAILED'
self.stack.append(UploadReportEntry(message=status_line))
self._update_task(self.json())
def json(self):
return [str(k) for k in self.stack]
def _update_task(self, message: str | list) -> None:
if not self.task:
return
with contextlib.suppress(AttributeError):
self.task.update_state(
state=self.upload_state,
meta={
"proposal_ref": self.proposal_ref,
"description": message,
},
)
def _validate_bundle_against_mode(config_yaml: Dict[str, Any]) -> Optional[str]:
"""Inspects the meta to ensure it is supported by the MODE this stack is in.
Mode is (typically) one of DEVELOPER or PRODUCTION.
"""
assert config_yaml
if not deployment_mode_is_production():
# We're not in production mode - no bundle checks
return None
# PRODUCTION mode (strict)
# Initial concern - the loader's git information.
# It must not be 'dirty' and must have a valid 'tag'.
xca_git_info_key = "xca_git_info"
base_error_msg = "Stack is in PRODUCTION mode - and"
try:
xca_git_info = config_yaml[xca_git_info_key]
except KeyError:
return f"{base_error_msg} '{xca_git_info_key}' is a required configuration property"
logger.info("%s: %s", xca_git_info_key, xca_git_info)
if "dirty" not in xca_git_info:
return f"{base_error_msg} '{xca_git_info_key}' has no 'dirty' property"
if xca_git_info["dirty"]:
return f"{base_error_msg} '{xca_git_info_key}->dirty' must be False"
if "tag" not in xca_git_info:
return f"{base_error_msg} '{xca_git_info_key}' has no 'tag' property"
xca_version_tag: str = str(xca_git_info["tag"])
tag_parts: List[str] = xca_version_tag.split(".")
tag_valid: bool = True
if len(tag_parts) in {2, 3}:
for tag_part in tag_parts:
if not tag_part.isdigit():
tag_valid = False
break
else:
tag_valid = False
if not tag_valid:
return f"{base_error_msg} '{xca_git_info_key}->tag' must be 'N.N[.N]'. Got '{xca_version_tag}'"
# OK if we get here
return None
def _flatten_dict_gen(d: dict, parent_key: tuple | str | int, depth: int):
for k, v in d.items():
if parent_key:
if isinstance(parent_key, tuple):
new_key = (*parent_key, k)
else:
new_key = (parent_key, k)
else:
new_key = k
try:
deep_enough = any([isinstance(x, dict) for x in v.values()])
except AttributeError:
continue
if deep_enough and depth > 1:
yield from flatten_dict(v, new_key, depth - 1)
else:
if isinstance(new_key, str):
yield new_key, v
else:
yield *new_key, v
def flatten_dict(d: dict, parent_key: tuple | int | str = "", depth: int = 1):
"""Flatten nested dict to specified depth."""
return _flatten_dict_gen(d, parent_key, depth)
def set_directory_permissions(path, permissions) -> None:
for root, dirs, files in os.walk(path):
# Set permissions for directories
for directory in dirs:
dir_path = os.path.join(root, directory)
os.chmod(dir_path, permissions)
# Set permissions for files
for file in files:
file_path = os.path.join(root, file)
os.chmod(file_path, permissions)
# borrowed from SO
def calculate_sha256(filepath) -> str:
sha256_hash = hashlib.sha256()
with open(filepath, "rb") as f:
# Read the file in chunks of 4096 bytes
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
def strip_version(s: str, separator: str = "/") -> Tuple[str, int]:
# format something like XX01ZVNS2B-x0673/B/501/1
# remove tailing '<separator>1'
return s[0 : s.rfind(separator)], int(s[s.rfind(separator) + 1 :])
def create_objects(func=None, *, depth=math.inf):
"""Wrapper function for saving database objects.
Handles common part of saving model instances, actual saving,
logging, reporting and error handling.
Inner functions are yaml data processing functions that return
the model class and the data to pass to model's get_or_create
function.
"""
if func is None:
return functools.partial(create_objects, depth=depth)
@functools.wraps(func)
def wrapper_create_objects(
self, *args, yaml_data: dict, **kwargs
) -> dict[int | str, MetadataObject]:
# logger.debug("+ wrapper_service_query")
# logger.debug("args passed: %s", args)
# logger.debug("kwargs passed: %s", kwargs)
flattened_data = flatten_dict(yaml_data, depth=depth)
result = {}
created, existing, failed, updated = 0, 0, 0, 0
for item in flattened_data:
logger.debug("flattened data item: %s", item)
instance_data = func(
self, *args, item_data=item, validate_files=False, **kwargs
)
logger.debug("Instance data returned: %s", instance_data)
obj = None
new = False
if not instance_data:
continue
try:
if instance_data.fields:
try:
obj = instance_data.model_class.filter_manager.by_target(
self.target
).get(**instance_data.fields)
logger.debug("Object exists: %s", instance_data.fields)
new = False
except instance_data.model_class.DoesNotExist:
# revalidate files
logger.debug("Object doesn't exist: %s", instance_data)
instance_data = func(self, *args, item_data=item, **kwargs)
obj = instance_data.model_class(
**instance_data.fields,
**instance_data.defaults,
)
obj.save()
new = True
# obj, new = instance_data.model_class.filter_manager.by_target(
# self.target
# ).get_or_create(
# **instance_data.fields,
# defaults=instance_data.defaults,
# )
else:
# no unique field requirements, just create new object
obj = instance_data.model_class(
**instance_data.defaults,
)
obj.save()
new = True
logger.debug(
"%s object %s created",
instance_data.model_class._meta.object_name, # pylint: disable=protected-access
obj,
)
except MultipleObjectsReturned:
msg = "{}.get_or_create in {} returned multiple objects for {}".format(
instance_data.model_class._meta.object_name, # pylint: disable=protected-access
instance_data.key,
instance_data.fields,
)
self.report.log(logging.ERROR, msg)
failed = failed + 1
except IntegrityError:
msg = "{} object {} failed to save".format(
instance_data.model_class._meta.object_name, # pylint: disable=protected-access
instance_data.key,
)
self.report.log(logging.ERROR, msg)
failed = failed + 1
if obj:
# update any additional fields
instance_qs = instance_data.model_class.objects.filter(pk=obj.pk)
instance_qs.update(**instance_data.defaults)
obj.refresh_from_db()
else:
# create fake object so I can just push the upload
# through and compile report for user feedback
obj = instance_data.model_class(
**instance_data.fields | instance_data.defaults
)
logger.warning(
"Fake %s object created: %s",
instance_data.model_class._meta.object_name, # pylint: disable=protected-access
obj,
)
if new:
created = created + 1
# check if old versions exist and mark them as superseded
if "version" in instance_data.fields.keys():
del instance_data.fields["version"]
superseded = instance_data.model_class.objects.filter(
**instance_data.fields,
).exclude(
pk=obj.pk,
)
updated += superseded.update(superseded=True)
else:
existing = existing + 1
m = MetadataObject(
instance=obj,
key=instance_data.key,
versioned_key=instance_data.versioned_key,
index_data=instance_data.index_data,
new=new,
)
# index data here probs
result[instance_data.versioned_key] = m
msg = "{} {} objects processed, {} created, {} fetched from database".format(
created + existing + failed,
next( # pylint: disable=protected-access
iter(result.values())
).instance._meta.model._meta.object_name, # pylint: disable=protected-access
created,
existing,
) # pylint: disable=protected-access
self.report.log(logging.INFO, msg)
# refresh all objects to make sure they're up to date.
# this is specifically because of the superseded flag above -
# I'm setting this in separate queryset, the db rows are
# updated, but the changes are not being propagated to the
# objects in result dict. Well aware that this isn't efficient
# but I don't have access to parent's versioned key here, (and
# even if I did, there's no guarantee that they would have
# already been processd), so that's why updating every single
# object.
if updated > 0:
for k in result.values():
k.instance.refresh_from_db()
return result
return wrapper_create_objects
class TargetLoader:
def __init__(
self,
data_bundle: str,
proposal_ref: str,
tempdir: str,
user_id=None,
task: Task | None = None,
):
self.data_bundle = Path(data_bundle).name
self.bundle_name = Path(data_bundle).stem
self.bundle_path = data_bundle
self.proposal_ref = proposal_ref
self.tempdir = tempdir
self.raw_data = Path(self.tempdir).joinpath(self.bundle_name)
self.task = task
self.version_number = 1
self.version_dir = None
self.previous_version_dirs = None
self.user_id = user_id
self.report = UploadReport(task=task, proposal_ref=self.proposal_ref)
self.raw_data.mkdir()
# create exp upload object
# NB! this is not saved here in case upload fails
self.experiment_upload = ExperimentUpload(
commit_datetime=timezone.now(),
file=self.data_bundle,
)
# work out where the data finally lands
path = Path(TARGET_LOADER_MEDIA_DIRECTORY)
# give each upload a unique directory
# update: resolving issue 1311 introduced a bug, where
# subsequent uploads overwrote file paths and files appeared
# to be missing. changing the directory structure so this
# wouldn't be an issue, the new structure is
# target_loader_data/target_title/upload_(n)/...
if task:
self.experiment_upload.task_id = task.request.id
# figure out absolute and relative paths to final
# location. relative path is added to db field, this will be
# used in url requests to retrieve the file. absolute path is
# for moving the file to the final location
self._final_path = path
self._abs_final_path = Path(settings.MEDIA_ROOT).joinpath(path)
# but don't create now, this comes later
# to be used in logging messages, if no task, means invoked
# directly, likely from management command
# self.task_id = f"task {task.request.id}: " if task else ""
# these will be filled later
self.target_name = None
self._target_root = None
self.target = None
self.project = None
# Initial (reassuring message)
bundle_filename = os.path.basename(self.bundle_path)
self.report.log(
logging.INFO,
f"Created TargetLoader for '{bundle_filename}' proposal_ref='{proposal_ref}'",
)
@property
def final_path(self) -> Path:
return self._final_path
@property
def abs_final_path(self) -> Path:
return self._abs_final_path
def validate_map_files(
self,
key: str,
obj_identifier: str,
file_struct: list,
validate_files: bool = True,
) -> list[str]:
"""Validate list of panddas event files.
Special case of file validation, too complex to squeeze into
the main validation method (mainly because of typing).
"""
def logfunc(_, message):
self.report.log(logging.WARNING, message)
result = []
for item in file_struct:
fname, file_hash = self._check_file(item, obj_identifier, key, logfunc)
if not fname:
continue
if validate_files:
self._check_file_hash(obj_identifier, key, fname, file_hash, logfunc)
result.append(fname)
return result
def validate_files(
self,
obj_identifier: str,
file_struct: dict,
required: Iterable[str] = (),
recommended: Iterable[str] = (),
validate_files: bool = True,
) -> list[str | None]:
"""Check if file exists and if sha256 hash matches (if given).
file struct can come in 2 configurations:
{file_key: {file: <file_path>, sha265: <hash> [smiles: <smiles>]}, ...}
or simply
{file_key: <file path>}
Detect which one and take appropriate action.
Once the filename is extracted, check if it exists and if
sha256 hash is given, calculate the hash and compare to the
one in file.
params:
- file_struct: dictionary read from yaml file
- required: mandatory filename keys
- recommended: optional filename keys
- protein_name: experiment_identifier (used for logging)
return:
- list of all file paths required
Checks for 4 possible errors:
- file is expected by the db schema but not referenced in METADATA_FILE
- file is referenced METADATA_FILE but not present in uploaded archive
- calculated hash doesn't match with the one in METADATA_FILE
- dictionary in unexpected format, unable to extract filename
"""
def logfunc(key, message):
if key in required:
self.report.log(logging.ERROR, message)
else:
self.report.log(logging.WARNING, message)
result = {}
for key, value in file_struct.items():
if key not in required and key not in recommended:
# schema isn't looking for this file, ignore
continue
filename, file_hash = None, None
# sort out the filename
if isinstance(value, dict):
filename, file_hash = self._check_file(
value, obj_identifier, key, logfunc
)
if not filename:
continue
if validate_files:
self._check_file_hash(
obj_identifier, key, filename, file_hash, logfunc
)
elif isinstance(value, str):
filename = value
if validate_files:
self._check_file_hash(
obj_identifier, key, filename, file_hash, logfunc
)
else:
# probably panddas files here
continue
# file key should go to result dict no matter what
result[key] = filename
logger.debug("Adding key %s: %s", key, filename)
files = []
for f in list(required) + list(recommended):
try:
files.append(result[f])
except KeyError:
logfunc(
f,
"{}: file {} expected but not found in {} file".format(
obj_identifier,
f,
METADATA_FILE,
),
)
files.append(None) # type: ignore [arg-type]
logger.debug("Returning files: %s", files)
# memo to self: added type ignore directives to return line
# below and append line above because after small refactoring,
# mypy all of the sudden started throwing errors on both of
# these. the core of it's grievance is that it expects the
# return type to be list[str]. no idea why, function signature
# clearly defines it as list[str | None]
return files # type: ignore [return-value]
def _check_file(
self,
value: dict,
obj_identifier: str,
key: str,
logfunc: Callable,
) -> Tuple[str | None, str | None]:
file_hash = value.get("sha256")
try:
filename = value["file"]
except KeyError:
# this is rather unexpected, haven't seen it yet
filename = None
logfunc(key, f"{obj_identifier}: malformed dict, key 'file' missing")
return filename, file_hash
def _check_file_hash(
self,
obj_identifier: str,
key: str,
filename: str,
file_hash: str | None,
logfunc: Callable,
) -> None:
file_path = self.raw_data.joinpath(filename)
if file_path.is_file():
if file_hash and file_hash != calculate_sha256(file_path):
logfunc(key, f"Invalid hash for file {filename}")
else:
logfunc(
key,
f"{key} referenced in {METADATA_FILE}: {obj_identifier} but not found in archive",
)
def _enumerate_objects(self, objects: dict, attr: str) -> None:
# don't overwrite values already in database, get the current
# max value and continue from there
max_existing = 0
for val in objects.values(): # pylint: disable=no-member
value = getattr(val.instance, attr, 0)
if value:
max_existing = max(value, max_existing)
if not max_existing:
max_existing = 0
for val in objects.values(): # pylint: disable=no-member
value = getattr(val.instance, attr)
if not value:
max_existing = max_existing + 1
setattr(val.instance, attr, max_existing)
val.instance.save()
@create_objects(depth=1)
def process_experiment(
self,
item_data: tuple[str, dict] | None = None,
prefix_tooltips: dict[str, str] | None = None,
validate_files: bool = True,
**kwargs,
) -> ProcessedObject | None:
"""Extract data from yaml block for creating Experiment instance.
Incoming data format (relevant bits):
(
protein_name: <str>,
{
'type': 'manual',
'crystallographic_files': {
'xtal_pdb': {
'file': 'upload_1/crystallographic_files/5rgs/5rgs.pdb',
'sha256': sha <str>,
},
'xtal_mtz': {
'file': 'upload_1/crystallographic_files/5rgs/5rgs.mtz',
'sha256': sha <str>,
},
'panddas_event_files': {
'file': <path>.ccp4,
'sha256': sha <str>,
'model': '1', chain: B, res: 203, index: 1, bdc: 0.23
},
'status': 'new',
},
}
)
This is enough to save full instance
"""
del kwargs
assert item_data
logger.debug("incoming data: %s", item_data)
experiment_name, data = item_data
extract = functools.partial(
self._extract,
data=data,
section_name="crystals",
item_name=experiment_name,
)
( # pylint: disable=unbalanced-tuple-unpacking
pdb_info,
mtz_info,
cif_info,
) = self.validate_files(
obj_identifier=experiment_name,
file_struct=data["crystallographic_files"],
recommended=(
"xtal_pdb",
"xtal_mtz",
"ligand_cif",
),
validate_files=validate_files,
)
try:
event_files = data["crystallographic_files"]["ligand_binding_events"]
except KeyError:
event_files = []
map_info_files = self.validate_map_files(
key="ligand_binding_events",
obj_identifier=experiment_name,
file_struct=event_files,
validate_files=validate_files,
)
dtype = extract(key="type")
if dtype == "manual":
exp_type = 1
elif dtype == "model_building":
exp_type = 0
else:
exp_type = -1
self.report.log(
logging.ERROR,
f"Unexpected 'type' '{dtype}' value for {experiment_name}",
)
dstatus = extract(key="status")
status_codes = {
"new": 0,
"deprecated": 1,
"superseded": 2,
"unchanged": 3,
}
try:
status = status_codes[dstatus]
except KeyError:
status = -1
self.report.log(
logging.ERROR, f"Unexpected status '{dstatus}' for {experiment_name}"
)
try:
smiles = data["crystallographic_files"]["ligand_cif"]["smiles"]
except KeyError:
smiles = ""
# if empty or key missing entirely, ensure code_prefix returns empty
code_prefix = extract(key="code_prefix", level=logging.INFO)
# ignoring type because tooltip dict can legitimately be empty
# and in such case, assert statement fails. need to remove it
# and use the ignore
prefix_tooltip = prefix_tooltips.get(code_prefix, "") # type: ignore[union-attr]
fields = {
"code": experiment_name,
}
map_info_paths = []
if map_info_files:
map_info_paths = [str(self._get_final_path(k)) for k in map_info_files]
defaults = {
"experiment_upload": self.experiment_upload,
"status": status,
"type": exp_type,
"pdb_info": str(self._get_final_path(pdb_info)),
"mtz_info": str(self._get_final_path(mtz_info)),
"cif_info": str(self._get_final_path(cif_info)),
"map_info": map_info_paths,
"prefix_tooltip": prefix_tooltip,
# this doesn't seem to be present
# pdb_sha256:
}
assigned_xtalform = extract(key="assigned_xtalform")
index_fields = {
"xtalform": assigned_xtalform,
"smiles": smiles,
"code_prefix": code_prefix,
}
return ProcessedObject(
model_class=Experiment,
fields=fields,
key=experiment_name,
versioned_key=experiment_name,
defaults=defaults,
index_data=index_fields,
)
@create_objects(depth=1)
def process_compound(
self,
experiments: dict[int | str, MetadataObject],
item_data: tuple[str, dict] | None = None,
**kwargs,
) -> ProcessedObject | None:
"""Extract data from yaml block for creating Compound instance.
Incoming data format:
xtal_pdb: {file: <file path>, sha256: <hash>}
xtal_mtz: {file: <file path>, sha256: <hash>}
ligand_cif: {file: <file path>, sha256: <hash>, smiles: <smiles>}
panddas_event_files:
- {file: <file path>, sha256: <hash>,
model: <int>, chain: <char[1]>, res: <int>, index: <int>, bdc: <float>}
- {file: <file path>, sha256: <hash>,
model: <int>, chain: <char[1]>, res: <int>, index: <int>, bdc: <float>}
NB! After creation, many2many with project needs to be populated
"""
del kwargs
assert item_data
logger.debug("incoming data: %s", item_data)
protein_name, data = item_data
if (
"aligned_files" not in data.keys()
or not experiments[protein_name].new # remove already saved objects
or "crystallographic_files" not in data.keys()
):
return None
try:
smiles = data["crystallographic_files"]["ligand_cif"]["smiles"]
except KeyError as exc:
# just setting the var to something
smiles = (
"crystallographic_files"
if exc.args[0] == "ligand_cif"
else "ligand_cif"
)
self.report.log(
logging.WARNING,
f"{exc} missing from {smiles} in '{protein_name}' experiment section",
)
return None
defaults = {
"smiles": smiles,
"compound_code": data.get("compound_code", None),
}
return ProcessedObject(
model_class=Compound,
fields={},
defaults=defaults,
key=protein_name,
versioned_key=protein_name,
)
@create_objects(depth=1)
def process_xtalform(
self,
item_data: tuple[str, dict] | None = None,
**kwargs,
) -> ProcessedObject | None:
"""Create Xtalform model instance from data.
Incoming data format (from meta_aligner.yaml):
<name>:
xtalform_ref: <ref>
xtalform_space_group: <space group>
xtalform_cell: <cell info>
and (from xtalforms.yaml):
<name>:
reference: <ref>
assemblies:
<idx>:
assembly: <assembly_id>
chains: <chains>
Saves all references to other tables (QuatAssembly and Experiment).
"""
del kwargs
assert item_data
# weirdly, none of the fields is mandatory in Xtalform
xtalform_name, data = item_data
extract = functools.partial(
self._extract,
data=data,
section_name="xtalforms",
item_name=xtalform_name,
)
fields = {
"name": xtalform_name,
}
space_group = extract(key="xtalform_space_group")
unit_cell_info = extract(key="xtalform_cell")
defaults = {
"space_group": space_group,
"unit_cell_info": unit_cell_info,
}
return ProcessedObject(
model_class=Xtalform,
fields=fields,
key=xtalform_name,
versioned_key=xtalform_name,
defaults=defaults,
)
@create_objects(depth=1)
def process_quat_assembly(
self,
item_data: tuple[str, dict] | None = None,
**kwargs,
) -> ProcessedObject | None:
"""Create QuatAssemblylform model instance from data.
Incoming data format:
<idx>:
reference: <name>
biomol: <biomol: str>
chains: <chain info: str>
No references to other models.
"""
del kwargs
assert item_data
assembly_name, data = item_data
extract = functools.partial(
self._extract,