Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Incermental rhs upload (issue 1394) #610

Merged
merged 4 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 165 additions & 102 deletions viewer/cset_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
from openpyxl.utils import get_column_letter

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "fragalysis.settings")
import django

django.setup()
from django.conf import settings

logger = logging.getLogger(__name__)

from django.conf import settings
from django.core.exceptions import MultipleObjectsReturned
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db.models import F, TextField, Value
from django.db.models.expressions import Func
from rdkit import Chem
from rdkit.Chem import Crippen, Descriptors

from viewer.models import (
Compound,
Expand All @@ -34,7 +35,13 @@
TextScoreValues,
User,
)
from viewer.utils import add_props_to_sdf_molecule, is_url, word_count
from viewer.utils import add_props_to_sdf_molecule, alphanumerator, is_url, word_count

logger = logging.getLogger(__name__)


# maximum distance between corresponding atoms in poses
_DIST_LIMIT = 0.5


def dataType(a_str: str) -> str:
Expand Down Expand Up @@ -132,6 +139,7 @@ def __init__(
version,
zfile,
zfile_hashvals,
computed_set_name,
):
self.user_id = user_id
self.sdf_filename = sdf_filename
Expand All @@ -141,6 +149,7 @@ def __init__(
self.version = version
self.zfile = zfile
self.zfile_hashvals = zfile_hashvals
self.computed_set_name = computed_set_name

def process_pdb(self, pdb_code, zfile, zfile_hashvals) -> str | None:
for key in zfile_hashvals.keys():
Expand Down Expand Up @@ -254,41 +263,49 @@ def get_site_observation(

return site_obvs

def create_mol(self, inchi, long_inchi=None, name=None) -> Compound:
def create_mol(self, inchi, target, name=None) -> Compound:
# check for an existing compound, returning a Compound
if long_inchi:
cpd = Compound.objects.filter(long_inchi=long_inchi)
sanitized_mol = Chem.MolFromInchi(long_inchi, sanitize=True)
else:
cpd = Compound.objects.filter(inchi=inchi)
sanitized_mol = Chem.MolFromInchi(inchi, sanitize=True)

new_mol = cpd[0] if len(cpd) != 0 else Compound()
new_mol.smiles = Chem.MolToSmiles(sanitized_mol)
new_mol.inchi = inchi
if long_inchi:
new_mol.long_inchi = long_inchi
new_mol.identifier = name

# descriptors
new_mol.mol_log_p = Crippen.MolLogP(sanitized_mol)
new_mol.mol_wt = float(Chem.rdMolDescriptors.CalcExactMolWt(sanitized_mol))
new_mol.heavy_atom_count = Chem.Lipinski.HeavyAtomCount(sanitized_mol)
new_mol.heavy_atom_mol_wt = float(Descriptors.HeavyAtomMolWt(sanitized_mol))
new_mol.nhoh_count = Chem.Lipinski.NHOHCount(sanitized_mol)
new_mol.no_count = Chem.Lipinski.NOCount(sanitized_mol)
new_mol.num_h_acceptors = Chem.Lipinski.NumHAcceptors(sanitized_mol)
new_mol.num_h_donors = Chem.Lipinski.NumHDonors(sanitized_mol)
new_mol.num_het_atoms = Chem.Lipinski.NumHeteroatoms(sanitized_mol)
new_mol.num_rot_bonds = Chem.Lipinski.NumRotatableBonds(sanitized_mol)
new_mol.num_val_electrons = Descriptors.NumValenceElectrons(sanitized_mol)
new_mol.ring_count = Chem.Lipinski.RingCount(sanitized_mol)
new_mol.tpsa = Chem.rdMolDescriptors.CalcTPSA(sanitized_mol)

# make sure there is an id so inspirations can be added
new_mol.save()

return new_mol

sanitized_mol = Chem.MolFromInchi(inchi, sanitize=True)
Chem.RemoveStereochemistry(sanitized_mol)
inchi = Chem.inchi.MolToInchi(sanitized_mol)
inchi_key = Chem.InchiToInchiKey(inchi)

try:
# NB! Max said there could be thousands of compounds per
# target so this distinct() here may become a problem

# fmt: off
cpd = Compound.objects.filter(
computedmolecule__computed_set__target=target,
).distinct().get(
inchi_key=inchi_key,
)
# fmt: on
except Compound.DoesNotExist:
cpd = Compound(
smiles=Chem.MolToSmiles(sanitized_mol),
inchi=inchi,
inchi_key=inchi_key,
current_identifier=name,
)
cpd.save()
except MultipleObjectsReturned as exc:
# NB! when processing new uploads, Compound is always
# fetched by inchi_key, so this shouldn't ever create
# duplicates. Ands LHS uploads do not create inchi_keys,
# so under normal operations duplicates should never
# occur. However there's nothing in the db to prevent
# this, so adding a catch clause and writing a meaningful
# message
logger.error(
'Duplicate compounds for target %s with inchi key %s.',
target.title,
inchi_key,
)
raise MultipleObjectsReturned from exc

return cpd

def set_props(self, cpd, props, compound_set) -> List[ScoreDescription]:
if 'ref_mols' and 'ref_pdb' not in list(props.keys()):
Expand Down Expand Up @@ -322,13 +339,10 @@ def set_mol(
smiles = Chem.MolToSmiles(mol)
inchi = Chem.inchi.MolToInchi(mol)
molecule_name = mol.GetProp('_Name')
long_inchi = None
if len(inchi) > 255:
long_inchi = inchi
inchi = inchi[:254]
version = mol.GetProp('version')

compound: Compound = self.create_mol(
inchi, name=molecule_name, long_inchi=long_inchi
inchi, compound_set.target, name=molecule_name
)

insp = mol.GetProp('ref_mols')
Expand All @@ -353,12 +367,7 @@ def set_mol(
'No matching molecules found for inspiration frag ' + i
)

if qs.count() > 1:
ids = [m.cmpd.id for m in qs]
ind = ids.index(max(ids))
ref = qs[ind]
elif qs.count() == 1:
ref = qs[0]
ref = qs.order_by('-cmpd_id').first()

insp_frags.append(ref)

Expand All @@ -385,11 +394,60 @@ def set_mol(

# Need a ComputedMolecule before saving.
# Check if anything exists already...
existing_computed_molecules = ComputedMolecule.objects.filter(
molecule_name=molecule_name, smiles=smiles, computed_set=compound_set

# I think, realistically, I only need to check compound
# fmt: off
qs = ComputedMolecule.objects.filter(
compound=compound,
).annotate(
# names come in format:
# target_name-sequential number-sequential letter,
# e.g. A71EV2A-1-a, hence grabbing the 3rd column
suffix=Func(
F('name'),
Value('-'),
Value(3),
function='split_part',
output_field=TextField(),
),
)

computed_molecule: Optional[ComputedMolecule] = None
if qs.exists():
suffix = next(
alphanumerator(start_from=qs.order_by('-suffix').first().suffix)
)
else:
suffix = 'a'

# distinct is ran on indexed field, so shouldn't be a problem
number = ComputedMolecule.objects.filter(
computed_set__target=compound_set.target,
).values('id').distinct().count() + 1
# fmt: on

name = f'v{number}{suffix}'

existing_computed_molecules = []
for k in qs:
kmol = Chem.MolFromMolBlock(k.sdf_info)
if kmol:
# find distances between corresponding atoms of the
# two conformers. if any one exceeds the _DIST_LIMIT,
# consider it to be a new ComputedMolecule
_, _, atom_map = Chem.rdMolAlign.GetBestAlignmentTransform(mol, kmol)
molconf = mol.GetConformer()
kmolconf = kmol.GetConformer()
small_enough = True
for mol_atom, kmol_atom in atom_map:
molpos = np.array(molconf.GetAtomPosition(mol_atom))
kmolpos = np.array(kmolconf.GetAtomPosition(kmol_atom))
distance = np.linalg.norm(molpos - kmolpos)
if distance >= _DIST_LIMIT:
small_enough = False
break
if small_enough:
existing_computed_molecules.append(k)

if len(existing_computed_molecules) == 1:
logger.info(
'Using existing ComputedMolecule %s', existing_computed_molecules[0]
Expand All @@ -400,10 +458,10 @@ def set_mol(
for exist in existing_computed_molecules:
logger.info('Deleting ComputedMolecule %s', exist)
exist.delete()
computed_molecule = ComputedMolecule()
if not computed_molecule:
computed_molecule = ComputedMolecule(name=name)
else:
logger.info('Creating new ComputedMolecule')
computed_molecule = ComputedMolecule()
computed_molecule = ComputedMolecule(name=name)

if isinstance(ref_so, SiteObservation):
code = ref_so.code
Expand All @@ -414,18 +472,20 @@ def set_mol(
pdb_info = ref_so
lhs_so = None

assert computed_molecule
# I don't quite understand why the overwrite of existing
# compmol.. but this is how it was, not touching it now
# update: I think it's about updating metadata. moving
# name attribute out so it won't get overwritten
computed_molecule.compound = compound
computed_molecule.computed_set = compound_set
computed_molecule.sdf_info = Chem.MolToMolBlock(mol)
computed_molecule.site_observation_code = code
computed_molecule.reference_code = code
computed_molecule.molecule_name = molecule_name
computed_molecule.name = f"{target}-{computed_molecule.identifier}"
computed_molecule.smiles = smiles
computed_molecule.pdb = lhs_so
# TODO: this is wrong
computed_molecule.pdb_info = pdb_info
computed_molecule.version = version
# Extract possible reference URL and Rationale
# URLs have to be valid URLs and rationals must contain more than one word
ref_url: Optional[str] = (
Expand All @@ -447,6 +507,8 @@ def set_mol(
# Done
computed_molecule.save()

compound_set.computed_molecules.add(computed_molecule)

# No update the molecule in the original file...
add_props_to_sdf_molecule(
sdf_file=filename,
Expand Down Expand Up @@ -530,50 +592,51 @@ def task(self) -> ComputedSet:
)

# Do we have any existing ComputedSets?
# Ones with the same method and upload date?
today: datetime.date = datetime.date.today()
existing_sets: List[ComputedSet] = ComputedSet.objects.filter(
method=truncated_submitter_method, upload_date=today
).all()
# If so, find the one with the highest ordinal.
latest_ordinal: int = 0
for exiting_set in existing_sets:
assert exiting_set.md_ordinal > 0
if exiting_set.md_ordinal > latest_ordinal:
latest_ordinal = exiting_set.md_ordinal
if latest_ordinal:
logger.info(
'Found existing ComputedSets for method "%s" on %s (%d) with ordinal=%d',
truncated_submitter_method,
str(today),
len(existing_sets),
latest_ordinal,
try:
computed_set = ComputedSet.objects.get(name=self.computed_set_name)
# refresh some attributes
computed_set.md_ordinal = F('md_ordinal') + 1
computed_set.upload_date = datetime.date.today()
computed_set.save()
except ComputedSet.DoesNotExist:
# no, create new

today: datetime.date = datetime.date.today()
new_ordinal: int = 1
try:
target = Target.objects.get(title=self.target)
except Target.DoesNotExist as exc:
# probably wrong target name supplied
logger.error('Target %s does not exist', self.target)
raise Target.DoesNotExist from exc

cs_name: str = (
f'{truncated_submitter_method}-{str(today)}-'
+ f'{get_column_letter(new_ordinal)}'
)
# ordinals are 1-based
new_ordinal: int = latest_ordinal + 1

# The computed set "name" consists of the "method",
# today's date and a 2-digit ordinal. The ordinal
# is used to distinguish between computed sets uploaded
# with the same method on the same day.
assert new_ordinal > 0
cs_name: str = f"{truncated_submitter_method}-{str(today)}-{get_column_letter(new_ordinal)}"
logger.info('Creating new ComputedSet "%s"', cs_name)

computed_set: ComputedSet = ComputedSet()
computed_set.name = cs_name
computed_set.md_ordinal = new_ordinal
computed_set.upload_date = today
computed_set.method = self.submitter_method[: ComputedSet.LENGTH_METHOD]
computed_set.target = Target.objects.get(title=self.target)
computed_set.spec_version = float(self.version.strip('ver_'))
if self.user_id:
computed_set.owner_user = User.objects.get(id=self.user_id)
else:
# The User ID may only be None if AUTHENTICATE_UPLOAD is False.
# Here the ComputedSet owner will take on a default (anonymous) value.
assert settings.AUTHENTICATE_UPLOAD is False
computed_set.save()
logger.info('Creating new ComputedSet "%s"', cs_name)

computed_set = ComputedSet(
name=cs_name,
md_ordinal=new_ordinal,
upload_date=today,
method=self.submitter_method[: ComputedSet.LENGTH_METHOD],
target=target,
spec_version=float(self.version.strip('ver_')),
)
if self.user_id:
try:
computed_set.owner_user = User.objects.get(id=self.user_id)
except User.DoesNotExist as exc:
logger.error('User %s does not exist', self.user_id)
raise User.DoesNotExist from exc

else:
# The User ID may only be None if AUTHENTICATE_UPLOAD is False.
# Here the ComputedSet owner will take on a default (anonymous) value.
assert settings.AUTHENTICATE_UPLOAD is False

computed_set.save()

# check compound set folder exists.
cmp_set_folder = os.path.join(
Expand Down
18 changes: 18 additions & 0 deletions viewer/migrations/0056_compound_inchi_key.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.25 on 2024-06-11 13:10

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('viewer', '0055_merge_20240516_1003'),
]

operations = [
migrations.AddField(
model_name='compound',
name='inchi_key',
field=models.CharField(blank=True, db_index=True, max_length=80),
),
]
Loading