From 8437f5da7c40c7a63478c3a1b2ad3fd99b9177bc Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Fri, 19 Jan 2024 13:56:03 +0100 Subject: [PATCH 1/3] fix: removing duplicates in conditions JSONL --- scripts/genes-integrate-diseases.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py index 7e975ef..91facf3 100644 --- a/scripts/genes-integrate-diseases.py +++ b/scripts/genes-integrate-diseases.py @@ -929,15 +929,19 @@ def run(self, pickle_path: Optional[str] = None): "panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level)) } ) + seen = set() for key, assoc in self.disease_assocs.items(): disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations - conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy( - update={ - "disease_associations": tuple( - sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence) - ) - } - ) + marker = (assoc.hgnc_id, assoc.labeled_disorders) + if marker not in seen: + seen.add(marker) + conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy( + update={ + "disease_associations": tuple( + sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence) + ) + } + ) result = ResultContainer(results=tuple(conditions_by_hgnc.values())) for assoc in result.results: json.dump(obj=assoc.model_dump(mode="json"), fp=sys.stdout) From 8d2e228e225cf90131baa3aa0c266ff8a99c477d Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Fri, 19 Jan 2024 14:44:56 +0100 Subject: [PATCH 2/3] wip --- rules/output/annonars/genes.smk | 2 +- scripts/genes-integrate-diseases.py | 57 +++++++++-------------------- 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/rules/output/annonars/genes.smk b/rules/output/annonars/genes.smk index 4471bc5..7295623 100644 --- a/rules/output/annonars/genes.smk +++ b/rules/output/annonars/genes.smk @@ -71,6 +71,6 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file --value date={wildcards.date} \ \ --value v_annonars={wildcards.v_annonars} \ - --value v_downloader={PV.downloader} + --value v_downloader={PV.downloader} \ > {output.spec_yaml} """ diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py index 91facf3..5ed7b89 100644 --- a/scripts/genes-integrate-diseases.py +++ b/scripts/genes-integrate-diseases.py @@ -831,17 +831,6 @@ def __lt__(self, other: "ResultContainer") -> bool: ) -class GeneDiseaseKey(BaseModel): - """Key for a gene-disease association.""" - - model_config = ConfigDict(frozen=True) - - #: Gene HGNC ID. - hgnc_id: str - #: Disease database ID. - disease_id: str - - class Integrator: """Implementation of the integration algorithm.""" @@ -880,29 +869,26 @@ class Integrator: def __init__(self): """Initialise the integrator.""" #: Mapping from `(gene_hgnc_id, disease_id)` to `GeneDiseaseAssociationEntry`. - self.disease_assocs: Dict[GeneDiseaseKey, GeneDiseaseAssociation] = {} + self.disease_assocs: Dict[str, List[GeneDiseaseAssociation]] = {} #: Mapping from `hgnc_id` to list of `PanelappAssociation`s. self.panelapp_assocs: Dict[str, List[PanelappAssociation]] = {} def register_disease_assoc(self, assoc: GeneDiseaseAssociation): """Register a gene-disease association.""" - found_list = set() - for disease_id in assoc.disease_ids: - key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) - if key in self.disease_assocs: - found_list.add(self.disease_assocs[key]) + found_list = self.disease_assocs.get(assoc.hgnc_id, []) if not found_list: - for disease_id in assoc.disease_ids: - key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) - self.disease_assocs[key] = assoc + self.disease_assocs[assoc.hgnc_id] = [assoc] else: - if len(found_list) != 1: - logger.warning(f"Found multiple associations for {assoc.hgnc_id}") + merged_any = False + new_list = [] for found in found_list: - found = found.merge(assoc) - for disease_id in assoc.disease_ids: - key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id) - self.disease_assocs[key] = found + if set(assoc.disease_ids) & set(found.disease_ids): + merged_any = True + found = found.merge(assoc) + new_list.append(found) + if not merged_any: + new_list.append(assoc) + self.disease_assocs[assoc.hgnc_id] = new_list def run(self, pickle_path: Optional[str] = None): logger.info("Building gene-disease map...") @@ -918,7 +904,8 @@ def run(self, pickle_path: Optional[str] = None): for hgnc_id in sorted( set( chain( - (k.hgnc_id for k in self.disease_assocs.keys()), self.panelapp_assocs.keys() + (hgnc_id for hgnc_id in self.disease_assocs.keys()), + self.panelapp_assocs.keys(), ) ) ) @@ -930,18 +917,10 @@ def run(self, pickle_path: Optional[str] = None): } ) seen = set() - for key, assoc in self.disease_assocs.items(): - disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations - marker = (assoc.hgnc_id, assoc.labeled_disorders) - if marker not in seen: - seen.add(marker) - conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy( - update={ - "disease_associations": tuple( - sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence) - ) - } - ) + for hgnc_id, assocs in self.disease_assocs.items(): + conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy( + update={"disease_associations": tuple(sorted(assocs, key=lambda a: a.confidence))} + ) result = ResultContainer(results=tuple(conditions_by_hgnc.values())) for assoc in result.results: json.dump(obj=assoc.model_dump(mode="json"), fp=sys.stdout) From d0a0b30527edad136d9acf17a5a65f94a3095c41 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Fri, 19 Jan 2024 14:55:06 +0100 Subject: [PATCH 3/3] wip --- scripts/genes-integrate-diseases.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/genes-integrate-diseases.py b/scripts/genes-integrate-diseases.py index 5ed7b89..f8cd0ce 100644 --- a/scripts/genes-integrate-diseases.py +++ b/scripts/genes-integrate-diseases.py @@ -916,7 +916,6 @@ def run(self, pickle_path: Optional[str] = None): "panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level)) } ) - seen = set() for hgnc_id, assocs in self.disease_assocs.items(): conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy( update={"disease_associations": tuple(sorted(assocs, key=lambda a: a.confidence))}