Skip to content

Commit f5377d3

Browse files
committed
switch to newer observedVarKeys format
1 parent 5151021 commit f5377d3

File tree

5 files changed

+376
-316
lines changed

5 files changed

+376
-316
lines changed

pori_python/ipr/ipr.py

+88-115
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from itertools import product
88
from copy import copy
99
from typing import Dict, Iterable, List, Sequence, Set, Tuple, cast
10-
10+
import uuid
1111
from pori_python.graphkb import GraphKBConnection
1212
from pori_python.graphkb import statement as gkb_statement
1313
from pori_python.graphkb import vocab as gkb_vocab
@@ -49,9 +49,13 @@ def filter_structural_variants(
4949
Filter structural variants to remove non-high quality events unless they are matched/annotated or
5050
they involve a gene that is a known fusion partner
5151
"""
52-
matched_svs = {match["variant"] for match in kb_matches if match["variantType"] == "sv"}
52+
matched_svs = {
53+
match["variant"] for match in kb_matches if match["variantType"] == "sv"
54+
}
5355
fusion_genes = {
54-
gene["name"] for gene in gene_annotations if gene.get("knownFusionPartner", False)
56+
gene["name"]
57+
for gene in gene_annotations
58+
if gene.get("knownFusionPartner", False)
5559
}
5660

5761
result = []
@@ -89,7 +93,9 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str]
8993

9094
# Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName
9195
ipr_source_rid = graphkb_conn.get_source("ipr")["@rid"]
92-
ipr_evidence_levels = filter(lambda d: d.get("source") == ipr_source_rid, evidence_levels)
96+
ipr_evidence_levels = filter(
97+
lambda d: d.get("source") == ipr_source_rid, evidence_levels
98+
)
9399
cross_references_mapping: Dict[str, str] = dict()
94100
ipr_rids_to_displayname: Dict[str, str] = dict()
95101
for level in ipr_evidence_levels:
@@ -138,7 +144,9 @@ def convert_statements_to_alterations(
138144
"""
139145
disease_matches = {
140146
r["@rid"]
141-
for r in gkb_vocab.get_term_tree(graphkb_conn, disease_name, ontology_class="Disease")
147+
for r in gkb_vocab.get_term_tree(
148+
graphkb_conn, disease_name, ontology_class="Disease"
149+
)
142150
}
143151

144152
if not disease_matches:
@@ -151,7 +159,9 @@ def convert_statements_to_alterations(
151159

152160
# get the recruitment status for any trial associated with a statement
153161
clinical_trials = [
154-
s["subject"]["@rid"] for s in statements if s["subject"]["@class"] == "ClinicalTrial"
162+
s["subject"]["@rid"]
163+
for s in statements
164+
if s["subject"]["@class"] == "ClinicalTrial"
155165
]
156166
recruitment_statuses = {}
157167
if clinical_trials:
@@ -168,7 +178,9 @@ def convert_statements_to_alterations(
168178

169179
for statement in statements:
170180
variants = [
171-
cast(Variant, c) for c in statement["conditions"] if c["@class"] in VARIANT_CLASSES
181+
cast(Variant, c)
182+
for c in statement["conditions"]
183+
if c["@class"] in VARIANT_CLASSES
172184
]
173185
diseases = [c for c in statement["conditions"] if c["@class"] == "Disease"]
174186
disease_match = len(diseases) == 1 and diseases[0]["@rid"] in disease_matches
@@ -189,8 +201,12 @@ def convert_statements_to_alterations(
189201

190202
evidence_level_str = display_evidence_levels(statement)
191203
evidence_levels = statement.get("evidenceLevel") or []
192-
ipr_evidence_levels = [ev_map[el.get("@rid", "")] for el in evidence_levels if el]
193-
ipr_evidence_levels_str = ";".join(sorted(set([el for el in ipr_evidence_levels])))
204+
ipr_evidence_levels = [
205+
ev_map[el.get("@rid", "")] for el in evidence_levels if el
206+
]
207+
ipr_evidence_levels_str = ";".join(
208+
sorted(set([el for el in ipr_evidence_levels]))
209+
)
194210

195211
for variant in variants:
196212
if variant["@rid"] not in variant_matches:
@@ -200,10 +216,16 @@ def convert_statements_to_alterations(
200216
"approvedTherapy": approved_therapy or False,
201217
"category": ipr_section or "unknown",
202218
"context": (
203-
statement["subject"]["displayName"] if statement["subject"] else ""
219+
statement["subject"]["displayName"]
220+
if statement["subject"]
221+
else ""
222+
),
223+
"kbContextId": (
224+
statement["subject"]["@rid"] if statement["subject"] else ""
225+
),
226+
"disease": ";".join(
227+
sorted(d.get("displayName", "") for d in diseases)
204228
),
205-
"kbContextId": (statement["subject"]["@rid"] if statement["subject"] else ""),
206-
"disease": ";".join(sorted(d.get("displayName", "") for d in diseases)),
207229
"evidenceLevel": evidence_level_str or "",
208230
"iprEvidenceLevel": ipr_evidence_levels_str or "",
209231
"kbStatementId": statement["@rid"],
@@ -234,67 +256,6 @@ def convert_statements_to_alterations(
234256
return rows
235257

236258

237-
"""
238-
"kbMatchedStatements": [
239-
{
240-
"approvedTherapy": false,
241-
"category": "therapeutic",
242-
"context": "test multivariant statement",
243-
"disease": "colorectal cancer [DOID:9256]",
244-
"kbStatementId": "#999:999",
245-
"matchedCancer": true,
246-
"reference": "pmid:TEST1",
247-
"relevance": "resistance",
248-
"iprEvidenceLevel": "IPR-D",
249-
"externalSource": "IPRKB",
250-
"reviewStatus": "pending",
251-
"kbData": {},
252-
"requiredKbMatches": [
253-
"#158:1343",
254-
"#999:9999"
255-
]
256-
}
257-
],
258-
"kbStatementMatchedConditions": [
259-
{
260-
"kbStatementId": "#999:999",
261-
"observedVariantKeys": [
262-
"TEST3",
263-
"TEST1"
264-
]
265-
},
266-
{
267-
"kbStatementId": "#999:999",
268-
"observedVariantKeys": [
269-
"TEST2",
270-
"TEST1"
271-
]
272-
}
273-
],
274-
"kbVariants": [
275-
{
276-
"kbVariant": "APC mutation",
277-
"variantKey": "TEST3",
278-
"variantType": "mut",
279-
"kbVariantId": "#158:1343"
280-
},
281-
{
282-
"kbVariant": "APC mutation",
283-
"variantKey": "TEST2",
284-
"variantType": "mut",
285-
"kbVariantId": "#158:1343"
286-
},
287-
{
288-
"kbVariant": "ZFP36L2:p.Q401del",
289-
"variant": "TEST1",
290-
"variantType": "mut",
291-
"kbVariantId": "#999:9999"
292-
}
293-
],
294-
295-
"""
296-
297-
298259
def select_expression_plots(
299260
kb_matches: List[KbMatch] | List[Hashabledict], all_variants: Sequence[IprVariant]
300261
) -> List[ImageDefinition]:
@@ -327,7 +288,9 @@ def select_expression_plots(
327288
gene = str(variant.get("gene", ""))
328289
hist = str(variant.get("histogramImage", ""))
329290
if hist:
330-
images_by_gene[gene] = ImageDefinition({"key": f"expDensity.{gene}", "path": hist})
291+
images_by_gene[gene] = ImageDefinition(
292+
{"key": f"expDensity.{gene}", "path": hist}
293+
)
331294
return [images_by_gene[gene] for gene in selected_genes if gene in images_by_gene]
332295

333296

@@ -370,7 +333,9 @@ def create_key_alterations(
370333
counts[type_mapping[variant_type]].add(variant_key)
371334

372335
if variant_type == "exp":
373-
alterations.append(f'{variant.get("gene","")} ({variant.get("expressionState")})')
336+
alterations.append(
337+
f'{variant.get("gene","")} ({variant.get("expressionState")})'
338+
)
374339
elif variant_type == "cnv":
375340
alterations.append(f'{variant.get("gene","")} ({variant.get("cnvState")})')
376341
# only show germline if relevant
@@ -447,15 +412,19 @@ def germline_kb_matches(
447412
# Remove any matches to germline events
448413
for alt in somatic_alts:
449414
var_list = [v for v in all_variants if v["key"] == alt["variant"]]
450-
somatic_var_list = [v for v in var_list if not v.get("germline", not assume_somatic)]
415+
somatic_var_list = [
416+
v for v in var_list if not v.get("germline", not assume_somatic)
417+
]
451418
if var_list and not somatic_var_list:
452419
logger.debug(
453420
f"Dropping germline match to somatic statement kbStatementId:{alt['kbStatementId']}: {alt['kbVariant']} {alt['category']}"
454421
)
455422
elif somatic_var_list:
456423
ret_list.append(alt) # match to somatic variant
457424
else:
458-
ret_list.append(alt) # alteration not in any specific keys matches to check.
425+
ret_list.append(
426+
alt
427+
) # alteration not in any specific keys matches to check.
459428

460429
return ret_list
461430

@@ -530,7 +499,9 @@ def multi_variant_filtering(
530499

531500
# Filtering out incompleted matches of gkb_matches
532501
return [
533-
match for match in gkb_matches if match["kbStatementId"] in complete_matching_statements
502+
match
503+
for match in gkb_matches
504+
if match["kbStatementId"] in complete_matching_statements
534505
]
535506

536507

@@ -576,7 +547,9 @@ def get_kb_matched_statements(
576547
for item in gkb_matches:
577548
stmt = copy(item)
578549
stmt["requiredKbMatches"].sort()
579-
kbs = KbMatchedStatement({key: val for (key, val) in stmt.items() if key in kbs_keys})
550+
kbs = KbMatchedStatement(
551+
{key: val for (key, val) in stmt.items() if key in kbs_keys}
552+
)
580553
dict_key = str(kbs)
581554
kbMatchedStatements[dict_key] = kbs
582555
return [*kbMatchedStatements.values()]
@@ -585,23 +558,22 @@ def get_kb_matched_statements(
585558
def get_kb_statement_matched_conditions(
586559
gkb_matches: List[KbMatch] | List[Hashabledict],
587560
allow_partial_matches: bool = False,
588-
infer_possible_matches: bool = False,
589561
) -> List[KbMatchedStatementConditionSet]:
590562
"""
591563
Prepares the kbMatchedStatementConditions section, with expected format
592-
kbStatementId: str
593-
observedVariantKeys: [str]
564+
kbStatementId: #999:999
565+
observedVariantKeys: [{'observedVariantKey': 'test1', 'kbVariantId': '#111:111'}]
594566
595567
where the kbStatementId is a gkb statement rid
596-
and each of the observed variant keys is a 'variantKey' field from one
597-
of the records in kbVariants.
568+
and each of the observed variant keys is a reference to
569+
a kbMatch (ie, an observed variant/kb variant pair).
598570
599571
Each record in the output from this function should represent
600572
one set of observed variants that satisfies the gkb variants in the
601573
conditions of the statement.
602574
603575
If more than one set of observed variants satisfies the gkb variant conditions of the
604-
statement, the output should include one record for each possible set.
576+
statement, the output from this function should include one record for each possible set.
605577
606578
Eg if the stmt requires gkb variants A and B, and the observed variants include
607579
X which matches A, and Y and Z which both match B,
@@ -611,73 +583,74 @@ def get_kb_statement_matched_conditions(
611583
Params:
612584
gkb_matches: KbMatch statements to be processed
613585
allow_partial_matches: include statements where not all requirements are satisfied
614-
infer_possible_matches: allow variants to be used to support statements even when
615-
the connection is not made explicit in the input data
616586
Returns:
617587
list of KbStatementMatchedConditionSet records
618588
"""
619589

620-
kbVariants = get_kb_variants(gkb_matches)
621590
kbMatchedStatements = get_kb_matched_statements(gkb_matches)
622591
kbMatchedStatementConditions = {}
623592

624593
for kbStmt in kbMatchedStatements:
625-
stmts = [item for item in gkb_matches if item["kbStatementId"] == kbStmt["kbStatementId"]]
594+
stmts = [
595+
item
596+
for item in gkb_matches
597+
if item["kbStatementId"] == kbStmt["kbStatementId"]
598+
]
599+
626600
requirements = {}
627601
for requirement in stmts[0]["requiredKbMatches"]:
628602
if not requirements.get(requirement, False):
629-
if infer_possible_matches:
630-
# if true, use all possible links between kbVariantId and kbStatement
631-
reqlist = [
632-
item["variantKey"]
633-
for item in kbVariants
634-
if item["kbVariantId"] == requirement
635-
]
636-
else:
637-
# only use explicit variant/statement links
638-
reqlist = [
639-
item["variant"]
640-
for item in gkb_matches
641-
if (
642-
item["kbVariantId"] == requirement
643-
and item["kbStatementId"] == kbStmt["kbStatementId"]
644-
)
645-
]
603+
# only use explicit variant/statement links
604+
reqlist = [
605+
{
606+
"kbVariantId": requirement,
607+
"observedVariantKey": item["variant"],
608+
}
609+
for item in gkb_matches
610+
if (
611+
item["kbVariantId"] == requirement
612+
and item["kbStatementId"] == kbStmt["kbStatementId"]
613+
)
614+
]
646615
requirements[requirement] = reqlist
647616

648-
# remove empty sets for requirements if allowing partial matches
617+
# remove empty sets from requirements if allowing partial matches
649618
if allow_partial_matches:
650-
requirements = {key: val for (key, val) in requirements.items() if val}
619+
requirements = {
620+
key: val for (key, val) in requirements.items() if len(val) > 0
621+
}
651622

652623
variantConditionSets = list(product(*requirements.values()))
653624
conditionSets = [
654625
{"kbStatementId": kbStmt["kbStatementId"], "observedVariantKeys": item}
655626
for item in variantConditionSets
656627
]
657628
for conditionSet in conditionSets:
658-
# remove Nones
659-
observedVariantKeys = [item for item in conditionSet["observedVariantKeys"] if item]
660-
observedVariantKeys.sort()
629+
observedVariantKeys = sorted(
630+
conditionSet["observedVariantKeys"],
631+
key=lambda x: (x["kbVariantId"], x["observedVariantKey"]),
632+
)
661633
kbmc = KbMatchedStatementConditionSet(
662634
{
663635
"kbStatementId": conditionSet["kbStatementId"],
664636
"observedVariantKeys": observedVariantKeys,
665637
}
666638
)
667-
kbMatchedStatementConditions[str(kbmc)] = kbmc
668-
639+
key = str(
640+
uuid.uuid5(uuid.NAMESPACE_DNS, str(kbmc))
641+
) # to make it more readable when debugging
642+
kbMatchedStatementConditions[key] = kbmc
669643
return [*kbMatchedStatementConditions.values()]
670644

671645

672646
def get_kb_matches_sections(
673647
gkb_matches: List[KbMatch] | List[Hashabledict],
674648
allow_partial_matches=False,
675-
infer_possible_matches=False,
676649
) -> KbMatchSections:
677650
kb_variants = get_kb_variants(gkb_matches)
678651
kb_matched_statements = get_kb_matched_statements(gkb_matches)
679652
kb_statement_matched_conditions = get_kb_statement_matched_conditions(
680-
gkb_matches, allow_partial_matches, infer_possible_matches
653+
gkb_matches, allow_partial_matches
681654
)
682655
return {
683656
"kbMatchedVariants": kb_variants,

0 commit comments

Comments
 (0)