7
7
from itertools import product
8
8
from copy import copy
9
9
from typing import Dict , Iterable , List , Sequence , Set , Tuple , cast
10
-
10
+ import uuid
11
11
from pori_python .graphkb import GraphKBConnection
12
12
from pori_python .graphkb import statement as gkb_statement
13
13
from pori_python .graphkb import vocab as gkb_vocab
@@ -49,9 +49,13 @@ def filter_structural_variants(
49
49
Filter structural variants to remove non-high quality events unless they are matched/annotated or
50
50
they involve a gene that is a known fusion partner
51
51
"""
52
- matched_svs = {match ["variant" ] for match in kb_matches if match ["variantType" ] == "sv" }
52
+ matched_svs = {
53
+ match ["variant" ] for match in kb_matches if match ["variantType" ] == "sv"
54
+ }
53
55
fusion_genes = {
54
- gene ["name" ] for gene in gene_annotations if gene .get ("knownFusionPartner" , False )
56
+ gene ["name" ]
57
+ for gene in gene_annotations
58
+ if gene .get ("knownFusionPartner" , False )
55
59
}
56
60
57
61
result = []
@@ -89,7 +93,9 @@ def get_evidencelevel_mapping(graphkb_conn: GraphKBConnection) -> Dict[str, str]
89
93
90
94
# Filter IPR EvidenceLevel and map each outgoing CrossReferenceOf to displayName
91
95
ipr_source_rid = graphkb_conn .get_source ("ipr" )["@rid" ]
92
- ipr_evidence_levels = filter (lambda d : d .get ("source" ) == ipr_source_rid , evidence_levels )
96
+ ipr_evidence_levels = filter (
97
+ lambda d : d .get ("source" ) == ipr_source_rid , evidence_levels
98
+ )
93
99
cross_references_mapping : Dict [str , str ] = dict ()
94
100
ipr_rids_to_displayname : Dict [str , str ] = dict ()
95
101
for level in ipr_evidence_levels :
@@ -138,7 +144,9 @@ def convert_statements_to_alterations(
138
144
"""
139
145
disease_matches = {
140
146
r ["@rid" ]
141
- for r in gkb_vocab .get_term_tree (graphkb_conn , disease_name , ontology_class = "Disease" )
147
+ for r in gkb_vocab .get_term_tree (
148
+ graphkb_conn , disease_name , ontology_class = "Disease"
149
+ )
142
150
}
143
151
144
152
if not disease_matches :
@@ -151,7 +159,9 @@ def convert_statements_to_alterations(
151
159
152
160
# get the recruitment status for any trial associated with a statement
153
161
clinical_trials = [
154
- s ["subject" ]["@rid" ] for s in statements if s ["subject" ]["@class" ] == "ClinicalTrial"
162
+ s ["subject" ]["@rid" ]
163
+ for s in statements
164
+ if s ["subject" ]["@class" ] == "ClinicalTrial"
155
165
]
156
166
recruitment_statuses = {}
157
167
if clinical_trials :
@@ -168,7 +178,9 @@ def convert_statements_to_alterations(
168
178
169
179
for statement in statements :
170
180
variants = [
171
- cast (Variant , c ) for c in statement ["conditions" ] if c ["@class" ] in VARIANT_CLASSES
181
+ cast (Variant , c )
182
+ for c in statement ["conditions" ]
183
+ if c ["@class" ] in VARIANT_CLASSES
172
184
]
173
185
diseases = [c for c in statement ["conditions" ] if c ["@class" ] == "Disease" ]
174
186
disease_match = len (diseases ) == 1 and diseases [0 ]["@rid" ] in disease_matches
@@ -189,8 +201,12 @@ def convert_statements_to_alterations(
189
201
190
202
evidence_level_str = display_evidence_levels (statement )
191
203
evidence_levels = statement .get ("evidenceLevel" ) or []
192
- ipr_evidence_levels = [ev_map [el .get ("@rid" , "" )] for el in evidence_levels if el ]
193
- ipr_evidence_levels_str = ";" .join (sorted (set ([el for el in ipr_evidence_levels ])))
204
+ ipr_evidence_levels = [
205
+ ev_map [el .get ("@rid" , "" )] for el in evidence_levels if el
206
+ ]
207
+ ipr_evidence_levels_str = ";" .join (
208
+ sorted (set ([el for el in ipr_evidence_levels ]))
209
+ )
194
210
195
211
for variant in variants :
196
212
if variant ["@rid" ] not in variant_matches :
@@ -200,10 +216,16 @@ def convert_statements_to_alterations(
200
216
"approvedTherapy" : approved_therapy or False ,
201
217
"category" : ipr_section or "unknown" ,
202
218
"context" : (
203
- statement ["subject" ]["displayName" ] if statement ["subject" ] else ""
219
+ statement ["subject" ]["displayName" ]
220
+ if statement ["subject" ]
221
+ else ""
222
+ ),
223
+ "kbContextId" : (
224
+ statement ["subject" ]["@rid" ] if statement ["subject" ] else ""
225
+ ),
226
+ "disease" : ";" .join (
227
+ sorted (d .get ("displayName" , "" ) for d in diseases )
204
228
),
205
- "kbContextId" : (statement ["subject" ]["@rid" ] if statement ["subject" ] else "" ),
206
- "disease" : ";" .join (sorted (d .get ("displayName" , "" ) for d in diseases )),
207
229
"evidenceLevel" : evidence_level_str or "" ,
208
230
"iprEvidenceLevel" : ipr_evidence_levels_str or "" ,
209
231
"kbStatementId" : statement ["@rid" ],
@@ -234,67 +256,6 @@ def convert_statements_to_alterations(
234
256
return rows
235
257
236
258
237
- """
238
- "kbMatchedStatements": [
239
- {
240
- "approvedTherapy": false,
241
- "category": "therapeutic",
242
- "context": "test multivariant statement",
243
- "disease": "colorectal cancer [DOID:9256]",
244
- "kbStatementId": "#999:999",
245
- "matchedCancer": true,
246
- "reference": "pmid:TEST1",
247
- "relevance": "resistance",
248
- "iprEvidenceLevel": "IPR-D",
249
- "externalSource": "IPRKB",
250
- "reviewStatus": "pending",
251
- "kbData": {},
252
- "requiredKbMatches": [
253
- "#158:1343",
254
- "#999:9999"
255
- ]
256
- }
257
- ],
258
- "kbStatementMatchedConditions": [
259
- {
260
- "kbStatementId": "#999:999",
261
- "observedVariantKeys": [
262
- "TEST3",
263
- "TEST1"
264
- ]
265
- },
266
- {
267
- "kbStatementId": "#999:999",
268
- "observedVariantKeys": [
269
- "TEST2",
270
- "TEST1"
271
- ]
272
- }
273
- ],
274
- "kbVariants": [
275
- {
276
- "kbVariant": "APC mutation",
277
- "variantKey": "TEST3",
278
- "variantType": "mut",
279
- "kbVariantId": "#158:1343"
280
- },
281
- {
282
- "kbVariant": "APC mutation",
283
- "variantKey": "TEST2",
284
- "variantType": "mut",
285
- "kbVariantId": "#158:1343"
286
- },
287
- {
288
- "kbVariant": "ZFP36L2:p.Q401del",
289
- "variant": "TEST1",
290
- "variantType": "mut",
291
- "kbVariantId": "#999:9999"
292
- }
293
- ],
294
-
295
- """
296
-
297
-
298
259
def select_expression_plots (
299
260
kb_matches : List [KbMatch ] | List [Hashabledict ], all_variants : Sequence [IprVariant ]
300
261
) -> List [ImageDefinition ]:
@@ -327,7 +288,9 @@ def select_expression_plots(
327
288
gene = str (variant .get ("gene" , "" ))
328
289
hist = str (variant .get ("histogramImage" , "" ))
329
290
if hist :
330
- images_by_gene [gene ] = ImageDefinition ({"key" : f"expDensity.{ gene } " , "path" : hist })
291
+ images_by_gene [gene ] = ImageDefinition (
292
+ {"key" : f"expDensity.{ gene } " , "path" : hist }
293
+ )
331
294
return [images_by_gene [gene ] for gene in selected_genes if gene in images_by_gene ]
332
295
333
296
@@ -370,7 +333,9 @@ def create_key_alterations(
370
333
counts [type_mapping [variant_type ]].add (variant_key )
371
334
372
335
if variant_type == "exp" :
373
- alterations .append (f'{ variant .get ("gene" ,"" )} ({ variant .get ("expressionState" )} )' )
336
+ alterations .append (
337
+ f'{ variant .get ("gene" ,"" )} ({ variant .get ("expressionState" )} )'
338
+ )
374
339
elif variant_type == "cnv" :
375
340
alterations .append (f'{ variant .get ("gene" ,"" )} ({ variant .get ("cnvState" )} )' )
376
341
# only show germline if relevant
@@ -447,15 +412,19 @@ def germline_kb_matches(
447
412
# Remove any matches to germline events
448
413
for alt in somatic_alts :
449
414
var_list = [v for v in all_variants if v ["key" ] == alt ["variant" ]]
450
- somatic_var_list = [v for v in var_list if not v .get ("germline" , not assume_somatic )]
415
+ somatic_var_list = [
416
+ v for v in var_list if not v .get ("germline" , not assume_somatic )
417
+ ]
451
418
if var_list and not somatic_var_list :
452
419
logger .debug (
453
420
f"Dropping germline match to somatic statement kbStatementId:{ alt ['kbStatementId' ]} : { alt ['kbVariant' ]} { alt ['category' ]} "
454
421
)
455
422
elif somatic_var_list :
456
423
ret_list .append (alt ) # match to somatic variant
457
424
else :
458
- ret_list .append (alt ) # alteration not in any specific keys matches to check.
425
+ ret_list .append (
426
+ alt
427
+ ) # alteration not in any specific keys matches to check.
459
428
460
429
return ret_list
461
430
@@ -530,7 +499,9 @@ def multi_variant_filtering(
530
499
531
500
# Filtering out incompleted matches of gkb_matches
532
501
return [
533
- match for match in gkb_matches if match ["kbStatementId" ] in complete_matching_statements
502
+ match
503
+ for match in gkb_matches
504
+ if match ["kbStatementId" ] in complete_matching_statements
534
505
]
535
506
536
507
@@ -576,7 +547,9 @@ def get_kb_matched_statements(
576
547
for item in gkb_matches :
577
548
stmt = copy (item )
578
549
stmt ["requiredKbMatches" ].sort ()
579
- kbs = KbMatchedStatement ({key : val for (key , val ) in stmt .items () if key in kbs_keys })
550
+ kbs = KbMatchedStatement (
551
+ {key : val for (key , val ) in stmt .items () if key in kbs_keys }
552
+ )
580
553
dict_key = str (kbs )
581
554
kbMatchedStatements [dict_key ] = kbs
582
555
return [* kbMatchedStatements .values ()]
@@ -585,23 +558,22 @@ def get_kb_matched_statements(
585
558
def get_kb_statement_matched_conditions (
586
559
gkb_matches : List [KbMatch ] | List [Hashabledict ],
587
560
allow_partial_matches : bool = False ,
588
- infer_possible_matches : bool = False ,
589
561
) -> List [KbMatchedStatementConditionSet ]:
590
562
"""
591
563
Prepares the kbMatchedStatementConditions section, with expected format
592
- kbStatementId: str
593
- observedVariantKeys: [str ]
564
+ kbStatementId: #999:999
565
+ observedVariantKeys: [{'observedVariantKey': 'test1', 'kbVariantId': '#111:111'} ]
594
566
595
567
where the kbStatementId is a gkb statement rid
596
- and each of the observed variant keys is a 'variantKey' field from one
597
- of the records in kbVariants .
568
+ and each of the observed variant keys is a reference to
569
+ a kbMatch (ie, an observed variant/kb variant pair) .
598
570
599
571
Each record in the output from this function should represent
600
572
one set of observed variants that satisfies the gkb variants in the
601
573
conditions of the statement.
602
574
603
575
If more than one set of observed variants satisfies the gkb variant conditions of the
604
- statement, the output should include one record for each possible set.
576
+ statement, the output from this function should include one record for each possible set.
605
577
606
578
Eg if the stmt requires gkb variants A and B, and the observed variants include
607
579
X which matches A, and Y and Z which both match B,
@@ -611,73 +583,74 @@ def get_kb_statement_matched_conditions(
611
583
Params:
612
584
gkb_matches: KbMatch statements to be processed
613
585
allow_partial_matches: include statements where not all requirements are satisfied
614
- infer_possible_matches: allow variants to be used to support statements even when
615
- the connection is not made explicit in the input data
616
586
Returns:
617
587
list of KbStatementMatchedConditionSet records
618
588
"""
619
589
620
- kbVariants = get_kb_variants (gkb_matches )
621
590
kbMatchedStatements = get_kb_matched_statements (gkb_matches )
622
591
kbMatchedStatementConditions = {}
623
592
624
593
for kbStmt in kbMatchedStatements :
625
- stmts = [item for item in gkb_matches if item ["kbStatementId" ] == kbStmt ["kbStatementId" ]]
594
+ stmts = [
595
+ item
596
+ for item in gkb_matches
597
+ if item ["kbStatementId" ] == kbStmt ["kbStatementId" ]
598
+ ]
599
+
626
600
requirements = {}
627
601
for requirement in stmts [0 ]["requiredKbMatches" ]:
628
602
if not requirements .get (requirement , False ):
629
- if infer_possible_matches :
630
- # if true, use all possible links between kbVariantId and kbStatement
631
- reqlist = [
632
- item ["variantKey" ]
633
- for item in kbVariants
634
- if item ["kbVariantId" ] == requirement
635
- ]
636
- else :
637
- # only use explicit variant/statement links
638
- reqlist = [
639
- item ["variant" ]
640
- for item in gkb_matches
641
- if (
642
- item ["kbVariantId" ] == requirement
643
- and item ["kbStatementId" ] == kbStmt ["kbStatementId" ]
644
- )
645
- ]
603
+ # only use explicit variant/statement links
604
+ reqlist = [
605
+ {
606
+ "kbVariantId" : requirement ,
607
+ "observedVariantKey" : item ["variant" ],
608
+ }
609
+ for item in gkb_matches
610
+ if (
611
+ item ["kbVariantId" ] == requirement
612
+ and item ["kbStatementId" ] == kbStmt ["kbStatementId" ]
613
+ )
614
+ ]
646
615
requirements [requirement ] = reqlist
647
616
648
- # remove empty sets for requirements if allowing partial matches
617
+ # remove empty sets from requirements if allowing partial matches
649
618
if allow_partial_matches :
650
- requirements = {key : val for (key , val ) in requirements .items () if val }
619
+ requirements = {
620
+ key : val for (key , val ) in requirements .items () if len (val ) > 0
621
+ }
651
622
652
623
variantConditionSets = list (product (* requirements .values ()))
653
624
conditionSets = [
654
625
{"kbStatementId" : kbStmt ["kbStatementId" ], "observedVariantKeys" : item }
655
626
for item in variantConditionSets
656
627
]
657
628
for conditionSet in conditionSets :
658
- # remove Nones
659
- observedVariantKeys = [item for item in conditionSet ["observedVariantKeys" ] if item ]
660
- observedVariantKeys .sort ()
629
+ observedVariantKeys = sorted (
630
+ conditionSet ["observedVariantKeys" ],
631
+ key = lambda x : (x ["kbVariantId" ], x ["observedVariantKey" ]),
632
+ )
661
633
kbmc = KbMatchedStatementConditionSet (
662
634
{
663
635
"kbStatementId" : conditionSet ["kbStatementId" ],
664
636
"observedVariantKeys" : observedVariantKeys ,
665
637
}
666
638
)
667
- kbMatchedStatementConditions [str (kbmc )] = kbmc
668
-
639
+ key = str (
640
+ uuid .uuid5 (uuid .NAMESPACE_DNS , str (kbmc ))
641
+ ) # to make it more readable when debugging
642
+ kbMatchedStatementConditions [key ] = kbmc
669
643
return [* kbMatchedStatementConditions .values ()]
670
644
671
645
672
646
def get_kb_matches_sections (
673
647
gkb_matches : List [KbMatch ] | List [Hashabledict ],
674
648
allow_partial_matches = False ,
675
- infer_possible_matches = False ,
676
649
) -> KbMatchSections :
677
650
kb_variants = get_kb_variants (gkb_matches )
678
651
kb_matched_statements = get_kb_matched_statements (gkb_matches )
679
652
kb_statement_matched_conditions = get_kb_statement_matched_conditions (
680
- gkb_matches , allow_partial_matches , infer_possible_matches
653
+ gkb_matches , allow_partial_matches
681
654
)
682
655
return {
683
656
"kbMatchedVariants" : kb_variants ,
0 commit comments