Skip to content

Commit c297090

Browse files
Fix redundand disease matching
1 parent e6e48c5 commit c297090

File tree

4 files changed

+89
-54
lines changed

4 files changed

+89
-54
lines changed

pori_python/ipr/annotate.py

+40-29
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def get_second_pass_variants(
5757

5858

5959
def get_ipr_statements_from_variants(
60-
graphkb_conn: GraphKBConnection, matches: List[Variant], disease_name: str
60+
graphkb_conn: GraphKBConnection, matches: List[Variant], disease_matches: List[str]
6161
) -> List[KbMatch]:
6262
"""IPR upload formatted GraphKB statements from the list of variants.
6363
@@ -72,7 +72,7 @@ def get_ipr_statements_from_variants(
7272
existing_statements = {s["@rid"] for s in statements}
7373

7474
for ipr_row in convert_statements_to_alterations(
75-
graphkb_conn, statements, disease_name, convert_to_rid_set(matches)
75+
graphkb_conn, statements, disease_matches, convert_to_rid_set(matches)
7676
):
7777
rows.append(ipr_row)
7878

@@ -88,7 +88,7 @@ def get_ipr_statements_from_variants(
8888
for ipr_row in convert_statements_to_alterations(
8989
graphkb_conn,
9090
inferred_statements,
91-
disease_name,
91+
disease_matches,
9292
convert_to_rid_set(inferred_matches),
9393
):
9494
ipr_row["kbData"]["inferred"] = True
@@ -99,15 +99,17 @@ def get_ipr_statements_from_variants(
9999

100100
def annotate_expression_variants(
101101
graphkb_conn: GraphKBConnection,
102+
disease_matches: List[str],
102103
variants: List[IprExprVariant],
103-
disease_name: str,
104104
show_progress: bool = False,
105105
) -> List[KbMatch]:
106106
"""Annotate expression variants with GraphKB in the IPR alterations format.
107107
108108
Args:
109-
graphkb_conn: the graphkb api connection object
110-
variants: list of variants
109+
graphkb_conn (GraphKBConnection): the graphkb api connection object
110+
disease_matches (list.str): GraphKB disease RIDs
111+
variants (list.IprExprVariant): list of variants.
112+
show_progress (bool): Progressbar displayed for long runs.
111113
112114
Returns:
113115
list of kbMatches records for IPR
@@ -127,7 +129,7 @@ def annotate_expression_variants(
127129
continue
128130
try:
129131
matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant)
130-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name):
132+
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches):
131133
ipr_row["variant"] = row["key"]
132134
ipr_row["variantType"] = row.get("variantType", "exp")
133135
alterations.append(ipr_row)
@@ -150,15 +152,17 @@ def annotate_expression_variants(
150152

151153
def annotate_copy_variants(
152154
graphkb_conn: GraphKBConnection,
155+
disease_matches: List[str],
153156
variants: List[IprCopyVariant],
154-
disease_name: str,
155157
show_progress: bool = False,
156158
) -> List[KbMatch]:
157159
"""Annotate allowed copy variants with GraphKB in the IPR alterations format.
158160
159161
Args:
160-
graphkb_conn: the graphkb api connection object
161-
variants: list of variants
162+
graphkb_conn (GraphKBConnection): the graphkb api connection object
163+
disease_matches (list.str): GraphKB disease RIDs
164+
variants (list.IprCopyVariant): list of variants.
165+
show_progress (bool): Progressbar displayed for long runs.
162166
163167
Returns:
164168
list of kbMatches records for IPR
@@ -180,7 +184,7 @@ def annotate_copy_variants(
180184
continue
181185
try:
182186
matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant)
183-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name):
187+
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches):
184188
ipr_row["variant"] = row["key"]
185189
ipr_row["variantType"] = row.get("variantType", "cnv")
186190
alterations.append(ipr_row)
@@ -205,17 +209,17 @@ def annotate_copy_variants(
205209

206210
def annotate_positional_variants(
207211
graphkb_conn: GraphKBConnection,
212+
disease_matches: List[str],
208213
variants: Sequence[IprStructuralVariant] | Sequence[Hashabledict],
209-
disease_name: str,
210214
show_progress: bool = False,
211215
) -> List[Hashabledict]:
212216
"""Annotate SNP, INDEL or fusion variant calls with GraphKB and return in IPR match format.
213217
214218
Hashable type is required to turn lists into sets.
215219
Args:
216220
graphkb_conn (GraphKBConnection): the graphkb api connection object
217-
variants (list.<dict>): list of variants. Defaults to [].
218-
disease_name (str): GraphKB disease name for statement matching. 'cancer' is most general
221+
disease_matches (list.str): GraphKB disease RIDs
222+
variants (Sequence): list of variants.
219223
show_progress (bool): Progressbar displayed for long runs.
220224
221225
Returns:
@@ -259,7 +263,9 @@ def annotate_positional_variants(
259263
raise parse_err
260264

261265
for ipr_row in get_ipr_statements_from_variants(
262-
graphkb_conn, matches, disease_name
266+
graphkb_conn,
267+
matches,
268+
disease_matches,
263269
):
264270
ipr_row["variant"] = row["key"]
265271
ipr_row["variantType"] = row.get(
@@ -304,15 +310,16 @@ def annotate_positional_variants(
304310

305311
def annotate_msi(
306312
graphkb_conn: GraphKBConnection,
307-
disease_name: str = "cancer",
313+
disease_matches: List[str],
308314
msi_category: str = "microsatellite instability",
309315
) -> List[KbMatch]:
310316
"""Annotate microsatellite instablity from GraphKB in the IPR alterations format.
311317
312318
Match to GraphKb Category variants with similar names
313319
Args:
314-
graphkb_conn: the graphkb api connection object
315-
msi_category: such as 'microsatellite instability'
320+
graphkb_conn (GraphKBConnection): the graphkb api connection object
321+
disease_matches (list.str): GraphKB disease RIDs
322+
msi_category (str): such as 'microsatellite instability'
316323
317324
Returns:
318325
list of kbMatches records for IPR
@@ -335,7 +342,9 @@ def annotate_msi(
335342
)
336343
if msi_categories:
337344
msi_variants = [cast(Variant, var) for var in msi_categories]
338-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, msi_variants, disease_name):
345+
for ipr_row in get_ipr_statements_from_variants(
346+
graphkb_conn, msi_variants, disease_matches
347+
):
339348
ipr_row["variant"] = msi_category
340349
ipr_row["variantType"] = "msi"
341350
gkb_matches.append(ipr_row)
@@ -344,15 +353,15 @@ def annotate_msi(
344353

345354
def annotate_tmb(
346355
graphkb_conn: GraphKBConnection,
347-
disease_name: str = "cancer",
356+
disease_matches: List[str],
348357
category: str = TMB_HIGH_CATEGORY,
349358
) -> List[KbMatch]:
350359
"""Annotate Tumour Mutation Burden (tmb) categories from GraphKB in the IPR alterations format.
351360
352361
Match to GraphKb Category variants with similar names
353362
Args:
354-
graphkb_conn: the graphkb api connection object
355-
disease_name: oncotree disease name for graphkb matching.
363+
graphkb_conn (GraphKBConnection): the graphkb api connection object
364+
disease_matches (list.str): GraphKB disease RIDs
356365
category: such as 'high mutation burden'
357366
358367
Returns:
@@ -376,7 +385,9 @@ def annotate_tmb(
376385
)
377386
if categories:
378387
cat_variants = [cast(Variant, var) for var in categories]
379-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, cat_variants, disease_name):
388+
for ipr_row in get_ipr_statements_from_variants(
389+
graphkb_conn, cat_variants, disease_matches
390+
):
380391
ipr_row["variant"] = category
381392
ipr_row["variantType"] = "tmb"
382393
gkb_matches.append(ipr_row)
@@ -385,19 +396,19 @@ def annotate_tmb(
385396

386397
def annotate_signature_variants(
387398
graphkb_conn: GraphKBConnection,
399+
disease_matches: List[str],
388400
variants: List[IprSignatureVariant] = [],
389-
disease_name: str = "cancer",
390401
show_progress: bool = False,
391402
) -> List[KbMatch]:
392403
"""Annotate Signature variants with GraphKB in the IPR alterations format.
393404
394405
Match to corresponding GraphKB Variants, then to linked GraphKB Statements
395406
396407
Args:
397-
graphkb_conn: the graphkb api connection object
398-
variants: list of signature variants
399-
disease_name: oncotree disease name for graphkb matching
400-
show_progress: progressbar displayed for long runs; default to False
408+
graphkb_conn (GraphKBConnection): the graphkb api connection object
409+
disease_matches (list.str): GraphKB disease RIDs
410+
variants (list.IprSignatureVariant): list of signature variants
411+
show_progress (bool): progressbar displayed for long runs; default to False
401412
402413
Returns:
403414
list of kbMatches records for IPR
@@ -416,7 +427,7 @@ def annotate_signature_variants(
416427
)
417428
# Matching GKB Variants to GKB Statements
418429
for ipr_row in get_ipr_statements_from_variants(
419-
graphkb_conn, matched_variants, disease_name
430+
graphkb_conn, matched_variants, disease_matches
420431
):
421432
ipr_row["variant"] = variant["key"]
422433
ipr_row["variantType"] = "sigv"

pori_python/ipr/ipr.py

+26-9
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,15 @@ def link_refs(refs) -> Tuple[str, str]:
116116
def convert_statements_to_alterations(
117117
graphkb_conn: GraphKBConnection,
118118
statements: List[Statement],
119-
disease_name: str,
119+
disease_matches: List[str],
120120
variant_matches: Iterable[str],
121121
) -> List[KbMatch]:
122122
"""Convert statements matched from graphkb into IPR equivalent representations.
123123
124124
Args:
125125
graphkb_conn: the graphkb connection object
126126
statements: list of statement records from graphkb
127+
disease_matches: GraphKB disease RIDs
127128
disease_name: name of the cancer type for the patient being reported on
128129
variant_matches: the list of RIDs the variant matched for these statements
129130
@@ -136,14 +137,6 @@ def convert_statements_to_alterations(
136137
Notes:
137138
- only report disease matched prognostic markers https://www.bcgsc.ca/jira/browse/GERO-72 and GERO-196
138139
"""
139-
disease_matches = {
140-
r["@rid"]
141-
for r in gkb_vocab.get_term_tree(graphkb_conn, disease_name, ontology_class="Disease")
142-
}
143-
144-
if not disease_matches:
145-
raise ValueError(f"failed to match disease ({disease_name}) to graphkb")
146-
147140
rows = []
148141
ev_map = get_evidencelevel_mapping(graphkb_conn)
149142
# GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag
@@ -632,3 +625,27 @@ def get_kb_matches_sections(
632625
"kbMatchedStatements": kb_matched_statements,
633626
"kbStatementMatchedConditions": kb_statement_matched_conditions,
634627
}
628+
629+
630+
def get_kb_disease_matches(
631+
graphkb_conn: GraphKBConnection, kb_disease_match: str = 'cancer', verbose=True
632+
) -> list[str]:
633+
if verbose:
634+
logger.info(f"Matching disease ({kb_disease_match}) to graphkb")
635+
636+
disease_matches = {
637+
r["@rid"]
638+
for r in gkb_vocab.get_term_tree(
639+
graphkb_conn,
640+
kb_disease_match,
641+
ontology_class="Disease",
642+
)
643+
}
644+
645+
if not disease_matches:
646+
msg = f"failed to match disease ({kb_disease_match}) to graphkb"
647+
if verbose:
648+
logger.error(msg)
649+
raise ValueError(msg)
650+
651+
return disease_matches

pori_python/ipr/main.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
create_key_alterations,
4949
filter_structural_variants,
5050
germline_kb_matches,
51+
get_kb_disease_matches,
5152
get_kb_matches_sections,
5253
select_expression_plots,
5354
)
@@ -338,8 +339,6 @@ def ipr_report(
338339
logger.error("Failed schema check - report variants may be corrupted or unmatched.")
339340
logger.error(f"Failed schema check: {err}")
340341

341-
kb_disease_match = content["kbDiseaseMatch"]
342-
343342
# validate the input variants
344343
signatureVariants: List[IprSignatureVariant] = preprocess_signature_variants(
345344
[
@@ -380,6 +379,19 @@ def ipr_report(
380379

381380
graphkb_conn.login(gkb_user, gkb_pass)
382381

382+
# DISEASE
383+
# Disease term from bioapps; expected OncoTree term
384+
kb_disease_match: str = content["kbDiseaseMatch"]
385+
386+
# Matching disease RIDs from GraphKB using term tree
387+
disease_matches: list[str] = []
388+
try:
389+
disease_matches = get_kb_disease_matches(graphkb_conn, kb_disease_match)
390+
except ValueError as err:
391+
# 2nd try using deafult disease term.
392+
# Will raise uncatched error if no match
393+
disease_matches = get_kb_disease_matches(graphkb_conn)
394+
383395
# GKB MATCHING
384396
gkb_matches: List[Hashabledict] = []
385397

@@ -405,7 +417,7 @@ def ipr_report(
405417
tmb["kbCategory"] = TMB_HIGH_CATEGORY
406418

407419
# GERO-296 - try matching to graphkb
408-
tmb_matches = annotate_tmb(graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY)
420+
tmb_matches = annotate_tmb(graphkb_conn, disease_matches, TMB_HIGH_CATEGORY)
409421
if tmb_matches:
410422
tmb_variant["kbCategory"] = TMB_HIGH_CATEGORY # type: ignore
411423
tmb_variant["variant"] = TMB_HIGH_CATEGORY
@@ -431,7 +443,7 @@ def ipr_report(
431443
msi_cat = msi.get("kbCategory")
432444
msi_variant = msi.copy()
433445
logger.info(f"Matching GKB msi {msi_cat}")
434-
msi_matches = annotate_msi(graphkb_conn, kb_disease_match, msi_cat)
446+
msi_matches = annotate_msi(graphkb_conn, disease_matches, msi_cat)
435447
if msi_matches:
436448
msi_variant["kbCategory"] = msi_cat # type: ignore
437449
msi_variant["variant"] = msi_cat
@@ -445,7 +457,7 @@ def ipr_report(
445457
logger.info(f"annotating {len(signatureVariants)} signatures")
446458
gkb_matches.extend(
447459
annotate_signature_variants(
448-
graphkb_conn, signatureVariants, kb_disease_match, show_progress=interactive
460+
graphkb_conn, disease_matches, signatureVariants, show_progress=interactive
449461
)
450462
)
451463
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
@@ -454,7 +466,7 @@ def ipr_report(
454466
logger.info(f"annotating {len(small_mutations)} small mutations")
455467
gkb_matches.extend(
456468
annotate_positional_variants(
457-
graphkb_conn, small_mutations, kb_disease_match, show_progress=interactive
469+
graphkb_conn, disease_matches, small_mutations, show_progress=interactive
458470
)
459471
)
460472
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
@@ -464,8 +476,8 @@ def ipr_report(
464476
gkb_matches.extend(
465477
annotate_positional_variants(
466478
graphkb_conn,
479+
disease_matches,
467480
structural_variants,
468-
kb_disease_match,
469481
show_progress=interactive,
470482
)
471483
)
@@ -477,7 +489,7 @@ def ipr_report(
477489
[
478490
Hashabledict(copy_var)
479491
for copy_var in annotate_copy_variants(
480-
graphkb_conn, copy_variants, kb_disease_match, show_progress=interactive
492+
graphkb_conn, disease_matches, copy_variants, show_progress=interactive
481493
)
482494
]
483495
)
@@ -490,8 +502,8 @@ def ipr_report(
490502
Hashabledict(exp_var)
491503
for exp_var in annotate_expression_variants(
492504
graphkb_conn,
505+
disease_matches,
493506
expression_variants,
494-
kb_disease_match,
495507
show_progress=interactive,
496508
)
497509
]
@@ -550,7 +562,7 @@ def ipr_report(
550562
graphkb_comments = auto_analyst_comments(
551563
graphkb_conn,
552564
gkb_matches,
553-
disease_name=kb_disease_match,
565+
disease_matches=set(disease_matches),
554566
variants=all_variants,
555567
)
556568
comments_list.append(graphkb_comments)

0 commit comments

Comments
 (0)