Skip to content

Commit 106f4c3

Browse files
Merge pull request #47 from bcgsc/bugfix/KBDEV-1266-slow-matching
Fix redundand disease matching
2 parents e6e48c5 + c96d7e1 commit 106f4c3

File tree

6 files changed

+108
-70
lines changed

6 files changed

+108
-70
lines changed

pori_python/ipr/annotate.py

+40-29
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def get_second_pass_variants(
5757

5858

5959
def get_ipr_statements_from_variants(
60-
graphkb_conn: GraphKBConnection, matches: List[Variant], disease_name: str
60+
graphkb_conn: GraphKBConnection, matches: List[Variant], disease_matches: List[str]
6161
) -> List[KbMatch]:
6262
"""IPR upload formatted GraphKB statements from the list of variants.
6363
@@ -72,7 +72,7 @@ def get_ipr_statements_from_variants(
7272
existing_statements = {s["@rid"] for s in statements}
7373

7474
for ipr_row in convert_statements_to_alterations(
75-
graphkb_conn, statements, disease_name, convert_to_rid_set(matches)
75+
graphkb_conn, statements, disease_matches, convert_to_rid_set(matches)
7676
):
7777
rows.append(ipr_row)
7878

@@ -88,7 +88,7 @@ def get_ipr_statements_from_variants(
8888
for ipr_row in convert_statements_to_alterations(
8989
graphkb_conn,
9090
inferred_statements,
91-
disease_name,
91+
disease_matches,
9292
convert_to_rid_set(inferred_matches),
9393
):
9494
ipr_row["kbData"]["inferred"] = True
@@ -99,15 +99,17 @@ def get_ipr_statements_from_variants(
9999

100100
def annotate_expression_variants(
101101
graphkb_conn: GraphKBConnection,
102+
disease_matches: List[str],
102103
variants: List[IprExprVariant],
103-
disease_name: str,
104104
show_progress: bool = False,
105105
) -> List[KbMatch]:
106106
"""Annotate expression variants with GraphKB in the IPR alterations format.
107107
108108
Args:
109-
graphkb_conn: the graphkb api connection object
110-
variants: list of variants
109+
graphkb_conn (GraphKBConnection): the graphkb api connection object
110+
disease_matches (list.str): GraphKB disease RIDs
111+
variants (list.IprExprVariant): list of variants.
112+
show_progress (bool): Progressbar displayed for long runs.
111113
112114
Returns:
113115
list of kbMatches records for IPR
@@ -127,7 +129,7 @@ def annotate_expression_variants(
127129
continue
128130
try:
129131
matches = gkb_match.match_expression_variant(graphkb_conn, gene, variant)
130-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name):
132+
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches):
131133
ipr_row["variant"] = row["key"]
132134
ipr_row["variantType"] = row.get("variantType", "exp")
133135
alterations.append(ipr_row)
@@ -150,15 +152,17 @@ def annotate_expression_variants(
150152

151153
def annotate_copy_variants(
152154
graphkb_conn: GraphKBConnection,
155+
disease_matches: List[str],
153156
variants: List[IprCopyVariant],
154-
disease_name: str,
155157
show_progress: bool = False,
156158
) -> List[KbMatch]:
157159
"""Annotate allowed copy variants with GraphKB in the IPR alterations format.
158160
159161
Args:
160-
graphkb_conn: the graphkb api connection object
161-
variants: list of variants
162+
graphkb_conn (GraphKBConnection): the graphkb api connection object
163+
disease_matches (list.str): GraphKB disease RIDs
164+
variants (list.IprCopyVariant): list of variants.
165+
show_progress (bool): Progressbar displayed for long runs.
162166
163167
Returns:
164168
list of kbMatches records for IPR
@@ -180,7 +184,7 @@ def annotate_copy_variants(
180184
continue
181185
try:
182186
matches = gkb_match.match_copy_variant(graphkb_conn, gene, variant)
183-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_name):
187+
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, matches, disease_matches):
184188
ipr_row["variant"] = row["key"]
185189
ipr_row["variantType"] = row.get("variantType", "cnv")
186190
alterations.append(ipr_row)
@@ -206,16 +210,16 @@ def annotate_copy_variants(
206210
def annotate_positional_variants(
207211
graphkb_conn: GraphKBConnection,
208212
variants: Sequence[IprStructuralVariant] | Sequence[Hashabledict],
209-
disease_name: str,
213+
disease_matches: List[str],
210214
show_progress: bool = False,
211215
) -> List[Hashabledict]:
212216
"""Annotate SNP, INDEL or fusion variant calls with GraphKB and return in IPR match format.
213217
214218
Hashable type is required to turn lists into sets.
215219
Args:
216220
graphkb_conn (GraphKBConnection): the graphkb api connection object
217-
variants (list.<dict>): list of variants. Defaults to [].
218-
disease_name (str): GraphKB disease name for statement matching. 'cancer' is most general
221+
variants (Sequence): list of variants.
222+
disease_matches (list.str): GraphKB disease RIDs
219223
show_progress (bool): Progressbar displayed for long runs.
220224
221225
Returns:
@@ -259,7 +263,9 @@ def annotate_positional_variants(
259263
raise parse_err
260264

261265
for ipr_row in get_ipr_statements_from_variants(
262-
graphkb_conn, matches, disease_name
266+
graphkb_conn,
267+
matches,
268+
disease_matches,
263269
):
264270
ipr_row["variant"] = row["key"]
265271
ipr_row["variantType"] = row.get(
@@ -304,15 +310,16 @@ def annotate_positional_variants(
304310

305311
def annotate_msi(
306312
graphkb_conn: GraphKBConnection,
307-
disease_name: str = "cancer",
313+
disease_matches: List[str],
308314
msi_category: str = "microsatellite instability",
309315
) -> List[KbMatch]:
310316
"""Annotate microsatellite instablity from GraphKB in the IPR alterations format.
311317
312318
Match to GraphKb Category variants with similar names
313319
Args:
314-
graphkb_conn: the graphkb api connection object
315-
msi_category: such as 'microsatellite instability'
320+
graphkb_conn (GraphKBConnection): the graphkb api connection object
321+
disease_matches (list.str): GraphKB disease RIDs
322+
msi_category (str): such as 'microsatellite instability'
316323
317324
Returns:
318325
list of kbMatches records for IPR
@@ -335,7 +342,9 @@ def annotate_msi(
335342
)
336343
if msi_categories:
337344
msi_variants = [cast(Variant, var) for var in msi_categories]
338-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, msi_variants, disease_name):
345+
for ipr_row in get_ipr_statements_from_variants(
346+
graphkb_conn, msi_variants, disease_matches
347+
):
339348
ipr_row["variant"] = msi_category
340349
ipr_row["variantType"] = "msi"
341350
gkb_matches.append(ipr_row)
@@ -344,15 +353,15 @@ def annotate_msi(
344353

345354
def annotate_tmb(
346355
graphkb_conn: GraphKBConnection,
347-
disease_name: str = "cancer",
356+
disease_matches: List[str],
348357
category: str = TMB_HIGH_CATEGORY,
349358
) -> List[KbMatch]:
350359
"""Annotate Tumour Mutation Burden (tmb) categories from GraphKB in the IPR alterations format.
351360
352361
Match to GraphKb Category variants with similar names
353362
Args:
354-
graphkb_conn: the graphkb api connection object
355-
disease_name: oncotree disease name for graphkb matching.
363+
graphkb_conn (GraphKBConnection): the graphkb api connection object
364+
disease_matches (list.str): GraphKB disease RIDs
356365
category: such as 'high mutation burden'
357366
358367
Returns:
@@ -376,7 +385,9 @@ def annotate_tmb(
376385
)
377386
if categories:
378387
cat_variants = [cast(Variant, var) for var in categories]
379-
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, cat_variants, disease_name):
388+
for ipr_row in get_ipr_statements_from_variants(
389+
graphkb_conn, cat_variants, disease_matches
390+
):
380391
ipr_row["variant"] = category
381392
ipr_row["variantType"] = "tmb"
382393
gkb_matches.append(ipr_row)
@@ -385,19 +396,19 @@ def annotate_tmb(
385396

386397
def annotate_signature_variants(
387398
graphkb_conn: GraphKBConnection,
399+
disease_matches: List[str],
388400
variants: List[IprSignatureVariant] = [],
389-
disease_name: str = "cancer",
390401
show_progress: bool = False,
391402
) -> List[KbMatch]:
392403
"""Annotate Signature variants with GraphKB in the IPR alterations format.
393404
394405
Match to corresponding GraphKB Variants, then to linked GraphKB Statements
395406
396407
Args:
397-
graphkb_conn: the graphkb api connection object
398-
variants: list of signature variants
399-
disease_name: oncotree disease name for graphkb matching
400-
show_progress: progressbar displayed for long runs; default to False
408+
graphkb_conn (GraphKBConnection): the graphkb api connection object
409+
disease_matches (list.str): GraphKB disease RIDs
410+
variants (list.IprSignatureVariant): list of signature variants
411+
show_progress (bool): progressbar displayed for long runs; default to False
401412
402413
Returns:
403414
list of kbMatches records for IPR
@@ -416,7 +427,7 @@ def annotate_signature_variants(
416427
)
417428
# Matching GKB Variants to GKB Statements
418429
for ipr_row in get_ipr_statements_from_variants(
419-
graphkb_conn, matched_variants, disease_name
430+
graphkb_conn, matched_variants, disease_matches
420431
):
421432
ipr_row["variant"] = variant["key"]
422433
ipr_row["variantType"] = "sigv"

pori_python/ipr/ipr.py

+32-10
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,15 @@ def link_refs(refs) -> Tuple[str, str]:
116116
def convert_statements_to_alterations(
117117
graphkb_conn: GraphKBConnection,
118118
statements: List[Statement],
119-
disease_name: str,
119+
disease_matches: List[str],
120120
variant_matches: Iterable[str],
121121
) -> List[KbMatch]:
122122
"""Convert statements matched from graphkb into IPR equivalent representations.
123123
124124
Args:
125125
graphkb_conn: the graphkb connection object
126126
statements: list of statement records from graphkb
127-
disease_name: name of the cancer type for the patient being reported on
127+
disease_matches: GraphKB disease RIDs
128128
variant_matches: the list of RIDs the variant matched for these statements
129129
130130
Raises:
@@ -136,14 +136,6 @@ def convert_statements_to_alterations(
136136
Notes:
137137
- only report disease matched prognostic markers https://www.bcgsc.ca/jira/browse/GERO-72 and GERO-196
138138
"""
139-
disease_matches = {
140-
r["@rid"]
141-
for r in gkb_vocab.get_term_tree(graphkb_conn, disease_name, ontology_class="Disease")
142-
}
143-
144-
if not disease_matches:
145-
raise ValueError(f"failed to match disease ({disease_name}) to graphkb")
146-
147139
rows = []
148140
ev_map = get_evidencelevel_mapping(graphkb_conn)
149141
# GERO-318 - add all IPR-A evidence equivalents to the approvedTherapy flag
@@ -632,3 +624,33 @@ def get_kb_matches_sections(
632624
"kbMatchedStatements": kb_matched_statements,
633625
"kbStatementMatchedConditions": kb_statement_matched_conditions,
634626
}
627+
628+
629+
def get_kb_disease_matches(
630+
graphkb_conn: GraphKBConnection, kb_disease_match: str = None, verbose: bool = True
631+
) -> list[str]:
632+
633+
if not kb_disease_match:
634+
kb_disease_match = 'cancer'
635+
if verbose:
636+
logger.warning(f"No disease provided; will use '{kb_disease_match}'")
637+
638+
if verbose:
639+
logger.info(f"Matching disease ({kb_disease_match}) to graphkb")
640+
641+
disease_matches = {
642+
r["@rid"]
643+
for r in gkb_vocab.get_term_tree(
644+
graphkb_conn,
645+
kb_disease_match,
646+
ontology_class="Disease",
647+
)
648+
}
649+
650+
if not disease_matches:
651+
msg = f"failed to match disease ({kb_disease_match}) to graphkb"
652+
if verbose:
653+
logger.error(msg)
654+
raise ValueError(msg)
655+
656+
return disease_matches

pori_python/ipr/main.py

+17-10
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
create_key_alterations,
4949
filter_structural_variants,
5050
germline_kb_matches,
51+
get_kb_disease_matches,
5152
get_kb_matches_sections,
5253
select_expression_plots,
5354
)
@@ -338,8 +339,6 @@ def ipr_report(
338339
logger.error("Failed schema check - report variants may be corrupted or unmatched.")
339340
logger.error(f"Failed schema check: {err}")
340341

341-
kb_disease_match = content["kbDiseaseMatch"]
342-
343342
# validate the input variants
344343
signatureVariants: List[IprSignatureVariant] = preprocess_signature_variants(
345344
[
@@ -380,6 +379,14 @@ def ipr_report(
380379

381380
graphkb_conn.login(gkb_user, gkb_pass)
382381

382+
# DISEASE
383+
# Disease term from bioapps; expected OncoTree term
384+
kb_disease_match: str = content["kbDiseaseMatch"]
385+
386+
# Matching disease RIDs from GraphKB using term tree
387+
# (Will raise uncatched error if no match)
388+
disease_matches: list[str] = get_kb_disease_matches(graphkb_conn, kb_disease_match)
389+
383390
# GKB MATCHING
384391
gkb_matches: List[Hashabledict] = []
385392

@@ -405,7 +412,7 @@ def ipr_report(
405412
tmb["kbCategory"] = TMB_HIGH_CATEGORY
406413

407414
# GERO-296 - try matching to graphkb
408-
tmb_matches = annotate_tmb(graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY)
415+
tmb_matches = annotate_tmb(graphkb_conn, disease_matches, TMB_HIGH_CATEGORY)
409416
if tmb_matches:
410417
tmb_variant["kbCategory"] = TMB_HIGH_CATEGORY # type: ignore
411418
tmb_variant["variant"] = TMB_HIGH_CATEGORY
@@ -431,7 +438,7 @@ def ipr_report(
431438
msi_cat = msi.get("kbCategory")
432439
msi_variant = msi.copy()
433440
logger.info(f"Matching GKB msi {msi_cat}")
434-
msi_matches = annotate_msi(graphkb_conn, kb_disease_match, msi_cat)
441+
msi_matches = annotate_msi(graphkb_conn, disease_matches, msi_cat)
435442
if msi_matches:
436443
msi_variant["kbCategory"] = msi_cat # type: ignore
437444
msi_variant["variant"] = msi_cat
@@ -445,7 +452,7 @@ def ipr_report(
445452
logger.info(f"annotating {len(signatureVariants)} signatures")
446453
gkb_matches.extend(
447454
annotate_signature_variants(
448-
graphkb_conn, signatureVariants, kb_disease_match, show_progress=interactive
455+
graphkb_conn, disease_matches, signatureVariants, show_progress=interactive
449456
)
450457
)
451458
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
@@ -454,7 +461,7 @@ def ipr_report(
454461
logger.info(f"annotating {len(small_mutations)} small mutations")
455462
gkb_matches.extend(
456463
annotate_positional_variants(
457-
graphkb_conn, small_mutations, kb_disease_match, show_progress=interactive
464+
graphkb_conn, small_mutations, disease_matches, show_progress=interactive
458465
)
459466
)
460467
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
@@ -465,7 +472,7 @@ def ipr_report(
465472
annotate_positional_variants(
466473
graphkb_conn,
467474
structural_variants,
468-
kb_disease_match,
475+
disease_matches,
469476
show_progress=interactive,
470477
)
471478
)
@@ -477,7 +484,7 @@ def ipr_report(
477484
[
478485
Hashabledict(copy_var)
479486
for copy_var in annotate_copy_variants(
480-
graphkb_conn, copy_variants, kb_disease_match, show_progress=interactive
487+
graphkb_conn, disease_matches, copy_variants, show_progress=interactive
481488
)
482489
]
483490
)
@@ -490,8 +497,8 @@ def ipr_report(
490497
Hashabledict(exp_var)
491498
for exp_var in annotate_expression_variants(
492499
graphkb_conn,
500+
disease_matches,
493501
expression_variants,
494-
kb_disease_match,
495502
show_progress=interactive,
496503
)
497504
]
@@ -550,7 +557,7 @@ def ipr_report(
550557
graphkb_comments = auto_analyst_comments(
551558
graphkb_conn,
552559
gkb_matches,
553-
disease_name=kb_disease_match,
560+
disease_matches=set(disease_matches),
554561
variants=all_variants,
555562
)
556563
comments_list.append(graphkb_comments)

0 commit comments

Comments
 (0)