Skip to content

Commit 0efbfaa

Browse files
authored
Merge pull request #37 from bcgsc/bugfix/DEVSU-2477-filter-gkb_matches
Bugfix/devsu 2477 filter gkb matches
2 parents e4689df + 2832eb5 commit 0efbfaa

File tree

3 files changed

+149
-4
lines changed

3 files changed

+149
-4
lines changed

pori_python/ipr/ipr.py

+74
Original file line numberDiff line numberDiff line change
@@ -388,3 +388,77 @@ def germline_kb_matches(
388388
ret_list.append(alt) # alteration not in any specific keys matches to check.
389389

390390
return ret_list
391+
392+
393+
def multi_variant_filtering(
394+
graphkb_conn: GraphKBConnection,
395+
gkb_matches: List[KbMatch],
396+
excludedTypes: List[str] = ['wildtype'],
397+
) -> List[KbMatch]:
398+
"""Filters out GraphKB matches that doesn't match to all required variants on multi-variant statements
399+
400+
DEVSU-2477
401+
GKB Statements can be conditional to more than one variant, with implicit 'AND' operator. Since variants
402+
are matched only one at a time, any multi-variant statement get matched if one of their conditional
403+
variants is matching the observed ones, making de facto an 'OR' operator between conditions. The current
404+
function is filtering out these incomplete matches.
405+
406+
Note: Wildtype variants are not taken into account at the moment.
407+
408+
Params:
409+
graphkb_conn: the graphkb connection object
410+
gkb_matches: KbMatch statements to be filtered
411+
excludedTypes: List of variant type terms to exclude from filtering. Default to Wildtype
412+
Returns:
413+
filtered list of KbMatch statements
414+
"""
415+
# All matching statements & variants (GKB RIDs)
416+
matching_statement_rids = {match['kbStatementId'] for match in gkb_matches}
417+
matching_variant_rids = {match['kbVariantId'] for match in gkb_matches}
418+
419+
# Get conditions detail on all matching statements
420+
res = graphkb_conn.post(
421+
uri="query",
422+
data={
423+
"target": "Statement",
424+
"filters": {
425+
"@rid": list(matching_statement_rids),
426+
"operator": 'IN',
427+
},
428+
"history": True,
429+
"returnProperties": [
430+
"@rid",
431+
"conditions.@rid",
432+
"conditions.@class",
433+
"conditions.type",
434+
],
435+
},
436+
)
437+
statements = res['result']
438+
439+
# Get set of excluded Vocabulary RIDs for variant types
440+
excluded = {}
441+
if len(excludedTypes) != 0 and excludedTypes[0] != '':
442+
excluded = gkb_vocab.get_terms_set(graphkb_conn, excludedTypes)
443+
444+
# Mapping statements to their conditional variants
445+
# (discarding non-variant conditions & variant conditions from excluded types)
446+
statement_to_variants = {}
447+
for statement in statements:
448+
statement_to_variants[statement['@rid']] = {
449+
el['@rid']
450+
for el in statement['conditions']
451+
if (el['@class'] in VARIANT_CLASSES and el.get('type', '') not in excluded)
452+
}
453+
454+
# Set of statements with complete matching
455+
complete_matching_statements = {
456+
statementRid
457+
for statementRid, variantRids in statement_to_variants.items()
458+
if variantRids.issubset(matching_variant_rids)
459+
}
460+
461+
# Filtering out incompleted matches of gkb_matches
462+
return [
463+
match for match in gkb_matches if match['kbStatementId'] in complete_matching_statements
464+
]

pori_python/ipr/main.py

+30-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
create_key_alterations,
3636
filter_structural_variants,
3737
germline_kb_matches,
38+
multi_variant_filtering,
3839
select_expression_plots,
3940
)
4041
from .summary import auto_analyst_comments
@@ -246,6 +247,7 @@ def ipr_report(
246247
custom_kb_match_filter=None,
247248
async_upload: bool = False,
248249
mins_to_wait: int = 5,
250+
multi_variant_filter: bool = True,
249251
) -> Dict:
250252
"""Run the matching and create the report JSON for upload to IPR.
251253
@@ -269,6 +271,7 @@ def ipr_report(
269271
custom_kb_match_filter: function(List[kbMatch]) -> List[kbMatch]
270272
async_upload: use report_async endpoint to upload reports
271273
mins_to_wait: if using report_async, number of minutes to wait for success before exception raised
274+
multi_variant_filter: filters out matches that doesn't match to all required variants on multi-variant statements
272275
273276
Returns:
274277
ipr_conn.upload_report return dictionary
@@ -300,10 +303,11 @@ def ipr_report(
300303
small_mutations, expression_variants, copy_variants, structural_variants
301304
)
302305

303-
# Setup connections
306+
# IPR CONNECTION
304307
ipr_conn = IprConnection(username, password, ipr_url)
305308
ipr_spec = ipr_conn.get_spec()
306309

310+
# GKB CONNECTION
307311
if graphkb_url:
308312
logger.info(f"connecting to graphkb: {graphkb_url}")
309313
graphkb_conn = GraphKBConnection(graphkb_url)
@@ -315,9 +319,10 @@ def ipr_report(
315319

316320
graphkb_conn.login(gkb_user, gkb_pass)
317321

322+
# GKB MATCHING
318323
gkb_matches: List[Hashabledict] = []
319324

320-
# Signature category variants
325+
# MATCHING TMB
321326
tmb_variant: IprVariant = {} # type: ignore
322327
tmb_matches = []
323328
if "tmburMutationBurden" in content.keys():
@@ -351,6 +356,7 @@ def ipr_report(
351356
gkb_matches.extend([Hashabledict(tmb_statement) for tmb_statement in tmb_matches])
352357
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
353358

359+
# MATCHING MSI
354360
msi = content.get("msi", [])
355361
msi_matches = []
356362
msi_variant: IprVariant = {} # type: ignore
@@ -374,6 +380,7 @@ def ipr_report(
374380
gkb_matches.extend([Hashabledict(msi) for msi in msi_matches])
375381
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
376382

383+
# MATCHING SMALL MUTATIONS
377384
logger.info(f"annotating {len(small_mutations)} small mutations")
378385
gkb_matches.extend(
379386
annotate_positional_variants(
@@ -382,6 +389,7 @@ def ipr_report(
382389
)
383390
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
384391

392+
# MATCHING STRUCTURAL VARIANTS
385393
logger.info(f"annotating {len(structural_variants)} structural variants")
386394
gkb_matches.extend(
387395
annotate_positional_variants(
@@ -390,6 +398,7 @@ def ipr_report(
390398
)
391399
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
392400

401+
# MATCHING COPY VARIANTS
393402
logger.info(f"annotating {len(copy_variants)} copy variants")
394403
gkb_matches.extend(
395404
[
@@ -401,6 +410,7 @@ def ipr_report(
401410
)
402411
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
403412

413+
# MATCHING EXPRESSION VARIANTS
404414
logger.info(f"annotating {len(expression_variants)} expression variants")
405415
gkb_matches.extend(
406416
[
@@ -412,13 +422,15 @@ def ipr_report(
412422
)
413423
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
414424

425+
# ALL VARIANTS
415426
all_variants: Sequence[IprVariant]
416427
all_variants = expression_variants + copy_variants + structural_variants + small_mutations # type: ignore
417428
if msi_matches:
418429
all_variants.append(msi_variant) # type: ignore
419430
if tmb_matches:
420431
all_variants.append(tmb_variant) # type: ignore
421432

433+
# GKB_MATCHES FILTERING
422434
if match_germline:
423435
# verify germline kb statements matched germline observed variants, not somatic variants
424436
org_len = len(gkb_matches)
@@ -434,17 +446,28 @@ def ipr_report(
434446
gkb_matches = [Hashabledict(match) for match in custom_kb_match_filter(gkb_matches)]
435447
logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants")
436448

449+
if multi_variant_filter:
450+
logger.info(
451+
f"Filtering out incomplete matches on multi-variant statements for {len(gkb_matches)} matches"
452+
)
453+
gkb_matches = multi_variant_filtering(graphkb_conn, gkb_matches)
454+
logger.info(f"multi_variant_filtering left {len(gkb_matches)} matches")
455+
456+
# KEY ALTERATIONS
437457
key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants)
438458

459+
# GENE INFORMATION
439460
logger.info("fetching gene annotations")
440461
gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants))
441462

463+
# THERAPEUTIC OPTIONS
442464
if generate_therapeutics:
443465
logger.info("generating therapeutic options")
444466
targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants)
445467
else:
446468
targets = []
447469

470+
# ANALYST COMMENTS
448471
logger.info("generating analyst comments")
449472
if generate_comments:
450473
comments = {
@@ -455,6 +478,7 @@ def ipr_report(
455478
else:
456479
comments = {"comments": ""}
457480

481+
# OUTPUT CONTENT
458482
# thread safe deep-copy the original content
459483
output = json.loads(json.dumps(content))
460484
output.update(
@@ -491,6 +515,7 @@ def ipr_report(
491515
ipr_result = None
492516
upload_error = None
493517

518+
# UPLOAD TO IPR
494519
if ipr_upload:
495520
try:
496521
logger.info(f"Uploading to IPR {ipr_conn.url}")
@@ -500,11 +525,14 @@ def ipr_report(
500525
except Exception as err:
501526
upload_error = err
502527
logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True)
528+
529+
# SAVE TO JSON FILE
503530
if output_json_path:
504531
if always_write_output_json or not ipr_result:
505532
logger.info(f"Writing IPR upload json to: {output_json_path}")
506533
with open(output_json_path, "w") as fh:
507534
fh.write(json.dumps(output))
535+
508536
logger.info(f"made {graphkb_conn.request_count} requests to graphkb")
509537
logger.info(f"average load {int(graphkb_conn.load or 0)} req/s")
510538
if upload_error:

tests/test_ipr/test_ipr.py

+45-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33

44
from pori_python.graphkb import statement as gkb_statement
55
from pori_python.graphkb import vocab as gkb_vocab
6-
from pori_python.ipr.ipr import convert_statements_to_alterations, germline_kb_matches
6+
from pori_python.ipr.ipr import (
7+
convert_statements_to_alterations,
8+
germline_kb_matches,
9+
multi_variant_filtering,
10+
)
711
from pori_python.types import Statement
812

913
DISEASE_RIDS = ["#138:12", "#138:13"]
@@ -142,6 +146,24 @@
142146
},
143147
]
144148

149+
KB_MATCHES_STATEMENTS = [
150+
{
151+
'@rid': SOMATIC_KB_MATCHES[0]['kbStatementId'],
152+
'conditions': [
153+
{'@class': 'PositionalVariant', '@rid': SOMATIC_KB_MATCHES[0]['kbVariantId']},
154+
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
155+
{'@class': 'Disease', '@rid': ''},
156+
],
157+
},
158+
{
159+
'@rid': SOMATIC_KB_MATCHES[1]['kbStatementId'],
160+
'conditions': [
161+
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
162+
{'@class': 'PositionalVariant', '@rid': '157:0', 'type': '#999:99'},
163+
],
164+
},
165+
]
166+
145167

146168
@pytest.fixture
147169
def graphkb_conn():
@@ -157,10 +179,15 @@ def __call__(self, *args, **kwargs):
157179
ret_val = self.return_values[self.index] if self.index < len(self.return_values) else []
158180
return ret_val
159181

182+
class PostMock:
183+
def __call__(self, *args, **kwargs):
184+
# custom return tailored for multi_variant_filtering() testing
185+
return {'result': KB_MATCHES_STATEMENTS}
186+
160187
def mock_get_source(source):
161188
return {"@rid": 0}
162189

163-
conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source)
190+
conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source, post=PostMock())
164191

165192
return conn
166193

@@ -203,6 +230,14 @@ def mock_func(*pos, **kwargs):
203230
monkeypatch.setattr(gkb_vocab, "get_term_tree", mock_func)
204231

205232

233+
@pytest.fixture(autouse=True)
234+
def get_terms_set(monkeypatch):
235+
def mock_func(*pos, **kwargs):
236+
return {'#999:99'}
237+
238+
monkeypatch.setattr(gkb_vocab, "get_terms_set", mock_func)
239+
240+
206241
@pytest.fixture(autouse=True)
207242
def mock_categorize_relevance(monkeypatch):
208243
def mock_func(_, relevance_id):
@@ -336,3 +371,11 @@ def test_germline_kb_matches(self):
336371
assert not germline_kb_matches(
337372
SOMATIC_KB_MATCHES, GERMLINE_VARIANTS
338373
), "Germline variant matched to KB somatic statement."
374+
375+
def test_multi_variant_filtering(self, graphkb_conn):
376+
assert (
377+
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES, [])) == 1
378+
), 'Incomplete matches filtered, without excluded types'
379+
assert (
380+
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES)) == 2
381+
), 'Incomplete matches filtered, with default excluded types'

0 commit comments

Comments
 (0)