Skip to content

Commit 175a9c3

Browse files
committed
resolve merge conflict
2 parents 4076ffc + ad1dbfc commit 175a9c3

File tree

4 files changed

+151
-7
lines changed

4 files changed

+151
-7
lines changed

pori_python/ipr/content.spec.json

+2-3
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,11 @@
9999
"items": {
100100
"properties": {
101101
"chromosomeBand": {
102-
"example": "X:p12.2",
103-
"pattern": "^(\\S+:\\S+?)?$",
102+
"example": "Xp12.2",
104103
"type": "string"
105104
},
106105
"cna": {
107-
"description": "The copy number alteration (CNA) ratio",
106+
"description": "Copy Number, Absolute (cna)",
108107
"example": 1.22,
109108
"type": [
110109
"number",

pori_python/ipr/ipr.py

+74
Original file line numberDiff line numberDiff line change
@@ -388,3 +388,77 @@ def germline_kb_matches(
388388
ret_list.append(alt) # alteration not in any specific keys matches to check.
389389

390390
return ret_list
391+
392+
393+
def multi_variant_filtering(
394+
graphkb_conn: GraphKBConnection,
395+
gkb_matches: List[KbMatch],
396+
excludedTypes: List[str] = ['wildtype'],
397+
) -> List[KbMatch]:
398+
"""Filters out GraphKB matches that doesn't match to all required variants on multi-variant statements
399+
400+
DEVSU-2477
401+
GKB Statements can be conditional to more than one variant, with implicit 'AND' operator. Since variants
402+
are matched only one at a time, any multi-variant statement get matched if one of their conditional
403+
variants is matching the observed ones, making de facto an 'OR' operator between conditions. The current
404+
function is filtering out these incomplete matches.
405+
406+
Note: Wildtype variants are not taken into account at the moment.
407+
408+
Params:
409+
graphkb_conn: the graphkb connection object
410+
gkb_matches: KbMatch statements to be filtered
411+
excludedTypes: List of variant type terms to exclude from filtering. Default to Wildtype
412+
Returns:
413+
filtered list of KbMatch statements
414+
"""
415+
# All matching statements & variants (GKB RIDs)
416+
matching_statement_rids = {match['kbStatementId'] for match in gkb_matches}
417+
matching_variant_rids = {match['kbVariantId'] for match in gkb_matches}
418+
419+
# Get conditions detail on all matching statements
420+
res = graphkb_conn.post(
421+
uri="query",
422+
data={
423+
"target": "Statement",
424+
"filters": {
425+
"@rid": list(matching_statement_rids),
426+
"operator": 'IN',
427+
},
428+
"history": True,
429+
"returnProperties": [
430+
"@rid",
431+
"conditions.@rid",
432+
"conditions.@class",
433+
"conditions.type",
434+
],
435+
},
436+
)
437+
statements = res['result']
438+
439+
# Get set of excluded Vocabulary RIDs for variant types
440+
excluded = {}
441+
if len(excludedTypes) != 0 and excludedTypes[0] != '':
442+
excluded = gkb_vocab.get_terms_set(graphkb_conn, excludedTypes)
443+
444+
# Mapping statements to their conditional variants
445+
# (discarding non-variant conditions & variant conditions from excluded types)
446+
statement_to_variants = {}
447+
for statement in statements:
448+
statement_to_variants[statement['@rid']] = {
449+
el['@rid']
450+
for el in statement['conditions']
451+
if (el['@class'] in VARIANT_CLASSES and el.get('type', '') not in excluded)
452+
}
453+
454+
# Set of statements with complete matching
455+
complete_matching_statements = {
456+
statementRid
457+
for statementRid, variantRids in statement_to_variants.items()
458+
if variantRids.issubset(matching_variant_rids)
459+
}
460+
461+
# Filtering out incompleted matches of gkb_matches
462+
return [
463+
match for match in gkb_matches if match['kbStatementId'] in complete_matching_statements
464+
]

pori_python/ipr/main.py

+30-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
create_key_alterations,
3636
filter_structural_variants,
3737
germline_kb_matches,
38+
multi_variant_filtering,
3839
select_expression_plots,
3940
)
4041
from .summary import auto_analyst_comments, get_ipr_analyst_comments
@@ -251,6 +252,7 @@ def ipr_report(
251252
include_nonspecific_disease: bool = False,
252253
include_nonspecific_project: bool = False,
253254
include_nonspecific_template: bool = False,
255+
multi_variant_filter: bool = True,
254256
) -> Dict:
255257
"""Run the matching and create the report JSON for upload to IPR.
256258
@@ -278,6 +280,7 @@ def ipr_report(
278280
include_nonspecific_disease: if include_ipr_variant_text is True, if no disease match is found use disease-nonspecific variant comment
279281
include_nonspecific_project: if include_ipr_variant_text is True, if no project match is found use project-nonspecific variant comment
280282
include_nonspecific_template: if include_ipr_variant_text is True, if no template match is found use template-nonspecific variant comment
283+
multi_variant_filter: filters out matches that doesn't match to all required variants on multi-variant statements
281284
282285
Returns:
283286
ipr_conn.upload_report return dictionary
@@ -309,10 +312,11 @@ def ipr_report(
309312
small_mutations, expression_variants, copy_variants, structural_variants
310313
)
311314

312-
# Setup connections
315+
# IPR CONNECTION
313316
ipr_conn = IprConnection(username, password, ipr_url)
314317
ipr_spec = ipr_conn.get_spec()
315318

319+
# GKB CONNECTION
316320
if graphkb_url:
317321
logger.info(f"connecting to graphkb: {graphkb_url}")
318322
graphkb_conn = GraphKBConnection(graphkb_url)
@@ -324,9 +328,10 @@ def ipr_report(
324328

325329
graphkb_conn.login(gkb_user, gkb_pass)
326330

331+
# GKB MATCHING
327332
gkb_matches: List[Hashabledict] = []
328333

329-
# Signature category variants
334+
# MATCHING TMB
330335
tmb_variant: IprVariant = {} # type: ignore
331336
tmb_matches = []
332337
if "tmburMutationBurden" in content.keys():
@@ -360,6 +365,7 @@ def ipr_report(
360365
gkb_matches.extend([Hashabledict(tmb_statement) for tmb_statement in tmb_matches])
361366
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
362367

368+
# MATCHING MSI
363369
msi = content.get("msi", [])
364370
msi_matches = []
365371
msi_variant: IprVariant = {} # type: ignore
@@ -383,6 +389,7 @@ def ipr_report(
383389
gkb_matches.extend([Hashabledict(msi) for msi in msi_matches])
384390
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
385391

392+
# MATCHING SMALL MUTATIONS
386393
logger.info(f"annotating {len(small_mutations)} small mutations")
387394
gkb_matches.extend(
388395
annotate_positional_variants(
@@ -391,6 +398,7 @@ def ipr_report(
391398
)
392399
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
393400

401+
# MATCHING STRUCTURAL VARIANTS
394402
logger.info(f"annotating {len(structural_variants)} structural variants")
395403
gkb_matches.extend(
396404
annotate_positional_variants(
@@ -399,6 +407,7 @@ def ipr_report(
399407
)
400408
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
401409

410+
# MATCHING COPY VARIANTS
402411
logger.info(f"annotating {len(copy_variants)} copy variants")
403412
gkb_matches.extend(
404413
[
@@ -410,6 +419,7 @@ def ipr_report(
410419
)
411420
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
412421

422+
# MATCHING EXPRESSION VARIANTS
413423
logger.info(f"annotating {len(expression_variants)} expression variants")
414424
gkb_matches.extend(
415425
[
@@ -421,13 +431,15 @@ def ipr_report(
421431
)
422432
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")
423433

434+
# ALL VARIANTS
424435
all_variants: Sequence[IprVariant]
425436
all_variants = expression_variants + copy_variants + structural_variants + small_mutations # type: ignore
426437
if msi_matches:
427438
all_variants.append(msi_variant) # type: ignore
428439
if tmb_matches:
429440
all_variants.append(tmb_variant) # type: ignore
430441

442+
# GKB_MATCHES FILTERING
431443
if match_germline:
432444
# verify germline kb statements matched germline observed variants, not somatic variants
433445
org_len = len(gkb_matches)
@@ -443,17 +455,28 @@ def ipr_report(
443455
gkb_matches = [Hashabledict(match) for match in custom_kb_match_filter(gkb_matches)]
444456
logger.info(f"\t custom_kb_match_filter left {len(gkb_matches)} variants")
445457

458+
if multi_variant_filter:
459+
logger.info(
460+
f"Filtering out incomplete matches on multi-variant statements for {len(gkb_matches)} matches"
461+
)
462+
gkb_matches = multi_variant_filtering(graphkb_conn, gkb_matches)
463+
logger.info(f"multi_variant_filtering left {len(gkb_matches)} matches")
464+
465+
# KEY ALTERATIONS
446466
key_alterations, variant_counts = create_key_alterations(gkb_matches, all_variants)
447467

468+
# GENE INFORMATION
448469
logger.info("fetching gene annotations")
449470
gene_information = get_gene_information(graphkb_conn, sorted(genes_with_variants))
450471

472+
# THERAPEUTIC OPTIONS
451473
if generate_therapeutics:
452474
logger.info("generating therapeutic options")
453475
targets = create_therapeutic_options(graphkb_conn, gkb_matches, all_variants)
454476
else:
455477
targets = []
456478

479+
# ANALYST COMMENTS
457480
logger.info("generating analyst comments")
458481

459482
comments_list = []
@@ -477,6 +500,7 @@ def ipr_report(
477500
comments_list.append(ipr_comments)
478501
comments = "\n".join(comments_list)
479502

503+
# OUTPUT CONTENT
480504
# thread safe deep-copy the original content
481505
output = json.loads(json.dumps(content))
482506
output.update(
@@ -513,6 +537,7 @@ def ipr_report(
513537
ipr_result = None
514538
upload_error = None
515539

540+
# UPLOAD TO IPR
516541
if ipr_upload:
517542
try:
518543
logger.info(f"Uploading to IPR {ipr_conn.url}")
@@ -522,11 +547,14 @@ def ipr_report(
522547
except Exception as err:
523548
upload_error = err
524549
logger.error(f"ipr_conn.upload_report failed: {err}", exc_info=True)
550+
551+
# SAVE TO JSON FILE
525552
if output_json_path:
526553
if always_write_output_json or not ipr_result:
527554
logger.info(f"Writing IPR upload json to: {output_json_path}")
528555
with open(output_json_path, "w") as fh:
529556
fh.write(json.dumps(output))
557+
530558
logger.info(f"made {graphkb_conn.request_count} requests to graphkb")
531559
logger.info(f"average load {int(graphkb_conn.load or 0)} req/s")
532560
if upload_error:

tests/test_ipr/test_ipr.py

+45-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33

44
from pori_python.graphkb import statement as gkb_statement
55
from pori_python.graphkb import vocab as gkb_vocab
6-
from pori_python.ipr.ipr import convert_statements_to_alterations, germline_kb_matches
6+
from pori_python.ipr.ipr import (
7+
convert_statements_to_alterations,
8+
germline_kb_matches,
9+
multi_variant_filtering,
10+
)
711
from pori_python.types import Statement
812

913
DISEASE_RIDS = ["#138:12", "#138:13"]
@@ -142,6 +146,24 @@
142146
},
143147
]
144148

149+
KB_MATCHES_STATEMENTS = [
150+
{
151+
'@rid': SOMATIC_KB_MATCHES[0]['kbStatementId'],
152+
'conditions': [
153+
{'@class': 'PositionalVariant', '@rid': SOMATIC_KB_MATCHES[0]['kbVariantId']},
154+
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
155+
{'@class': 'Disease', '@rid': ''},
156+
],
157+
},
158+
{
159+
'@rid': SOMATIC_KB_MATCHES[1]['kbStatementId'],
160+
'conditions': [
161+
{'@class': 'CategoryVariant', '@rid': SOMATIC_KB_MATCHES[1]['kbVariantId']},
162+
{'@class': 'PositionalVariant', '@rid': '157:0', 'type': '#999:99'},
163+
],
164+
},
165+
]
166+
145167

146168
@pytest.fixture
147169
def graphkb_conn():
@@ -157,10 +179,15 @@ def __call__(self, *args, **kwargs):
157179
ret_val = self.return_values[self.index] if self.index < len(self.return_values) else []
158180
return ret_val
159181

182+
class PostMock:
183+
def __call__(self, *args, **kwargs):
184+
# custom return tailored for multi_variant_filtering() testing
185+
return {'result': KB_MATCHES_STATEMENTS}
186+
160187
def mock_get_source(source):
161188
return {"@rid": 0}
162189

163-
conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source)
190+
conn = Mock(query=QueryMock(), cache={}, get_source=mock_get_source, post=PostMock())
164191

165192
return conn
166193

@@ -203,6 +230,14 @@ def mock_func(*pos, **kwargs):
203230
monkeypatch.setattr(gkb_vocab, "get_term_tree", mock_func)
204231

205232

233+
@pytest.fixture(autouse=True)
234+
def get_terms_set(monkeypatch):
235+
def mock_func(*pos, **kwargs):
236+
return {'#999:99'}
237+
238+
monkeypatch.setattr(gkb_vocab, "get_terms_set", mock_func)
239+
240+
206241
@pytest.fixture(autouse=True)
207242
def mock_categorize_relevance(monkeypatch):
208243
def mock_func(_, relevance_id):
@@ -336,3 +371,11 @@ def test_germline_kb_matches(self):
336371
assert not germline_kb_matches(
337372
SOMATIC_KB_MATCHES, GERMLINE_VARIANTS
338373
), "Germline variant matched to KB somatic statement."
374+
375+
def test_multi_variant_filtering(self, graphkb_conn):
376+
assert (
377+
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES, [])) == 1
378+
), 'Incomplete matches filtered, without excluded types'
379+
assert (
380+
len(multi_variant_filtering(graphkb_conn, SOMATIC_KB_MATCHES)) == 2
381+
), 'Incomplete matches filtered, with default excluded types'

0 commit comments

Comments
 (0)