Skip to content

Commit e6e48c5

Browse files
Merge pull request #42 from bcgsc/feature/KBDEV-1227-signature-support
Feature/kbdev 1227 signature support
2 parents 0035155 + 210347b commit e6e48c5

11 files changed

+328
-32
lines changed

pori_python/graphkb/match.py

+31-19
Original file line numberDiff line numberDiff line change
@@ -129,50 +129,62 @@ def cache_missing_features(conn: GraphKBConnection) -> None:
129129

130130
def match_category_variant(
131131
conn: GraphKBConnection,
132-
gene_name: str,
132+
reference_name: str,
133133
category: str,
134134
root_exclude_term: str = "",
135135
gene_source: str = "",
136136
gene_is_source_id: bool = False,
137137
ignore_cache: bool = False,
138+
reference_class: str = 'Feature',
138139
) -> List[Variant]:
139140
"""
140141
Returns a list of variants matching the input variant
141142
142143
Args:
143144
conn (GraphKBConnection): the graphkb connection object
144-
gene_name (str): the name of the gene the variant is in reference to
145+
reference_name (str): the name of the Feature(gene)/Signature the variant is in reference to
145146
category (str): the variant category (ex. copy loss)
146147
gene_source: The source database the gene is defined by (ex. ensembl)
147148
gene_is_source_id: Indicates the gene name(s) input should be treated as sourceIds not names
149+
reference_class (str): Class name of the variant reference. Default to 'Feature'
148150
Raises:
149151
FeatureNotFoundError: The gene could not be found in GraphKB
150152
151153
Returns:
152154
Array.<dict>: List of variant records from GraphKB which match the input
153155
"""
154-
# disambiguate the gene to find all equivalent representations
155-
features = convert_to_rid_list(
156-
get_equivalent_features(
157-
conn,
158-
gene_name,
159-
source=gene_source,
160-
is_source_id=gene_is_source_id,
161-
ignore_cache=ignore_cache,
156+
# disambiguate the reference to find all equivalent representations
157+
references: List[str] = []
158+
if reference_class == 'Feature':
159+
references = convert_to_rid_list(
160+
get_equivalent_features(
161+
conn,
162+
reference_name,
163+
source=gene_source,
164+
is_source_id=gene_is_source_id,
165+
ignore_cache=ignore_cache,
166+
)
162167
)
163-
)
164-
165-
if not features:
166-
raise FeatureNotFoundError(
167-
f"unable to find the gene ({gene_name}) or any equivalent representations"
168+
if not references:
169+
raise FeatureNotFoundError(
170+
f"unable to find the gene ({reference_name}) or any equivalent representations"
171+
)
172+
if reference_class == 'Signature':
173+
references = convert_to_rid_list(
174+
get_equivalent_terms(
175+
conn,
176+
reference_name.lower(),
177+
ontology_class='Signature',
178+
ignore_cache=ignore_cache,
179+
)
168180
)
169181

170182
# get the list of terms that we should match
171-
terms = convert_to_rid_list(
183+
types = convert_to_rid_list(
172184
get_term_tree(conn, category, root_exclude_term, ignore_cache=ignore_cache)
173185
)
174186

175-
if not terms:
187+
if not types:
176188
raise ValueError(f"unable to find the term/category ({category}) or any equivalent")
177189

178190
# find the variant list
@@ -183,8 +195,8 @@ def match_category_variant(
183195
"target": {
184196
"target": "CategoryVariant",
185197
"filters": [
186-
{"reference1": features, "operator": "IN"},
187-
{"type": terms, "operator": "IN"},
198+
{"reference1": references, "operator": "IN"},
199+
{"type": types, "operator": "IN"},
188200
],
189201
},
190202
"queryType": "similarTo",

pori_python/ipr/annotate.py

+53
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Hashabledict,
2020
IprCopyVariant,
2121
IprExprVariant,
22+
IprSignatureVariant,
2223
IprStructuralVariant,
2324
KbMatch,
2425
Statement,
@@ -380,3 +381,55 @@ def annotate_tmb(
380381
ipr_row["variantType"] = "tmb"
381382
gkb_matches.append(ipr_row)
382383
return gkb_matches
384+
385+
386+
def annotate_signature_variants(
387+
graphkb_conn: GraphKBConnection,
388+
variants: List[IprSignatureVariant] = [],
389+
disease_name: str = "cancer",
390+
show_progress: bool = False,
391+
) -> List[KbMatch]:
392+
"""Annotate Signature variants with GraphKB in the IPR alterations format.
393+
394+
Match to corresponding GraphKB Variants, then to linked GraphKB Statements
395+
396+
Args:
397+
graphkb_conn: the graphkb api connection object
398+
variants: list of signature variants
399+
disease_name: oncotree disease name for graphkb matching
400+
show_progress: progressbar displayed for long runs; default to False
401+
402+
Returns:
403+
list of kbMatches records for IPR
404+
"""
405+
alterations: List[Hashabledict] = []
406+
407+
iterfunc = tqdm if show_progress else iter
408+
for variant in iterfunc(variants):
409+
try:
410+
# Matching signature variant to GKB Variants
411+
matched_variants: List[Variant] = gkb_match.match_category_variant(
412+
graphkb_conn,
413+
variant["signatureName"],
414+
variant["variantTypeName"],
415+
reference_class="Signature",
416+
)
417+
# Matching GKB Variants to GKB Statements
418+
for ipr_row in get_ipr_statements_from_variants(
419+
graphkb_conn, matched_variants, disease_name
420+
):
421+
ipr_row["variant"] = variant["key"]
422+
ipr_row["variantType"] = "sigv"
423+
alterations.append(Hashabledict(ipr_row))
424+
425+
except ValueError as err:
426+
logger.error(f"failed to match signature category variant '{variant}': {err}")
427+
428+
# drop duplicates
429+
alterations = list(set(alterations))
430+
431+
logger.info(
432+
f"matched {len(variants)} signature category variants to {len(alterations)} graphkb annotations"
433+
)
434+
435+
return alterations

pori_python/ipr/constants.py

+3
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,8 @@
55
# all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial']
66
FAILED_REVIEW_STATUS = "failed"
77

8+
# Signatures
9+
COSMIC_SIGNATURE_VARIANT_TYPE = "high signature"
10+
HLA_SIGNATURE_VARIANT_TYPE = "signature present"
811
TMB_HIGH = 10.0 # genomic mutations per mb - https://www.bcgsc.ca/jira/browse/GERO-296
912
TMB_HIGH_CATEGORY = "high mutation burden"

pori_python/ipr/content.spec.json

+10
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,16 @@
212212
},
213213
"type": "array"
214214
},
215+
"cosmicSignatures": {
216+
"default": [],
217+
"description": "List of observed (above threshold) COSMIC signatures (DBS, SBS and ID) & dMMR",
218+
"items": {
219+
"description": "Signature name",
220+
"example": "DBS1",
221+
"type": "string"
222+
},
223+
"type": "array"
224+
},
215225
"expressionVariants": {
216226
"default": [],
217227
"items": {

pori_python/ipr/inputs.py

+71-1
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@
1616
IprCopyVariant,
1717
IprExprVariant,
1818
IprFusionVariant,
19+
IprSignatureVariant,
1920
IprSmallMutationVariant,
2021
IprVariant,
2122
)
2223

23-
from .constants import DEFAULT_URL
24+
from .constants import COSMIC_SIGNATURE_VARIANT_TYPE, DEFAULT_URL, HLA_SIGNATURE_VARIANT_TYPE
2425
from .util import hash_key, logger, pandas_falsy
2526

2627
protein_letters_3to1.setdefault("Ter", "*")
@@ -152,6 +153,12 @@
152153
"mavis_product_id",
153154
]
154155

156+
SIGV_REQ = ["signatureName", "variantTypeName"]
157+
SIGV_COSMIC = ["signature"] # 1st element used as signatureName key
158+
SIGV_HLA = ["a1", "a2", "b1", "b2", "c1", "c2"]
159+
SIGV_OPTIONAL = ["displayName"]
160+
SIGV_KEY = SIGV_REQ[:]
161+
155162

156163
def validate_variant_rows(
157164
rows: Iterable[Dict], required: List[str], optional: List[str], row_to_key: Callable
@@ -388,6 +395,69 @@ def row_key(row: Dict) -> Tuple[str, ...]:
388395
return result
389396

390397

398+
def preprocess_signature_variants(rows: Iterable[Dict]) -> List[IprSignatureVariant]:
399+
"""
400+
Validate the input rows contain the minimum required fields and
401+
generate any default values where possible
402+
"""
403+
404+
def row_key(row: Dict) -> Tuple[str, ...]:
405+
return tuple(["sigv"] + [row[key] for key in SIGV_KEY])
406+
407+
variants = validate_variant_rows(rows, SIGV_REQ, SIGV_OPTIONAL, row_key)
408+
result = [cast(IprSignatureVariant, var) for var in variants]
409+
410+
# Adding additional required properties
411+
for row in result:
412+
row["variant"] = row["displayName"]
413+
row["variantType"] = "sigv"
414+
415+
return result
416+
417+
418+
def preprocess_cosmic(rows: Iterable[Dict]) -> Iterable[Dict]:
419+
"""
420+
Process cosmic inputs into preformatted signature inputs
421+
Note: Cosmic and dMMR already evaluated against thresholds in gsc_report
422+
"""
423+
cosmic = set()
424+
for row in rows:
425+
if not set(SIGV_COSMIC).issubset(row.keys()):
426+
continue
427+
cosmic.add(row[SIGV_COSMIC[0]])
428+
429+
return [
430+
{
431+
"displayName": f"{signature} {COSMIC_SIGNATURE_VARIANT_TYPE}",
432+
"signatureName": signature,
433+
"variantTypeName": COSMIC_SIGNATURE_VARIANT_TYPE,
434+
}
435+
for signature in cosmic
436+
]
437+
438+
439+
def preprocess_hla(rows: Iterable[Dict]) -> Iterable[Dict]:
440+
"""
441+
Process hla inputs into preformatted signature inputs
442+
"""
443+
hla: Set[str] = set()
444+
for row in rows: # 1 row per sample; should be 3
445+
for k, v in row.items():
446+
if k not in SIGV_HLA:
447+
continue
448+
hla.add(f"HLA-{v}") # 2nd level, e.g. 'HLA-A*02:01'
449+
hla.add(f"HLA-{v.split(':')[0]}") # 1st level, e.g. 'HLA-A*02'
450+
451+
return [
452+
{
453+
"displayName": f"{signature} {HLA_SIGNATURE_VARIANT_TYPE}",
454+
"signatureName": signature,
455+
"variantTypeName": HLA_SIGNATURE_VARIANT_TYPE,
456+
}
457+
for signature in hla
458+
]
459+
460+
391461
def check_variant_links(
392462
small_mutations: List[IprSmallMutationVariant],
393463
expression_variants: List[IprExprVariant],

0 commit comments

Comments
 (0)