-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathebrains.py
145 lines (120 loc) · 5.68 KB
/
ebrains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Copyright 2018-2021
# Institute of Neuroscience and Medicine (INM-1), Forschungszentrum Jülich GmbH
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Query data features published as Ebrains datasets with AtlasConcepts"""
from ..features.dataset import ebrains as _ebrains
from . import query
from ..commons import logger, siibra_tqdm
from ..features import anchor as _anchor
from ..retrieval import requests, datasets, cache
from ..core import parcellation, region
from collections import defaultdict
import re
from packaging.version import Version
from tempfile import NamedTemporaryFile
class EbrainsFeatureQuery(query.LiveQuery, args=[], FeatureType=_ebrains.EbrainsDataFeature):
# in EBRAINS knowledge graph prior to v3, versions were modelled
# in dataset names. Typically found formats are (v1.0) and [rat, v2.1]
VERSION_PATTERN = re.compile(r"^(.*?) *[\[\(][^v]*?(v[0-9].*?)[\]\)]")
COMPACT_FEATURE_LIST = True
# datasets whose name contains any of these strings will be ignored
_BLACKLIST = {
"Whole-brain parcellation of the Julich-Brain Cytoarchitectonic Atlas",
"whole-brain collections of cytoarchitectonic probabilistic maps",
"DiFuMo atlas",
"Automated Anatomical Labeling (AAL1) atlas",
}
loader = requests.MultiSourcedRequest(
requests=[
requests.GitlabProxy(
flavour=requests.GitlabProxyEnum.PARCELLATIONREGION_V1,
),
]
)
parcellation_ids = None
def __init__(self, **kwargs):
query.LiveQuery.__init__(self, **kwargs)
if self.__class__.parcellation_ids is None:
self.__class__.parcellation_ids = [
dset.id
for parc in parcellation.Parcellation.registry()
for dset in parc.datasets
if isinstance(dset, datasets.EbrainsV3DatasetVersion)
]
def query(self, region: region.Region):
versioned_datasets = defaultdict(dict)
invalid_species_datasets = {}
results = self.loader.data.get("results", [])
for r in siibra_tqdm(results, total=len(results)):
regionname = r.get("name", None)
alias = r.get("alias", None)
for ds_spec in r.get("datasets", []):
ds_name = ds_spec.get("name")
ds_id = ds_spec.get("@id")
if "dataset" not in ds_id:
continue
try:
ds_species = _anchor.Species.decode(ds_spec)
except ValueError:
logger.debug(f"Cannot decode {ds_spec}")
invalid_species_datasets[ds_id] = ds_name
continue
if self.COMPACT_FEATURE_LIST:
if any(ds_id.endswith(i) for i in self.parcellation_ids):
continue
if any(e.lower() in ds_name.lower() for e in self._BLACKLIST):
continue
dset = _ebrains.EbrainsDataFeature(
dataset_version_id=ds_id,
anchor=_anchor.AnatomicalAnchor(
region=alias or regionname,
species=ds_species,
),
)
if not dset.matches(region):
continue
version_match = self.VERSION_PATTERN.search(ds_name)
if version_match is None or not self.COMPACT_FEATURE_LIST:
yield dset
else: # store version, add only the latest version after the loop
name, version = version_match.groups()
versioned_datasets[name][version] = dset
if len(invalid_species_datasets) > 0:
with NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") as f:
for dsid, dsname in invalid_species_datasets.items():
f.write(f"{dsid} {dsname}\n")
logger.warning(
f"{len(invalid_species_datasets)} datasets have been ignored, "
"because siibra could not decode their species. "
f"See {f.name}"
)
# if versioned datasets have been recorded, register only
# the newest one with older ones linked as a version history.
for name, dsets in versioned_datasets.items():
try: # if possible, sort by version tag
sorted_versions = sorted(dsets.keys(), key=Version)
except TypeError: # else sort lexicographically
sorted_versions = sorted(dsets.keys())
# chain the dataset versions
prev = None
for version in sorted_versions:
curr = dsets[version]
curr.version = version
if prev is not None:
curr._prev = prev
prev._next = curr
prev = curr
logger.debug(
f"Registered only version {version} of {', '.join(sorted_versions)} for {name}. "
f"Its version history is: {curr.version_history}"
)
yield curr
cache.Warmup.register_warmup_fn(cache.WarmupLevel.DATA)(lambda: EbrainsFeatureQuery.loader.data)