Skip to content

Commit 3fb345c

Browse files
committed
Filter region in rfmix read
1 parent cb2c0bd commit 3fb345c

File tree

1 file changed

+69
-2
lines changed

1 file changed

+69
-2
lines changed

src/hamsta/io.py

+69-2
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def read_rfmixfb(
124124
Args:
125125
fname: Path to RFMIX output
126126
ancestry: The ancestry to be extracted
127+
exclude: str = None,
127128
128129
Returns:
129130
a local ancestry matrix (marker, sample) and list of sample
@@ -168,12 +169,24 @@ def read_rfmixfb(
168169
# data lines
169170
# Reshape to (marker, sample, ploidy, ancestry) array
170171
# choose ancestry and sum over ploidy
172+
if exclude is not None:
173+
exclude_region = read_bed(exclude)
174+
171175
LA_matrix = [] # read into (marker by (sample x ploidy x ancestry))
172176
for i, line in enumerate(f_handle):
173177
if i % 100 == 0:
174178
logging.debug(f"processing {i}-th marker")
175179

176180
line_split = line.strip().split("\t")
181+
if exclude is not None:
182+
isskip = np.any(
183+
[
184+
start <= int(line_split[1]) and int(line_split[1]) < end
185+
for _, start, end in exclude_region
186+
]
187+
)
188+
if isskip:
189+
continue
177190
LA_matrix.append(line_split[4:])
178191

179192
LA_matrix = jnp.float_(LA_matrix).reshape(-1, N, 2, n_pops)
@@ -207,7 +220,7 @@ def read_zarr(
207220
locanc (marker, sample, ploidy, ancestry) float32 1.0 0.0 1.0 ... 0.0 1.0
208221
209222
Args:
210-
fname: Path to RFMIX output
223+
fname: Path to zarr file storing local ancestry in xarray
211224
ancestry: The ancestry to be extracted
212225
213226
Returns:
@@ -235,7 +248,19 @@ def read_zarr(
235248
"""
236249
ds = xr.open_zarr(fname)
237250

238-
LA_matrix = jnp.array(ds.locanc.sel(ancestry=ancestry).sum(dim="ploidy"))
251+
# LA_matrix = jnp.array(ds.locanc.sel(ancestry=ancestry).sum(dim="ploidy"))
252+
ds_LA = ds.locanc.sel(ancestry=ancestry).sum(dim="ploidy")
253+
if exclude is not None:
254+
exclude_region = read_bed(exclude)
255+
extract = np.logical_and.reduce(
256+
[
257+
~np.logical_and(start <= ds_LA["marker"], ds_LA["marker"] < end)
258+
for _, start, end in exclude_region
259+
]
260+
)
261+
ds_LA = ds_LA[extract, :]
262+
263+
LA_matrix = jnp.array(ds_LA)
239264

240265
sample_df = ds.sample.to_dataframe().reset_index(drop=True)
241266

@@ -247,6 +272,48 @@ def read_nc(
247272
ancestry: str,
248273
exclude: str = None,
249274
) -> Tuple[jnp.ndarray, pd.DataFrame]:
275+
"""Reader for xarray stored in netcdf
276+
277+
Read a :class:`xarray.Dataset` with data ``locanc`` in
278+
(marker, sample, ploidy, ancestry), example::
279+
280+
<xarray.Dataset>
281+
Dimensions: (marker: 8, sample: 39, ploidy: 2, ancestry: 2)
282+
Coordinates:
283+
* ancestry (ancestry) <U3 'HCB' 'JPT'
284+
* marker (marker) uint32 1 6 12 20 25 31 36 43
285+
* ploidy (ploidy) int8 0 1
286+
* sample (sample) <U6 'HCB182' 'HCB190' 'HCB191' ... 'JPT266' 'JPT267'
287+
Data variables:
288+
locanc (marker, sample, ploidy, ancestry) float32 1.0 0.0 1.0 ... 0.0 1.0
289+
290+
Args:
291+
fname: Path to netcdf file storing local ancestry in xarray
292+
ancestry: The ancestry to be extracted
293+
294+
Returns:
295+
a local ancestry matrix (marker, sample) and list of sample
296+
297+
Example
298+
-------
299+
>>> from hamsta import io
300+
>>> A, A_sample = io.read_zarr("tests/testdata/example.zarr", "HCB")
301+
>>> A[:5, :5]
302+
DeviceArray([[2. , 2. , 2. , 1.969, 1. ],
303+
[2. , 2. , 2. , 1.969, 1. ],
304+
[2. , 2. , 2. , 1.969, 1. ],
305+
[2. , 2. , 2. , 1.969, 1. ],
306+
[2. , 2. , 2. , 1.969, 1. ]], dtype=float32)
307+
>>> A_sample.head(5)
308+
sample
309+
0 HCB182
310+
1 HCB190
311+
2 HCB191
312+
3 HCB193
313+
4 HCB194
314+
315+
316+
"""
250317

251318
ds = xr.open_dataset(fname).load()
252319

0 commit comments

Comments
 (0)