@@ -124,6 +124,7 @@ def read_rfmixfb(
124
124
Args:
125
125
fname: Path to RFMIX output
126
126
ancestry: The ancestry to be extracted
127
+ exclude: str = None,
127
128
128
129
Returns:
129
130
a local ancestry matrix (marker, sample) and list of sample
@@ -168,12 +169,24 @@ def read_rfmixfb(
168
169
# data lines
169
170
# Reshape to (marker, sample, ploidy, ancestry) array
170
171
# choose ancestry and sum over ploidy
172
+ if exclude is not None :
173
+ exclude_region = read_bed (exclude )
174
+
171
175
LA_matrix = [] # read into (marker by (sample x ploidy x ancestry))
172
176
for i , line in enumerate (f_handle ):
173
177
if i % 100 == 0 :
174
178
logging .debug (f"processing { i } -th marker" )
175
179
176
180
line_split = line .strip ().split ("\t " )
181
+ if exclude is not None :
182
+ isskip = np .any (
183
+ [
184
+ start <= int (line_split [1 ]) and int (line_split [1 ]) < end
185
+ for _ , start , end in exclude_region
186
+ ]
187
+ )
188
+ if isskip :
189
+ continue
177
190
LA_matrix .append (line_split [4 :])
178
191
179
192
LA_matrix = jnp .float_ (LA_matrix ).reshape (- 1 , N , 2 , n_pops )
@@ -207,7 +220,7 @@ def read_zarr(
207
220
locanc (marker, sample, ploidy, ancestry) float32 1.0 0.0 1.0 ... 0.0 1.0
208
221
209
222
Args:
210
- fname: Path to RFMIX output
223
+ fname: Path to zarr file storing local ancestry in xarray
211
224
ancestry: The ancestry to be extracted
212
225
213
226
Returns:
@@ -235,7 +248,19 @@ def read_zarr(
235
248
"""
236
249
ds = xr .open_zarr (fname )
237
250
238
- LA_matrix = jnp .array (ds .locanc .sel (ancestry = ancestry ).sum (dim = "ploidy" ))
251
+ # LA_matrix = jnp.array(ds.locanc.sel(ancestry=ancestry).sum(dim="ploidy"))
252
+ ds_LA = ds .locanc .sel (ancestry = ancestry ).sum (dim = "ploidy" )
253
+ if exclude is not None :
254
+ exclude_region = read_bed (exclude )
255
+ extract = np .logical_and .reduce (
256
+ [
257
+ ~ np .logical_and (start <= ds_LA ["marker" ], ds_LA ["marker" ] < end )
258
+ for _ , start , end in exclude_region
259
+ ]
260
+ )
261
+ ds_LA = ds_LA [extract , :]
262
+
263
+ LA_matrix = jnp .array (ds_LA )
239
264
240
265
sample_df = ds .sample .to_dataframe ().reset_index (drop = True )
241
266
@@ -247,6 +272,48 @@ def read_nc(
247
272
ancestry : str ,
248
273
exclude : str = None ,
249
274
) -> Tuple [jnp .ndarray , pd .DataFrame ]:
275
+ """Reader for xarray stored in netcdf
276
+
277
+ Read a :class:`xarray.Dataset` with data ``locanc`` in
278
+ (marker, sample, ploidy, ancestry), example::
279
+
280
+ <xarray.Dataset>
281
+ Dimensions: (marker: 8, sample: 39, ploidy: 2, ancestry: 2)
282
+ Coordinates:
283
+ * ancestry (ancestry) <U3 'HCB' 'JPT'
284
+ * marker (marker) uint32 1 6 12 20 25 31 36 43
285
+ * ploidy (ploidy) int8 0 1
286
+ * sample (sample) <U6 'HCB182' 'HCB190' 'HCB191' ... 'JPT266' 'JPT267'
287
+ Data variables:
288
+ locanc (marker, sample, ploidy, ancestry) float32 1.0 0.0 1.0 ... 0.0 1.0
289
+
290
+ Args:
291
+ fname: Path to netcdf file storing local ancestry in xarray
292
+ ancestry: The ancestry to be extracted
293
+
294
+ Returns:
295
+ a local ancestry matrix (marker, sample) and list of sample
296
+
297
+ Example
298
+ -------
299
+ >>> from hamsta import io
300
+ >>> A, A_sample = io.read_zarr("tests/testdata/example.zarr", "HCB")
301
+ >>> A[:5, :5]
302
+ DeviceArray([[2. , 2. , 2. , 1.969, 1. ],
303
+ [2. , 2. , 2. , 1.969, 1. ],
304
+ [2. , 2. , 2. , 1.969, 1. ],
305
+ [2. , 2. , 2. , 1.969, 1. ],
306
+ [2. , 2. , 2. , 1.969, 1. ]], dtype=float32)
307
+ >>> A_sample.head(5)
308
+ sample
309
+ 0 HCB182
310
+ 1 HCB190
311
+ 2 HCB191
312
+ 3 HCB193
313
+ 4 HCB194
314
+
315
+
316
+ """
250
317
251
318
ds = xr .open_dataset (fname ).load ()
252
319
0 commit comments