Skip to content

Commit

Permalink
✨ Introducing RioXarrayReaderIterDataPipe for reading GeoTIFFs (#6)
Browse files Browse the repository at this point in the history
An iterable-style DataPipe for GeoTIFF data! Composition over inheritance~ I/O handled using rioxarray. IterDataPipe code based on https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/online.py#L29-L59.

* ➕ Add torchdata

A library of common modular data loading primitives for easily constructing flexible and performant data pipelines!

* ➕ Add rioxarray

Geospatial xarray extension powered by rasterio!

Pinning minimum version to 0.10.0 which had dropped Python 3.7 support as per NEP29.

* ✨ Introducing RioXarrayReaderIterDataPipe for reading GeoTIFFs

An iterable-style DataPipe for GeoTIFF data! Composition over inheritance. Uses rioxarray for the I/O and the IterDataPipe code is based on https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/online.py#L29-L59. Have added a unit test and doctest for good measure.
  • Loading branch information
weiji14 authored Jun 6, 2022
1 parent ffae80b commit 9ee7363
Show file tree
Hide file tree
Showing 8 changed files with 439 additions and 5 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ jobs:
poetry plugin add poetry-dynamic-versioning-plugin
poetry show
# Run the regular tests
# Run the unit tests and doctests
- name: Test with pytest
run: poetry run --verbose pytest
run: poetry run --verbose pytest --doctest-modules
2 changes: 2 additions & 0 deletions docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ sphinx:
html_show_copyright: false
extra_extensions:
- 'sphinx.ext.autodoc'
- 'sphinx.ext.napoleon'
- 'sphinx.ext.viewcode'
4 changes: 3 additions & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# API Reference

## DataPipes

```{eval-rst}
.. automodule:: zen3geo
.. automodule:: zen3geo.datapipes
:members:
```
319 changes: 318 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ classifiers = [
"Programming Language :: Python :: 3.10",
]


[tool.poetry.dependencies]
python = "^3.8"
rioxarray = ">=0.10.0"
torchdata = ">=0.3.0"

[tool.poetry.group.dev.dependencies]
black = "*"
Expand Down
2 changes: 2 additions & 0 deletions zen3geo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@

from importlib.metadata import version

from zen3geo.datapipes import RioXarrayReaderIterDataPipe as RioXarrayReader

__version__ = version("zen3geo") # e.g. 0.1.2.dev3+g0ab3cd78
83 changes: 83 additions & 0 deletions zen3geo/datapipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Iterable-style DataPipes for geospatial raster and vector data.
Based on
https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/online.py#L29-L59
"""
from typing import Any, Dict, Iterator, Optional, Tuple

import rioxarray
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe
from torchdata.datapipes.utils import StreamWrapper


@functional_datapipe("read_from_rioxarray")
class RioXarrayReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
"""
Takes raster files (e.g. GeoTIFFs) from local disk or URLs
(as long as they can be read by rioxarray and/or rasterio)
and yields tuples of filename and xarray.DataArray objects
(functional name: ``read_from_rioxarray``).
Parameters
----------
source_datapipe : IterDataPipe[str]
A DataPipe that contains filepaths or URL links to raster files such as
GeoTIFFs.
kwargs : Optional
Extra keyword arguments to pass to ``rioxarray.open_rasterio`` and/or
``rasterio.open``. See
https://corteva.github.io/rioxarray/stable/rioxarray.html#rioxarray-open-rasterio
and https://rasterio.readthedocs.io/en/stable/api/rasterio.html#rasterio.open
Yields
------
stream_obj : Tuple[str, xarray.DataArray]
A tuple consisting of the filename that was passed in, and an
``xarray.DataArray`` object containing the raster data.
Example
-------
>>> from torchdata.datapipes.iter import IterableWrapper
>>> from zen3geo import RioXarrayReader
...
>>> # Read in GeoTIFF data using DataPipe
>>> file_url: str = "https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif"
>>> dp = IterableWrapper(iterable=[file_url])
>>> dp_rioxarray = dp.read_from_rioxarray()
...
>>> # Loop or iterate over the DataPipe stream
>>> it = iter(dp_rioxarray)
>>> filename, dataarray = next(it)
>>> filename
'https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif'
>>> dataarray
StreamWrapper<<xarray.DataArray (band: 1, y: 960, x: 1920)>
[1843200 values with dtype=uint8]
Coordinates:
* band (band) int64 1
* x (x) float64 -179.9 -179.7 -179.5 -179.3 ... 179.5 179.7 179.9
* y (y) float64 89.91 89.72 89.53 89.34 ... -89.53 -89.72 -89.91
spatial_ref int64 0
...
"""

def __init__(
self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]]
) -> None:
self.source_datapipe: IterDataPipe[str] = source_datapipe
self.kwargs = kwargs

def __iter__(self) -> Iterator[Tuple]:
for filename in self.source_datapipe:
yield (
filename,
StreamWrapper(
rioxarray.open_rasterio(filename=filename, **self.kwargs)
),
)

def __len__(self) -> int:
return len(self.source_datapipe)
27 changes: 27 additions & 0 deletions zen3geo/tests/test_datapipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Tests for datapipes.
"""
from torchdata.datapipes.iter import IterableWrapper

from zen3geo import RioXarrayReader


# %%
def test_rioxarray_reader():
"""
Ensure that RioXarrayReader works to read in a GeoTIFF file.
"""
file_url: str = "https://github.com/GenericMappingTools/gmtserver-admin/raw/master/cache/earth_day_HD.tif"
dp = IterableWrapper(iterable=[file_url])

# Using class constructors
dp_rioxarray = RioXarrayReader(source_datapipe=dp)
# Using functional form (recommended)
dp_rioxarray = dp.read_from_rioxarray()

it = iter(dp_rioxarray)
filename, dataarray = next(it)

assert isinstance(filename, str)
assert dataarray.shape == (1, 960, 1920)
assert dataarray.dims == ("band", "y", "x")

0 comments on commit 9ee7363

Please sign in to comment.