dmey · dmey · Nov 10, 2020 · Nov 9, 2020 · Nov 9, 2020 · Nov 9, 2020
diff --git a/DEVELOP.md b/DEVELOP.md
@@ -55,4 +55,4 @@ Create and upload a new release with the following commands
 python setup.py bdist_wheel
 pip install --upgrade twine
 python -m twine upload dist/*
-```
+```
diff --git a/docs/installation.md b/docs/installation.md
@@ -2,7 +2,7 @@
 
 ## Required dependencies
 
-- Python (3.7 or later)
+- Python (3.8 or later)
 - [numpy](http://www.numpy.org/)
 - [scipy](https://www.scipy.org/scipylib/index.html)
 - [xarray](http://xarray.pydata.org/)

diff --git a/environment-test.yml b/environment-test.yml
@@ -2,5 +2,5 @@ name: synthia-test
 channels:
   - defaults
 dependencies:
-  - python=3.7 # when changing this, also change setup.py
+  - python=3.8 # when changing this, also change setup.py
   - pytest
diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
         "Programming Language :: Python :: 3",
     ],
     packages=find_packages(),
-    python_requires=">=3.7", # when changing this, also change environment-test.yml
+    python_requires=">=3.8", # when changing this, also change environment-test.yml
     install_requires=[
         "numpy",
         "scipy",

diff --git a/synthia/copulas/vine.py b/synthia/copulas/vine.py
@@ -1,7 +1,7 @@
 # Synthia (https://github.com/dmey/synthia).
 # Copyright (c) 2020 D. Meyer and T. Nagler. Licensed under the MIT License.
 
-from typing import Optional
+from typing import Optional, List
 from multiprocessing import cpu_count
 import tempfile
 import os
@@ -32,25 +32,40 @@ def __init__(self, controls: Optional['pv.FitControlsVinecop']=None) -> None:
         self.controls = controls
 
     def fit(self, rank_standardized: np.ndarray) -> None:
-        """Fit a Vine copula to data.
+        """Fit a Vine copula to continuous data.
 
         Args:
-            rank_standardized (ndarray): 2D array of shape (feature, feature)
+            rank_standardized (ndarray): 2D array of shape (sample, feature)
                 with values in range [-1,1]
 
         Returns:
             None
         """
         self.model = pv.Vinecop(rank_standardized, controls=self.controls)
 
+    def fit_with_discrete(self, rank_standardized: np.ndarray, is_discrete: List[bool]) -> None:
+        """Fit a Vine copula to mixed continuous/discrete data
+
+        Args:
+            rank_standardized (ndarray): 2D array of shape (sample, feature)
+                with values in range [-1,1]
+
+            is_discrete (List[bool]): 1D list of booleans of shape (feature) indicating
+                whether features are discrete or continuous
+        Returns:
+            None
+        """
+        var_types = ["d" if d else "c" for d in is_discrete]
+        self.model = pv.Vinecop(rank_standardized, controls=self.controls, var_types=var_types)
+
     def generate(self, n_samples: int, qrng=False, num_threads=1) -> np.ndarray:
         """Generate n_samples Vine copula entries.
 
         Args:
             n_samples (int): Number of samples to generate.
 
         Returns:
-            2D array of shape (n_samples, feature) with Vine copula entries.
+            2D array of shape (sample, feature) with Vine copula entries.
         """
         u_sim = self.model.simulate(n_samples, qrng=qrng, num_threads=num_threads)
         return u_sim

diff --git a/synthia/generators/copula.py b/synthia/generators/copula.py
@@ -5,8 +5,11 @@
 import numpy as np
 import xarray as xr
 
+from scipy.stats import rankdata
+
 from ..parameterizers.parameterizer import Parameterizer
 from ..copulas.copula import Copula
+from ..copulas.vine import VineCopula
 from ..util import to_feature_array, from_feature_array, per_feature
 
 class CopulaDataGenerator:
@@ -80,6 +83,7 @@ def _log(self, msg: str) -> None:
 
     def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
             copula: Copula,
+            is_discrete: Optional[Union[bool, Dict[int, bool], Dict[str, bool]]]=None,
             parameterize_by: Optional[Union[Parameterizer, Dict[int, Parameterizer], Dict[str, Parameterizer]]]=None):
         """Fit the marginal distributions and copula model for all features.
 
@@ -90,6 +94,8 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
 
             copula: The underlying copula to use, for example a GaussianCopula object.
 
+            is_discrete : indicates whether features are discrete or continuous
+
             parameterize_by (Parameterizer or mapping, optional): The
                 following forms are valid:
 
@@ -106,11 +112,18 @@ def fit(self, data: Union[np.ndarray, xr.DataArray, xr.Dataset],
         self.dtype = data.dtype
         self.n_features = data.shape[1]
 
+        self.is_discrete = per_feature(is_discrete, self.data_info)
+        if any(self.is_discrete) and not isinstance(copula, VineCopula):
+            raise TypeError('Discrete samples can only be modelled in vine copulas')
+
         self._log('computing rank data')
-        rank_standardized = compute_rank_standardized(data)
+        rank_standardized = compute_rank_standardized(data, self.is_discrete)
 
         self._log('fitting copula')
-        copula.fit(rank_standardized)
+        if any(self.is_discrete):
+            copula.fit_with_discrete(rank_standardized, self.is_discrete)
+        else:
+            copula.fit(rank_standardized)
 
         self._log('parameterizing data')
         self.parameterizers = per_feature(parameterize_by, self.data_info)
@@ -173,7 +186,12 @@ def generate(self, n_samples: int,
             else:
                 feature_samples = self.parameterizers[i].generate(n_samples)
 
-            samples[:,i] = np.quantile(feature_samples, q=u[:, i], interpolation='linear')
+            if self.is_discrete[i]:
+                interp = 'nearest'
+            else:
+                interp = 'linear'
+
+            samples[:,i] = np.quantile(feature_samples, q=u[:, i], interpolation=interp)
 
             if unif_ratio_per_feature[i] != 0:
                 feature_min = self.feature_min[i].compute().item()
@@ -189,22 +207,42 @@ def generate(self, n_samples: int,
 
         return samples
 
-def compute_rank_standardized(data: xr.DataArray) -> xr.DataArray:
+def compute_rank_standardized(data: xr.DataArray, is_discrete: List[bool]) -> np.ndarray:
     """Compute per-feature percentage ranks of the data. Data is a 
     2D xarray of shape (sample, feature).
 
     Example:
        >>> # 3 samples, 2 features
        >>> data = xr.DataArray([(10,0.3), (5,0.2), (1500,0.1)])
-       >>> compute_rank_standardized(data).values
+       >>> compute_rank_standardized(data, is_discrete=[False, False])
        array([[0.5 , 0.75],
               [0.25, 0.5 ],
               [0.75, 0.25]])
     """
 
     assert data.ndim == 2, f'Input array must be 2D, given: {data.ndim}'
-    # rank() requires all data in-memory, hence compute()
-    rank = data.compute().rank(data.dims[0])
-    rank_standardized = rank / (rank.max(data.dims[0]) + 1)
-    assert rank_standardized.shape == data.shape
+    data = data.compute()
+    if not any(is_discrete):
+        rank = data.rank(data.dims[0])
+        rank = rank.values
+    else:
+        # use scipy for discrete as xarray only supports 'average' rank
+        assert len(is_discrete) == data.shape[1], f"is_discrete must have length {data.shape[1]} but is {len(is_discrete)}"
+
+        ranks = []
+        for i in range(data.shape[1]):
+            feature = data[:,i]
+            if is_discrete[i]:
+                feature_rank_max = rankdata(feature, method='max')
+                ranks.append(feature_rank_max.reshape(-1, 1))
+            else:
+                feature_rank = feature.rank(feature.dims[0]).values
+                ranks.append(feature_rank.reshape(-1, 1))
+
+        feature_rank_min = rankdata(data[:, is_discrete], method='min') - 1
+        ranks.append(feature_rank_min)
+
+        rank = np.concatenate(ranks, axis=1)
+
+    rank_standardized = rank / (rank.max(axis=0) + 1)
     return rank_standardized
diff --git a/synthia/util.py b/synthia/util.py
@@ -47,6 +47,7 @@ class StackInfoVar(NamedTuple):
     name: str
     dims: Tuple[str, ...]
     shape: Tuple[int, ...]
+    dtype: np.dtype
 
 StackInfo = List[StackInfoVar]
 
@@ -68,7 +69,7 @@ def to_stacked_array(ds: xr.Dataset, var_names=None, new_dim='stacked', name=Non
             stacked = stacked.drop(list(stacked.coords.keys()))
         else:
             stacked = v.expand_dims(new_dim, axis=-1)
-        stack_info.append(StackInfoVar(var_name, v.dims, v.shape[1:]))
+        stack_info.append(StackInfoVar(var_name, v.dims, v.shape[1:], v.dtype))
         var_stacked.append(stacked)
     arr = xr.concat(var_stacked, new_dim)
     if name:
@@ -93,6 +94,7 @@ def to_unstacked_dataset(arr: np.ndarray, stack_info: StackInfo) -> xr.Dataset:
             unstacked_shape.append(dim_len)
         var_slice = arr[:, curr_i:curr_i+feature_len]
         var_unstacked = var_slice.reshape(unstacked_shape)
+        var_unstacked = var_unstacked.astype(var.dtype, copy=False)
         unstacked[var.name] = xr.DataArray(var_unstacked, dims=var.dims)
         curr_i += feature_len
     ds = xr.Dataset(unstacked)
@@ -162,7 +164,7 @@ def load_dataset(name='SAF-Synthetic') -> xr.Dataset:
     if name != 'SAF-Synthetic':
         raise RuntimeError('Only SAF-Synthetic is currerlty supported')
 
-    url = 'https://raw.githubusercontent.com/dmey/pyvinecopulib/tmp/assets/data/generator_saf_temperature_fpca_6.pkl'
+    url = 'https://raw.githubusercontent.com/dmey/synthia/data/generator_saf_temperature_fpca_6.pkl'
     generator = pickle.load(urlopen(url))
     N_SAMPLES = 25000
     ds = generator.generate(N_SAMPLES)