Expose pd.DataFrame.to_csv and json.dump keyword arguments (#421)

andersy005 · web-flow · commit 18d9d57f2368 · 2021-12-17T13:14:43.000-07:00
diff --git a/intake_esm/cat.py b/intake_esm/cat.py
@@ -115,7 +115,15 @@ def from_dict(cls, data: typing.Dict) -> 'ESMCatalogModel':
         cat._df = df
         return cat
 
-    def save(self, name: str, *, directory: str = None, catalog_type: str = 'dict') -> None:
+    def save(
+        self,
+        name: str,
+        *,
+        directory: str = None,
+        catalog_type: str = 'dict',
+        to_csv_kwargs: dict = None,
+        json_dump_kwargs: dict = None,
+    ) -> None:
         """
         Save the catalog to a file.
 
@@ -128,6 +136,10 @@ def save(self, name: str, *, directory: str = None, catalog_type: str = 'dict')
         catalog_type: str
             The type of catalog to save. Whether to save the catalog table as a dictionary
             in the JSON file or as a separate CSV file. Valid options are 'dict' and 'file'.
+        to_csv_kwargs : dict, optional
+            Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
+        json_dump_kwargs : dict, optional
+            Additional keyword arguments passed through to the :py:func:`~json.dump` function.
 
         Notes
         -----
@@ -140,7 +152,7 @@ def save(self, name: str, *, directory: str = None, catalog_type: str = 'dict')
             raise ValueError(
                 f'catalog_type must be either "dict" or "file". Received catalog_type={catalog_type}'
             )
-        csv_file_name = pathlib.Path(f'{name}.csv.gz')
+        csv_file_name = pathlib.Path(f'{name}.csv')
         json_file_name = pathlib.Path(f'{name}.json')
         if directory:
             directory = pathlib.Path(directory)
@@ -154,13 +166,20 @@ def save(self, name: str, *, directory: str = None, catalog_type: str = 'dict')
         data['id'] = name
 
         if catalog_type == 'file':
+            csv_kwargs = {'index': False}
+            csv_kwargs.update(to_csv_kwargs or {})
+            compression = csv_kwargs.get('compression')
+            extensions = {'gzip': '.gz', 'bz2': '.bz2', 'zip': '.zip', 'xz': '.xz', None: ''}
+            csv_file_name = f'{csv_file_name}{extensions[compression]}'
             data['catalog_file'] = str(csv_file_name)
-            self.df.to_csv(csv_file_name, compression='gzip', index=False)
+            self.df.to_csv(csv_file_name, **csv_kwargs)
         else:
             data['catalog_dict'] = self.df.to_dict(orient='records')
 
         with open(json_file_name, 'w') as outfile:
-            json.dump(data, outfile, indent=2)
+            json_kwargs = {'indent': 2}
+            json_kwargs.update(json_dump_kwargs or {})
+            json.dump(data, outfile, **json_kwargs)
 
         print(f'Successfully wrote ESM collection json file to: {json_file_name}')
 
diff --git a/intake_esm/core.py b/intake_esm/core.py
@@ -365,6 +365,8 @@ def serialize(
         name: pydantic.StrictStr,
         directory: typing.Union[pydantic.DirectoryPath, pydantic.StrictStr] = None,
         catalog_type: str = 'dict',
+        to_csv_kwargs: typing.Dict[typing.Any, typing.Any] = None,
+        json_dump_kwargs: typing.Dict[typing.Any, typing.Any] = None,
     ) -> None:
         """Serialize collection/catalog to corresponding json and csv files.
 
@@ -376,6 +378,10 @@ def serialize(
             The path to the local directory. If None, use the current directory
         catalog_type: str, default 'dict'
             Whether to save the catalog table as a dictionary in the JSON file or as a separate CSV file.
+        to_csv_kwargs : dict, optional
+            Additional keyword arguments passed through to the :py:meth:`~pandas.DataFrame.to_csv` method.
+        json_dump_kwargs : dict, optional
+            Additional keyword arguments passed through to the :py:func:`~json.dump` function.
 
         Notes
         -----
@@ -395,7 +401,13 @@ def serialize(
         >>> col_subset.serialize(name="cmip6_bcc_esm1", catalog_type="file")
         """
 
-        self.esmcat.save(name, directory=directory, catalog_type=catalog_type)
+        self.esmcat.save(
+            name,
+            directory=directory,
+            catalog_type=catalog_type,
+            to_csv_kwargs=to_csv_kwargs,
+            json_dump_kwargs=json_dump_kwargs,
+        )
 
     def nunique(self) -> pd.Series:
         """Count distinct observations across dataframe columns
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -147,15 +147,24 @@ def test_catalog_getitem_error():
         cat['foo']
 
 
-@pytest.mark.parametrize('catalog_type', ['file', 'dict'])
-def test_catalog_serialize(tmp_path, catalog_type):
+@pytest.mark.parametrize(
+    'catalog_type, to_csv_kwargs, json_dump_kwargs',
+    [('file', {'compression': 'bz2'}, {}), ('file', {'compression': 'gzip'}, {}), ('dict', {}, {})],
+)
+def test_catalog_serialize(tmp_path, catalog_type, to_csv_kwargs, json_dump_kwargs):
     cat = intake.open_esm_datastore(cdf_col_sample_cmip6)
     local_store = tmp_path
     cat_subset = cat.search(
         source_id='MRI-ESM2-0',
     )
     name = 'CMIP6-MRI-ESM2-0'
-    cat_subset.serialize(name=name, directory=local_store, catalog_type=catalog_type)
+    cat_subset.serialize(
+        name=name,
+        directory=local_store,
+        catalog_type=catalog_type,
+        to_csv_kwargs=to_csv_kwargs,
+        json_dump_kwargs=json_dump_kwargs,
+    )
     cat = intake.open_esm_datastore(f'{local_store}/{name}.json')
     pd.testing.assert_frame_equal(
         cat_subset.df.reset_index(drop=True), cat.df.reset_index(drop=True)