source: csv: Add label

sudharsana-kjl · pdxjohnny · John Andersen · commit c854e40f530a · 2019-08-10T21:03:19.000-07:00
Co-authored-by: John Andersen &lt;johnandersenpdx@gmail.com&gt;
Signed-off-by: John Andersen &lt;johnandersenpdx@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -58,6 +58,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - A temporary directory is used to replicate `mktemp -u` functionality so as to
   provide tests using a FileSource with a valid tempfile name.
 - Labels for JSON sources
+- Labels for CSV sources
 - util.cli CMD's correcly set the description of subparsers instead of their
   help, they also accept the `CLI_FORMATTER_CLASS` property.
 - CSV source now has `entry_point` decoration
diff --git a/dffml/source/csv.py b/dffml/source/csv.py
@@ -5,7 +5,10 @@
 """
 import csv
 import ast
-from typing import NamedTuple, Dict
+import asyncio
+from typing import NamedTuple, Dict, List
+from dataclasses import dataclass
+from contextlib import asynccontextmanager
 
 from ..repo import Repo
 from .memory import MemorySource
@@ -16,13 +19,38 @@
 csv.register_dialect("strip", skipinitialspace=True)
 
 
+@dataclass
+class OpenCSVFile:
+    write_out: Dict
+    active: int
+    lock: asyncio.Lock
+    write_back_key: bool = True
+    write_back_label: bool = False
+
+    async def inc(self):
+        async with self.lock:
+            self.active += 1
+
+    async def dec(self):
+        async with self.lock:
+            self.active -= 1
+            return bool(self.active < 1)
+
+
+CSV_SOURCE_CONFIG_DEFAULT_KEY = "src_url"
+CSV_SOURCE_CONFIG_DEFAULT_LABEL = "unlabeled"
+CSV_SOURCE_CONFIG_DEFAULT_LABEL_COLUMN = "label"
+
+
 class CSVSourceConfig(FileSourceConfig, NamedTuple):
     filename: str
-    label: str = "unlabeled"
     readonly: bool = False
-    key: str = None
+    key: str = CSV_SOURCE_CONFIG_DEFAULT_KEY
+    label: str = CSV_SOURCE_CONFIG_DEFAULT_LABEL
+    label_column: str = CSV_SOURCE_CONFIG_DEFAULT_LABEL_COLUMN
 
 
+# CSVSource is a bit of a mess
 @entry_point("csv")
 class CSVSource(FileSource, MemorySource):
     """
@@ -32,6 +60,29 @@ class CSVSource(FileSource, MemorySource):
     # Headers we've added to track data other than feature data for a repo
     CSV_HEADERS = ["prediction", "confidence"]
 
+    OPEN_CSV_FILES: Dict[str, OpenCSVFile] = {}
+    OPEN_CSV_FILES_LOCK: asyncio.Lock = asyncio.Lock()
+
+    @asynccontextmanager
+    async def _open_csv(self, fd=None):
+        async with self.OPEN_CSV_FILES_LOCK:
+            if self.config.filename not in self.OPEN_CSV_FILES:
+                self.logger.debug(f"{self.config.filename} first open")
+                open_file = OpenCSVFile(
+                    active=1, lock=asyncio.Lock(), write_out={}
+                )
+                self.OPEN_CSV_FILES[self.config.filename] = open_file
+                if fd is not None:
+                    await self.read_csv(fd, open_file)
+            else:
+                self.logger.debug(f"{self.config.filename} already open")
+                await self.OPEN_CSV_FILES[self.config.filename].inc()
+            yield self.OPEN_CSV_FILES[self.config.filename]
+
+    async def _empty_file_init(self):
+        async with self._open_csv():
+            return {}
+
     @classmethod
     def args(cls, args, *above) -> Dict[str, Arg]:
         cls.config_set(args, above, "filename", Arg())
@@ -42,9 +93,18 @@ def args(cls, args, *above) -> Dict[str, Arg]:
             Arg(type=bool, action="store_true", default=False),
         )
         cls.config_set(
-            args, above, "label", Arg(type=str, default="unlabeled")
+            args,
+            above,
+            "label",
+            Arg(type=str, default=CSV_SOURCE_CONFIG_DEFAULT_LABEL),
         )
-        cls.config_set(args, above, "key", Arg(type=str, default=None))
+        cls.config_set(
+            args,
+            above,
+            "labelcol",
+            Arg(type=str, default=CSV_SOURCE_CONFIG_DEFAULT_LABEL_COLUMN),
+        )
+        cls.config_set(args, above, "key", Arg(type=str, default="src_url"))
         return args
 
     @classmethod
@@ -54,38 +114,53 @@ def config(cls, config, *above):
             readonly=cls.config_get(config, above, "readonly"),
             label=cls.config_get(config, above, "label"),
             key=cls.config_get(config, above, "key"),
+            label_column=cls.config_get(config, above, "labelcol"),
         )
 
-    async def load_fd(self, fd):
-        """
-        Parses a CSV stream into Repo instances
-        """
-        i = 0
-        self.mem = {}
-        for data in csv.DictReader(fd, dialect="strip"):
+    async def read_csv(self, fd, open_file):
+        dict_reader = csv.DictReader(fd, dialect="strip")
+        # Record what headers are present when the file was opened
+        if not self.config.key in dict_reader.fieldnames:
+            open_file.write_back_key = False
+        if self.config.label_column in dict_reader.fieldnames:
+            open_file.write_back_label = True
+        # Store all the repos by their label in write_out
+        open_file.write_out = {}
+        # If there is no key track row index to be used as src_url by label
+        index = {}
+        for row in dict_reader:
+            # Grab label from row
+            label = row.get(self.config.label_column, self.config.label)
+            if self.config.label_column in row:
+                del row[self.config.label_column]
+            index.setdefault(label, 0)
+            # Grab src_url from row
+            src_url = row.get(self.config.key, index[label])
+            if self.config.key in row:
+                del row[self.config.key]
+            else:
+                index[label] += 1
             # Repo data we are going to parse from this row (must include
             # features).
-            repo_data = {"features": {}}
+            repo_data = {}
             # Parse headers we as the CSV source added
             csv_meta = {}
             for header in self.CSV_HEADERS:
-                if not data.get(header) is None and data[header] != "":
-                    csv_meta[header] = data[header]
+                value = row.get(header, None)
+                if value is not None and value != "":
+                    csv_meta[header] = row[header]
                     # Remove from feature data
-                    del data[header]
-            # Parse feature data
-            for key, value in data.items():
+                    del row[header]
+            # Set the features
+            features = {}
+            for key, value in row.items():
                 if value != "":
                     try:
-                        repo_data["features"][key] = ast.literal_eval(value)
+                        features[key] = ast.literal_eval(value)
                     except (SyntaxError, ValueError):
-                        repo_data["features"][key] = value
-                if self.config.key is not None and self.config.key == key:
-                    src_url = value
-                if self.config.key is None:
-                    src_url = str(i)
-            i += 1
-            # Correct types and structure of repo data from csv_meta
+                        features[key] = value
+            if features:
+                repo_data["features"] = features
             if "prediction" in csv_meta and "confidence" in csv_meta:
                 repo_data.update(
                     {
@@ -95,32 +170,67 @@ async def load_fd(self, fd):
                         }
                     }
                 )
-            repo = Repo(src_url, data=repo_data)
-            self.mem[repo.src_url] = repo
+            # If there was no data in the row, skip it
+            if not repo_data and src_url == str(index[label] - 1):
+                continue
+            # Add the repo to our internal memory representation
+            open_file.write_out.setdefault(label, {})
+            open_file.write_out[label][src_url] = Repo(src_url, data=repo_data)
+
+    async def load_fd(self, fd):
+        """
+        Parses a CSV stream into Repo instances
+        """
+        async with self._open_csv(fd) as open_file:
+            self.mem = open_file.write_out.get(self.config.label, {})
         self.logger.debug("%r loaded %d records", self, len(self.mem))
 
     async def dump_fd(self, fd):
         """
         Dumps data into a CSV stream
         """
-        # Sample some headers without iterating all the way through
-        fieldnames = []
-        for repo in self.mem.values():
-            fieldnames = list(repo.data.features.keys())
-            break
-        # Add our headers
-        fieldnames += self.CSV_HEADERS
-        # Write out the file
-        writer = csv.DictWriter(fd, fieldnames=fieldnames)
-        writer.writeheader()
-        # Write out rows in order
-        for repo in self.mem.values():
-            repo_data = repo.dict()
-            row = {}
-            for key, value in repo_data["features"].items():
-                row[key] = value
-            if "prediction" in repo_data:
-                row["prediction"] = repo_data["prediction"]["value"]
-                row["confidence"] = repo_data["prediction"]["confidence"]
-            writer.writerow(row)
+        async with self.OPEN_CSV_FILES_LOCK:
+            open_file = self.OPEN_CSV_FILES[self.config.filename]
+            open_file.write_out.setdefault(self.config.label, {})
+            open_file.write_out[self.config.label].update(self.mem)
+            # Bail if not last open source for this file
+            if not (await open_file.dec()):
+                return
+            # Add our headers
+            fieldnames = (
+                [] if not open_file.write_back_key else [self.config.key]
+            )
+            fieldnames.append(self.config.label_column)
+            # Get all the feature names
+            feature_fieldnames = set()
+            for label, repos in open_file.write_out.items():
+                for repo in repos.values():
+                    feature_fieldnames |= set(repo.data.features.keys())
+            fieldnames += list(feature_fieldnames)
+            fieldnames += self.CSV_HEADERS
+            self.logger.debug(f"fieldnames: {fieldnames}")
+            # Write out the file
+            writer = csv.DictWriter(fd, fieldnames=fieldnames)
+            writer.writeheader()
+            for label, repos in open_file.write_out.items():
+                for repo in repos.values():
+                    repo_data = repo.dict()
+                    row = {name: "" for name in fieldnames}
+                    # Always write the label
+                    row[self.config.label_column] = label
+                    # Write the key if it existed
+                    if open_file.write_back_key:
+                        row[self.config.key] = repo.src_url
+                    # Write the features
+                    for key, value in repo_data.get("features", {}).items():
+                        row[key] = value
+                    # Write the prediction
+                    if "prediction" in repo_data:
+                        row["prediction"] = repo_data["prediction"]["value"]
+                        row["confidence"] = repo_data["prediction"][
+                            "confidence"
+                        ]
+                    writer.writerow(row)
+            del self.OPEN_CSV_FILES[self.config.filename]
+            self.logger.debug(f"{self.config.filename} written")
         self.logger.debug("%r saved %d records", self, len(self.mem))
diff --git a/tests/service/test_dev.py b/tests/service/test_dev.py
@@ -87,6 +87,8 @@ async def generic_test(self, name, package_specific_files):
                                 package_name,
                                 package_specific_files,
                             )
+                        else:  # pragma: no cov
+                            pass
 
     async def test_model(self):
         await self.generic_test(
diff --git a/tests/source/test_csv.py b/tests/source/test_csv.py
@@ -2,8 +2,11 @@
 # Copyright (c) 2019 Intel Corporation
 import unittest
 import tempfile
+import os
+import csv
+import random
+import pathlib
 
-from dffml.source.file import FileSourceConfig
 from dffml.source.csv import CSVSource, CSVSourceConfig
 from dffml.util.testing.source import FileSourceTest
 from dffml.util.asynctestcase import AsyncTestCase
@@ -15,35 +18,67 @@ class TestCSVSource(FileSourceTest, AsyncTestCase):
     async def setUpSource(self):
         return CSVSource(CSVSourceConfig(filename=self.testfile))
 
-    @unittest.skip("Labels not implemented yet for CSV files")
     async def test_label(self):
-        """
-        Labels not implemented yet for CSV files
-        """
+        with tempfile.TemporaryDirectory() as testdir:
+            self.testfile = os.path.join(testdir, str(random.random()))
+            unlabeled = await self.setUpSource()
+            labeled = await self.setUpSource()
+            labeled.config = labeled.config._replace(label="somelabel")
+            async with unlabeled, labeled:
+                async with unlabeled() as uctx, labeled() as lctx:
+                    await uctx.update(
+                        Repo("0", data={"features": {"feed": 1}})
+                    )
+                    await lctx.update(
+                        Repo("0", data={"features": {"face": 2}})
+                    )
+                # async with unlabeled, labeled:
+                async with unlabeled() as uctx, labeled() as lctx:
+                    repo = await uctx.repo("0")
+                    self.assertIn("feed", repo.features())
+                    repo = await lctx.repo("0")
+                    self.assertIn("face", repo.features())
+            with open(self.testfile, "r") as fd:
+                dict_reader = csv.DictReader(fd, dialect="strip")
+                rows = {
+                    row["label"]: {row["src_url"]: row} for row in dict_reader
+                }
+                self.assertIn("unlabeled", rows)
+                self.assertIn("somelabel", rows)
+                self.assertIn("0", rows["unlabeled"])
+                self.assertIn("0", rows["somelabel"])
+                self.assertIn("feed", rows["unlabeled"]["0"])
+                self.assertIn("face", rows["somelabel"]["0"])
+                self.assertEqual("1", rows["unlabeled"]["0"]["feed"])
+                self.assertEqual("2", rows["somelabel"]["0"]["face"])
 
-    def test_config_readonly_default(self):
+    def test_config_default(self):
         config = CSVSource.config(
             parse_unknown("--source-csv-filename", "feedface")
         )
         self.assertEqual(config.filename, "feedface")
         self.assertEqual(config.label, "unlabeled")
-        self.assertEqual(config.key, None)
+        self.assertEqual(config.label_column, "label")
+        self.assertEqual(config.key, "src_url")
         self.assertFalse(config.readonly)
 
-    def test_config_readonly_set(self):
+    def test_config_set(self):
         config = CSVSource.config(
             parse_unknown(
                 "--source-csv-filename",
                 "feedface",
                 "--source-csv-label",
                 "default-label",
+                "--source-csv-labelcol",
+                "dffml_label",
                 "--source-csv-key",
                 "SourceURLColumn",
                 "--source-csv-readonly",
             )
         )
         self.assertEqual(config.filename, "feedface")
         self.assertEqual(config.label, "default-label")
+        self.assertEqual(config.label_column, "dffml_label")
         self.assertEqual(config.key, "SourceURLColumn")
         self.assertTrue(config.readonly)
 
@@ -59,5 +94,5 @@ async def test_key(self):
                 async with source() as sctx:
                     repo_a = await sctx.repo("a")
                     repo_b = await sctx.repo("b")
-                    self.assertEqual(repo_a.data.features["ValueColumn"], 42)
-                    self.assertEqual(repo_b.data.features["ValueColumn"], 420)
+                    self.assertEqual(repo_a.feature("ValueColumn"), 42)
+                    self.assertEqual(repo_b.feature("ValueColumn"), 420)
diff --git a/tests/test_cli.py b/tests/test_cli.py

Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,8 @@ async def generic_test(self, name, package_specific_files):`
`87`	`87`	`package_name,`
`88`	`88`	`package_specific_files,`
`89`	`89`	`)`
	`90`	`+ else: # pragma: no cov`
	`91`	`+ pass`
`90`	`92`
`91`	`93`	`async def test_model(self):`
`92`	`94`	`await self.generic_test(`