model: scratch: Added simple linear regression

yashlamba · John Andersen · commit 4cb581b92cf4 · 2019-07-23T22:39:47.000-07:00
Signed-off-by: John Andersen &lt;john.s.andersen@intel.com&gt;
diff --git a/.travis.yml b/.travis.yml
@@ -15,6 +15,7 @@ env:
   matrix:
   - PLUGIN=.
   - PLUGIN=model/tensorflow
+  - PLUGIN=model/scratch
   - PLUGIN=feature/git
   - PLUGIN=feature/auth
   - CHANGELOG=1
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,7 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - util.cli CMD classes have their argparse description set to their docstring.
 - util.cli CMD classes can specify the formatter class used in
   `argparse.ArgumentParser` via the `CLI_FORMATTER_CLASS` property.
-- skeleton for service creation was added
+- Skeleton for service creation was added
+- Simple Linear Regression model from scratch
 ### Changed
 - feature/codesec became it's own branch, binsec
 - BaseOrchestratorContext `run_operations` strict is default to true. With
diff --git a/model/scratch/.coveragerc b/model/scratch/.coveragerc
@@ -0,0 +1,13 @@
+[run]
+source =
+    dffml_model_scratch
+    tests
+branch = True
+
+[report]
+exclude_lines =
+    no cov
+    no qa
+    noqa
+    pragma: no cover
+    if __name__ == .__main__.:
diff --git a/model/scratch/.gitattributes b/model/scratch/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/model/scratch/.gitignore b/model/scratch/.gitignore
@@ -0,0 +1,20 @@
+*.log
+*.pyc
+.cache/
+.coverage
+.idea/
+.vscode/
+*.egg-info/
+build/
+dist/
+docs/build/
+venv/
+wheelhouse/
+*.egss
+.mypy_cache/
+*.swp
+.venv/
+.eggs/
+*.modeldir
+*.db
+htmlcov/
diff --git a/model/scratch/LICENSE b/model/scratch/LICENSE
@@ -0,0 +1,21 @@
+Copyright (c) 2019 Intel
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/model/scratch/MANIFEST.in b/model/scratch/MANIFEST.in
@@ -0,0 +1,2 @@
+include README.md
+include LICENSE
diff --git a/model/scratch/README.md b/model/scratch/README.md
@@ -0,0 +1,55 @@
+# DFFML Scratch Models
+
+## About
+
+Models created without a machine learning framework.
+
+## Install
+
+```console
+python3.7 -m pip install --user dffml-model-scratch
+```
+
+## Usage
+
+If we have a dataset of years of experience in a job and the Salary (in
+thousands) at that job we can use the Simple Linear Regression model to predict
+a salary given the years of experience (or the other way around).
+
+First we create the file containing the dataset. Then we train the model, get
+its accuracy. And using `echo` pipe a new csv file of data to predict into the
+model, and it will give us it prediction of the Salary.
+
+```console
+$ cat > dataset.csv << EOF
+Years,Salary
+1,40
+2,50
+3,60
+4,70
+5,80
+EOF
+$ dffml train -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename dataset.csv -source-readonly -log debug
+$ dffml accuracy -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename dataset.csv -source-readonly -log debug
+1.0
+$ echo -e 'Years,Salary\n6,0\n' | dffml predict all -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename /dev/stdin -source-readonly -log debug
+[
+    {
+        "extra": {},
+        "features": {
+            "Salary": 0,
+            "Years": 6
+        },
+        "last_updated": "2019-07-19T09:46:45Z",
+        "prediction": {
+            "confidence": 1.0,
+            "value": 90.0
+        },
+        "src_url": "0"
+    }
+]
+```
+
+## License
+
+Scratch Models are distributed under the terms of the [MIT License](LICENSE).
diff --git a/model/scratch/dffml_model_scratch/__init__.py b/model/scratch/dffml_model_scratch/__init__.py
diff --git a/model/scratch/dffml_model_scratch/slr.py b/model/scratch/dffml_model_scratch/slr.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2019 Intel Corporation
+"""
+Description of what this model does
+"""
+import os
+import abc
+import json
+import hashlib
+from typing import AsyncIterator, Tuple, Any, List, Optional, NamedTuple, Dict
+
+import numpy as np
+
+from dffml.repo import Repo
+from dffml.source.source import Sources
+from dffml.feature import Features
+from dffml.accuracy import Accuracy
+from dffml.model.model import ModelConfig, ModelContext, Model
+from dffml.util.entrypoint import entry_point
+from dffml.util.cli.arg import Arg
+
+
+class SLRConfig(ModelConfig, NamedTuple):
+    predict: str
+    directory: str
+
+
+class SLRContext(ModelContext):
+    def __init__(self, parent, features):
+        super().__init__(parent, features)
+        self.xData = np.array([])
+        self.yData = np.array([])
+        self.features = self.applicable_features(features)
+        self._features_hash_ = hashlib.sha384(
+            ("".join(sorted(self.features))).encode()
+        ).hexdigest()
+
+    @property
+    def regression_line(self):
+        return self.parent.saved.get(self._features_hash_, None)
+
+    @regression_line.setter
+    def regression_line(self, rline):
+        self.parent.saved[self._features_hash_] = rline
+
+    def applicable_features(self, features):
+        usable = []
+        if len(features) != 1:
+            raise ValueError(
+                "Simple Linear Regression doesn't support features other than 1"
+            )
+        for feature in features:
+            if feature.dtype() != int and feature.dtype() != float:
+                raise ValueError(
+                    "Simple Linear Regression only supports int or float feature"
+                )
+            if feature.length() != 1:
+                raise ValueError(
+                    "Simple LR only supports single values (non-matrix / array)"
+                )
+            usable.append(feature.NAME)
+        return sorted(usable)
+
+    async def predict_input(self, x):
+        prediction = self.regression_line[0] * x + self.regression_line[1]
+        self.logger.debug(
+            "Predicted Value of {} {}:".format(
+                self.parent.config.predict, prediction
+            )
+        )
+        return prediction
+
+    async def squared_error(self, ys, yline):
+        return sum((ys - yline) ** 2)
+
+    async def coeff_of_deter(self, ys, regression_line):
+        y_mean_line = [np.mean(ys) for y in ys]
+        squared_error_mean = await self.squared_error(ys, y_mean_line)
+        squared_error_regression = await self.squared_error(
+            ys, regression_line
+        )
+        return 1 - (squared_error_regression / squared_error_mean)
+
+    async def best_fit_line(self):
+        self.logger.debug("Number of input repos: {}".format(len(self.xData)))
+        x = self.xData
+        y = self.yData
+        mean_x = np.mean(self.xData)
+        mean_y = np.mean(self.yData)
+        m = (mean_x * mean_y - np.mean(x * y)) / (
+            (mean_x ** 2) - np.mean(x * x)
+        )
+        b = mean_y - (m * mean_x)
+        regression_line = [m * x + b for x in x]
+        accuracy = await self.coeff_of_deter(y, regression_line)
+        return (m, b, accuracy)
+
+    async def train(self, sources: Sources):
+        async for repo in sources.with_features(
+            self.features + [self.parent.config.predict]
+        ):
+            feature_data = repo.features(
+                self.features + [self.parent.config.predict]
+            )
+            self.xData = np.append(self.xData, feature_data[self.features[0]])
+            self.yData = np.append(
+                self.yData, feature_data[self.parent.config.predict]
+            )
+        self.regression_line = await self.best_fit_line()
+
+    async def accuracy(self, sources: Sources) -> Accuracy:
+        if self.regression_line is None:
+            raise ValueError("Model Not Trained")
+        accuracy_value = self.regression_line[2]
+        return Accuracy(accuracy_value)
+
+    async def predict(
+        self, repos: AsyncIterator[Repo]
+    ) -> AsyncIterator[Tuple[Repo, Any, float]]:
+        async for repo in repos:
+            feature_data = repo.features(self.features)
+            yield repo, await self.predict_input(
+                feature_data[self.features[0]]
+            ), self.regression_line[2]
+
+
+@entry_point("slr")
+class SLR(Model):
+    """
+    Simple Linear Regression Model for 2 variables implemented from scratch. Models are saved under the
+    ``directory`` in subdirectories named after the hash of their feature names.
+    """
+
+    CONTEXT = SLRContext
+
+    def __init__(self, config: SLRConfig) -> None:
+        super().__init__(config)
+        self.saved = {}
+
+    def _filename(self):
+        return os.path.join(
+            self.config.directory,
+            hashlib.sha384(self.config.predict.encode()).hexdigest() + ".json",
+        )
+
+    async def __aenter__(self) -> SLRContext:
+        filename = self._filename()
+        if os.path.isfile(filename):
+            with open(filename, "r") as read:
+                self.saved = json.load(read)
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        filename = self._filename()
+        with open(filename, "w") as write:
+            json.dump(self.saved, write)
+
+    @classmethod
+    def args(cls, args, *above) -> Dict[str, Arg]:
+        cls.config_set(
+            args,
+            above,
+            "directory",
+            Arg(
+                default=os.path.join(
+                    os.path.expanduser("~"), ".cache", "dffml", "scratch"
+                ),
+                help="Directory where state should be saved",
+            ),
+        )
+        cls.config_set(
+            args,
+            above,
+            "predict",
+            Arg(type=str, help="Label or the value to be predicted"),
+        )
+        return args
+
+    @classmethod
+    def config(cls, config, *above) -> "SLRConfig":
+        return SLRConfig(
+            directory=cls.config_get(config, above, "directory"),
+            predict=cls.config_get(config, above, "predict"),
+        )
diff --git a/model/scratch/dffml_model_scratch/version.py b/model/scratch/dffml_model_scratch/version.py
@@ -0,0 +1 @@
+VERSION = "0.0.1"
diff --git a/model/scratch/setup.py b/model/scratch/setup.py
@@ -0,0 +1,69 @@
+import os
+import sys
+import ast
+from io import open
+
+from setuptools import find_packages, setup
+
+self_path = os.path.dirname(os.path.realpath(__file__))
+
+with open(
+    os.path.join(self_path, "dffml_model_scratch", "version.py"), "r"
+) as f:
+    for line in f:
+        if line.startswith("VERSION"):
+            version = ast.literal_eval(line.strip().split("=")[-1].strip())
+            break
+
+with open(os.path.join(self_path, "README.md"), "r", encoding="utf-8") as f:
+    readme = f.read()
+
+INSTALL_REQUIRES = (
+    ["numpy>=1.16.4"] + ["dffml>=REPLACE_DFFML_VERSION"]
+    if not any(
+        list(
+            map(
+                os.path.isfile,
+                list(
+                    map(
+                        lambda syspath: os.path.join(
+                            syspath, "dffml.egg-link"
+                        ),
+                        sys.path,
+                    )
+                ),
+            )
+        )
+    )
+    else []
+)
+
+setup(
+    name="dffml-model-scratch",
+    version=version,
+    description="",
+    long_description=readme,
+    long_description_content_type="text/markdown",
+    author="Yash Lamba",
+    author_email="yashlamba2000@gmail.com",
+    maintainer="John Andersen",
+    maintainer_email="john.s.andersen@intel.com",
+    url="https://github.com/intel/dffml/blob/master/model/scratch/README.md",
+    license="MIT",
+    keywords=["dffml"],
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "License :: OSI Approved :: Apache Software License",
+        "Natural Language :: English",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: Implementation :: CPython",
+        "Programming Language :: Python :: Implementation :: PyPy",
+    ],
+    install_requires=INSTALL_REQUIRES,
+    packages=find_packages(),
+    entry_points={"dffml.model": ["scratchslr = dffml_model_scratch.slr:SLR"]},
+)
diff --git a/model/scratch/tests/__init__.py b/model/scratch/tests/__init__.py
diff --git a/model/scratch/tests/test_slr.py b/model/scratch/tests/test_slr.py
diff --git a/scripts/docs/care b/scripts/docs/care

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Auto detect text files and perform LF normalization`
	`2`	`+* text=auto`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+include README.md`
	`2`	`+include LICENSE`