Skip to content
This repository was archived by the owner on Aug 25, 2024. It is now read-only.

Commit 4cb581b

Browse files
yashlambaJohn Andersen
authored and
John Andersen
committed
model: scratch: Added simple linear regression
Signed-off-by: John Andersen <john.s.andersen@intel.com>
1 parent 05c1f30 commit 4cb581b

File tree

15 files changed

+469
-2
lines changed

15 files changed

+469
-2
lines changed

.travis.yml

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ env:
1515
matrix:
1616
- PLUGIN=.
1717
- PLUGIN=model/tensorflow
18+
- PLUGIN=model/scratch
1819
- PLUGIN=feature/git
1920
- PLUGIN=feature/auth
2021
- CHANGELOG=1

CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1919
- util.cli CMD classes have their argparse description set to their docstring.
2020
- util.cli CMD classes can specify the formatter class used in
2121
`argparse.ArgumentParser` via the `CLI_FORMATTER_CLASS` property.
22-
- skeleton for service creation was added
22+
- Skeleton for service creation was added
23+
- Simple Linear Regression model from scratch
2324
### Changed
2425
- feature/codesec became it's own branch, binsec
2526
- BaseOrchestratorContext `run_operations` strict is default to true. With

model/scratch/.coveragerc

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[run]
2+
source =
3+
dffml_model_scratch
4+
tests
5+
branch = True
6+
7+
[report]
8+
exclude_lines =
9+
no cov
10+
no qa
11+
noqa
12+
pragma: no cover
13+
if __name__ == .__main__.:

model/scratch/.gitattributes

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

model/scratch/.gitignore

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
*.log
2+
*.pyc
3+
.cache/
4+
.coverage
5+
.idea/
6+
.vscode/
7+
*.egg-info/
8+
build/
9+
dist/
10+
docs/build/
11+
venv/
12+
wheelhouse/
13+
*.egss
14+
.mypy_cache/
15+
*.swp
16+
.venv/
17+
.eggs/
18+
*.modeldir
19+
*.db
20+
htmlcov/

model/scratch/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Copyright (c) 2019 Intel
2+
3+
MIT License
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

model/scratch/MANIFEST.in

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
include README.md
2+
include LICENSE

model/scratch/README.md

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# DFFML Scratch Models
2+
3+
## About
4+
5+
Models created without a machine learning framework.
6+
7+
## Install
8+
9+
```console
10+
python3.7 -m pip install --user dffml-model-scratch
11+
```
12+
13+
## Usage
14+
15+
If we have a dataset of years of experience in a job and the Salary (in
16+
thousands) at that job we can use the Simple Linear Regression model to predict
17+
a salary given the years of experience (or the other way around).
18+
19+
First we create the file containing the dataset. Then we train the model, get
20+
its accuracy. And using `echo` pipe a new csv file of data to predict into the
21+
model, and it will give us it prediction of the Salary.
22+
23+
```console
24+
$ cat > dataset.csv << EOF
25+
Years,Salary
26+
1,40
27+
2,50
28+
3,60
29+
4,70
30+
5,80
31+
EOF
32+
$ dffml train -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename dataset.csv -source-readonly -log debug
33+
$ dffml accuracy -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename dataset.csv -source-readonly -log debug
34+
1.0
35+
$ echo -e 'Years,Salary\n6,0\n' | dffml predict all -model scratchslr -features def:Years:int:1 -model-predict Salary -sources f=csv -source-filename /dev/stdin -source-readonly -log debug
36+
[
37+
{
38+
"extra": {},
39+
"features": {
40+
"Salary": 0,
41+
"Years": 6
42+
},
43+
"last_updated": "2019-07-19T09:46:45Z",
44+
"prediction": {
45+
"confidence": 1.0,
46+
"value": 90.0
47+
},
48+
"src_url": "0"
49+
}
50+
]
51+
```
52+
53+
## License
54+
55+
Scratch Models are distributed under the terms of the [MIT License](LICENSE).

model/scratch/dffml_model_scratch/__init__.py

Whitespace-only changes.
+184
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2019 Intel Corporation
3+
"""
4+
Description of what this model does
5+
"""
6+
import os
7+
import abc
8+
import json
9+
import hashlib
10+
from typing import AsyncIterator, Tuple, Any, List, Optional, NamedTuple, Dict
11+
12+
import numpy as np
13+
14+
from dffml.repo import Repo
15+
from dffml.source.source import Sources
16+
from dffml.feature import Features
17+
from dffml.accuracy import Accuracy
18+
from dffml.model.model import ModelConfig, ModelContext, Model
19+
from dffml.util.entrypoint import entry_point
20+
from dffml.util.cli.arg import Arg
21+
22+
23+
class SLRConfig(ModelConfig, NamedTuple):
24+
predict: str
25+
directory: str
26+
27+
28+
class SLRContext(ModelContext):
29+
def __init__(self, parent, features):
30+
super().__init__(parent, features)
31+
self.xData = np.array([])
32+
self.yData = np.array([])
33+
self.features = self.applicable_features(features)
34+
self._features_hash_ = hashlib.sha384(
35+
("".join(sorted(self.features))).encode()
36+
).hexdigest()
37+
38+
@property
39+
def regression_line(self):
40+
return self.parent.saved.get(self._features_hash_, None)
41+
42+
@regression_line.setter
43+
def regression_line(self, rline):
44+
self.parent.saved[self._features_hash_] = rline
45+
46+
def applicable_features(self, features):
47+
usable = []
48+
if len(features) != 1:
49+
raise ValueError(
50+
"Simple Linear Regression doesn't support features other than 1"
51+
)
52+
for feature in features:
53+
if feature.dtype() != int and feature.dtype() != float:
54+
raise ValueError(
55+
"Simple Linear Regression only supports int or float feature"
56+
)
57+
if feature.length() != 1:
58+
raise ValueError(
59+
"Simple LR only supports single values (non-matrix / array)"
60+
)
61+
usable.append(feature.NAME)
62+
return sorted(usable)
63+
64+
async def predict_input(self, x):
65+
prediction = self.regression_line[0] * x + self.regression_line[1]
66+
self.logger.debug(
67+
"Predicted Value of {} {}:".format(
68+
self.parent.config.predict, prediction
69+
)
70+
)
71+
return prediction
72+
73+
async def squared_error(self, ys, yline):
74+
return sum((ys - yline) ** 2)
75+
76+
async def coeff_of_deter(self, ys, regression_line):
77+
y_mean_line = [np.mean(ys) for y in ys]
78+
squared_error_mean = await self.squared_error(ys, y_mean_line)
79+
squared_error_regression = await self.squared_error(
80+
ys, regression_line
81+
)
82+
return 1 - (squared_error_regression / squared_error_mean)
83+
84+
async def best_fit_line(self):
85+
self.logger.debug("Number of input repos: {}".format(len(self.xData)))
86+
x = self.xData
87+
y = self.yData
88+
mean_x = np.mean(self.xData)
89+
mean_y = np.mean(self.yData)
90+
m = (mean_x * mean_y - np.mean(x * y)) / (
91+
(mean_x ** 2) - np.mean(x * x)
92+
)
93+
b = mean_y - (m * mean_x)
94+
regression_line = [m * x + b for x in x]
95+
accuracy = await self.coeff_of_deter(y, regression_line)
96+
return (m, b, accuracy)
97+
98+
async def train(self, sources: Sources):
99+
async for repo in sources.with_features(
100+
self.features + [self.parent.config.predict]
101+
):
102+
feature_data = repo.features(
103+
self.features + [self.parent.config.predict]
104+
)
105+
self.xData = np.append(self.xData, feature_data[self.features[0]])
106+
self.yData = np.append(
107+
self.yData, feature_data[self.parent.config.predict]
108+
)
109+
self.regression_line = await self.best_fit_line()
110+
111+
async def accuracy(self, sources: Sources) -> Accuracy:
112+
if self.regression_line is None:
113+
raise ValueError("Model Not Trained")
114+
accuracy_value = self.regression_line[2]
115+
return Accuracy(accuracy_value)
116+
117+
async def predict(
118+
self, repos: AsyncIterator[Repo]
119+
) -> AsyncIterator[Tuple[Repo, Any, float]]:
120+
async for repo in repos:
121+
feature_data = repo.features(self.features)
122+
yield repo, await self.predict_input(
123+
feature_data[self.features[0]]
124+
), self.regression_line[2]
125+
126+
127+
@entry_point("slr")
128+
class SLR(Model):
129+
"""
130+
Simple Linear Regression Model for 2 variables implemented from scratch. Models are saved under the
131+
``directory`` in subdirectories named after the hash of their feature names.
132+
"""
133+
134+
CONTEXT = SLRContext
135+
136+
def __init__(self, config: SLRConfig) -> None:
137+
super().__init__(config)
138+
self.saved = {}
139+
140+
def _filename(self):
141+
return os.path.join(
142+
self.config.directory,
143+
hashlib.sha384(self.config.predict.encode()).hexdigest() + ".json",
144+
)
145+
146+
async def __aenter__(self) -> SLRContext:
147+
filename = self._filename()
148+
if os.path.isfile(filename):
149+
with open(filename, "r") as read:
150+
self.saved = json.load(read)
151+
return self
152+
153+
async def __aexit__(self, exc_type, exc_value, traceback):
154+
filename = self._filename()
155+
with open(filename, "w") as write:
156+
json.dump(self.saved, write)
157+
158+
@classmethod
159+
def args(cls, args, *above) -> Dict[str, Arg]:
160+
cls.config_set(
161+
args,
162+
above,
163+
"directory",
164+
Arg(
165+
default=os.path.join(
166+
os.path.expanduser("~"), ".cache", "dffml", "scratch"
167+
),
168+
help="Directory where state should be saved",
169+
),
170+
)
171+
cls.config_set(
172+
args,
173+
above,
174+
"predict",
175+
Arg(type=str, help="Label or the value to be predicted"),
176+
)
177+
return args
178+
179+
@classmethod
180+
def config(cls, config, *above) -> "SLRConfig":
181+
return SLRConfig(
182+
directory=cls.config_get(config, above, "directory"),
183+
predict=cls.config_get(config, above, "predict"),
184+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
VERSION = "0.0.1"

model/scratch/setup.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
import sys
3+
import ast
4+
from io import open
5+
6+
from setuptools import find_packages, setup
7+
8+
self_path = os.path.dirname(os.path.realpath(__file__))
9+
10+
with open(
11+
os.path.join(self_path, "dffml_model_scratch", "version.py"), "r"
12+
) as f:
13+
for line in f:
14+
if line.startswith("VERSION"):
15+
version = ast.literal_eval(line.strip().split("=")[-1].strip())
16+
break
17+
18+
with open(os.path.join(self_path, "README.md"), "r", encoding="utf-8") as f:
19+
readme = f.read()
20+
21+
INSTALL_REQUIRES = (
22+
["numpy>=1.16.4"] + ["dffml>=REPLACE_DFFML_VERSION"]
23+
if not any(
24+
list(
25+
map(
26+
os.path.isfile,
27+
list(
28+
map(
29+
lambda syspath: os.path.join(
30+
syspath, "dffml.egg-link"
31+
),
32+
sys.path,
33+
)
34+
),
35+
)
36+
)
37+
)
38+
else []
39+
)
40+
41+
setup(
42+
name="dffml-model-scratch",
43+
version=version,
44+
description="",
45+
long_description=readme,
46+
long_description_content_type="text/markdown",
47+
author="Yash Lamba",
48+
author_email="yashlamba2000@gmail.com",
49+
maintainer="John Andersen",
50+
maintainer_email="john.s.andersen@intel.com",
51+
url="https://github.com/intel/dffml/blob/master/model/scratch/README.md",
52+
license="MIT",
53+
keywords=["dffml"],
54+
classifiers=[
55+
"Development Status :: 3 - Alpha",
56+
"Intended Audience :: Developers",
57+
"License :: OSI Approved :: MIT License",
58+
"License :: OSI Approved :: Apache Software License",
59+
"Natural Language :: English",
60+
"Operating System :: OS Independent",
61+
"Programming Language :: Python :: 3 :: Only",
62+
"Programming Language :: Python :: 3.7",
63+
"Programming Language :: Python :: Implementation :: CPython",
64+
"Programming Language :: Python :: Implementation :: PyPy",
65+
],
66+
install_requires=INSTALL_REQUIRES,
67+
packages=find_packages(),
68+
entry_points={"dffml.model": ["scratchslr = dffml_model_scratch.slr:SLR"]},
69+
)

model/scratch/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)