Skip to content

Commit 64435bc

Browse files
committed
asd ml files
1 parent 2009db2 commit 64435bc

File tree

4 files changed

+378
-0
lines changed

4 files changed

+378
-0
lines changed

supervised/knn_scratch.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import numpy as np
2+
import operator
3+
from sklearn.model_selection import train_test_split
4+
from sklearn.datasets import load_iris
5+
6+
"""
7+
X_test point finds K nearest neighbour
8+
select the most voted class
9+
10+
"""
11+
def euclidean_distance(x,y):
12+
"""
13+
x = vector 1
14+
y = vector 2
15+
"""
16+
return np.sqrt(np.sum((x-y)**2))
17+
18+
def get_neighbours(X_train, X_test_instance, k):
19+
distances = []
20+
neighbours = []
21+
X_train_row = X_train.shape[0]
22+
for i in range(X_train_row):
23+
distance = euclidean_distance(X_train[i], X_test_instance)
24+
distances.append((i,distance))
25+
distances.sort(key=operator.itemgetter(1))
26+
for d in range(k):
27+
neighbours.append(distances[d][0])
28+
return neighbours
29+
30+
def voting_fn(output, y_train):
31+
classVotes = {}
32+
for i in range(len(output)):
33+
if y_train[output[i]] in classVotes:
34+
classVotes[y_train[output[i]]] +=1
35+
else:
36+
classVotes[y_train[output[i]]] = 1
37+
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
38+
return sortedVotes[0][0]
39+
40+
def predict(X_train, X_test, Y_train, Y_test, k):
41+
output_class = []
42+
for i in range(0, X_test.shape[0]):
43+
output = get_neighbours(X_train, X_test[i], k)
44+
predictedClass = voting_fn(output, Y_train)
45+
output_class.append(predictedClass)
46+
return output_class
47+
48+
49+
if __name__ == "__main__":
50+
iris = load_iris()
51+
X, y = iris.data, iris.target
52+
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
53+
y_predict = predict(X_train, X_test, y_train, y_test, 3)
54+
print(confusion_matrix(y_test, y_predict))

supervised/linearReg_scratch.py

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import numpy as np
2+
from sklearn.datasets import load_boston
3+
import matplotlib.pyplot as plt
4+
5+
class LinearReg():
6+
7+
'''
8+
*******************************************************
9+
*Remember to Normalize features before fit and predict*
10+
*******************************************************
11+
'''
12+
13+
def __init__(self, fit_intercept=True):
14+
self._coef = None
15+
self._intercept = None
16+
self._fit_intercept = fit_intercept
17+
18+
19+
def predict(self, X):
20+
"""
21+
Output: model prediction f(x)
22+
f(x) = wx + b
23+
24+
Arguments:
25+
X: 2D numpy array
26+
"""
27+
28+
return np.dot(X, self._coef)
29+
30+
def cost_function(self, predictions, y):
31+
"""
32+
Look for the MSE
33+
Arguments:
34+
predictions:
35+
y:
36+
"""
37+
sq_error = (predictions - y)**2
38+
return (1.0/(2*(y.shape[0]))) * sq_error.sum()
39+
40+
def normalize(self, X):
41+
"""
42+
normalize features
43+
"""
44+
45+
for i in X.T:
46+
fmean = np.mean(i)
47+
frange = np.amax(i) - np.amin(i)
48+
49+
#Vector Subtraction
50+
i -= fmean
51+
52+
#Vector Division
53+
X /= frange
54+
55+
return X
56+
57+
def add_bias(self, X):
58+
"""
59+
add bias to features
60+
"""
61+
return np.c_[np.ones(X.shape[0]), X]
62+
63+
def fit(self, X, y, learning_rate = 0.01, iters = 50000, log = True):
64+
"""
65+
Fit model coefficients.
66+
67+
Arguments:
68+
X: 2D numpy array
69+
Y: 1D numpy array
70+
"""
71+
72+
#Initialize the weights
73+
self._coef = np.zeros(X.shape[1])
74+
75+
for i in range(iters):
76+
#1 Get Prediction:
77+
predictions = self.predict(X)
78+
#2 Calculate cost for auditing purposes
79+
cost = self.cost_function(predictions, y)
80+
81+
#3 Calculate error/loss
82+
error = predictions - y
83+
84+
#Calculate new coef
85+
theta = self._coef - (1/(X.shape[0]))*learning_rate*(np.dot(X.T, error))
86+
87+
self._coef = theta
88+
89+
if log == True and i % 10000 == 0:
90+
print("iter: "+str(i) + " cost: "+str(cost) + "")
91+
92+
93+
if __name__ == "__main__":
94+
boston = load_boston()
95+
X = boston.data
96+
y = boston.target
97+
model = LinearReg()
98+
features = model.add_bias(model.normalize(X))
99+
model.fit(features, y)
100+
predict_result = model.predict(features)
101+
plt.scatter(y, predict_result)
102+
plt.xlabel("Prices: $Y_i$")
103+
plt.ylabel("Predicted prices: $\hat{Y}_i$")
104+
plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")

supervised/logistic_scratch.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import numpy as np
2+
from sklearn.datasets import load_iris
3+
import matplotlib.pyplot as plt
4+
from sklearn.metrics import confusion_matrix
5+
6+
class Logistic:
7+
8+
def __init__(self, fit_intercept=True):
9+
self._coef = None
10+
self._intercept = None
11+
self._fit_intercept = fit_intercept
12+
13+
#def sigmoid(z):
14+
#return 1 / (1 + np.exp(-z))
15+
16+
def predict_proba(self, X):
17+
18+
def sigmoid(z):
19+
return 1 / (1 + np.exp(-z))
20+
"""
21+
Output: model prediction f(x)
22+
f(x) = wx + b
23+
24+
Arguments:
25+
X: 2D numpy array
26+
"""
27+
z = np.dot(self._coef,X.T)
28+
h = sigmoid(z)
29+
return h
30+
31+
def predict(self, X, threshold = 0.5):
32+
return self.predict_proba(X) >= threshold
33+
34+
def cost_function(self, predictions, y):
35+
return (-1/y.shape[0])*np.sum((y.T * np.log(predictions)) + ((1 - y.T) * np.log(1 - predictions)))
36+
37+
38+
def normalize(self, X):
39+
"""
40+
normalize features
41+
"""
42+
43+
for i in X.T:
44+
fmean = np.mean(i)
45+
frange = np.amax(i) - np.amin(i)
46+
47+
#Vector Subtraction
48+
i -= fmean
49+
50+
#Vector Division
51+
X /= frange
52+
53+
return X
54+
55+
56+
def add_bias(self, X):
57+
"""
58+
add bias to features
59+
"""
60+
return np.c_[np.ones(X.shape[0]), X]
61+
62+
def fit(self, X, y, learning_rate= 0.01, iters = 100000, log = True):
63+
"""
64+
Fit model coefficients.
65+
66+
Arguments:
67+
X: 2D numpy array
68+
Y: 1D numpy array
69+
"""
70+
71+
self._coef = np.zeros(X.shape[1])
72+
for i in range(iters):
73+
#1 Get Prediction:
74+
predictions = self.predict_proba(X)
75+
#2 Calculate cost for auditing purposes
76+
cost = self.cost_function(predictions, y)
77+
#3 Calculate gradient
78+
#gradient = (1/X.shape[0])*(np.dot(X.T, (predictions-y.T).T))
79+
#4 - Multiply the gradient by our learning rate
80+
#Calculate new coef
81+
theta = self._coef - (1/(X.shape[0]))*learning_rate*(np.dot(X.T, (predictions-y.T).T))
82+
self._coef = theta
83+
84+
if log == True and i % 10000 == 0:
85+
print("iter: "+str(i) + " cost: "+str(cost) + "")
86+
87+
def plot_boundary(self, X, y):
88+
plt.figure(figsize=(10, 6))
89+
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='b', label='0')
90+
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='r', label='1')
91+
plt.legend()
92+
x1_min, x1_max = X[:,0].min(), X[:,0].max(),
93+
x2_min, x2_max = X[:,1].min(), X[:,1].max(),
94+
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
95+
grid = np.c_[xx1.ravel(), xx2.ravel()]
96+
probs = self.predict_proba(grid).reshape(xx1.shape)
97+
plt.contour(xx1, xx2, probs, [0.5], linewidths=1, colors='green');
98+
99+
100+
101+
if __name__ == "__main__":
102+
iris = load_iris()
103+
X, y = iris.data, iris.target
104+
X = iris.data[:, :2]
105+
y = (iris.target != 0) * 1
106+
model = Logistic()
107+
model.fit(X, y)
108+
y_predict = model.predict(X)
109+
model.plot_boundary(X,y)
110+
print(confusion_matrix(y, y_predict))

unsupervised/kmeans_scratch.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import numpy as np
2+
from sklearn.datasets import make_blobs
3+
import matplotlib.pyplot as plt
4+
5+
class Kmeans():
6+
"""
7+
Parameters:
8+
k: int
9+
The number of iterations the alogorithm will form.
10+
max_iterations: int
11+
The number of iterations the algorithm with run for it if it does
12+
not converge before that.
13+
"""
14+
15+
def __init__(self, k=2, max_iterations=500):
16+
self.k = k
17+
self.max_iterations = max_iterations
18+
19+
def _init_random_centroids(self, X):
20+
""" Initialize the centroids as k random samples of X"""
21+
n_samples, n_features = np.shape(X)
22+
centroids = np.zeros((self.k, n_features)) # Centroids with features value
23+
for i in range(self.k):
24+
centroid = X[np.random.choice(range(n_samples))]
25+
centroids[i] = centroid
26+
return centroids
27+
28+
def _euclidean_distance(self, x, y):
29+
"""
30+
x = vector 1
31+
y = vector 2
32+
"""
33+
return np.sqrt(np.sum((x-y)**2))
34+
35+
def _closest_centroid(self, sample, centroids):
36+
""" Return the index of the closest centroid to the sample"""
37+
def _euclidean_distance(x, y):
38+
"""
39+
x = vector 1
40+
y = vector 2
41+
"""
42+
return np.sqrt(np.sum((x-y)**2))
43+
44+
45+
closest_i = 0
46+
closest_dist = float("inf")
47+
for i, centroid in enumerate(centroids):
48+
distance = _euclidean_distance(sample, centroid)
49+
if distance < closest_dist:
50+
closest_i = i
51+
closest_dist = distance
52+
return closest_i
53+
54+
def _create_clusters(self, centroids, X):
55+
""" Assign the samples to the closest centroids to create clusters
56+
return clusters with sample_id
57+
"""
58+
#n_samples = np.shape(X)[0]
59+
clusters = [[] for _ in range(self.k)]
60+
for sample_i, sample in enumerate(X):
61+
centroid_i = self._closest_centroid(sample, centroids)
62+
clusters[centroid_i].append(sample_i)
63+
return clusters
64+
65+
def _calculate_centroids(self, clusters, X):
66+
""" Calculate new centroids as the means of the samples in each cluster """
67+
n_features = np.shape(X)[1]
68+
centroids = np.zeros((self.k, n_features))
69+
for i, cluster in enumerate(clusters):
70+
centroid = np.mean(X[cluster], axis=0)
71+
centroids[i] = centroid
72+
return centroids
73+
74+
def _get_cluster_labels(self, clusters, X):
75+
"""Classify samples as the index of their clusters """
76+
# One prediction for each sample
77+
y_pred = np.zeros(np.shape(X)[0])
78+
for cluster_i, cluster in enumerate(clusters):
79+
for sample_i in cluster:
80+
y_pred[sample_i] = cluster_i
81+
return y_pred
82+
83+
def predict(self, X):
84+
""" Do K-Means clustering and return cluster indices """
85+
86+
# Initialize centroids as k random samples from X
87+
centroids = self._init_random_centroids(X)
88+
print(centroids)
89+
90+
# Iterate until convergence or for max iterations
91+
for _ in range(self.max_iterations):
92+
# Assign samples to closest centroids (create clusters)
93+
clusters = self._create_clusters(centroids, X)
94+
# Save current centroids for convergence check
95+
prev_centroids = centroids
96+
# Calculate new centroids from the clusters
97+
centroids = self._calculate_centroids(clusters, X)
98+
# If no centroids have changed => convergence
99+
diff = centroids - prev_centroids
100+
if not diff.any():
101+
break
102+
103+
return self._get_cluster_labels(clusters, X)
104+
105+
if __name__ == "__main__":
106+
X, y = make_blobs(n_samples=800, centers=4, cluster_std=0.7)
107+
plt.scatter(X[:,0],X[:,1])
108+
model = Kmeans(k = 4)
109+
y_pred = model.predict(X)
110+
plt.scatter(X[:,0],X[:,1],c=y_pred,cmap="viridis")

0 commit comments

Comments
 (0)