asd ml files

gawainchin · gawainchin · commit 64435bcbb68b · 2019-05-26T23:55:38.000+08:00
diff --git a/supervised/knn_scratch.py b/supervised/knn_scratch.py
@@ -0,0 +1,54 @@
+import numpy as np
+import operator
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_iris
+
+"""
+X_test point finds K nearest neighbour
+select the most voted class
+
+"""
+def euclidean_distance(x,y):
+    """
+    x = vector 1
+    y = vector 2
+    """
+    return np.sqrt(np.sum((x-y)**2))
+
+def get_neighbours(X_train, X_test_instance, k):
+    distances = []
+    neighbours = []
+    X_train_row = X_train.shape[0]
+    for i in range(X_train_row):
+        distance = euclidean_distance(X_train[i], X_test_instance)
+        distances.append((i,distance))
+    distances.sort(key=operator.itemgetter(1))
+    for d in range(k):
+        neighbours.append(distances[d][0])
+    return neighbours
+
+def voting_fn(output, y_train):
+    classVotes = {}
+    for i in range(len(output)):
+        if y_train[output[i]] in classVotes:
+            classVotes[y_train[output[i]]] +=1
+        else:
+            classVotes[y_train[output[i]]] = 1
+        sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
+    return sortedVotes[0][0]
+
+def predict(X_train, X_test, Y_train, Y_test, k):
+    output_class = []
+    for i in range(0, X_test.shape[0]):
+        output = get_neighbours(X_train, X_test[i], k)
+        predictedClass = voting_fn(output, Y_train)
+        output_class.append(predictedClass)
+    return output_class
+
+
+if __name__ == "__main__":
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
+    y_predict = predict(X_train, X_test, y_train, y_test, 3)
+    print(confusion_matrix(y_test, y_predict))
diff --git a/supervised/linearReg_scratch.py b/supervised/linearReg_scratch.py
@@ -0,0 +1,104 @@
+import numpy as np
+from sklearn.datasets import load_boston
+import matplotlib.pyplot as plt
+
+class LinearReg():
+	
+	'''
+	*******************************************************
+	*Remember to Normalize features before fit and predict*
+	*******************************************************
+	'''
+
+	def __init__(self, fit_intercept=True):
+		self._coef = None
+		self._intercept = None
+		self._fit_intercept = fit_intercept
+		
+		
+	def predict(self, X):
+		"""
+		Output: model prediction f(x)
+		f(x) = wx + b
+		
+		Arguments:
+		X: 2D numpy array
+		"""
+
+		return np.dot(X, self._coef)
+		
+	def cost_function(self, predictions, y):
+		"""
+		Look for the MSE
+		Arguments:
+		predictions:
+		y:
+		"""
+		sq_error = (predictions - y)**2
+		return (1.0/(2*(y.shape[0]))) * sq_error.sum()
+		
+	def normalize(self, X):
+		"""
+		normalize features
+		"""
+
+		for i in X.T:
+			fmean = np.mean(i)
+			frange = np.amax(i) - np.amin(i)
+
+			#Vector Subtraction
+			i -= fmean
+
+			#Vector Division
+			X /= frange
+
+		return X
+		
+	def add_bias(self, X):
+		"""
+		add bias to features
+		"""
+		return np.c_[np.ones(X.shape[0]), X]
+		
+	def fit(self, X, y, learning_rate = 0.01, iters = 50000, log = True):
+		"""
+		Fit  model coefficients.
+		
+		Arguments:
+		X: 2D numpy array
+		Y: 1D numpy array
+		"""
+			
+		#Initialize the weights
+		self._coef = np.zeros(X.shape[1])
+		
+		for i in range(iters):
+			#1 Get Prediction:
+			predictions = self.predict(X)
+			#2 Calculate cost for auditing purposes
+			cost = self.cost_function(predictions, y)
+			
+			#3 Calculate error/loss
+			error = predictions - y
+			
+			#Calculate new coef
+			theta = self._coef - (1/(X.shape[0]))*learning_rate*(np.dot(X.T, error))
+			
+			self._coef = theta
+			
+			if log == True and i % 10000 == 0:
+				print("iter: "+str(i) + " cost: "+str(cost) + "")
+
+	
+if __name__ == "__main__":
+	boston = load_boston()
+	X = boston.data
+	y = boston.target
+	model  = LinearReg()
+	features = model.add_bias(model.normalize(X))
+	model.fit(features, y)
+	predict_result = model.predict(features)
+	plt.scatter(y, predict_result)
+	plt.xlabel("Prices: $Y_i$")
+	plt.ylabel("Predicted prices: $\hat{Y}_i$")
+	plt.title("Prices vs Predicted prices: $Y_i$ vs $\hat{Y}_i$")
diff --git a/supervised/logistic_scratch.py b/supervised/logistic_scratch.py
@@ -0,0 +1,110 @@
+import numpy as np
+from sklearn.datasets import load_iris
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+
+class Logistic:
+
+	def __init__(self, fit_intercept=True):
+		self._coef = None
+		self._intercept = None
+		self._fit_intercept = fit_intercept
+		
+	#def sigmoid(z):
+		#return 1 / (1 + np.exp(-z))
+		
+	def predict_proba(self, X):
+	
+		def sigmoid(z):
+			return 1 / (1 + np.exp(-z))
+		"""
+		Output: model prediction f(x)
+		f(x) = wx + b
+		
+		Arguments:
+		X: 2D numpy array
+		"""
+		z = np.dot(self._coef,X.T)
+		h = sigmoid(z)
+		return h
+		
+	def predict(self, X, threshold = 0.5):
+		return self.predict_proba(X) >= threshold
+	
+	def cost_function(self, predictions, y):
+		return (-1/y.shape[0])*np.sum((y.T * np.log(predictions)) + ((1 - y.T) * np.log(1 - predictions)))
+		
+		
+	def normalize(self, X):
+		"""
+		normalize features
+		"""
+
+		for i in X.T:
+			fmean = np.mean(i)
+			frange = np.amax(i) - np.amin(i)
+
+			#Vector Subtraction
+			i -= fmean
+
+			#Vector Division
+			X /= frange
+
+		return X
+		
+		
+	def add_bias(self, X):
+		"""
+		add bias to features
+		"""
+		return np.c_[np.ones(X.shape[0]), X]
+		
+	def fit(self, X, y, learning_rate= 0.01, iters = 100000, log = True):
+		"""
+		Fit  model coefficients.
+		
+		Arguments:
+		X: 2D numpy array
+		Y: 1D numpy array
+		"""
+		
+		self._coef = np.zeros(X.shape[1])
+		for i in range(iters):
+		#1 Get Prediction:
+			predictions = self.predict_proba(X)
+		#2 Calculate cost for auditing purposes
+			cost = self.cost_function(predictions, y)
+		#3 Calculate gradient
+			#gradient = (1/X.shape[0])*(np.dot(X.T,  (predictions-y.T).T))
+		#4 - Multiply the gradient by our learning rate
+			#Calculate new coef
+			theta = self._coef - (1/(X.shape[0]))*learning_rate*(np.dot(X.T, (predictions-y.T).T))
+			self._coef = theta
+			
+			if log == True and i % 10000 == 0:
+				print("iter: "+str(i) + " cost: "+str(cost) + "")
+
+	def plot_boundary(self, X, y):
+		plt.figure(figsize=(10, 6))
+		plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='b', label='0')
+		plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='r', label='1')
+		plt.legend()
+		x1_min, x1_max = X[:,0].min(), X[:,0].max(),
+		x2_min, x2_max = X[:,1].min(), X[:,1].max(),
+		xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
+		grid = np.c_[xx1.ravel(), xx2.ravel()]
+		probs = self.predict_proba(grid).reshape(xx1.shape)
+		plt.contour(xx1, xx2, probs, [0.5], linewidths=1, colors='green');
+
+
+
+if __name__ == "__main__":
+	iris = load_iris()
+	X, y = iris.data, iris.target
+	X = iris.data[:, :2]
+	y = (iris.target != 0) * 1
+	model = Logistic()
+	model.fit(X, y)
+	y_predict = model.predict(X)
+	model.plot_boundary(X,y)
+	print(confusion_matrix(y, y_predict))
diff --git a/unsupervised/kmeans_scratch.py b/unsupervised/kmeans_scratch.py
@@ -0,0 +1,110 @@
+import numpy as np
+from sklearn.datasets import make_blobs
+import matplotlib.pyplot as plt
+
+class Kmeans():
+    """
+    Parameters:
+    k: int
+        The number of iterations the alogorithm will form.
+    max_iterations: int
+        The number of iterations the algorithm with run for it if it does
+        not converge before that.
+    """
+
+    def __init__(self, k=2, max_iterations=500):
+        self.k = k
+        self.max_iterations = max_iterations
+
+    def  _init_random_centroids(self, X):
+        """ Initialize the centroids as k random samples of X"""
+        n_samples, n_features = np.shape(X)
+        centroids = np.zeros((self.k, n_features))  # Centroids with features value
+        for i in range(self.k):
+            centroid = X[np.random.choice(range(n_samples))]
+            centroids[i] = centroid
+        return centroids
+
+    def _euclidean_distance(self, x, y):
+        """
+        x = vector 1
+        y = vector 2
+        """
+        return np.sqrt(np.sum((x-y)**2))
+
+    def _closest_centroid(self, sample, centroids):
+        """ Return the index of the closest centroid to the sample"""
+        def _euclidean_distance(x, y):
+            """
+            x = vector 1
+            y = vector 2
+            """
+            return np.sqrt(np.sum((x-y)**2))
+
+
+        closest_i = 0
+        closest_dist = float("inf")
+        for i, centroid in enumerate(centroids):
+            distance = _euclidean_distance(sample, centroid)
+            if distance < closest_dist:
+                closest_i = i
+                closest_dist = distance
+        return closest_i
+    
+    def _create_clusters(self, centroids, X):
+        """ Assign the samples to the closest centroids to create clusters
+        return clusters with sample_id 
+        """
+        #n_samples = np.shape(X)[0]
+        clusters = [[] for _ in range(self.k)]
+        for sample_i, sample in enumerate(X):
+            centroid_i = self._closest_centroid(sample, centroids)
+            clusters[centroid_i].append(sample_i)
+        return clusters
+
+    def _calculate_centroids(self, clusters, X):
+        """ Calculate new centroids as the means of the samples in each cluster  """
+        n_features = np.shape(X)[1]
+        centroids = np.zeros((self.k, n_features))
+        for i, cluster in enumerate(clusters):
+            centroid = np.mean(X[cluster], axis=0)
+            centroids[i] = centroid
+        return centroids
+ 
+    def _get_cluster_labels(self, clusters, X):
+        """Classify samples as the index of their clusters """
+        # One prediction for each sample
+        y_pred = np.zeros(np.shape(X)[0])
+        for cluster_i, cluster in enumerate(clusters):
+            for sample_i in cluster:
+                y_pred[sample_i] = cluster_i
+        return y_pred
+
+    def predict(self, X):
+        """ Do K-Means clustering and return cluster indices """
+
+        # Initialize centroids as k random samples from X
+        centroids = self._init_random_centroids(X)
+        print(centroids)
+
+        # Iterate until convergence or for max iterations
+        for _ in range(self.max_iterations):
+            # Assign samples to closest centroids (create clusters)
+            clusters = self._create_clusters(centroids, X)
+            # Save current centroids for convergence check
+            prev_centroids = centroids
+            # Calculate new centroids from the clusters
+            centroids = self._calculate_centroids(clusters, X)
+            # If no centroids have changed => convergence
+            diff = centroids - prev_centroids
+            if not diff.any():
+                break
+
+        return self._get_cluster_labels(clusters, X)
+
+if __name__ == "__main__":
+    X, y = make_blobs(n_samples=800, centers=4, cluster_std=0.7)
+    plt.scatter(X[:,0],X[:,1])
+    model = Kmeans(k = 4)
+    y_pred = model.predict(X)
+    plt.scatter(X[:,0],X[:,1],c=y_pred,cmap="viridis")