42-ft_linear_regression/SimpleLinearRegression.py at main · smaugdela/42-ft_linear_regression · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pickle


class SimpleLinearRegression:


	def __init__(self):
		self.theta_0 = 0.0
		self.theta_1 = 0.0
		self.r2_score = 0.0


	def fit(self, x: np.array, y: np.array, learning_rate=0.1, iterations=10000, precision=1e-6, visualizer=False):
		"""
		Finds the best parameters theta_0 and theta_1 (bias and weight) for given input X and output y, using gradient descent.
		"""

		if len(x) != len(y):
			raise ValueError("The dataset is invalid. Their length differ")

		norm_theta_0 = 0.0
		norm_theta_1 = 0.0

		# Normalizing data
		max_x = max(x)
		min_x = min(x)
		if max_x == min_x:
			raise ValueError("The dataset is invalid. The input data has no variance.")
		x = (x - min_x) / (max_x - min_x)
		max_y = max(y)
		min_y = min(y)
		if max_y == min_y:
			raise ValueError("The dataset is invalid. The output data has no variance.")
		y = (y - min_y) / (max_y - min_y)

		m = float(len(x))

		if visualizer:
			# Setup
			x_line = np.array([min(x), max(x)])
			y_line = x_line * norm_theta_1 + norm_theta_0
			plt.ion()
			fig = plt.figure()
			ax = fig.add_subplot(111)
			ax.plot(x, y, 'bx')
			line, = ax.plot(x_line, y_line, 'r')

		for i in range(iterations):

			if visualizer and plt.fignum_exists(fig.number):
				y_line = x_line * norm_theta_1 + norm_theta_0
				line.set_ydata(y_line)
				fig.canvas.draw()
				fig.canvas.flush_events()

			y_pred = norm_theta_0 + (norm_theta_1 * x)

			tmp_theta_0 = learning_rate / m * np.sum(y_pred - y)
			tmp_theta_1 = learning_rate / m * np.sum((y_pred - y) * x)

			norm_theta_0 = norm_theta_0 - tmp_theta_0
			norm_theta_1 = norm_theta_1 - tmp_theta_1

			if (abs(tmp_theta_0) < precision and abs(tmp_theta_1) < precision):
				break

		# Computing R2 score
		y_mean = y.mean()
		y_pred = norm_theta_0 + (norm_theta_1 * x)
		self.r2_score =  1 - (np.sum((y - y_pred) ** 2) / np.sum((y - y_mean) ** 2))

		# Denormalizing final parameters
		self.theta_1 = norm_theta_1 * (max_y - min_y) / (max_x - min_x)
		self.theta_0 = self.theta_0 = norm_theta_0 * (max_y - min_y) + min_y - self.theta_1 * min_x


	def predict(self, x: float) -> float:
		return self.theta_0 + (self.theta_1 * x)


	def import_coef(self, file_name: str = '.coefs.pkl'):
		try:
			with open(file_name, 'rb') as file:
				self.theta_0 = pickle.load(file)
				self.theta_1 = pickle.load(file)
				self.r2_score = pickle.load(file)
		except:
			print("No coefficients found, please train the model first or provide a valid file.")


	def export_coef(self, file_name: str = '.coefs.pkl'):
		try:
			with open(file_name, 'wb') as file:
				pickle.dump(self.theta_0, file)
				pickle.dump(self.theta_1, file)
				pickle.dump(self.r2_score, file)
		except:
			print("Error: Could not export to file.")