2017-07-02 77 views
0

从头开始在python中实现多重线性回归。每个时代之后的成本增长非常迅速,最终溢出。发生了什么问题?有没有逻辑错误或方法错误? 下面是代码:成本函数在梯度下降实现中不减少

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

class LinearRegression: 

    def __init__(self): 
     #initialize parameters with none values 
     self.cost_history = None 
     self.std_dev = None 
     self.mean = None 
     self.weights = None 

    def set_mean_and_std_dev(self, X_data, axis=0): 
     #mean and std_deviation of training_data 
     self.std_dev = np.array(np.std(X_data, axis = axis)) 
     self.mean = np.array(np.mean(X_data, axis = axis)) 
     print("Mean : ", self.mean) 
     print("Standard deviation : ", self.std_dev) 
     return 

    def normalize(self, X_data): 
     #normalizing the data 
     X_data = (X_data - self.mean)/(self.std_dev+0.00001) 
     return 

    def cost(self, X, y): 
     #squared error based cost 
     return np.sum((np.dot(X, self.weights.T)-y)**2) 

    def get_gradient(self, X, y, pred_y, learning_rate): 
     grad = np.zeros(shape=(X.shape[1],), dtype='float64') 
     #print("X.shape : ", X.shape, " grad.shape : ", grad.shape, "pred_y.shape", pred_y.shape, "y.shape : ", y.shape) 

     for ix in range(X.shape[0]): 
      #for each example in X 
      x_example = X[ix, :] 
      error = (pred_y[ix] - y[ix]) 

      for jx in range(grad.shape[0]): 
       #for each feature of X 
       grad[jx] = grad[jx] + x_example[jx]*error 
     for ix in range(grad.shape[0]): 
      #divide by the number of examples 
      grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0]) 
     return grad 


    def gradient_descent(self, X, y, learning_rate=0.001, num_iterations=100): 

     self.weights = np.zeros(shape=(X.shape[1],), dtype='float64') 
     for ix in range(X.shape[1]): 
      #initialize weights with random values 
      self.weights[ix] = np.random.rand()*100 

     #initialize cost history 
     self.cost_history = [] 

     for ix in range(num_iterations): 
      pred_y = np.dot(X, self.weights.T) 
      gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate) 
      self.weights = self.weights - gradient 
      cost = self.cost(X, y) 
      self.cost_history.append(cost) 

      if ix%10 == 0: 
       print("(learning_rate/(X.shape[0]))*gradient) : ", gradient)    
       print("Cost at ", ix, " epoch is : ", cost) 

    def predict(self, X): 
     return np.dot(X, self.weights.T) 

    def get_weights(self): 
     return self.weights 

#create a linear regression object 
lr = LinearRegression() 


# # Some random function generator 
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4 
def generate_data(X): 
    y = np.zeros(shape=(X.shape[0],), dtype='float64') 
    for ix in range(X.shape[0]): 
     y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3] 
    return y 


X = np.zeros(shape=(300, 5), dtype='float64') 
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)] 
X[:, 1:] = np.array(data_gen, dtype='float64') 

y = generate_data(X[:, 1:]) 

lr.set_mean_and_std_dev(X) 
lr.normalize(X) 

X[:, 0] = 1 
print(X.shape, y.shape, X.dtype, y.dtype) 
print(X[0], y[0]) 



X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:] 
print(y_test) 


lr.gradient_descent(X_train, y_train, 0.01, 500) 
pred_y = lr.predict(X_test) 
+0

你试过一个较小的学习率是多少? –

+0

是的,我也尝试过0.00001和0.0000001,但它一直在拍摄。 –

+0

我认为最好的方法是拍摄二维数据集(300 x 2),并跟踪梯度值,权重和成本。它应该帮助你挑出错误。它也将有助于与你可以在纸上解决的例子。 –

回答

1

有一个在功能get_gradient一个错误。 指数jx代码块for ix用于:

for ix in range(grad.shape[0]): 
    #divide by the number of examples 
    grad[jx] = grad[jx]*learning_rate*(1.0/X.shape[0]) 

而且你写太多for循环。 相反,你可以试试这个为get_gradient功能:

def get_gradient(self, X, y, pred_y, learning_rate): 
    errors = (pred_y - y) 
    grad = X.T.dot(errors) 
    #divide by the number of examples 
    grad = grad*learning_rate/X.shape[0] 
    return grad 

其他问题:

  1. 权重的初始值过大:

    self.weights = np.zeros(shape=(X.shape[1],), dtype='float64') 
    for ix in range(X.shape[1]): 
        #initialize weights with random values 
        self.weights[ix] = np.random.rand()*100 
    

    你可以用较小的初始值并只使用一条线:self.weights = np.random.rand(X.shape[1])

  2. 学习速率过大,请尝试:lr.gradient_descent(X_train, y_train, 0.00001, 3000)

  3. 好像你想要做的事与这两个功能:set_mean_and_std_devnormalize。 但是他们在这段代码中什么都没做。

最终代码:

import numpy as np 

class LinearRegression: 

    def __init__(self): 
     #initialize parameters with none values 
     self.cost_history = None 
     self.std_dev = None 
     self.mean = None 
     self.weights = None 

    def cost(self, X, y): 
     #squared error based cost 
     return np.sum((self.predict(X)-y)**2) 

    def get_gradient(self, X, y, pred_y, learning_rate): 
     errors = (pred_y - y) 
     grad = X.T.dot(errors) 
     #divide by the number of examples 
     grad = grad*learning_rate/X.shape[0] 
     return grad 


    def gradient_descent(self, X, y, 
         learning_rate=0.001, 
         num_iterations=100): 
     #initialize weights with random values 
     self.weights = np.random.rand(X.shape[1]) 
     #initialize cost history 
     self.cost_history = [] 

     for ix in range(num_iterations): 
      pred_y = self.predict(X) 
      gradient = self.get_gradient(X=X, y=y, pred_y=pred_y, learning_rate=learning_rate) 
      self.weights = self.weights - gradient 
      cost = self.cost(X, y) 
      self.cost_history.append(cost) 

      if ix%10 == 0: 
       print("(learning_rate/(X.shape[0]))*gradient) : ", gradient) 
       print("Cost at ", ix, " epoch is : ", cost) 

    def predict(self, X): 
     return np.dot(X, self.weights) 


#create a linear regression object 
lr = LinearRegression() 

# # Some random function generator 
# y = 2.5 + 2*X1 - X2 - 3*X3 + 1.23*X4 
def generate_data(X): 
    y = np.zeros(shape=(X.shape[0],), dtype='float64') 
    for ix in range(X.shape[0]): 
     y[ix] = 1.23 + 2.5*X[ix, 0] + 2*X[ix, 1] - X[ix, 2] - 3*X[ix, 3] 
    return y 


X = np.zeros(shape=(300, 5), dtype='float64') 
data_gen = [[np.random.rand()*107 for jx in range(4)] for ix in range(300)] 
X[:, 1:] = np.array(data_gen, dtype='float64') 

y = generate_data(X[:, 1:]) 

X[:, 0] = 1 
print(X.shape, y.shape, X.dtype, y.dtype) 
print(X[0], y[0]) 

X_train, X_test, y_train, y_test = X[:200], X[200:], y[:200], y[200:] 
print(y_test) 

lr.gradient_descent(X_train, y_train, 0.00001, 3000) 
pred_y = lr.predict(X_test)