我想对950个样本和大约5000个特征的数据使用套索优化。套索函数是$(1 /(2 * numberofsamples))* || y - Xw ||^2_2 + alpha * || w || _1 $。一旦尝试使用初始化的最小化,我得到完全不同的w这很奇怪,因为套索是凸的,初始化不应该影响结果。这是带和不带初始化的套索的结果。 tol是宽容。如果w的变化变为低音容忍,则会发生收敛。为什么不同的初始点会导致套索优化(凸面)的不同结果?
tol=0.00000001
##### lasso model errors #####
gene: 5478 matrix error: 0.069611732213
with initialization: alpha: 1e-20 promotion: -3.58847815733e-13
coef: [-0.00214732 -0.00509795 0.00272167 -0.00651548 -0.00164646 -0.00115342
0.00553346 0.01047653 0.00139832]
without initialization: alpha: 1e-20 promotion: -19.0735249749
coef: [-0.03650629 0.08992003 -0.01287155 0.03203973 0.1567577 -0.03708655
-0.13710957 -0.01252736 -0.21710334]
with initialization: alpha: 1e-15 promotion: 1.06179081478e-10
coef: [-0.00214732 -0.00509795 0.00272167 -0.00651548 -0.00164646 -0.00115342
0.00553346 0.01047653 0.00139832]
without initialization: alpha: 1e-15 promotion: -19.0735249463
coef: [-0.03650629 0.08992003 -0.01287155 0.03203973 0.1567577 -0.03708655
-0.13710957 -0.01252736 -0.21710334]
Warning (from warnings module):
File "/usr/local/lib/python2.7/site-packages/sklearn/linear_model/coordinate_descent.py", line 491
ConvergenceWarning)
ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
with initialization: alpha: 1e-10 promotion: 0.775144987537
coef: [-0.00185139 -0.0048819 0.00218349 -0.00622618 -0.00145647 -0.00115857
0.0055919 0.01072924 0.00043773]
without initialization: alpha: 1e-10 promotion: -17.8649603301
coef: [-0.03581581 0.0892119 -0..03151441 0.15606195 -0.03734093
-0.13604286 -0.01247732 -0.21233529]
with initialization: alpha: 1e-08 promotion: -5.87121366314
coef: [-0. 0. -0. -0.01064477 0. -0.00116167
-0. 0.01114746 0. ]
without initialization: alpha: 1e-08 promotion: 4.05593555389
coef: [ 0. 0.04505117 0.00668611 0. 0.07731668 -0.03537848
-0.03151995 0. -0.00310122]
max promote:
4.05593555389
对于实现,我使用了python包skylarn.linear_model的lasso函数。我也改变了数据,但新数据的结果也随着初始化而改变。我认为这很奇怪,但我无法分析它并找到解释。
下面是我的代码的一部分,它与套索有关。我的数据是基因表达。我测试了规范化和非规范化数据上的代码。在他们两人的初始点上有所不同。
alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]
lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
lassoreg.coef_ = mybeta[:,j-c]
lassoreg.fit(train[:,predictors],train[:,y])
y_train_pred = lassoreg.predict(A)#train[:,predictors])
y_test_pred = lassoreg.predict(C)#test[:,predictors])
这里也是我的全部代码:
import pandas as pd
import random
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os
from GEOparse.GEOTypes import (GSE, GSM, GPL, GDS,
GDSSubset, GEODatabase,
DataIncompatibilityException,
NoMetadataException,
)
import GEOparse as GEO
import numpy as np
import copy
import sys
import math
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoLars
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import coordinate_descent
from sklearn.linear_model import lasso_path, enet_path
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from copy import deepcopy
miss_percent = 0.1
alpha_lasso = [1e-20,1e-15, 1e-10, 1e-8, 1e-7,1e-6,1e-5,1e-4, 1e-3,1e-2, 1, 5 ,20]
mins=[]
maxs=[]
mean_err=[]
alphas=[]
mins1=[]
maxs1=[]
mean_err1=[]
alphas1=[]
#mnist = input_data.read_data_sets('../../MNIST_data', one_hot=True)
def getdata(percent):
gsd = GEO.get_GEO(geo="GDS4971")
ngsd = gsd.table.replace('null', np.NaN)
ngsd = ngsd.dropna(axis=0, how='any')
ngsd =ngsd.transpose()
dataarray = ngsd.values
data = np.delete(dataarray, [0,1], 0)
x = data.astype(np.float)
r_df = x.shape[0]
c_df = x.shape[1]
r = int(r_df-math.sqrt((1-percent)*r_df))
c = int(c_df-math.sqrt((1-percent)*c_df))
train = x[0:r,:]
test = x[r:r_df,:]
return x,train,test,r_df,c_df,r,c
genedata,train,test,r_df,c_df,r,c = getdata(miss_percent)
predictors = range(0,c)
promotion =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion = np.asmatrix(promotion)
#error of ax-b
error_aw_b = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso = np.asmatrix(error_lasso)
promotion1 =[[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
promotion1 = np.asmatrix(promotion)
#error of ax-b
error_aw_b1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_aw_b1 = np.asmatrix(error_aw_b)
#error of cw-x
error_cw_x1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_cw_x1 = np.asmatrix(error_cw_x)
#error of lasso function
error_lasso1 = [[0.001 for x in range(len(alpha_lasso))] for y in range(c_df-c)]
error_lasso1 = np.asmatrix(error_lasso)
mybeta = #any initialization
###################### LASSO #####################
print("##### lasso model errors #####")
for j in range(c,c+1):
mean_err=[]
print("\n")
y=j
eachMeanError= math.sqrt((np.power(errorC[:,j-c],2)).sum()/(r_df-r))
print("gene: "+str(j)+ " matrix error: "+ str(eachMeanError))
for i in range(0,4):#len(alpha_lasso)):
lassoreg = Lasso(alpha=alpha_lasso[i],warm_start=True,tol=0.00000001,max_iter=100000)
lassoreg.coef_ = mybeta[:,j-c]
lassoreg.fit(train[:,predictors],train[:,y])
y_train_pred = lassoreg.predict(A)#train[:,predictors])
y_test_pred = lassoreg.predict(C)#test[:,predictors])
y_lasso_func = (1/(2*r))*sum(y_train_pred)+sum(abs(lassoreg.coef_))
################## RMS ##################
error_aw_b[j-c,i] = math.sqrt(sum((y_train_pred-train[:,y])**2)/r)
error_lasso[j-c,i] = y_lasso_func
error_cw_x[j-c,i] = math.sqrt(sum((y_test_pred-test[:,y])**2)/(r_df-r))
mins.extend([(error_cw_x.min())])
maxs.extend([(error_cw_x.max())])
promotion[j-c,i] = (((eachMeanError-error_cw_x[j-c,i])/eachMeanError)*100)
print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b[j-c,i]) + " error_cw_x: " + str(error_cw_x[j-c,i])+" error_lasso: "+str(error_lasso[j-c,i]) + " promotion: " + str(promotion[j-c,i]))
print("coef: " + str(lassoreg.coef_[1:10]))
lassoreg1 = Lasso(alpha=alpha_lasso[i],tol=0.00000001,max_iter=100000)
lassoreg1.fit(train[:,predictors],train[:,y])
y_train_pred1 = lassoreg1.predict(A)#train[:,predictors])
y_test_pred1 = lassoreg1.predict(C)#test[:,predictors])
y_lasso_func1 = (1/(2*r))*sum(y_train_pred1)+sum(abs(lassoreg1.coef_))
################## RMS ##################
error_aw_b1[j-c,i] = math.sqrt(sum((y_train_pred1-train[:,y])**2)/r)
error_lasso1[j-c,i] = y_lasso_func1
error_cw_x1[j-c,i] = math.sqrt(sum((y_test_pred1-test[:,y])**2)/(r_df-r))
mins1.extend([(error_cw_x1.min())])
maxs1.extend([(error_cw_x1.max())])
promotion1[j-c,i] = (((eachMeanError-error_cw_x1[j-c,i])/eachMeanError)*100)
print("alpha: "+str(alpha_lasso[i])+ " error_aw_b: "+str(error_aw_b1[j-c,i]) + " error_cw_x: " + str(error_cw_x1[j-c,i])+" error_lasso: "+str(error_lasso1[j-c,i]) + " promotion: " + str(promotion1[j-c,i]))
print("coef: " + str(lassoreg1.coef_[1:10]))
print("\n")
print("max promote:")
print((promotion[j-c,:].max()))
f = open('analyse_col', 'wb')
np.save(f, [promotion,alphas,error_cw_x,mins,maxs])
f.close()
plt.plot(promotion[:,j-c])
plt.ylabel('coef for ')
plt.xlabel('each gene')
plt.show()
非常感谢,您的回答和论文非常有用。我知道在某些情况下它并没有收敛,但在某些情况下它确实如此。迭代次数为100000次。收敛意味着系数的变化小于0.00000001。在收敛的情况下,我认为它应该接近最佳点,我们不应该看到很大的差异。我认为问题在于你在答案的第一部分提到的条件。 – shadi
问题可能不是维度(p> n),而不是算法,但是缩放比例不好。你可能没有预处理你的问题或使用一些非常奇怪的参数。一个简单的规范(new_x - old_x)
sascha
我实际上测试了标准化和非标准化数据的套索,我的数据是基因表达。以非标准化的方式,范围介于6和14之间。在标准化模式下,它介于0和1之间。我通过代码完成了该职位。预处理是什么意思?你的意思是正常化吗? – shadi