在Python中使用XGBoost预测输出变量基于多个输入变量

我是xgboost的新手，并尝试执行以下操作。在Python中使用XGBoost预测输出变量基于多个输入变量

使用输入变量
试图找出哪些输入变量具有与所述输出变量更多的相关性（良好的关系）预测输出变量。

我无法得到正确的结果w.r.t都是1和2.我是这个xgboost的初学者，plz帮助我在这里。提前致谢。

简称：（杰森·布朗利博客，kaggle）

CODE：

import pandas as pd 
from sklearn import preprocessing 
import numpy as np 
import xgboost as xgb 
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 

def main(): 
    #Removing the blank fields and filling with mean values 
    def xls_to_csv(): 
     df = pd.read_excel(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_S2-2017-02-14-103304-836.xlsx") 
     df.drop(['aggregation','lot','____________________wafer','wafer','lot wafer'],axis=1, inplace=True) 
     df_1 = df.apply(lambda x: x.fillna(x.mean()),axis=0) 
     df_1.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute.csv", index=False) 

    #xls_to_csv() 
    #Applying normalization 
    df1 = pd.read_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute.csv") 
    for feature in df1.columns: # Loop through all columns in the dataframe 
      if df1[feature].dtype == 'object': # Only apply for columns with categorical strings 
       df1[feature] = pd.Categorical(df1[feature]).codes 
    df2 = (df1 - df1.mean())/df1.std() 
    df2 = df2.dropna(axis=1,how='all',thresh=None) 

    df2.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute_after_nml.csv", index=False) 

    def get_data(): 
     train = pd.read_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_after_impute_after_nml.csv") 
     y_train = train.pop('7;IDDQ_IPD;tested_pct;sbin') 
     features = train.columns 
     x_train = train[features] 
     return features, x_train, y_train 

    features, x_train, y_train = get_data() 

    final_train,final_test = train_test_split(x_train, test_size = 0.2) 
    final_y_train,final_y_test = train_test_split(y_train, test_size = 0.2) 

    #XGboost modelling starts here 
    xgdmat = xgb.DMatrix(final_train, final_y_train) # Create our DMatrix to make XGBoost more efficient 
    our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,'objective': 'reg:linear', 'max_depth':3, 'min_child_weight':1} # Grid Search CV optimized settings 

    final_gb = xgb.train(our_params, xgdmat, num_boost_round= 1000) 
    importances = final_gb.get_fscore() 
    importance_frame = pd.DataFrame({'Importance': list(importances.values()), 'Feature': list(importances.keys())}) 
    importance_frame.sort('Importance', inplace = True) 
    importance_frame.to_csv(r"C:\ML material - docs\L90\Abhijit's task\extraction-M1947-B3,B6andB5\S2\ML_9820G_PCMvsS2_PCM_FE_BE_ML_scores.csv", index=False) 

    # Analysing the test results 
    testdmat = xgb.DMatrix(final_test) 
    y_pred = final_gb.predict(testdmat) 
    print y_pred,"\n",final_y_test 

if __name__ == '__main__': 
    main()

来源

2017-03-06 user3827728