2017-03-03 72 views
0

我想在Eclipse上尝试Spark ML,我必须先执行一些数据操作。下面的代码显示了后者。在Eclipse/Spark IDE中执行异常的错误符号引用

我的代码:

package org.test.spark 


import org.apache.spark.SparkConf 
import org.apache.spark.SparkContext 

import org.apache.spark.ml.classification.RandomForestClassifier 
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 
import org.apache.spark.ml.feature.StringIndexer 
import org.apache.spark.ml.feature.VectorAssembler 

import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 
import org.apache.spark.ml.{ Pipeline, PipelineStage } 


import org.apache.spark.rdd.RDD 

import org.apache.spark.sql._ 

object DataTest{ 

    import scala.reflect.runtime.universe.TypeTag 

    case class Credit(
    creditability: Double, 
    balance: Double, duration: Double, history: Double, purpose: Double, amount: Double, 
    savings: Double, employment: Double, instPercent: Double, sexMarried: Double, guarantors: Double, 
    residenceDuration: Double, assets: Double, age: Double, concCredit: Double, apartment: Double, 
    credits: Double, occupation: Double, dependents: Double, hasPhone: Double, foreign: Double 
) 

    def main(args: Array[String]) = { 

    //Start the Spark context 
    val conf = new SparkConf() 
     .setAppName("DataTest") 
     .setMaster("local") 
    val sc = new SparkContext(conf) 

    val sqlContext= new org.apache.spark.sql.SQLContext(sc) 

    import sqlContext.implicits._ 


    // function to create a Credit class from an Array of Double 
    def parseCredit(line: Array[Double]): Credit = { 
    Credit(
     line(0), 
     line(1) - 1, line(2), line(3), line(4) , line(5), 
     line(6) - 1, line(7) - 1, line(8), line(9) - 1, line(10) - 1, 
     line(11) - 1, line(12) - 1, line(13), line(14) - 1, line(15) - 1, 
     line(16) - 1, line(17) - 1, line(18) - 1, line(19) - 1, line(20) - 1 
    ) 
    } 

// function to transform an RDD of Strings into an RDD of Double 
    def parseRDD(rdd: RDD[String]): RDD[Array[Double]] = { 
    rdd.map(_.split(",")).map(_.map(_.toDouble)) 
    } 

    val creditDF= parseRDD(sc.textFile("germancredit.csv")).map(parseCredit).toDF().cache() 

    creditDF.registerTempTable("credit") 

    creditDF.printSchema 

    creditDF.show 

    creditDF.groupBy("creditability").avg("balance").show 

    sqlContext.sql("SELECT creditability, avg(balance) as avgbalance, avg(amount) as avgamt, avg(duration) as avgdur FROM credit GROUP BY creditability ").show 

    //define the feature columns to put in the feature vector 
    val featureCols = Array("balance", "duration", "history", "purpose", "amount", 
    "savings", "employment", "instPercent", "sexMarried", "guarantors", 
    "residenceDuration", "assets", "age", "concCredit", "apartment", 
    "credits", "occupation", "dependents", "hasPhone", "foreign") 

//set the input and output column names 

    val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features") 

//return a dataframe with all of the feature columns in a vector column 
    val df2 = assembler.transform(creditDF) 

// the transform method produced a new column: features. 
    df2.show 

    sc.stop 



    } 

} 

当我运行mvn clean install我得到如下:

error: bad symbolic reference. A signature in PipelineStage.class refers to term internal in package org.apache.spark which is not available. It may be completely missing from the current classpath, or the version on the classpath might be incompatible with the version used when compiling PipelineStage.class. val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

看来,问题出现的通话val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

当我运行mvn clean package我得到:

Failed to execute goal org.scala-tools:maven-scala-plugin:2.15.2:compile (default) on project spark: wrap: org.apache.commons.exec.ExecuteException: Process exited with an error: 1(Exit value: 1) -> [Help 1]

我的pom.xml文件:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 
    <modelVersion>4.0.0</modelVersion> 
    <groupId>org.test</groupId> 
    <artifactId>spark</artifactId> 
    <version>0.0.1-SNAPSHOT</version> 

    <pluginRepositories> 
     <pluginRepository> 
      <id>scala-tools.org</id> 
      <name>Scala-tools Maven2 Repository</name> 
      <url>http://scala-tools.org/repo-releases</url> 
     </pluginRepository> 
    </pluginRepositories> 


    <dependencies> 
     <dependency> 
      <groupId>org.apache.spark</groupId> 
      <artifactId>spark-core_2.10</artifactId> 
      <version>1.6.0</version> 
     </dependency> 

     <dependency> 
      <groupId>org.apache.spark</groupId> 
      <artifactId>spark-sql_2.10</artifactId> 
      <version>1.6.0</version> 
     </dependency> 


     <dependency> 
      <groupId>org.apache.spark</groupId> 
      <artifactId>spark-mllib_2.10</artifactId> 
      <version>2.0.1</version> 
     </dependency> 

     <dependency> 
      <groupId>org.apache.spark</groupId> 
      <artifactId>spark-mllib-local_2.10</artifactId> 
      <version>2.0.1</version> 
     </dependency> 

     <dependency> 
      <groupId>com.databricks</groupId> 
      <artifactId>spark-csv_2.10</artifactId> 
      <version>1.2.0</version> 
     </dependency> 

    </dependencies> 

    <build> 
     <plugins> 
      <!-- mixed scala/java compile --> 
      <plugin> 
       <groupId>org.scala-tools</groupId> 
       <artifactId>maven-scala-plugin</artifactId> 
        <version>2.15.2</version> 
       <executions> 

        <execution> 
         <id>compile</id> 
         <goals> 
          <goal>compile</goal> 
         </goals> 
         <phase>compile</phase> 
        </execution> 

        <execution> 
         <id>test-compile</id> 
         <goals> 
          <goal>testCompile</goal> 
         </goals> 
         <phase>test-compile</phase> 
        </execution> 


        <execution> 
         <phase>process-resources</phase> 
         <goals> 
          <goal>compile</goal> 
         </goals> 
        </execution> 

       </executions> 
      </plugin> 


      <plugin> 
       <artifactId>maven-compiler-plugin</artifactId> 
        <version>3.6.1</version> 
       <configuration> 
        <source>1.7</source> 
        <target>1.7</target> 
       </configuration> 
      </plugin> 
      <!-- for fatjar --> 
      <plugin> 
       <groupId>org.apache.maven.plugins</groupId> 
       <artifactId>maven-assembly-plugin</artifactId> 
       <version>2.4</version> 
       <configuration> 
        <descriptorRefs> 
         <descriptorRef>jar-with-dependencies</descriptorRef> 
        </descriptorRefs> 
       </configuration> 
       <executions> 
        <execution> 
         <id>assemble-all</id> 
         <phase>package</phase> 
         <goals> 
          <goal>single</goal> 
         </goals> 
        </execution> 
       </executions> 
      </plugin> 
      <plugin> 
       <groupId>org.apache.maven.plugins</groupId> 
       <artifactId>maven-jar-plugin</artifactId> 
       <version>3.0.2</version> 
       <configuration> 
        <archive> 
         <manifest> 
          <addClasspath>true</addClasspath> 
          <mainClass>fully.qualified.MainClass</mainClass> 
         </manifest> 
        </archive> 
       </configuration> 
      </plugin> 
     </plugins> 
     <pluginManagement> 
      <plugins> 
       <!--This plugin's configuration is used to store Eclipse m2e settings 
        only. It has no influence on the Maven build itself. --> 
       <plugin> 
        <groupId>org.eclipse.m2e</groupId> 
        <artifactId>lifecycle-mapping</artifactId> 
        <version>1.0.0</version> 
        <configuration> 
         <lifecycleMappingMetadata> 
          <pluginExecutions> 
           <pluginExecution> 
            <pluginExecutionFilter> 
             <groupId>org.scala-tools</groupId> 
             <artifactId> 
              maven-scala-plugin 
             </artifactId> 
             <versionRange> 
              [2.15.2,) 
             </versionRange> 
             <goals> 
              <goal>compile</goal> 
              <goal>testCompile</goal> 
             </goals> 
            </pluginExecutionFilter> 
            <action> 
             <execute></execute> 
            </action> 
           </pluginExecution> 
          </pluginExecutions> 
         </lifecycleMappingMetadata> 
        </configuration> 
       </plugin> 
      </plugins> 
     </pluginManagement> 
</build> 


</project> 

对我怎么能解决这个错误将是非常有益的任何建议,谢谢。

回答

0

试着总是从org.apache.spark包中导入相同版本的所有东西。

<dependency> 
     <groupId>org.apache.spark</groupId> 
     <artifactId>spark-sql_2.10</artifactId> 
     <version>1.6.0</version> 
    </dependency> 
    <dependency> 
     <groupId>org.apache.spark</groupId> 
     <artifactId>spark-mllib_2.10</artifactId> 
     <version>2.0.1</version> 
    </dependency> 

在这里,有些东西正在使用版本1.6.0和其他2.0.1 - 我认为火花文物都需要他们的版本是相同的。

相关问题