2015-03-03 95 views
2

在对一些稀疏向量进行聚类之后,我需要在每个聚类中找到相交向量。为了实现这一点,我尽量减少MLlib载体如下面的例子:Spark:值reduceByKey不是成员

import org.apache.spark.SparkConf 
import org.apache.spark.SparkContext 
import org.apache.spark.mllib.clustering.KMeans 
import org.apache.spark.mllib.linalg.Vectors 

//For Sparse Vector 
import org.apache.spark.mllib.regression.LabeledPoint 
import org.apache.spark.mllib.util.MLUtils 
import org.apache.spark.rdd.RDD 
import org.apache.spark.mllib.linalg.{Vector, Vectors} 

object Recommend { 

    def main(args: Array[String]) { 
    // set up environment 
    val conf = new SparkConf() 
     .setAppName("Test") 
     .set("spark.executor.memory", "2g") 
    val sc = new SparkContext(conf) 

    // Some vectors 
    val vLen = 1800 
    val sv11: Vector = Vectors.sparse(vLen,Seq((100,1.0), (110,1.0), (120,1.0), (130, 1.0))) 
    val sv12: Vector = Vectors.sparse(vLen,Seq((100,1.0), (110,1.0), (120,1.0), (130, 1.0), (140, 1.0) )) 
    val sv13: Vector = Vectors.sparse(vLen,Seq((100,1.0), (120,1.0), (130,1.0))) 
    val sv14: Vector = Vectors.sparse(vLen,Seq((110,1.0), (130, 1.0))) 
    val sv15: Vector = Vectors.sparse(vLen,Seq((140, 1.0))) 

    val sv21: Vector = Vectors.sparse(vLen,Seq((200,1.0), (210,1.0), (220,1.0), (230, 1.0))) 
    val sv22: Vector = Vectors.sparse(vLen,Seq((200,1.0), (210,1.0), (220,1.0), (230, 1.0), (240, 1.0) )) 
    val sv23: Vector = Vectors.sparse(vLen,Seq((200,1.0), (220,1.0), (230,1.0))) 
    val sv24: Vector = Vectors.sparse(vLen,Seq((210,1.0), (230, 1.0))) 
    val sv25: Vector = Vectors.sparse(vLen,Seq((240, 1.0))) 

    val sv31: Vector = Vectors.sparse(vLen,Seq((300,1.0), (310,1.0), (320,1.0), (330, 1.0))) 
    val sv32: Vector = Vectors.sparse(vLen,Seq((300,1.0), (310,1.0), (320,1.0), (330, 1.0), (340, 1.0) )) 
    val sv33: Vector = Vectors.sparse(vLen,Seq((300,1.0), (320,1.0), (330,1.0))) 
    val sv34: Vector = Vectors.sparse(vLen,Seq((310,1.0), (330, 1.0))) 
    val sv35: Vector = Vectors.sparse(vLen,Seq((340, 1.0))) 

    val sparseData = sc.parallelize(Seq(
     sv11, sv12, sv13, sv14, sv15, 
     sv21, sv22, sv23, sv24, sv25, 
     sv31, sv32, sv33, sv34, sv35 
     )) 

    // Cluster the data into two classes using KMeans 
    val numClusters = 3 
    val numIterations = 20 

    test(numClusters, numIterations, sparseData) 
    } 

    def test(numClusters:Int, numIterations:Int, 
     data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]) = { 

    val clusters = KMeans.train(data, numClusters, numIterations) 

    val predictions = data.map(v => (clusters.predict(v), v)) 

    predictions.reduceByKey((v1, v2) => v1) 

    } 
} 

线predictions.reduceByKey((v1, v2) => v1)导致错误:

value reduceByKey is not a member of org.apache.spark.rdd.RDD[(Int, org.apache.spark.mllib.linalg.Vector)] 

是什么原因呢?

+1

可能重复http://stackoverflow.com/questions/23943852/reducebykey-method-not-being-found-in-scala-spark ) – 2015-03-03 14:13:15

+0

感谢您的补救)) – zork 2015-03-03 14:24:41

回答

1

您的代码应该有,因为你已经猜到了,这个进口补充说:

import org.apache.spark.SparkContext._ 

为什么?因为它带来了一些隐含的转换,主要的重要(对于你的情况)是隐式转换。 Spark会猜测当你有的Tuple时,左侧可以被认为是关键,因此可以让你访问一些方便的转换或动作,如reduceByKey

问候,

[reduceByKey方法Scala中火花未找到(的
相关问题