object newpdf {
def main(args: Array[String]) {
val sc = new SparkContext("local[*]","appName")
val path = "hdfs://namenode2.aibl.net:8020/ABDF/akhilaajith/PF_knnmodel_1231480046927236/visualise1/model_points"
val data = sc.binaryFiles(path)
val rdd = data.map(x => {
tikaFunc(x)
})
rdd.foreach(println)
}
def tikaFunc(a: (String, PortableDataStream)) = {
val file: File = new File(a._1.drop(5))
val myparser: AutoDetectParser = new AutoDetectParser()
val stream: InputStream = new FileInputStream(a._1)
val handler: WriteOutContentHandler = new WriteOutContentHandler(-1)
val metadata: Metadata = new Metadata()
val context: ParseContext = new ParseContext()
myparser.parse(stream, handler, metadata, context)
stream.close
val delimiter = " "
Array(file.getName, handler.toString.trim).mkString(delimiter)
}
}
它显示错误的InputStream作为如何使用Scala在Apache Tika中提供hdfs路径?
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 1, localhost): java.io.FileNotFoundException: hdfs:/namenode2.aibl.net:8020/ABDF/akhilaajith/PF_knnmodel_1231480046927236/visualise1/model_points/part-00000 (No such file or directory)
怎么才能解决这个问题呢?
它说找不到'HDFS文件: /namenode2.aibl.net:8020/ABDF/akhilaajith/PF_knnmodel_1231480046927236/visualise1/model_points /部分00000'。也许路径不正确? – dk14
路径是正确的。我检查该文件hdfs并且还有内容 – AkhilaV