我试图按照通过前端给予搜索词来获取谷歌数据和使用Apache nutch1.13到所有的URL和索引爬进solr6.5阿帕奇Nutch的抓取问题
当我运行正常的java程序通过从控制台获取输入,它工作正常,我可以看到索引中的已检索索引数据,并且可以对其进行搜索。 但是,当我尝试运行相同的Java Servlet中,我从UI输入,然后我得到下面的错误...无法弄清楚,什么问题
它的投掷错误,当我的servlet运行下面的脚本:
#!/bin/bash
NUTCH_HOME="/home/nutch1.13"
SOLR_HOME="/home/solr-6.5.0"
urls="/home/nutch1.13/urls/seed.txt"
crawldir="/home/nutch1.13/crawl"
NumRound=1
#clean the crawls
echo "Cleaning up..."
# bash check if directory exists
if [ -d $crawldir ]; then
echo "crawldir Directory exists"
rm -rf $crawldir/crawldb
rm -rf $crawldir/linkdb
rm -rf $crawldir/segments
else
echo "Directory does not exists"
fi
#crawl the urls
echo "----- crawling urls-----"
#$NUTCH_HOME/bin/crawl $urls $crawldir $NumRound
#start the solr
#$SOLR_HOME/bin/solr start
#if [ -d $SOLR_HOME/server/solr/$1]; then
# echo "Core already exists"
#else
#create collection/core for solr
# echo "----- create solr core-----"
# $SOLR_HOME/bin/solr create -c $1
#fi
#index the crawl data
#echo "----- Index to solr-----"
#$NUTCH_HOME/bin/nutch solrindex http://localhost:8983/solr/$1
$crawldir/crawldb -linkdb $crawldir/linkdb $crawldir/segments/*
also , my servlet class is as follows :
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;import java.io.InputStream;
import java.io.PrintWriter;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Servlet implementation class
*/
@WebServlet("/UrlMapping")
public class Driver extends HttpServlet {
private static final long serialVersionUID = 1L;
public static final String GOOGLE_SEARCH_URL = "https://www.google.com/search";
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
response.setContentType("text/html");
PrintWriter out = response.getWriter();
String word = request.getParameter("search");
String url = request.getParameter("urlcount");
String core = request.getParameter("solrcore");
out.println("Entered search term ->"+word);
out.println("Number of url's to be crawled -> "+url);
out.println("Solr core name -> "+core);
try {
search(word, Integer.parseInt(url));
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
/** execute apache nutch script to crawl the url's and index in solr */
try {
executeProcess(core);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
}
public void search(String searchterm,int num) throws IOException, InterruptedException{
String gsearchURL = GOOGLE_SEARCH_URL + "?q=" + searchterm + "&num=" + num;
Document doc = Jsoup.connect(gsearchURL).userAgent("Chrome/41.0.2228.0 Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36").get();
Elements results = doc.select("h3.r > a");
try (FileWriter fw = new FileWriter(new File("/home/sukesh/nutch1.13/urls/seed.txt"), false)) {
for (Element result : results) {
String linkHref = result.attr("href");
String linkText = result.text();
System.out.println("Text::" + linkText + ", URL::" + linkHref.substring(6, linkHref.indexOf("&")));
fw.write(linkHref.substring(7, linkHref.indexOf("&")) + "\n");
}
}
}
public void executeProcess(String arg) throws IOException, InterruptedException {
//String scriptPath = getServletContext().getRealPath("/sukicrawl.sh");
String scriptPath = "/home/elicpse_j2ee/eclipse/workspace/GoogleAnalytics/NutchScript/sukicrawl.sh";
Process p = new ProcessBuilder(scriptPath, arg).start();
InputStream ip = p.getInputStream();
int i = 0;
StringBuffer sb = new StringBuffer();
while ((i = ip.read()) != -1) {
sb.append((char) i);
}
System.out.println(sb.toString());
}
}
日志信息:
java.lang.Exception: java.io.IOException: Mkdirs failed to create file:/generate-temp-b42b2b91-e1e5-4e82-8861-881a7a607bd9/_temporary/0/_temporary/attempt_local2075293294_0001_r_000000_0/fetchlist-1 (exists=false, cwd=file:/)
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:529)
Caused by: java.io.IOException: Mkdirs failed to create file:/generate-temp-b42b2b91-e1e5-4e82-8861-881a7a607bd9/_temporary/0/_temporary/attempt_local2075293294_0001_r_000000_0/fetchlist-1 (exists=false, cwd=file:/)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:450)
at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:435)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:909)
at org.apache.hadoop.io.SequenceFile$Writer.<init>(SequenceFile.java:1135)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:273)
at org.apache.hadoop.io.SequenceFile.createWriter(SequenceFile.java:530)
at org.apache.hadoop.mapred.SequenceFileOutputFormat.getRecordWriter(SequenceFileOutputFormat.java:64)
at org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat.getBaseRecordWriter(MultipleSequenceFileOutputFormat.java:51)
at org.apache.hadoop.mapred.lib.MultipleOutputFormat$1.write(MultipleOutputFormat.java:104)
at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.write(ReduceTask.java:493)
at org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:422)
at org.apache.nutch.crawl.Generator$Selector.reduce(Generator.java:344)
at org.apache.nutch.crawl.Generator$Selector.reduce(Generator.java:112)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:444)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:392)
at org.apache.hadoop.mapred.LocalJobRunner$Job$ReduceTaskRunnable.run(LocalJobRunner.java:319)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
2017-04-21 15:13:21,356 ERROR crawl.Generator - Generator:
java.io.IOException: Job failed!
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:865)
at org.apache.nutch.crawl.Generator.generate(Generator.java:591)
at org.apache.nutch.crawl.Generator.run(Generator.java:766)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.nutch.crawl.Generator.main(Generator.java:719)
错误控制台上:
generator: starting at 2017-04-21 15:31:22
Generator: Selecting best-scoring urls due for fetch.
Generator: filtering: false
Generator: normalizing: true
Generator: topN: 50000
Generator: java.io.IOException: Job failed!
at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:865)
at org.apache.nutch.crawl.Generator.generate(Generator.java:591)
at org.apache.nutch.crawl.Generator.run(Generator.java:766)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at org.apache.nutch.crawl.Generator.main(Generator.java:719)
Error running:
/home/nutch1.13/bin/nutch generate -D mapreduce.job.reduces=2 -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true /home/nutch1.13/crawl/crawldb /home/nutch1.13/crawl/segments -topN 50000 -numFetchers 1 -noFilter
Failed with exit value 255.
感谢您的回应...我让它工作。我将tomcat安装从/ opt/tomcat更改为自定义安装位置。现在它的工作... – sukesh
好吧,很高兴它的工作 –