2012-03-14 61 views
1

我修改了下面的代码来输出至少出现过十次的字。但它不起作用 - 输出文件根本不会改变。我需要做些什么才能使它工作?来自Hadoop字数的意外输出

import java.io.IOException; 
import java.util.*; 

import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.conf.*; 
import org.apache.hadoop.io.*; 
import org.apache.hadoop.mapreduce.*; 
import org.apache.hadoop.mapreduce.lib.input.*; 
import org.apache.hadoop.mapreduce.lib.output.*; 
import org.apache.hadoop.util.*; 
// ... 
public class WordCount extends Configured implements Tool { 
// ... 
public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { 
    private final static IntWritable one = new IntWritable(1); 
    private Text word = new Text(); 

    public void map(LongWritable key, Text value, Context context) 
      throws IOException, InterruptedException { 
     String line = value.toString(); 
     StringTokenizer tokenizer = new StringTokenizer(line); 
     while (tokenizer.hasMoreTokens()) { 
      word.set(tokenizer.nextToken()); 
      context.write(word, one); 
     } 
    } 
} 

public static class Reduce extends 
     Reducer<Text, IntWritable, Text, IntWritable> { 
    public void reduce(Text key, Iterable<IntWritable> values, 
      Context context) throws IOException, InterruptedException { 

     int sum = 0; 
     for (IntWritable val : values) { 
      sum += val.get(); 
     } 
        // where I modified, but not working, the output file didnt change 
     if(sum >= 10) 
     { 
      context.write(key, new IntWritable(sum)); 
     } 
    } 
} 

public int run(String[] args) throws Exception { 
    Job job = new Job(getConf()); 
    job.setJarByClass(WordCount.class); 
    job.setJobName("wordcount"); 

    job.setOutputKeyClass(Text.class); 
    job.setOutputValueClass(IntWritable.class); 

    job.setMapperClass(Map.class); 
    //job.setCombinerClass(Reduce.class); 
    job.setReducerClass(Reduce.class); 

    job.setInputFormatClass(TextInputFormat.class); 
    job.setOutputFormatClass(TextOutputFormat.class); 

    FileInputFormat.setInputPaths(job, new Path(args[0])); 
    FileOutputFormat.setOutputPath(job, new Path(args[1])); 

    boolean success = job.waitForCompletion(true); 
    return success ? 0 : 1; 
} 

public static void main(String[] args) throws Exception { 
    int ret = ToolRunner.run(new WordCount(), args); 
    System.exit(ret); 
} 
} 

回答

1

代码看起来完全有效。我可以怀疑你的数据集足够大,所以文字恰好出现10次以上? 请确保您确实在寻找新的结果。

0

您可以看到默认的Hadoop计数器并了解发生了什么。

+0

具体文件,你可以张贴实际的输出,如果减少输入组的数量等于减少输出记录的数量,并且您已确认所有输出的计数大于等于10,那么这将支持@David留下的评论 – 2012-03-22 02:09:44

0

代码是绝对正确的,也许你正在读取修改代码之前生成的输出。或者,也许你没有更新以前在修改代码后使用的jar文件?

0

该代码看起来有效。 为了能够帮助您,我们至少需要您用来运行此命令行的命令行。这也将有助于如果你给它这样的

one 
two two 
three three three 

等截至20