2011-11-21 83 views
5

基本上我是lucene的新手我已经使用70个电子邮件文档创建索引。第一个索引是用前29个文档创建的,然后是41个文档的其余部分用于创建另一个索引。Lucene中的合并索引

我试图使用Lucene在第一个索引文件进行搜索,它给我的结果,我想......但每当我只是尝试合并这两个指标是从来没有为我做这个.. 创建索引

import java.io.BufferedReader; 
import java.io.File; 
import java.io.FileFilter; 
import java.io.FileReader; 
import java.io.IOException; 

import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 

public class Indexer { 

    public static void main(String[] args) throws Exception { 
     if (args.length != 0) { 
      throw new IllegalArgumentException("Usage: java " 
        + Indexer.class.getName() + " <index dir> <data dir>"); 
     } 
     String indexDir = "docsOPDir"; //1 
     String dataDir = "docsDir"; //2 
     long start = System.currentTimeMillis(); 
     Indexer indexer = new Indexer(indexDir); 
     int numIndexed; 
     try { 
      numIndexed = indexer.index(dataDir, new TextFilesFilter()); 
     } finally { 
      indexer.close(); 
     } 
     long end = System.currentTimeMillis(); 
     System.out.println("Indexing " + numIndexed + " files took " 
       + (end - start) + " milliseconds"); 
    } 

    private IndexWriter writer; 

    public Indexer(String indexDir) throws IOException { 
     File INDEX_DIR = new File(indexDir); 

     INDEX_DIR.mkdir(); 
     Directory dir = FSDirectory.getDirectory(INDEX_DIR); 
     writer = new IndexWriter(dir, new StandardAnalyzer(), true); 
     writer.setMergeFactor(1000); 
     writer.setRAMBufferSizeMB(50); 

    } 

    public void close() throws IOException { 
     writer.close(); //4 
    } 

    public int index(String dataDir, FileFilter filter) throws Exception { 
     File[] files = new File(dataDir).listFiles(); 
     for (File f : files) { 
      System.out.println("Reading File:"+f); 
      if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() 
        && (filter == null || filter.accept(f))) { 
       indexFile(f); 
      } 
     } 
     return writer.numRamDocs(); //5 
    } 

    private static class TextFilesFilter implements FileFilter { 
     public boolean accept(File path) { 
      return !path.getName().toLowerCase() //6 
        .startsWith("541"); //6 
     } 
    } 

    protected Document getDocument(File f) throws Exception { 
     Document doc = new Document(); 
     doc.add(new Field("subject", getSubject(f),Field.Store.YES, Field.Index.TOKENIZED)); //7 
     doc.add(new Field("filename", f.getName(), //8 
       Field.Store.YES, Field.Index.NO));//8 
     doc.add(new Field("fullpath", f.getCanonicalPath(), //9 
       Field.Store.YES, Field.Index.NO));//9 
     return doc; 
    } 
    private String getSubject(File f) throws Exception { 
      BufferedReader br = new BufferedReader(new FileReader(f)); 
      String line ; 
      while ((line = br.readLine()) != null) { 
       if(line.toUpperCase().startsWith("SUBJECT")){ 
        return line; 
       } 
       } 
      return "NO Subject Found"; 

     } 
    private void indexFile(File f) throws Exception { 
     System.out.println("Indexing " + f.getCanonicalPath()); 
     Document doc = getDocument(f); 
     writer.addDocument(doc); //10 
    } 
} 

对于合并指数:

File INDEXES_DIR = new File("\\docsOP2"); 
     File INDEX_DIR = new File("\\docs"); 

     INDEX_DIR.mkdir(); 

     Date start = new Date(); 

     try { 

      IndexWriter writer = new IndexWriter(INDEX_DIR, 
               new StandardAnalyzer(), 
               true); 
      writer.setMergeFactor(1000); 
      writer.setRAMBufferSizeMB(50); 

      Directory indexes[] = new Directory[INDEXES_DIR.list().length]; 

      for (int i = 0; i < INDEXES_DIR.list().length; i++) { 
       System.out.println("Adding: " + INDEXES_DIR.list()[i]); 
       indexes[i] = FSDirectory.getDirectory(INDEXES_DIR.getAbsolutePath() 
                + "/" + INDEXES_DIR.list()[i]); 
       System.out.println(indexes[i]); 
      } 

      System.out.print("Merging added indexes..."); 
      writer.addIndexes(indexes); 
      System.out.println("done"); 

      System.out.print("Optimizing index..."); 
      writer.optimize(); 
      writer.close(); 
      System.out.println("done"); 

      Date end = new Date(); 
      System.out.println("It took: "+((end.getTime() - start.getTime())/1000) 
              + "\""); 

回答

3

的代码看起来是正确的。为帮助您追踪问题,请转储新索引以查看其中包含的内容。

下面是一些代码GIST:Dump a Lucene index as a XML document

+0

<?XML版本= “1.0”?><字段名= “受试者” 值= “标题:Re:FAKE GOD,圣谎言”>

+0

输出是否包含您期望/查找的值?如果是这样,那么查询索引的代码是错误的。如果不是,那么合并代码中有一个错误。 –