2015-02-08 105 views
-1

我试图访问我的系统上的文件夹。被称为数据集。我已经在NetBeans访问java中的文件/目录

IR 以下软件包已主类调用名为DocumentParser.java

和文档解析器另一个类是另一个叫做svm_ir目录。 DocumentParser.java IR.java的代码给出下

public static void main(String args[]) throws Exception { 
     DocumentParser dp = new DocumentParser(); 

     String folderName = "dataset"; 
     String lang = "english"; 
//  String lang = "hindi"; 
     String queryPath = "query"; 

     dp.parseFiles(folderName, lang);//location of your source files, only text file 
     dp.buildTFIDForCorpus(); //calculates tfidf   
     dp.queryVectors = dp.parseQuery(queryPath, lang); 
     Score.cosineSimilarity(dp.queryVectors, dp.tfidfDocumentVector, dp.fileNames, dp.fileParseCount); 
     dp.output(); 
    } 

为DocumentParser.java代码给出下

package svm_ir; 

import java.io.BufferedReader; 
import java.io.BufferedWriter; 
import java.io.File; 
import java.io.FileInputStream; 
import java.io.FileNotFoundException; 
import java.io.FileOutputStream; 
import java.io.FileReader; 
import java.io.FileWriter; 
import java.io.IOException; 
import java.io.InputStreamReader; 
import java.io.OutputStreamWriter; 
import java.util.ArrayList; 
import java.util.Arrays; 
import java.util.List; 

public class DocumentParser { 

    public static final String ENGLISH = "english"; 
    public static final String HINDI = "hindi"; 

    //This variable will hold all terms of each document in an array. 
    public List termsDocsArray = new ArrayList<String[]>(); 
    public List allTerms = new ArrayList<String>(); //to hold all terms 
    public List tfidfDocsVector = new ArrayList<Double>(); 

    public Object arrayOfTermDocumentVectors[]; 
    public String vocabulary[]; 
    public Double tfidfDocumentVector[][]; 

    public String queryVector[]; 
    public Double queryVectors[][]; 

    public int corpusSize = 0; 
    public static int fileParseCount = 0; 
    public String[] fileNames; 
    public static DocumentScore documentScores[][]; 
    public static int documentsScored = 0; 
    public static File queryFiles[]; 
    public static String ext = ".txt"; 
    public static String tag = "content"; 
    public static int queryCount = 0; 

    @SuppressWarnings("unchecked") 
    public void parseFiles(String filePath, String lang) throws Exception { 
     System.out.println(filePath); 
//  if(filePath.isDirectory()); 
     File[] allfiles = new File(filePath).listFiles(); 
//  if(allfiles.isDirectory()) 
     corpusSize = allfiles.length; 

     lang = lang.toLowerCase(); 
     String[] tokenizedTerms; 
     List fileNames = new ArrayList<String>(); 

     for (File f : allfiles) { 
      if (f.getName().endsWith(ext) && !(f.getName().contains("index"))) { 
       tokenizedTerms = tokenize(f, lang, "d"); 

       for (String term : tokenizedTerms) { 
        if (!allTerms.contains(term) && !StopWords.hindi.contains(term)) { //avoid duplicate entry 
         allTerms.add(term); 
        } 
       } 
       termsDocsArray.add(tokenizedTerms); 
       fileNames.add(f.getName()); 
       System.out.println("Total documents parsed: " + (++fileParseCount)); 
      } 
     } 
     arrayOfTermDocumentVectors = termsDocsArray.toArray(new Object[termsDocsArray.size()]); 
     vocabulary = new String[allTerms.size()]; 
     System.out.println("Building Vocabulary"); 
     vocabulary = (String[]) allTerms.toArray(vocabulary); 
     System.out.println("Vocabulary built"); 

     String vocab = ""; 
     for (String word : vocabulary) { 
      vocab += word + "\n"; 
     } 
     BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("vocabulary\\vocab.txt"), "UTF-8")); 
     bw.write(vocab); 
     bw.close(); 

     this.fileNames = new String[fileParseCount]; 
     this.fileNames = (String[]) fileNames.toArray(this.fileNames); 

    } 

    public Double[][] parseQuery(String queryPath, String lang) throws Exception { 
     File[] allfiles = new File(queryPath).listFiles(); //List all queries   
     queryFiles = allfiles; 
     String[] tokenizedTerms; 
     Double[] tfidfQueryVector = null; 
     Double[][] tfidfQueryVectors = null; 
     List<Double[]> tfidfQVectors = new ArrayList<Double[]>(); 
     for (File f : allfiles) { 
      if (f.getName().endsWith(ext)) { 
       tokenizedTerms = tokenize(f, lang, "q"); //Builds a vector for the document by tokenizing it's words. 
       queryVector = tokenizedTerms; 
       tfidfQueryVector = getTFIDFVector(queryVector); 
       tfidfQVectors.add(tfidfQueryVector); 
      } 
      System.out.println("Building query tfidf vector " + (++queryCount)); 
     } 
//  documentScores = new DocumentScore[queryCount][]; 
     tfidfQueryVectors = (Double[][]) tfidfQVectors.toArray(new Double[tfidfQVectors.size()][vocabulary.length]); 
     return tfidfQueryVectors; 
    } 

    public String[] tokenize(File f, String lang, String typeOfDoc) throws Exception { 
     String s = null; 
     s = TagParser.parse(f, tag); 
     String[] tokenizedTerms; 
     if (lang == ENGLISH) { 
      tokenizedTerms = s.replaceAll("[\"\'\\.,\"\':;<>\\-\n\t\\(\\)0-9\\?]+", " ").trim().split("\\s+"); 
     } else { 
      tokenizedTerms = s.replaceAll("[\"\'\\.,\"\':;<>\\-\n\t\\(\\)0-9\\?]+", " ").trim().split("\\s+"); 
     } 
     return tokenizedTerms; 
    } 

    @SuppressWarnings("unchecked") 
    public void buildTFIDForCorpus() { 
     int docVectorCount = 0; 
     Double[] tfidfvector = new Double[allTerms.size()]; 
     String[] unfilteredTDStringArray; 
     for (Object unfilteredTermDocumentVector : arrayOfTermDocumentVectors) { 
      tfidfvector = new Double[allTerms.size()]; 
      unfilteredTDStringArray = (String[]) unfilteredTermDocumentVector; 
      tfidfvector = getTFIDFVector(unfilteredTDStringArray); 
      tfidfDocsVector.add(tfidfvector); //storing document vectors; 
      System.out.println("Total document tfidf vectors created: " + (++docVectorCount) + "/" + corpusSize); 
     } 
     tfidfDocumentVector = (Double[][]) tfidfDocsVector.toArray(new Double[tfidfDocsVector.size()][tfidfvector.length]); 
    } 

    public Double[] getTFIDFVector(String[] unfilteredTDStringArray) { 
     Double tf; //term frequency 
     Double idf; //inverse document frequency 
     Double tfidf; //term frequency inverse document frequency 
     Double[] tfidfvector = new Double[allTerms.size()]; 
     int count = 0; 
     for (String term : vocabulary) { 
      tf = TfIdf.tfCalculator(unfilteredTDStringArray, term); 
      idf = TfIdf.idfCalculator(arrayOfTermDocumentVectors, term); 
      tfidf = tf * idf; 
      tfidfvector[count] = tfidf; 
      count++; 
     } 
     return tfidfvector; 
    } 

    public static void output() throws IOException { 
     File runFile = new File("results\\run-tfid.txt"); 
     String results = ""; 
     runFile.createNewFile(); 
     File queryFile; 
     for (int i = 0; i < queryFiles.length; i++) { 
      queryFile = queryFiles[i]; 
      for (int rank = 0; rank < Math.min(DocumentParser.fileParseCount, 100); rank++) { 
       results += queryFile.getName() + " Q0 " + documentScores[i][rank].fileName + " " + (rank + 1) + " " + documentScores[i][rank].score + "\n"; 
      } 
     } 
     FileWriter fw = new FileWriter(runFile.getAbsoluteFile()); 
     BufferedWriter bw = new BufferedWriter(fw); 
     bw.write(results); 
     bw.close(); 
     System.out.println(results); 
    } 
} 

我想知道哪里应该把数据集文件夹使用netbeans正确访问它。我需要更改我的代码吗?我会感谢迅速回复。

+0

访问系统上的目录完全独立于您使用的IDE。另外,这段代码不会编译,如果你的文件不是目录,'.listFiles()'将返回'null'。你最好使用'Files.newDirectoryStream()',或使用Java 8,'Files.list()' – fge 2015-02-08 20:36:46

+0

,请指导我解决我的错误。我会感谢你。我已更新文件 – 2015-02-08 20:47:04

回答

0

你可以使用下面的命令来检查你的IDE默认路径:

String filepath = "dataset"; 
File inputFile = new File(filepath); 
inputPath = inputFile.getAbsolutePath(); 
System.out.println("dataset path "+inputPath); 

此外,您可以配置在NetBeans自己的工作目录: 项目 - >属性 - >生成 - >运行 - >工作目录

如果您将该字段留空,则默认工作目录应位于您的项目文件夹下。

+0

感谢兄弟..它为我工作 – 2015-02-08 20:56:30