2014-10-03 72 views
0

我有一个函数从一个目录获取文件列表,然后从列表中搜索匹配的文件名。性能很糟糕。linq查询的性能问题

下面是函数:

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type) 
    { 
     XmlConfigurator.Configure(); 
     log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString())); 
     List<fileStatus> results = new List<fileStatus>(); 
     DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory); 
     if (dirInfo.Exists) 
     { 
      // GET LIST OF ALL FILES IN DIRECTORY 
      string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories); 

      log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files))); 


      if (files.Length > 0 && permitNumbers.Count > 0) 
      { 
       log.Debug("Checking for matching files"); 
       // CHECK FOR MATCHING FILES 
       switch (type) 
       { 
        case fileType.Well: 

         var matchingFiles = (from f in files 
              where f.Substring(f.LastIndexOf("\\") + 1).Length > 4 
              where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 5)) 
              select new fileStatus(fileType.Well, f.Substring(f.LastIndexOf("\\") + 1, 5), 1, f.Substring(f.LastIndexOf("\\") + 1))); 


         var permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         var nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
               select new fileStatus(fileType.Well, p, 0, string.Empty)); 

         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.DrillerLog: 
         matchingFiles = (from f in files 
             where f.Substring(f.LastIndexOf("\\") + 1).Length > 4 
             where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 5)) 
             select new fileStatus(fileType.DrillerLog, f.Substring(f.LastIndexOf("\\") + 1, 5), 1, f.Substring(f.LastIndexOf("\\") + 1))); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
              select new fileStatus(fileType.DrillerLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.RasterLog: 

         matchingFiles = (from f in files 
             where f.Substring(f.LastIndexOf("\\") + 1).Length > 13 
             where permitNumbers.Contains(f.Substring(f.LastIndexOf("\\") + 1, 14)) 
             select new fileStatus(fileType.RasterLog, f.Substring(f.LastIndexOf("\\") + 1, 14), 1, f.Substring(f.LastIndexOf("\\") + 1))); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = (from p in permitNumbers 
               where !permitNumbersWithMatches.Contains(p) 
              select new fileStatus(fileType.RasterLog, p, 0, string.Empty)); 



         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 
         break; 
        default: 
         break; 
       } 
       log.Debug("Done checking for matching files"); 
      } 
     } 
     return results; 

    } 

一旦它到达LINQ查询,对“matchingFiles”提供的价值,它只是挂起。这是一个大的“许可证号码”(如5000),也是一大组“文件”。

我能做些什么来加快速度?

考虑到下面提供的建议,我将功能修改为如下,现在性能按预期工作。非常感谢你! =)

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type) 
    { 
     HashSet<string> numbers = new HashSet<string>(permitNumbers); 
     XmlConfigurator.Configure(); 
     log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString())); 
     List<fileStatus> results = new List<fileStatus>(); 
     DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory); 
     if (dirInfo.Exists) 
     { 
      // GET LIST OF ALL FILES IN DIRECTORY 
      string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories); 
      HashSet<string> fileNames = new HashSet<string>(files.Select(f => Path.GetFileName(f))); 

      log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files))); 


      if (fileNames.Count > 0 && numbers.Count > 0) 
      { 
       log.Debug("Checking for matching files"); 
       // CHECK FOR MATCHING FILES 
       switch (type) 
       { 
        case fileType.Well: 
         var matchingFiles = (from f in fileNames 
              where f.Length > 4 
              where numbers.Contains(f.Substring(0, 5)) 
              select new fileStatus(fileType.Well, f.Substring(0, 5), 1, f)); 


         var permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         var nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty)); 

         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.DrillerLog: 
         matchingFiles = (from f in fileNames 
             where f.Length > 4 
             where numbers.Contains(f.Substring(0, 5)) 
             select new fileStatus(fileType.DrillerLog, f.Substring(0, 5), 1, f)); 


         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.DrillerLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 

         break; 
        case fileType.RasterLog: 

         matchingFiles = (from f in fileNames 
             where f.Length > 13 
             where numbers.Contains(f.Substring(0, 14)) 
             select new fileStatus(fileType.RasterLog, f.Substring(0, 14), 1, f)); 

         permitNumbersWithMatches = (from x in matchingFiles 
                 select x.PermitNumber); 

         nonMatchingFiles = numbers.Except(permitNumbersWithMatches) 
          .Select(p => new fileStatus(fileType.RasterLog, p, 0, string.Empty)); 


         results.AddRange(matchingFiles); 
         results.AddRange(nonMatchingFiles); 
         break; 
        default: 
         break; 
       } 
       log.Debug("Done checking for matching files"); 
      } 
     } 
     return results; 

    } 
+0

“一旦它到达LINQ查询” 哪一个?你有几个。另外,5000并不是一个“非常大的集合”。 – 2014-10-03 15:06:56

+1

你是否分析了代码?哪个linq查询很慢? – 2014-10-03 15:07:06

+0

只要它击中提供“matchingFiles”值的linq查询。 – 2014-10-03 15:08:07

回答

2

你正在创建一个查询matchingFiles,其中,迭代时,将通过你的所有文件的迭代,以多种方式操纵他们,也做的线性搜索你的数字集合。然后,您执行该查询并执行该操作(需要反复从磁盘读取大量数据,如果您有足够的高速缓存),并执行线性搜索对于每个的许可证号码。这导致O(N^2 * M)的渐近复杂度,其中N是许可证号码的数目,M是文件的数目。这是...非常糟糕。

这里的关键是避免1)进行线性搜索和2)多次迭代复杂查询,特别是避免对其他序列中的每个项目进行迭代。

对于#1,只是让permitNumbers a HashSet<string>而不是一个列表,然后检查一个项目是否包含在它将成为一个O(1)操作。

var nonMatchingFiles = permitNumbers.Except(permitNumbersWithMatches) 
    .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty)); 
1

我将消除所有f.Substring一再呼吁(f.LastIndexOf(“\”:

#2与只需要迭代源序列一旦操作取代第三个查询)+ 1))一起Path.GetFileName单个调用(F)

例如

var fileNames = files.Select(f => Path.GetFileName(f));  

var matchingFiles = (from fname in fileNames 
        where fname.Length > 4 
        where permitNumbers.Contains(fname.Substring(0, 5)) 
        select new fileStatus(fileType.Well, fname.Substring(0, 5), 1, fname); 
+0

这会提高代码的可读性,但它不可能对性能产生有意义的影响。 – Servy 2014-10-03 16:06:45