2012-01-10 57 views
6

首先让我为这个问题的规模道歉,但我真的想在功能上考虑,这是我不得不合作的更具挑战性的问题之一。如何写一个功能文件“扫描仪”

我想获得一些建议,说明如何以功能方式处理问题,特别是在F#中。我正在编写一个程序来检查目录列表,并使用正则表达式列表来过滤从目录中检索到的文件列表,并使用第二个正则表达式列表来查找被检索文件的文本中的匹配项。我希望这件事能够为每个匹配给定正则表达式模式的文本返回文件名,行索引,列索引,模式和匹配值。另外,需要记录异常,并且有3种可能的异常情况:无法打开目录,无法打开文件,从文件中读取内容失败。这样做的最终要求是“扫描”匹配的文件量可能非常大,所以整个事情需要懒散。我并不太担心“纯”功能解决方案,因为我对“良好”的解决方案很感兴趣,而且解决方案的读取效果很好,并且运行良好。最后一个挑战是使它与C#互操作,因为我想使用winform工具将此算法附加到UI。这是我的第一次尝试,希望这将澄清问题:

open System.Text.RegularExpressions 
open System.IO 

type Reader<'t, 'a> = 't -> 'a //=M['a], result varies 

let returnM x _ = x 

let map f m = fun t -> t |> m |> f 

let apply f m = fun t -> t |> m |> (t |> f) 

let bind f m = fun t -> t |> (t |> m |> f) 

let Scanner dirs = 
    returnM dirs 
    |> apply (fun dirExHandler -> 
     Seq.collect (fun directory -> 
      try 
       Directory.GetFiles(directory, "*", SearchOption.AllDirectories) 
      with | e -> 
       dirExHandler e directory 
       Array.empty)) 
    |> map (fun filenames -> 
     returnM filenames 
     |> apply (fun (filenamepatterns, lineExHandler, fileExHandler) -> 
      Seq.filter (fun filename -> 
       filenamepatterns |> Seq.exists (fun pattern -> 
        let regex = new Regex(pattern) 
        regex.IsMatch(filename))) 
      >> Seq.map (fun filename -> 
        let fileinfo = new FileInfo(filename) 
        try 
         use reader = fileinfo.OpenText() 
         Seq.unfold (fun ((reader : StreamReader), index) -> 
          if not reader.EndOfStream then 
           try 
            let line = reader.ReadLine() 
            Some((line, index), (reader, index + 1)) 
           with | e -> 
            lineExHandler e filename index 
            None 
          else 
           None) (reader, 0)   
         |> (fun lines -> (filename, lines)) 
        with | e -> 
         fileExHandler e filename 
         (filename, Seq.empty)) 
      >> (fun files -> 
       returnM files 
       |> apply (fun contentpatterns -> 
        Seq.collect (fun file -> 
         let filename, lines = file 
         lines |> 
          Seq.collect (fun line -> 
           let content, index = line 
           contentpatterns 
           |> Seq.collect (fun pattern ->  
            let regex = new Regex(pattern) 
            regex.Matches(content) 
            |> (Seq.cast<Match> 
            >> Seq.map (fun contentmatch -> 
             (filename, 
              index, 
              contentmatch.Index, 
              pattern, 
              contentmatch.Value)))))))))) 

感谢您的任何输入。

更新 - 这里是一个基于反馈的任何更新的解决方案,我收到:

open System.Text.RegularExpressions 
open System.IO 

type ScannerConfiguration = { 
    FileNamePatterns : seq<string> 
    ContentPatterns : seq<string> 
    FileExceptionHandler : exn -> string -> unit 
    LineExceptionHandler : exn -> string -> int -> unit 
    DirectoryExceptionHandler : exn -> string -> unit } 

let scanner specifiedDirectories (configuration : ScannerConfiguration) = seq { 
    let ToCachedRegexList = Seq.map (fun pattern -> new Regex(pattern)) >> Seq.cache 

    let contentRegexes = configuration.ContentPatterns |> ToCachedRegexList 

    let filenameRegexes = configuration.FileNamePatterns |> ToCachedRegexList 

    let getLines exHandler reader = 
     Seq.unfold (fun ((reader : StreamReader), index) -> 
      if not reader.EndOfStream then 
       try 
        let line = reader.ReadLine() 
        Some((line, index), (reader, index + 1)) 
       with | e -> exHandler e index; None 
      else 
       None) (reader, 0) 

    for specifiedDirectory in specifiedDirectories do 
     let files = 
      try Directory.GetFiles(specifiedDirectory, "*", SearchOption.AllDirectories) 
      with e -> configuration.DirectoryExceptionHandler e specifiedDirectory; [||] 
     for file in files do 
      if filenameRegexes |> Seq.exists (fun (regex : Regex) -> regex.IsMatch(file)) then 
       let lines = 
        let fileinfo = new FileInfo(file) 
        try 
         use reader = fileinfo.OpenText() 
         reader |> getLines (fun e index -> configuration.LineExceptionHandler e file index) 
        with | e -> configuration.FileExceptionHandler e file; Seq.empty 
       for line in lines do 
        let content, index = line 
        for contentregex in contentRegexes do 
         for mmatch in content |> contentregex.Matches do 
          yield (file, index, mmatch.Index, contentregex.ToString(), mmatch.Value) } 

同样,任何投入是值得欢迎的。

+2

你见过像Parsec这样的函数解析器吗? – 2012-01-10 15:58:39

+1

这是很多文字。试着把它分解成更容易阅读。 – Marcin 2012-01-10 16:02:22

+0

我只是简单地使用接口和对象表达式来创建一个实例并将其暴露给C#代码。 – 2012-01-10 18:02:47

回答

8

我认为最好的方法是从最简单的解决方案开始,然后对其进行扩展。您目前的做法似乎是相当难念给我听,原因有二:

  • 的代码使用在不在F#太常见的模式有很多组合程序和功能组成的。一些处理可以使用序列表达式更容易地编写。

  • 这段代码全部写成单个函数,但它相当复杂,如果它被分成多个函数,它会更具可读性。

我可能会通过在测试单个文件的功能分裂代码开始(比如​​),并且走到了文件并调用​​功能。主要迭代可以很好地利用编写F#序列表达式:

// Checks whether a file name matches a filename pattern 
// and a content matches a content pattern 
let fileMatches fileNamePatterns contentPatterns 
       (fileExHandler, lineExHandler) file = 
    // TODO: This can be imlemented using 
    // File.ReadLines which returns a sequence 


// Iterates over all the files and calls 'fileMatches' 
let scanner specifiedDirectories fileNamePatterns contentPatterns 
      (dirExHandler, fileExHandler, lineExHandler) = seq { 
    // Iterate over all the specified directories 
    for specifiedDir in specifiedDirectories do 
    // Find all files in the directories (and handle exceptions)  
    let files = 
     try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories) 
     with e -> dirExHandler e specifiedDir; [||] 
    // Iterate over all files and report those that match 
    for file in files do 
     if fileMatches fileNamePatterns contentPatterns 
        (fileExHandler, lineExHandler) file then 
     // Matches! Return this file as part of the result. 
     yield file } 

的功能仍是很复杂的,因为你需要通过周围很多参数。包装在一个简单的类型或记录的参数,可能是一个好主意:

type ScannerArguments = 
    { FileNamePatterns:string 
    ContentPatterns:string 
    FileExceptionHandler:exn -> string -> unit 
    LineExceptionHandler:exn -> string -> unit 
    DirectoryExceptionHandler:exn -> string -> unit } 

然后你就可以同时定义​​和scanner作为仅举两个参数的功能,这将使你的代码了很多更具可读性。例如:

// Iterates over all the files and calls 'fileMatches' 
let scanner specifiedDirectories (args:ScannerArguments) = seq { 
    for specifiedDir in specifiedDirectories do 
    let files = 
     try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories) 
     with e -> args.DirectoryEceptionHandler e specifiedDir; [||] 
    for file in files do 
     // No need to propagate all arguments explicitly to other functions 
     if fileMatches args file then yield file }