2015-04-06 69 views
1

我试图比较2个单词文档。我的目标是获取包含两个文件中每行的列表以及它们是否匹配。我有那部分工作。我的问题是,如果一个文件比另一个文件大,那么来自较大文本的文本不会被添加到上述列表中。有些情况下一个文件可能比另一个文件大。它用于文档修订系统。当前修订版中可能会有比以前更多的文字,反之亦然。比较2个不同长度的Word文档

到目前为止,我有这个代码。我修改了发现的示例here

这里是我一直在使用这两个示例文件(Word文档)
Test1.docx:

Test 

This is a test document. It was created May 31. 
The contents of this document are: 
Unknown 

Test2.docx:

Test 

This is a test document. It was created Apr 1. 
The contents of this document are: 
Test Item 1 
Test Item 2 

这里是我的Program.cs文件,这是我编辑的地方。在CompareDocuments方法:

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Xml.Linq; 
using DocumentFormat.OpenXml.Packaging; 

namespace DocxDiff 
{ 
    public class Program 
    { 
     private static List<DocumentCompare> _differences = new List<DocumentCompare>(); 

     public static string GetParagraphText(XElement p) 
     { 
      return p.Descendants(W.r) 
       .Where(e => e.Parent.Name != W.del && e.Parent.Name != W.moveFrom) 
       .Descendants(W.t) 
       .Select(t => (string) t) 
       .StringConcatenate(); 
     } 

     public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2) 
     { 
      XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument(); 
      XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument(); 

      var doc1Elements = xDoc1 
       .Descendants() 
       .Where(e => e.Name != W.commentRangeStart 
          && e.Name != W.commentRangeEnd 
          && e.Name != W.proofErr 
          && !e.Ancestors(W.p).Any()); 
      var doc2Elements = xDoc2 
       .Descendants() 
       .Where(e => e.Name != W.commentRangeStart 
          && e.Name != W.commentRangeEnd 
          && e.Name != W.proofErr 
          && !e.Ancestors(W.p).Any()); 

      List<DocumentCompare> differences = new List<DocumentCompare>(); 

      IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) => 
      { 
       // if the lines are different, set to true 
       bool difference = false; 
       if (e1.Name != e2.Name) 
       { 
        return false; 
       } 

       if (e1.Name == W.p && e2.Name == W.p) 
       { 
        if ((GetParagraphText(e1) != GetParagraphText(e2))) 
        { 
         // there is a difference between the documents 
         difference = true; 
        } 

        // record lines 
        differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference }); 
       } 

       // this is from the code in the link above 
       // this method does not return a bool, it returns the list of differences 
       return true; 
      }); 

      // determine if the documents are equivalent 
      // this has to be here to run the code above 
      bool test = correspondingElementEquivalency.Any(e => e != true); 

      return differences; 
     } 

     public static void Main(string[] args) 
     { 
      var doc1Path = @"C:\Diff\Test1.docx"; 
      var doc2Path = @"C:\Diff\Test2.docx"; 

      using(WordprocessingDocument doc1 = WordprocessingDocument.Open(doc1Path, false)) 
      using(WordprocessingDocument doc2 = WordprocessingDocument.Open(doc2Path, false)) 
      { 
       _differences = CompareDocuments(doc1, doc2); 

       foreach (var t in _differences) 
       { 
        Console.WriteLine("Difference: {0}\nDoc 1: {1}\nDoc 2: {2}", t.Difference, t.Document1Text, t.Document2Text); 
       } 
      } 

      Console.Read(); 
     } 
    } 
} 

这是我用于存储比较的文档类:

public class DocumentCompare 
{ 
    public string Document1Text { get; set; } 
    public string Document2Text { get; set; } 
    public bool Difference { get; set; } 
} 

这里是我的Extensions.cs文件(从教程,不能修改):

using System; 
using System.Collections.Generic; 
using System.IO; 
using System.Linq; 
using System.Text; 
using System.Xml; 
using System.Xml.Linq; 
using DocumentFormat.OpenXml.Packaging; 

namespace DocxDiff 
{ 
    public static class Extensions 
    { 
     public static XDocument GetXDocument(this OpenXmlPart part) 
     { 
      XDocument xdoc = part.Annotation<XDocument>(); 

      if (xdoc != null) 
       return xdoc; 

      using (StreamReader streamReader = new StreamReader(part.GetStream())) 
       xdoc = XDocument.Load(XmlReader.Create(streamReader)); 

      part.AddAnnotation(xdoc); 
      return xdoc; 
     } 

     public static string StringConcatenate(this IEnumerable<string> source) 
     { 
      StringBuilder sb = new StringBuilder(); 
      foreach (var s in source) 
       sb.Append(s); 
      return sb.ToString(); 
     } 

     public static IEnumerable<TResult> Zip<TFirst, TSecond, TResult>(this IEnumerable<TFirst> first, IEnumerable<TSecond> second, Func<TFirst, TSecond, TResult> func) 
     { 
      var ie1 = first.GetEnumerator(); 
      var ie2 = second.GetEnumerator(); 

      while (ie1.MoveNext() && ie2.MoveNext()) 
      { 
       yield return func(ie1.Current, ie2.Current); 
      } 
     } 
    } 
} 

这是本教程也给出的W.cs类:

public static class W 
{ 
    public static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; 

    public static XName p = w + "p"; 
    public static XName r = w + "r"; 
    public static XName t = w + "t"; 
    public static XName commentRangeStart = w + "commentRangeStart"; 
    public static XName commentRangeEnd = w + "commentRangeEnd"; 
    public static XName proofErr = w + "proofErr"; 
    public static XName del = w + "del"; 
    public static XName moveFrom = w + "moveFrom"; 
} 

编辑:我很确定我需要修改Zip方法来添加大文件中的行,并为其他文件添加一个空字符串。我试过(不成功)来修改这(从here)工作:

static void Main() { 
    var a = new List<int> { 1, 2, 3 }; 
    var b = new List<int> { 1, 2, 3, 4, 5 }; 
    foreach (var c in a.Merge(b, (x, y) => x + y)) { 
     Console.WriteLine(c); 
    } 
} 
static IEnumerable<T> Merge<T>(this IEnumerable<T> first, 
     IEnumerable<T> second, Func<T, T, T> operation) { 
    using (var iter1 = first.GetEnumerator()) 
    using (var iter2 = second.GetEnumerator()) { 
     while (iter1.MoveNext()) { 
      if (iter2.MoveNext()) { 
       yield return operation(iter1.Current, iter2.Current); 
      } else { 
       yield return iter1.Current; 
      } 
     } 
     while (iter2.MoveNext()) { 
      yield return iter2.Current; 
     } 
    } 
} 

我敢肯定,我必须做这样的事情。合并文件。不知何故,我需要添加一个空的列表项到较小的文档。

编辑:这就是我刚刚提出的,它的工作,只有它不会显示在列表(单词列表)的文档中的项目。我把它放在Program.cs中DocumentCompare方法bool test...return differences;前右:

// get document sizes 
     var largerDoc = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements : doc2Elements; 
     var smallerDocCount = doc1Elements.Count() < doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements.Count() : doc2Elements.Count(); 
     var doc1Larger = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? true : false; 
     var doc1Arr = doc1Elements.ToArray(); 
     var doc2Arr = doc2Elements.ToArray(); 

     // add in the remaining text for the larger document 
     for (var i = smallerDocCount; i < largerDoc.Count(); i++) 
     { 
      // if doc1 is larger, add doc 1 and null for doc 2 
      if (doc1Larger) 
      { 
       Console.WriteLine("doc1 Text: {0}", doc1Arr[i].Value); 
       differences.Add(new DocumentComparison() { Document1Text = doc1Arr[i].Value, Document2Text = "", Difference = true }); 
      } 
      else if(!doc1Larger) { 
       Console.WriteLine("doc2 Text: {0}", doc2Arr[i].Value); 
       differences.Add(new DocumentComparison() { Document1Text = "", Document2Text = doc2Arr[i].Value, Difference = true }); 
      } 
     } 

回答

0

好吧,这是我想出来的。 (这可能不是最高效的,但我相信这是该项目的一个很好的解决方案)。下面是代码(解释如下):

public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2) 
    { 
     XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument(); 
     XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument(); 

     // these queries return the elements that contain text in the word documents 
     var doc1Elements = xDoc1 
      .Descendants() 
      .Where(e => e.Name != W.commentRangeStart 
         && e.Name != W.commentRangeEnd 
         && e.Name != W.proofErr 
         && !e.Ancestors(W.p).Any()); 
     var doc2Elements = xDoc2 
      .Descendants() 
      .Where(e => e.Name != W.commentRangeStart 
         && e.Name != W.commentRangeEnd 
         && e.Name != W.proofErr 
         && !e.Ancestors(W.p).Any()); 

     List<DocumentCompare> differences = new List<DocumentCompare>(); 

     IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) => 
     { 
      bool difference = false; 
      if (e1.Name != e2.Name) 
      { 
       return false; 
      } 

      if (e1.Name == W.p && e2.Name == W.p) 
      { 
       // e1.Name == W.p && 
       if ((GetParagraphText(e1) != GetParagraphText(e2))) 
       { 
        // there is a difference between the documents 
        difference = true; 
       } 

       // record lines 
       differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference }); 
      } 
      return true; 
     }); 

     // determine if the documents are equivalent 
     bool test = correspondingElementEquivalency.Any(e => e != true); 

     var doc1Values = (from ie1 in doc1Elements where ie1.Name == W.p select ie1.Value).ToList(); 
     var doc2Values = (from ie2 in doc2Elements where ie2.Name == W.p select ie2.Value).ToList(); 

     // determine the larger document to add the remainder of that document to the list 
     var largerDoc = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values : doc2Values; 
     var smallerDocCount = doc1Values.Count() < doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values.Count() : doc2Values.Count(); 
     var doc1Larger = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? true : false; 
     var doc1Arr = doc1Values.ToArray(); 
     var doc2Arr = doc2Values.ToArray(); 

     // add in the remaining text for the larger document 
     for (var i = smallerDocCount; i < largerDoc.Count(); i++) 
     { 
      // if doc1 is larger, add doc 1 and null for doc 2 
      if (doc1Larger) 
      { 
       Console.WriteLine("doc1 Text: {0}", doc1Arr[i]); 
       differences.Add(new DocumentCompare() { Document1Text = doc1Arr[i], Document2Text = "", Difference = true }); 
      } 
      else if(!doc1Larger) { 
       Console.WriteLine("doc2 Text: {0}", doc2Arr[i]); 
       differences.Add(new DocumentCompare() { Document1Text = "", Document2Text = doc2Arr[i], Difference = true }); 
      } 
     } 

     return differences; 
    } 

好吧,IEnumerable完成到最小文档结束的位置。然后,我得到大文档的其余部分(从较小文档的末尾开始),并将其附加到列表的末尾。我仍然需要收紧一些代码,但它的工作原理。