比较2个不同长度的Word文档

我试图比较2个单词文档。我的目标是获取包含两个文件中每行的列表以及它们是否匹配。我有那部分工作。我的问题是，如果一个文件比另一个文件大，那么来自较大文本的文本不会被添加到上述列表中。有些情况下一个文件可能比另一个文件大。它用于文档修订系统。当前修订版中可能会有比以前更多的文字，反之亦然。比较2个不同长度的Word文档

到目前为止，我有这个代码。我修改了发现的示例here。

这里是我一直在使用这两个示例文件（Word文档）
Test1.docx：

Test 

This is a test document. It was created May 31. 
The contents of this document are: 
Unknown

Test2.docx：

Test 

This is a test document. It was created Apr 1. 
The contents of this document are: 
Test Item 1 
Test Item 2

这里是我的Program.cs文件，这是我编辑的地方。在CompareDocuments方法：

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using System.Xml.Linq; 
using DocumentFormat.OpenXml.Packaging; 

namespace DocxDiff 
{ 
    public class Program 
    { 
     private static List<DocumentCompare> _differences = new List<DocumentCompare>(); 

     public static string GetParagraphText(XElement p) 
     { 
      return p.Descendants(W.r) 
       .Where(e => e.Parent.Name != W.del && e.Parent.Name != W.moveFrom) 
       .Descendants(W.t) 
       .Select(t => (string) t) 
       .StringConcatenate(); 
     } 

     public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2) 
     { 
      XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument(); 
      XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument(); 

      var doc1Elements = xDoc1 
       .Descendants() 
       .Where(e => e.Name != W.commentRangeStart 
          && e.Name != W.commentRangeEnd 
          && e.Name != W.proofErr 
          && !e.Ancestors(W.p).Any()); 
      var doc2Elements = xDoc2 
       .Descendants() 
       .Where(e => e.Name != W.commentRangeStart 
          && e.Name != W.commentRangeEnd 
          && e.Name != W.proofErr 
          && !e.Ancestors(W.p).Any()); 

      List<DocumentCompare> differences = new List<DocumentCompare>(); 

      IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) => 
      { 
       // if the lines are different, set to true 
       bool difference = false; 
       if (e1.Name != e2.Name) 
       { 
        return false; 
       } 

       if (e1.Name == W.p && e2.Name == W.p) 
       { 
        if ((GetParagraphText(e1) != GetParagraphText(e2))) 
        { 
         // there is a difference between the documents 
         difference = true; 
        } 

        // record lines 
        differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference }); 
       } 

       // this is from the code in the link above 
       // this method does not return a bool, it returns the list of differences 
       return true; 
      }); 

      // determine if the documents are equivalent 
      // this has to be here to run the code above 
      bool test = correspondingElementEquivalency.Any(e => e != true); 

      return differences; 
     } 

     public static void Main(string[] args) 
     { 
      var doc1Path = @"C:\Diff\Test1.docx"; 
      var doc2Path = @"C:\Diff\Test2.docx"; 

      using(WordprocessingDocument doc1 = WordprocessingDocument.Open(doc1Path, false)) 
      using(WordprocessingDocument doc2 = WordprocessingDocument.Open(doc2Path, false)) 
      { 
       _differences = CompareDocuments(doc1, doc2); 

       foreach (var t in _differences) 
       { 
        Console.WriteLine("Difference: {0}\nDoc 1: {1}\nDoc 2: {2}", t.Difference, t.Document1Text, t.Document2Text); 
       } 
      } 

      Console.Read(); 
     } 
    } 
}

这是我用于存储比较的文档类：

public class DocumentCompare 
{ 
    public string Document1Text { get; set; } 
    public string Document2Text { get; set; } 
    public bool Difference { get; set; } 
}

这里是我的Extensions.cs文件（从教程，不能修改）：

using System; 
using System.Collections.Generic; 
using System.IO; 
using System.Linq; 
using System.Text; 
using System.Xml; 
using System.Xml.Linq; 
using DocumentFormat.OpenXml.Packaging; 

namespace DocxDiff 
{ 
    public static class Extensions 
    { 
     public static XDocument GetXDocument(this OpenXmlPart part) 
     { 
      XDocument xdoc = part.Annotation<XDocument>(); 

      if (xdoc != null) 
       return xdoc; 

      using (StreamReader streamReader = new StreamReader(part.GetStream())) 
       xdoc = XDocument.Load(XmlReader.Create(streamReader)); 

      part.AddAnnotation(xdoc); 
      return xdoc; 
     } 

     public static string StringConcatenate(this IEnumerable<string> source) 
     { 
      StringBuilder sb = new StringBuilder(); 
      foreach (var s in source) 
       sb.Append(s); 
      return sb.ToString(); 
     } 

     public static IEnumerable<TResult> Zip<TFirst, TSecond, TResult>(this IEnumerable<TFirst> first, IEnumerable<TSecond> second, Func<TFirst, TSecond, TResult> func) 
     { 
      var ie1 = first.GetEnumerator(); 
      var ie2 = second.GetEnumerator(); 

      while (ie1.MoveNext() && ie2.MoveNext()) 
      { 
       yield return func(ie1.Current, ie2.Current); 
      } 
     } 
    } 
}

这是本教程也给出的W.cs类：

public static class W 
{ 
    public static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; 

    public static XName p = w + "p"; 
    public static XName r = w + "r"; 
    public static XName t = w + "t"; 
    public static XName commentRangeStart = w + "commentRangeStart"; 
    public static XName commentRangeEnd = w + "commentRangeEnd"; 
    public static XName proofErr = w + "proofErr"; 
    public static XName del = w + "del"; 
    public static XName moveFrom = w + "moveFrom"; 
}

编辑：我很确定我需要修改Zip方法来添加大文件中的行，并为其他文件添加一个空字符串。我试过（不成功）来修改这（从here）工作：

static void Main() { 
    var a = new List<int> { 1, 2, 3 }; 
    var b = new List<int> { 1, 2, 3, 4, 5 }; 
    foreach (var c in a.Merge(b, (x, y) => x + y)) { 
     Console.WriteLine(c); 
    } 
} 
static IEnumerable<T> Merge<T>(this IEnumerable<T> first, 
     IEnumerable<T> second, Func<T, T, T> operation) { 
    using (var iter1 = first.GetEnumerator()) 
    using (var iter2 = second.GetEnumerator()) { 
     while (iter1.MoveNext()) { 
      if (iter2.MoveNext()) { 
       yield return operation(iter1.Current, iter2.Current); 
      } else { 
       yield return iter1.Current; 
      } 
     } 
     while (iter2.MoveNext()) { 
      yield return iter2.Current; 
     } 
    } 
}

我敢肯定，我必须做这样的事情。合并文件。不知何故，我需要添加一个空的列表项到较小的文档。

编辑：这就是我刚刚提出的，它的工作，只有它不会显示在列表（单词列表）的文档中的项目。我把它放在Program.cs中DocumentCompare方法bool test...后return differences;前右：

// get document sizes 
     var largerDoc = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements : doc2Elements; 
     var smallerDocCount = doc1Elements.Count() < doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? doc1Elements.Count() : doc2Elements.Count(); 
     var doc1Larger = doc1Elements.Count() > doc2Elements.Count() && doc1Elements.Count() != doc2Elements.Count() ? true : false; 
     var doc1Arr = doc1Elements.ToArray(); 
     var doc2Arr = doc2Elements.ToArray(); 

     // add in the remaining text for the larger document 
     for (var i = smallerDocCount; i < largerDoc.Count(); i++) 
     { 
      // if doc1 is larger, add doc 1 and null for doc 2 
      if (doc1Larger) 
      { 
       Console.WriteLine("doc1 Text: {0}", doc1Arr[i].Value); 
       differences.Add(new DocumentComparison() { Document1Text = doc1Arr[i].Value, Document2Text = "", Difference = true }); 
      } 
      else if(!doc1Larger) { 
       Console.WriteLine("doc2 Text: {0}", doc2Arr[i].Value); 
       differences.Add(new DocumentComparison() { Document1Text = "", Document2Text = doc2Arr[i].Value, Difference = true }); 
      } 
     }

来源

2015-04-06 user

好吧，这是我想出来的。（这可能不是最高效的，但我相信这是该项目的一个很好的解决方案）。下面是代码（解释如下）：

public static List<DocumentCompare> CompareDocuments(WordprocessingDocument doc1, WordprocessingDocument doc2) 
    { 
     XDocument xDoc1 = doc1.MainDocumentPart.GetXDocument(); 
     XDocument xDoc2 = doc2.MainDocumentPart.GetXDocument(); 

     // these queries return the elements that contain text in the word documents 
     var doc1Elements = xDoc1 
      .Descendants() 
      .Where(e => e.Name != W.commentRangeStart 
         && e.Name != W.commentRangeEnd 
         && e.Name != W.proofErr 
         && !e.Ancestors(W.p).Any()); 
     var doc2Elements = xDoc2 
      .Descendants() 
      .Where(e => e.Name != W.commentRangeStart 
         && e.Name != W.commentRangeEnd 
         && e.Name != W.proofErr 
         && !e.Ancestors(W.p).Any()); 

     List<DocumentCompare> differences = new List<DocumentCompare>(); 

     IEnumerable<bool> correspondingElementEquivalency = doc1Elements.Zip(doc2Elements, (e1, e2) => 
     { 
      bool difference = false; 
      if (e1.Name != e2.Name) 
      { 
       return false; 
      } 

      if (e1.Name == W.p && e2.Name == W.p) 
      { 
       // e1.Name == W.p && 
       if ((GetParagraphText(e1) != GetParagraphText(e2))) 
       { 
        // there is a difference between the documents 
        difference = true; 
       } 

       // record lines 
       differences.Add(new DocumentCompare() { Document1Text = e1.Value, Document2Text = e2.Value, Difference = difference }); 
      } 
      return true; 
     }); 

     // determine if the documents are equivalent 
     bool test = correspondingElementEquivalency.Any(e => e != true); 

     var doc1Values = (from ie1 in doc1Elements where ie1.Name == W.p select ie1.Value).ToList(); 
     var doc2Values = (from ie2 in doc2Elements where ie2.Name == W.p select ie2.Value).ToList(); 

     // determine the larger document to add the remainder of that document to the list 
     var largerDoc = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values : doc2Values; 
     var smallerDocCount = doc1Values.Count() < doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? doc1Values.Count() : doc2Values.Count(); 
     var doc1Larger = doc1Values.Count() > doc2Values.Count() && doc1Values.Count() != doc2Values.Count() ? true : false; 
     var doc1Arr = doc1Values.ToArray(); 
     var doc2Arr = doc2Values.ToArray(); 

     // add in the remaining text for the larger document 
     for (var i = smallerDocCount; i < largerDoc.Count(); i++) 
     { 
      // if doc1 is larger, add doc 1 and null for doc 2 
      if (doc1Larger) 
      { 
       Console.WriteLine("doc1 Text: {0}", doc1Arr[i]); 
       differences.Add(new DocumentCompare() { Document1Text = doc1Arr[i], Document2Text = "", Difference = true }); 
      } 
      else if(!doc1Larger) { 
       Console.WriteLine("doc2 Text: {0}", doc2Arr[i]); 
       differences.Add(new DocumentCompare() { Document1Text = "", Document2Text = doc2Arr[i], Difference = true }); 
      } 
     } 

     return differences; 
    }

好吧，IEnumerable完成到最小文档结束的位置。然后，我得到大文档的其余部分（从较小文档的末尾开始），并将其附加到列表的末尾。我仍然需要收紧一些代码，但它的工作原理。

来源

2015-04-07 11:53:50 user

比较2个不同长度的Word文档

回答

相关问题