2017-07-04 94 views
0

我正在使用Microsoft DocumentFormat.OpenXml SDK从Excel文件中读取数据。 虽然这样做,我正在考虑如果一个单元格有空白值(如果是,也请阅读)。使用Microsoft DocumentFormat.OpenXml SDK读取c#中的excel文件

现在,面临着其中workSheet.SheetDimension为空的Excel表之一的问题,因此代码抛出异常。使用

代码:

类OpenXMLHelper { //一个辅助函数来打开使用的OpenXML Excel文件,并且从一个 工作表//返回包含所有的数据的数据表。 // //我们在使用OLEDB读取Excel数据时遇到了很多问题(例如,ACE驱动程序不再存在于新服务器上, // OLEDB由于安全问题而不工作,并且公然忽略空白行工作表顶部),所以这是一个更稳定的数据读取方法。 //

public static DataTable ExcelWorksheetToDataTable(string pathFilename) 
    { 
     try 
     { 
      DataTable dt = new DataTable(); 
      string dimensions = string.Empty; 

      using (FileStream fs = new FileStream(pathFilename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) 
      { 
       using (SpreadsheetDocument document = SpreadsheetDocument.Open(fs, false)) 
       { 
        // Find the sheet with the supplied name, and then use that 
        // Sheet object to retrieve a reference to the first worksheet. 
        //Sheet theSheet = document.WorkbookPart.Workbook.Descendants<Sheet>().Where(s => s.Name == worksheetName).FirstOrDefault(); 
        //--Sheet theSheet = document.WorkbookPart.Workbook.Descendants<Sheet>().FirstOrDefault(); 

        //--if (theSheet == null) 
        //-- throw new Exception("Couldn't find the worksheet: "+ theSheet.Id); 

        // Retrieve a reference to the worksheet part. 
        //WorksheetPart wsPart = (WorksheetPart)(document.WorkbookPart.GetPartById(theSheet.Id)); 
        //--WorksheetPart wsPart = (WorksheetPart)(document.WorkbookPart.GetPartById(theSheet.Id)); 

        WorkbookPart workbookPart = document.WorkbookPart; 
        WorksheetPart wsPart = workbookPart.WorksheetParts.FirstOrDefault(); 
        Worksheet workSheet = wsPart.Worksheet; 

        dimensions = workSheet.SheetDimension.Reference.InnerText;  // Get the dimensions of this worksheet, eg "B2:F4" 

        int numOfColumns = 0; 
        int numOfRows = 0; 
        CalculateDataTableSize(dimensions, ref numOfColumns, ref numOfRows); 
        //System.Diagnostics.Trace.WriteLine(string.Format("The worksheet \"{0}\" has dimensions \"{1}\", so we need a DataTable of size {2}x{3}.", worksheetName, dimensions, numOfColumns, numOfRows)); 

        SheetData sheetData = workSheet.GetFirstChild<SheetData>(); 
        IEnumerable<Row> rows = sheetData.Descendants<Row>(); 

        string[,] cellValues = new string[numOfColumns, numOfRows]; 

        int colInx = 0; 
        int rowInx = 0; 
        string value = ""; 
        SharedStringTablePart stringTablePart = document.WorkbookPart.SharedStringTablePart; 

        // Iterate through each row of OpenXML data, and store each cell's value in the appropriate slot in our [,] string array. 
        foreach (Row row in rows) 
        { 
         for (int i = 0; i < row.Descendants<Cell>().Count(); i++) 
         { 
          // *DON'T* assume there's going to be one XML element for each column in each row... 
          Cell cell = row.Descendants<Cell>().ElementAt(i); 
          if (cell.CellValue == null || cell.CellReference == null) 
           continue;      // eg when an Excel cell contains a blank string 

          // Convert this Excel cell's CellAddress into a 0-based offset into our array (eg "G13" -> [6, 12]) 
          colInx = GetColumnIndexByName(cell.CellReference);    // eg "C" -> 2 (0-based) 
          rowInx = GetRowIndexFromCellAddress(cell.CellReference) - 1;  // Needs to be 0-based 

          // Fetch the value in this cell 
          value = cell.CellValue.InnerXml; 
          if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString) 
          { 
           value = stringTablePart.SharedStringTable.ChildElements[Int32.Parse(value)].InnerText; 
          } 

          cellValues[colInx, rowInx] = value; 
         } 
        } 

        // Copy the array of strings into a DataTable. 
        // We don't (currently) make any attempt to work out which columns should be numeric, rather than string. 
        for (int col = 0; col < numOfColumns; col++) 
        { 
         //dt.Columns.Add("Column_" + col.ToString()); 
         dt.Columns.Add(cellValues[col, 0]); 
        } 

        //foreach (Cell cell in rows.ElementAt(0)) 
        //{ 
        // dt.Columns.Add(GetCellValue(doc, cell)); 
        //} 


        for (int row = 0; row < numOfRows; row++) 
        { 
         DataRow dataRow = dt.NewRow(); 
         for (int col = 0; col < numOfColumns; col++) 
         { 
          dataRow.SetField(col, cellValues[col, row]); 
         } 
         dt.Rows.Add(dataRow); 
        } 

        dt.Rows.RemoveAt(0); 
        //#if DEBUG 
        //    // Write out the contents of our DataTable to the Output window (for debugging) 
        //    string str = ""; 
        //    for (rowInx = 0; rowInx < maxNumOfRows; rowInx++) 
        //    { 
        //     for (colInx = 0; colInx < maxNumOfColumns; colInx++) 
        //     { 
        //      object val = dt.Rows[rowInx].ItemArray[colInx]; 
        //      str += (val == null) ? "" : val.ToString(); 
        //      str += "\t"; 
        //     } 
        //     str += "\n"; 
        //    } 
        //    System.Diagnostics.Trace.WriteLine(str); 
        //#endif 
        return dt; 
       } 
      } 
     } 
     catch (Exception ex) 
     { 
      return null; 
     } 

    } 

    public static void CalculateDataTableSize(string dimensions, ref int numOfColumns, ref int numOfRows) 
    { 
     // How many columns & rows of data does this Worksheet contain ? 
     // We'll read in the Dimensions string from the Excel file, and calculate the size based on that. 
     //  eg "B1:F4" -> we'll need 6 columns and 4 rows. 
     // 
     // (We deliberately ignore the top-left cell address, and just use the bottom-right cell address.) 
     try 
     { 
      string[] parts = dimensions.Split(':');  // eg "B1:F4" 
      if (parts.Length != 2) 
       throw new Exception("Couldn't find exactly *two* CellAddresses in the dimension"); 

      numOfColumns = 1 + GetColumnIndexByName(parts[1]);  // A=1, B=2, C=3 (1-based value), so F4 would return 6 columns 
      numOfRows = GetRowIndexFromCellAddress(parts[1]); 
     } 
     catch 
     { 
      throw new Exception("Could not calculate maximum DataTable size from the worksheet dimension: " + dimensions); 
     } 
    } 

    public static int GetRowIndexFromCellAddress(string cellAddress) 
    { 
     // Convert an Excel CellReference column into a 1-based row index 
     // eg "D42" -> 42 
     //  "F123" -> 123 
     string rowNumber = System.Text.RegularExpressions.Regex.Replace(cellAddress, "[^0-9 _]", ""); 
     return int.Parse(rowNumber); 
    } 

    public static int GetColumnIndexByName(string cellAddress) 
    { 
     // Convert an Excel CellReference column into a 0-based column index 
     // eg "D42" -> 3 
     //  "F123" -> 5 
     var columnName = System.Text.RegularExpressions.Regex.Replace(cellAddress, "[^A-Z_]", ""); 
     int number = 0, pow = 1; 
     for (int i = columnName.Length - 1; i >= 0; i--) 
     { 
      number += (columnName[i] - 'A' + 1) * pow; 
      pow *= 26; 
     } 
     return number - 1; 
    } 
}[enter image description here][1] 

回答

0

SheetDimension部分是可选的(并且为此你不能总是依赖于它是最新的)。看到的OpenXML说明书的以下部分:

18.3.1.35尺寸(工作单尺寸)

该元素指定的工作表的使用范围。它指定工作表中 已用单元格的行和列边界。 这是可选的,不是必需的。 已使用的单元格包括具有公式,文本内容和单元格格式的单元格。当整列被格式化时,只有 该列中的第一个单元被认为被使用。

因此,没有任何SheetDimension部分的Excel文件是完全有效的,所以您不应该依赖它存在于Excel文件中。

为此我建议简单地解析包含在SheetData部分的所有元素,和“算”的行数(而不是阅读SheetDimensions部分获得的行/列数)。这样您还可以考虑到Excel文件可能包含数据之间的完全空白行。

+0

只分析行不会获取所需的结果。 Issue: 行1:5单元格(全部具有值) 行2:6单元格(单元格1,2为空白)。 所以发生了什么是放置数据时,第1行将被打印,但第2行有2个空白单元格,在插入到数据表中时会向左移动。正因为如此,我考虑了SheetDimension。 任何建议如何解决。 代码片段可以更清晰。 –