2016-09-24 47 views
0

我从XSLX产生用Groovy一个ARFF文件, 没有宣布,但是当我尝试秧鸡打开这个文件我得到这个错误:标称值在头

File "..." not recognised as an 'Arff data files' file. Reason: nominal value not declared in header, read Token[Ativo], line 16

我无法理解为什么我得到这个错误 有人可以帮助我解决这个错误,并解释它为什么会发生?

生成的文件

@relation kd-itempedido 
@attribute tipopedido {Assistencia,Recompra,Venda,Troca} 
@attribute aprovado {0.0,1.0} 
@attribute fasepedido {Aprovado,Cancelado,EmAprovacao,Liberado,Novo} 
@attribute statusinternopedido {NegociarPagamento,PedidosDeTeste,AguardandoOcorrencia,Nada,AguardandoBoletoDeposito,PedidoDuplicado,SuspeitaDeFraude} 
@attribute canal {Marketplace,Desktop} 
@attribute origem {LojasAmericanas,Optimise,MercadoLivre,Cityads,Zanox,Zoom,Rakuten,Lomadee,Facebook,Viptarget,Submarino,Criteo,Muccashop,Chaordic,Walmart,Googlead,Nada,Extra,Lojaskd,Shopback,Afilio,Shoptime,Nextperformance,CarrinhoAbandonado,Bing} 
@attribute mercado {S,N} 
@attribute cluster {EntregaImediata,Fiprec,Icconv,Esgotado} 
@attribute statusitem {Ativo} 
@attribute statusproduto {Inativo,Ativo,AtivoSemEstoque,ForaDeLinha} 
@attribute polo {Polo1,Polo3,Polo2} 
@data 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Fiprec,Ativo,Ativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Googlead,S,Ativo,Inativo,Polo2 
Venda,0.0,Novo,Nada,Desktop,Muccashop,N,Ativo,Ativo,Polo3 

Groovy的(VM -Dfile.encoding = ASCII UTF8 UTF8)

@Grapes([ 
     @Grab('org.apache.poi:poi:3.10.1'), 
     @Grab('org.apache.poi:poi-ooxml:3.10.1')]) 
import org.apache.poi.xssf.usermodel.XSSFWorkbook 
import java.text.Normalizer 
import static org.apache.poi.ss.usermodel.Cell.* 
import java.nio.file.Paths 

def path = "/home/eric/Documents/development/ufpr/Solid Eric/ItemPedido1000.xlsx" 
def relation = "kd-itempedido" 
def columns = ["tipopedido", "aprovado", "fasepedido", "statusinternopedido", "canal", "origem", "mercado", "cluster", "statusitem","statusproduto", "polo"] 
def arff = "ItemPedido.arff" 
new XslxToArffParser(path, relation, columns, arff); 

class Data{ 
    def rows = new ArrayList<List>(); 

    @Override 
    String toString() { 
     def s = "" 
     for (r in rows){ 
      for(d in r){ 

       s+=d 
       if(r.indexOf(d) < (r.size()-1)) 
        s+="," 
      } 
      s+="\n" 
     } 
     return s 
    } 
} 



class Atributo { 
    def descricao; 
    def possibilidades = new HashSet<Object>(); 
    def index; 

    @Override 
    String toString() { 

     def builder = new StringBuilder() 
     builder.append("@attribute ").append(descricao) 
     builder.append(" {") 
     for(def i = 0; i<possibilidades.size(); i++){ 
      builder.append(possibilidades[i]) 
      if((i+1) != possibilidades.size()) 
       builder.append(",") 
     } 
     builder.append("}").append("\n") 
     return builder.toString(); 
    } 
} 

class XslxToArffParser { 
    def attributes =[:]; 
    def data = new Data(); 
    def sheet = null; 

    XslxToArffParser(path, relation, columns, arffPath){ 
     load(path) 
     getAttributes(columns) 
     collectData() 
     saveArff(relation, arffPath) 
    } 

    def String parse(String s){ 
     s = Normalizer.normalize(s, Normalizer.Form.NFD) 
     s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", "") 
     s = s.split(/[^\w]/).collect { it.toLowerCase().capitalize() }.join("") 
     s = s.replaceAll(" ", "") 
     s = s.replaceAll("[^A-Za-z0-9]", "") 
     s = s.isEmpty() ? "Nada" : s 
     return s 
    } 

    def load(path) { 
     Paths.get(path).withInputStream { input -> 
      def workbook = new XSSFWorkbook(input) 
      sheet = workbook.getSheetAt(0) 
     } 
    } 

    def getAttributes(columns){ 
     for (cell in sheet.getRow(0).cellIterator()) { 
      def index = cell.columnIndex 
      def description = parse(cell.stringCellValue).toLowerCase() 
      if(columns.contains(description)){ 
       attributes << [(index):new Atributo(descricao: description, index: index)] 
      } 
     } 
    } 

    def collectData(){ 
     def headerFlag = true 
     for (row in sheet.rowIterator()) { 
      if (headerFlag) { 
       headerFlag = false 
       continue 
      } 
      def r = [] 
      for (cell in row.cellIterator()) { 
       def index = cell.columnIndex; 
       def value = cell.cellType == CELL_TYPE_STRING ? parse(cell.stringCellValue) : cell.numericCellValue 

       def attr = attributes[index] 
       if(attr != null){ 
        attr.possibilidades.add(value) 
        r << value 
       } 
      } 

      data.rows.add(r) 
     } 
    } 

    def saveArff(relation, path){ 
     Paths.get(path).withWriter { writer -> 

      writer.write "@relation " + relation 
      writer.write "\n" 
      for(a in attributes.values()) 
       writer.write a.toString() 

      writer.write "@data" 
      writer.write "\n" 

      writer.write data.toString() 
     } 
    } 
} 

解决。 “row.cellIterator()”不会迭代空/空单元格

回答

0

自从我使用Weka以来,我一直在观察你显示的文件和错误消息,我怀疑问题出现在最后两个数据文件的行。它们没有属性“集群”的值。

之后的S或N(属性“mercado”),他们有“Ativo”。该“Ativo”值未被定义为标称属性集群的可能值之一。虽然该文件读取了“Ativo”(这就是为什么错误消息显示为''读取Token [Ativo]'',但它期望为群集属性读取一个值,但它尚未期望statusitem属性的值。