2013-03-25 68 views
8

我一直在解析这样的XML很多年,我不得不承认,当不同元素的数量变得更大时,我发现它有点无聊和耗尽,这里是我的意思,样本虚拟XML:更好的解析xml的方法

<?xml version="1.0"?> 
<Order> 
    <Date>2003/07/04</Date> 
    <CustomerId>123</CustomerId> 
    <CustomerName>Acme Alpha</CustomerName> 
    <Item> 
     <ItemId> 987</ItemId> 
     <ItemName>Coupler</ItemName> 
     <Quantity>5</Quantity> 
    </Item> 
    <Item> 
     <ItemId>654</ItemId> 
     <ItemName>Connector</ItemName> 
     <Quantity unit="12">3</Quantity> 
    </Item> 
    <Item> 
     <ItemId>579</ItemId> 
     <ItemName>Clasp</ItemName> 
     <Quantity>1</Quantity> 
    </Item> 
</Order> 

这是(使用SAX)相关部分:

public class SaxParser extends DefaultHandler { 

    boolean isItem = false; 
    boolean isOrder = false; 
    boolean isDate = false; 
    boolean isCustomerId = false; 
    private Order order; 
    private Item item; 

     @Override 
    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { 
     if (localName.equalsIgnoreCase("ORDER")) { 
      order = new Order(); 
     } 

     if (localName.equalsIgnoreCase("DATE")) { 
      isDate = true; 
     } 

     if (localName.equalsIgnoreCase("CUSTOMERID")) { 
      isCustomerId = true; 
     } 

     if (localName.equalsIgnoreCase("ITEM")) { 
      isItem = true; 
     } 
    } 

    public void characters(char ch[], int start, int length) throws SAXException { 

     if (isDate){ 
      SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd"); 
      String value = new String(ch, start, length); 
      try { 
       order.setDate(formatter.parse(value)); 
      } catch (ParseException e) { 
       e.printStackTrace(); 
      } 
     } 

     if(isCustomerId){ 
      order.setCustomerId(Integer.valueOf(new String(ch, start, length))); 
     } 

     if (isItem) { 
      item = new Item(); 
      isItem = false; 
     } 



    } 

} 

我不知道有没有摆脱其保持与元素的数量不断增加,这些可怕的布尔值的方法。必须有更好的方法来解析这个相对简单的XML。仅仅通过查看执行此任务所需的代码行看起来很难看。

目前我正在使用SAX解析器,但我打开其他任何建议(除DOM以外,我无法承受内存解析器中有大量XML文件)。

+4

你可以试试StAX – 2013-03-25 23:33:04

+0

如果你有一个生成XML的音乐会数据模型,我会看看XStream(http://xstream.codehaus.org/)。它将数据序列化到xml和后面做得非常好。 – 2013-03-25 23:43:49

+1

关于主题,我喜欢从XSD开始并使用XmlBeans。稍微OT,XML标签应该是区分大小写的,这个代码打破了这一点。 – 2013-03-26 00:17:10

回答

5

下面是使用JAXB和StAX的示例。

输入文档:

<?xml version="1.0" encoding="UTF-8"?> 
<Personlist xmlns="http://example.org"> 
    <Person> 
     <Name>Name 1</Name> 
     <Address> 
      <StreetAddress>Somestreet</StreetAddress> 
      <PostalCode>00001</PostalCode> 
      <CountryName>Finland</CountryName> 
     </Address> 
    </Person> 
    <Person> 
     <Name>Name 2</Name> 
     <Address> 
      <StreetAddress>Someotherstreet</StreetAddress> 
      <PostalCode>43400</PostalCode> 
      <CountryName>Sweden</CountryName> 
     </Address> 
    </Person> 
</Personlist> 

Person.java:

@XmlRootElement(name = "Person", namespace = "http://example.org") 
public class Person { 
    @XmlElement(name = "Name", namespace = "http://example.org") 
    private String name; 
    @XmlElement(name = "Address", namespace = "http://example.org") 
    private Address address; 

    public String getName() { 
     return name; 
    } 

    public Address getAddress() { 
     return address; 
    } 
} 

Address.java:

public class Address { 
    @XmlElement(name = "StreetAddress", namespace = "http://example.org") 
    private String streetAddress; 
    @XmlElement(name = "PostalCode", namespace = "http://example.org") 
    private String postalCode; 
    @XmlElement(name = "CountryName", namespace = "http://example.org") 
    private String countryName; 

    public String getStreetAddress() { 
     return streetAddress; 
    } 

    public String getPostalCode() { 
     return postalCode; 
    } 

    public String getCountryName() { 
     return countryName; 
    } 
} 

PersonlistProcessor.java:

public class PersonlistProcessor { 
    public static void main(String[] args) throws Exception { 
     new PersonlistProcessor().processPersonlist(PersonlistProcessor.class 
       .getResourceAsStream("personlist.xml")); 
    } 

    // TODO: Instead of throws Exception, all exceptions should be wrapped 
    // inside runtime exception 
    public void processPersonlist(InputStream inputStream) throws Exception { 
     JAXBContext jaxbContext = JAXBContext.newInstance(Person.class); 
     XMLStreamReader xss = XMLInputFactory.newFactory().createXMLStreamReader(inputStream); 
     // Create unmarshaller 
     Unmarshaller unmarshaller = jaxbContext.createUnmarshaller(); 
     // Go to next tag 
     xss.nextTag(); 
     // Require Personlist 
     xss.require(XMLStreamReader.START_ELEMENT, "http://example.org", "Personlist"); 
     // Go to next tag 
     while (xss.nextTag() == XMLStreamReader.START_ELEMENT) { 
      // Require Person 
      xss.require(XMLStreamReader.START_ELEMENT, "http://example.org", "Person"); 
      // Unmarshall person 
      Person person = (Person)unmarshaller.unmarshal(xss); 
      // Process person 
      processPerson(person); 
     } 
     // Require Personlist 
     xss.require(XMLStreamReader.END_ELEMENT, "http://example.org", "Personlist"); 
    } 

    private void processPerson(Person person) { 
     System.out.println(person.getName()); 
     System.out.println(person.getAddress().getCountryName()); 
    } 
} 
0

在SAX中,解析器在您的处理程序中“推送”事件,因此您必须按照惯例在这里完成所有的管理工作。另一种选择是StAX(javax.xml.stream包),它仍然是流式处理,但是你的代码负责从解析器中“拉”事件。通过这种方式,什么元素按照什么顺序被预测的逻辑被编码在程序的控制流中,而不是必须在布尔值中明确表示。

根据XML的精确结构,可能存在一个使用工具包(如XOM)的“中间路径”,该工具包具有将文档的子树解析为类似DOM的对象模型的操作模式,然后把它扔掉,解析下一个。这对于具有许多类似元素的重复性文档是很好的,每个元素都可以独立处理 - 您可以轻松地在每个树枝中编写基于树的API,但仍具有流式行为,可以让您高效地解析大文档。

public class ItemProcessor extends NodeFactory { 
    private Nodes emptyNodes = new Nodes(); 

    public Nodes finishMakingElement(Element elt) { 
    if("Item".equals(elt.getLocalName())) { 
     // process the Item element here 
     System.out.println(elt.getFirstChildElement("ItemId").getValue() 
     + ": " + elt.getFirstChildElement("ItemName").getValue()); 

     // then throw it away 
     return emptyNodes; 
    } else { 
     return super.finishMakingElement(elt); 
    } 
    } 
} 

您也可以达到类似的事情:使用StAX和JAXB的组合 - 定义JAXB注释类代表您的重复元素(在这个例子中项目),然后创建一个StAX的解析器,导航到第一Item开始标签,然后你可以从XMLStreamReader一次解组一个完整的Item

-1
import java.io.File; 
import java.io.FileOutputStream; 
import java.io.InputStream; 
import java.io.OutputStream; 
import java.util.ArrayList; 
import javax.xml.parsers.DocumentBuilder; 
import javax.xml.parsers.DocumentBuilderFactory; 
import javax.xml.transform.Transformer; 
import javax.xml.transform.TransformerFactory; 
import javax.xml.transform.dom.DOMSource; 
import javax.xml.transform.stream.StreamResult; 
import javax.xml.xpath.XPath; 
import javax.xml.xpath.XPathConstants; 
import javax.xml.xpath.XPathExpression; 
import javax.xml.xpath.XPathFactory; 
import org.w3c.dom.Document; 
import org.w3c.dom.NodeList; 

public class JXML { 
private DocumentBuilder builder; 
private Document doc = null; 
private DocumentBuilderFactory factory ; 
private XPathExpression expr = null; 
private XPathFactory xFactory; 
private XPath xpath; 
private String xmlFile; 
public static ArrayList<String> XMLVALUE ; 


public JXML(String xmlFile){ 
    this.xmlFile = xmlFile; 
} 


private void xmlFileSettings(){  
    try { 
     factory = DocumentBuilderFactory.newInstance(); 
     factory.setNamespaceAware(true); 
     xFactory = XPathFactory.newInstance(); 
     xpath = xFactory.newXPath(); 
     builder = factory.newDocumentBuilder(); 
     doc = builder.parse(xmlFile); 
    } 
    catch (Exception e){ 
     System.out.println(e); 
    }  
} 



public String[] selectQuery(String query){ 
    xmlFileSettings(); 
    ArrayList<String> records = new ArrayList<String>(); 
    try { 
     expr = xpath.compile(query); 
     Object result = expr.evaluate(doc, XPathConstants.NODESET); 
     NodeList nodes = (NodeList) result; 
     for (int i=0; i<nodes.getLength();i++){    
      records.add(nodes.item(i).getNodeValue()); 
     } 
     return records.toArray(new String[records.size()]); 
    } 
    catch (Exception e) { 
     System.out.println("There is error in query string"); 
     return records.toArray(new String[records.size()]); 
    }  
} 

public boolean updateQuery(String query,String value){ 
    xmlFileSettings(); 
    try{ 
     NodeList nodes = (NodeList) xpath.evaluate(query, doc, XPathConstants.NODESET); 
     for (int idx = 0; idx < nodes.getLength(); idx++) { 
      nodes.item(idx).setTextContent(value); 
     } 
     Transformer xformer = TransformerFactory.newInstance().newTransformer(); 
     xformer.transform(new DOMSource(doc), new StreamResult(new File(this.xmlFile))); 
     return true; 
    }catch(Exception e){ 
     System.out.println(e); 
     return false; 
    } 
} 




public static void main(String args[]){ 
    JXML jxml = new JXML("c://user.xml"); 
    jxml.updateQuery("//Order/CustomerId/text()","222"); 
    String result[]=jxml.selectQuery("//Order/Item/*/text()"); 
    for(int i=0;i<result.length;i++){ 
     System.out.println(result[i]); 
    } 
} 

}

+0

该OP明确表示,他们并不想使用DOM(或者涉及将整个文档解析为内存中的树结构的任何其他模型) – 2013-03-25 23:58:55

6

如果你控制了XML的定义,你可以使用XML绑定工具,例如JAXB(Java体系XML绑定)。在JAXB你可以定义一个架构XML结构(支持XSD和其他语言)或注释Java类以定义序列化规则。一旦在XML和Java之间有明确的声明性映射,对XML进行编组和解组就变得微不足道了。

使用JAXB确实需要比SAX处理程序更多的内存,但存在按部分处理XML文档的方法:Dealing with large documents

JAXB page from Oracle

0

我一直在使用xsteam连载我自己的对象到XML,然后加载回为Java对象。如果您可以将everythign表示为POJO,并且您正确注释了POJO以匹配xml文件中的类型,则可能会发现它更易于使用。

当一个字符串表示XML对象,你可以这样写:

Order theOrder = (Order)xstream.fromXML(xmlString);

我一直用它的对象加载到内存在一个单一的线,但如果你需要流呢并按照步骤进行处理,您应该可以使用HierarchicalStreamReader来遍历文档。这可能与@Dave建议的简单类似。

0

正如其他人所建议的那样,Stax模型将是一种更好的方法,可以最大限度地减少内存足迹,因为它是基于推送的模型。我亲自使用了Axio(它在Apache Axis中使用)并使用XPath表达式解析元素,这与您在所提供的代码片段中所做的操作不比详细讨论节点元素冗长。

0

没有支持更紧凑的XML解析,RTXML另一个库。图书馆及其文档在rasmustorkel.com。我实现了文件的解析在原来的问题,我在这里包括了完整的程序:

package for_so; 

import java.io.File; 
import java.util.ArrayList; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 

import rasmus_torkel.xml_basic.read.TagNode; 
import rasmus_torkel.xml_basic.read.XmlReadOptions; 
import rasmus_torkel.xml_basic.read.impl.XmlReader; 

public class Q15626686_ReadOrder 
{ 
    public static class Order 
    { 
     public final Date   _date; 
     public final int    _customerId; 
     public final String   _customerName; 
     public final ArrayList<Item> _itemAl; 

     public 
     Order(TagNode node) 
     { 
      _date = (Date)node.nextStringMappedFieldE("Date", Date.class); 
      _customerId = (int)node.nextIntFieldE("CustomerId"); 
      _customerName = node.nextTextFieldE("CustomerName"); 
      _itemAl = new ArrayList<Item>(); 
      boolean finished = false; 
      while (!finished) 
      { 
       TagNode itemNode = node.nextChildN("Item"); 
       if (itemNode != null) 
       { 
        Item item = new Item(itemNode); 
        _itemAl.add(item); 
       } 
       else 
       { 
        finished = true; 
       } 
      } 
      node.verifyNoMoreChildren(); 
     } 
    } 

    public static final Pattern DATE_PATTERN = Pattern.compile("^(\\d\\d\\d\\d)\\/(\\d\\d)\\/(\\d\\d)$"); 

    public static class Date 
    { 
     public final String _dateString; 
     public final int _year; 
     public final int _month; 
     public final int _day; 

     public 
     Date(String dateString) 
     { 
      _dateString = dateString; 
      Matcher matcher = DATE_PATTERN.matcher(dateString); 
      if (!matcher.matches()) 
      { 
       throw new RuntimeException(dateString + " does not match pattern " + DATE_PATTERN.pattern()); 
      } 
      _year = Integer.parseInt(matcher.group(1)); 
      _month = Integer.parseInt(matcher.group(2)); 
      _day = Integer.parseInt(matcher.group(3)); 
     } 
    } 

    public static class Item 
    { 
     public final int  _itemId; 
     public final String _itemName; 
     public final Quantity _quantity; 

     public 
     Item(TagNode node) 
     { 
      _itemId = node.nextIntFieldE("ItemId"); 
      _itemName = node.nextTextFieldE("ItemName"); 
      _quantity = new Quantity(node.nextChildE("Quantity")); 
      node.verifyNoMoreChildren(); 
     } 
    } 

    public static class Quantity 
    { 
     public final int _unitSize; 
     public final int _unitQuantity; 

     public 
     Quantity(TagNode node) 
     { 
      _unitSize = node.attributeIntD("unit", 1); 
      _unitQuantity = node.onlyInt(); 
     } 
    } 

    public static void 
    main(String[] args) 
    { 
     File xmlFile = new File(args[0]); 
     TagNode orderNode = XmlReader.xmlFileToRoot(xmlFile, "Order", XmlReadOptions.DEFAULT); 
     Order order = new Order(orderNode); 
     System.out.println("Read order for " + order._customerName + " which has " + order._itemAl.size() + " items"); 
    } 
} 

你会发现,检索功能的N端,E或D.他们指的做什么时所需的数据项不存在。 N表示返回Null,E表示抛出异常,D表示使用Default。

0

解,而无需使用外部包,或甚至的XPath:使用enum “PARSE_MODE”,可能是在结合有Stack<PARSE_MODE>

1)碱性溶液:

一个)字段

private PARSE_MODE parseMode = PARSE_MODE.__UNDEFINED__; 
// NB: essential that all these enum values are upper case, but this is the convention anyway 
private enum PARSE_MODE { 
    __UNDEFINED__, ORDER, DATE, CUSTOMERID, ITEM }; 
private List<String> parseModeStrings = new ArrayList<String>(); 
private Stack<PARSE_MODE> modeBreadcrumbs = new Stack<PARSE_MODE>(); 

b)让你的List<String>,也许在construc TOR:

for(PARSE_MODE pm : PARSE_MODE.values()){ 
     // might want to check here that these are indeed upper case 
     parseModeStrings.add(pm.name()); 
    } 

C)startElementendElement

@Override 
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { 
    String localNameUC = localName.toUpperCase(); 
    // pushing "__UNDEFINED__" would mess things up! But unlikely name for an XML element 
    assert ! localNameUC.equals("__UNDEFINED__"); 

    if(parseModeStrings.contains(localNameUC)){ 
     parseMode = PARSE_MODE.valueOf(localNameUC); 
     // any "policing" to do with which modes are allowed to switch into 
     // other modes could be put here... 
     // in your case, go `new Order()` here when parseMode == ORDER 
     modeBreadcrumbs.push(parseMode); 
    } 
    else { 
     // typically ignore the start of this element... 
    } 
} 

@Override 
private void endElement(String uri, String localName, String qName) throws Exception { 
    String localNameUC = localName.toUpperCase(); 
    if(parseModeStrings.contains(localNameUC)){ 
     // will not fail unless XML structure which is malformed in some way 
     // or coding error in use of the Stack, etc.: 
     assert modeBreadcrumbs.pop() == parseMode; 
     if(modeBreadcrumbs.empty()){ 
      parseMode = PARSE_MODE.__UNDEFINED__; 
     } 
     else { 
      parseMode = modeBreadcrumbs.peek(); 
     } 
    } 
    else { 
     // typically ignore the end of this element... 
    } 

} 

...所以这是什么意思呢?在任何时候,您都了解您所在的“解析模式”,并且您还可以查看Stack<PARSE_MODE> modeBreadcrumbs,如果您需要了解您通过其他解析模式进入此处...

characters方法然后变成基本上清洁器:

public void characters(char[] ch, int start, int length) throws SAXException { 
    switch(parseMode){ 
    case DATE: 
     // PS - this SimpleDateFormat object can be a field: it doesn't need to be created hundreds of times 
     SimpleDateFormat formatter. ... 
     String value = ... 
     ... 
     break; 

    case CUSTOMERID: 
     order.setCustomerId(... 
     break; 

    case ITEM: 
     item = new Item(); 
     // this next line probably won't be needed: when you get to endElement, if 
     // parseMode is ITEM, the previous mode will be restored automatically 
     // isItem = false ; 
    } 

} 

2)更“专业”溶液:
abstract类的具体类必须延伸并且然后没有能力来修改Stack等NB检查qName而不是localName。因此:

public abstract class AbstractSAXHandler extends DefaultHandler { 
    protected enum PARSE_MODE implements SAXHandlerParseMode { 
     __UNDEFINED__ 
    }; 
    // abstract: the concrete subclasses must populate... 
    abstract protected Collection<Enum<?>> getPossibleModes(); 
    // 
    private Stack<SAXHandlerParseMode> modeBreadcrumbs = new Stack<SAXHandlerParseMode>(); 
    private Collection<Enum<?>> possibleModes; 
    private Map<String, Enum<?>> nameToEnumMap; 
    private Map<String, Enum<?>> getNameToEnumMap(){ 
     // lazy creation and population of map 
     if(nameToEnumMap == null){ 
      if(possibleModes == null){ 
       possibleModes = getPossibleModes(); 
      } 
      nameToEnumMap = new HashMap<String, Enum<?>>(); 
      for(Enum<?> possibleMode : possibleModes){ 
       nameToEnumMap.put(possibleMode.name(), possibleMode); 
      } 
     } 
     return nameToEnumMap; 
    } 

    protected boolean isLegitimateModeName(String name){ 
     return getNameToEnumMap().containsKey(name); 
    } 

    protected SAXHandlerParseMode getParseMode() { 
     return modeBreadcrumbs.isEmpty()? PARSE_MODE.__UNDEFINED__ : modeBreadcrumbs.peek(); 
    } 

    @Override 
    public void startElement(String uri, String localName, String qName, Attributes attributes) 
      throws SAXException { 
     try { 
      _startElement(uri, localName, qName, attributes); 
     } catch (Exception e) { 
      throw new RuntimeException(e); 
     } 
    } 

    // override in subclasses (NB I think caught Exceptions are not a brilliant design choice in Java) 
    protected void _startElement(String uri, String localName, String qName, Attributes attributes) 
      throws Exception { 
     String qNameUC = qName.toUpperCase(); 
     // very undesirable ever to push "UNDEFINED"! But unlikely name for an XML element 
     assert !qNameUC.equals("__UNDEFINED__") : "Encountered XML element with qName \"__UNDEFINED__\"!"; 
     if(getNameToEnumMap().containsKey(qNameUC)){ 
      Enum<?> newMode = getNameToEnumMap().get(qNameUC); 
      modeBreadcrumbs.push((SAXHandlerParseMode)newMode); 
     } 
    } 

    @Override 
    public void endElement(String uri, String localName, String qName) throws SAXException { 
     try { 
      _endElement(uri, localName, qName); 
     } catch (Exception e) { 
      throw new RuntimeException(e); 
     } 
    } 

    // override in subclasses 
    protected void _endElement(String uri, String localName, String qName) throws Exception { 
     String qNameUC = qName.toUpperCase(); 
     if(getNameToEnumMap().containsKey(qNameUC)){ 
      modeBreadcrumbs.pop(); 
     } 
    } 

    public List<?> showModeBreadcrumbs(){ 
     return org.apache.commons.collections4.ListUtils.unmodifiableList(modeBreadcrumbs); 
    } 

} 

interface SAXHandlerParseMode { 

} 

然后,具体子类的显着部分:

private enum PARSE_MODE implements SAXHandlerParseMode { 
    ORDER, DATE, CUSTOMERID, ITEM 
}; 

private Collection<Enum<?>> possibleModes; 

@Override 
protected Collection<Enum<?>> getPossibleModes() { 
    // lazy initiation 
    if (possibleModes == null) { 
     List<SAXHandlerParseMode> parseModes = new ArrayList<SAXHandlerParseMode>(Arrays.asList(PARSE_MODE.values())); 
     possibleModes = new ArrayList<Enum<?>>(); 
     for(SAXHandlerParseMode parseMode : parseModes){ 
      possibleModes.add(PARSE_MODE.valueOf(parseMode.toString())); 
     } 
     // __UNDEFINED__ mode (from abstract superclass) must be added afterwards 
     possibleModes.add(AbstractSAXHandler.PARSE_MODE.__UNDEFINED__); 
    } 
    return possibleModes; 
} 

PS这是更复杂的东西起点:例如,您可以设置它保持与同步的List<Object>Stack<PARSE_MODE>:然后Objects可以是你想要的任何东西,使你能够“回到”你正在处理的那个上升的“XML节点”。但不要使用Map,但是:Stack可能会不止一次包含相同的PARSE_MODE对象。这实际上说明了所有的树状结构的基本特征:没有单独的节点(这里:解析模式)孤立存在的:其身份总是领先于它整个路径定义。