解析时从xml过滤元素

我使用nodejs xml解析器sax-js从xml获取内容。该xml的结构如下：解析时从xml过滤元素

<item> 
    <title>Some title</title> 
    <guid isPermaLink="false">http://example.com</guid> 
</item> 
<item> 
    <title>VIDEO: Some title</title> 
    <guid isPermaLink="false">http://example1.com</guid> 
</item>

我想下的所有URL，其title DONOT与VIDEO开始。

目前，它给了我所有的网址。

我的代码目前是：

'use strict'; 
var sax = require('sax-js'); 
var request = require('request'); 

var href = 'http://some-xml-url.xml'; 

var urls = []; 
var isTextPending = false; 

var saxStream = sax.createStream(true); 
saxStream.on('error', function (e) { 
    console.error(e); 
}); 

saxStream.ontext = function (text) { 
    if(isTextPending) { 
     urls.push(text); 
     isTextPending = false; 
    } 
}; 
saxStream.on('opentag', function (node) { 
    if(node.name === 'guid' && node.attributes.isPermaLink === 'false') { 
     isTextPending = true; 
    } 
}); 
saxStream.end = function() { 
} 
request(href).pipe(saxStream);

来源

2014-09-25 Palak Arora

如果您确定XML结构，可以使用RE来提取URL。否则，您可以[将XML转换为json]（https://www.npmjs.com/package/fast-xml-parser），然后遍历'item'并检查'item [i] .title'是否不从''VIDEO'开始，然后将'item [i] .guid'存储在某个数组中。 – 2017-11-30 13:58:12

您将需要处理更多的国家，只是 'isTextPending'。

下面是一个示例（注意，这也处理'closetag'事件以排除处理中标记之间的文本）。

'use strict'; 
var sax = require('sax-js'); 
var request = require('request'); 

var href = 'http://some-xml-url.xml'; 

var urls = []; 
var tagName = undefined; 
var isValidGuid = false; 
var isValidTitle = false; 
var guidUrl = undefined; 

var saxStream = sax.createStream(true); 
saxStream.on('error', function (e) { 
    console.error(e); 
}); 

saxStream.ontext = function (text) { 
    if (tagName === 'guid' && isValidGuid) { 
      guidUrl = text; 
    } 
    else if (tagName === 'title') { 
     isValidTitle = !(text.indexOf('VIDEO') === 0); 
    } 
    else return; 

    if (guidUrl !== undefined && isValidTitle) { 
     urls.push(guidUrl); 
    } 
}; 
saxStream.on('opentag', function (node) { 

    tagName = node.name; 
    switch(node.name) { 
     case 'guid': 
      isValidGuid = (node.attributes.isPermaLink === 'false'); 
      break 
     case 'item': 
      isValidGuid = false; 
      isValidTitle = false; 
      guidUrl = undefined; 
     break;  

    } 
}); 
saxStream.on('closetag', function (node) { 
    tagName = undefined; 
}); 
saxStream.end = function() { 
    console.log('Result: '+JSON.stringify(urls)); 
}; 
request(href).pipe(saxStream);

来源

2014-09-25 14:30:30 Ceredig

解析时从xml过滤元素

回答

相关问题