2011-09-25 78 views
9

我目前正在创建一个Node.js webscraper/proxy,但我无法解析在源代码的脚本部分中找到的相关Url,我想REGEX会做招。 虽然我不知道如何实现这一点。Javascript:REGEX将所有相对网址更改为绝对

无论如何,我可以去做这件事吗?

此外,我很容易做到这一点,因为我很困惑其他代理解析网站。我认为大多数只是荣耀的网站刮板,它可以读取网站的源代码,将所有链接/表单重新发送给代理。

+1

我会使用一个真正的解析器,而不是一个正则表达式。有节点的html解析器。 – thejh

回答

38

高级HTML字符串替换功能

注为OP,因为他要求这样的功能:更改base_url您代理的基本URL以达到预期的效果。

下面会显示两个功能(使用指南包含在代码中)。确保你不要跳过这个答案的任何部分,以充分理解函数的行为。

  • rel_to_abs(urL) - 此函数返回绝对URL。当传递一个具有通用信任协议的绝对URL时,它将立即返回该URL。否则,将从base_url和函数参数生成一个绝对URL。相对URL被正确解析(../; ./; .; //)。
  • replace_all_rel_by_abs - 此功能将解析所有在HTML中具有重要含义的URL,例如CSS url(),链接和外部资源。查看代码以获得解析实例的完整列表。请参阅this answer调整实施至从外部源(嵌入文档)清理HTML字符串
  • 测试用例(在答案的底部):为了测试该功能的有效性,只需将小书签粘贴到位置栏即可。


rel_to_abs - 解析相对URL

function rel_to_abs(url){ 
    /* Only accept commonly trusted protocols: 
    * Only data-image URLs are accepted, Exotic flavours (escaped slash, 
    * html-entitied characters) are not supported to keep the function fast */ 
    if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) 
     return url; //Url is already absolute 

    var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/"; 
    if(url.substring(0,2) == "//") 
     return location.protocol + url; 
    else if(url.charAt(0) == "/") 
     return location.protocol + "//" + location.host + url; 
    else if(url.substring(0,2) == "./") 
     url = "." + url; 
    else if(/^\s*$/.test(url)) 
     return ""; //Empty = Return nothing 
    else url = "../" + url; 

    url = base_url + url; 
    var i=0 
    while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,""))); 

    /* Escape certain characters to prevent XSS */ 
    url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22") 
      .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E"); 
    return url; 
} 

例/例子:

  • http://foo.bar。已经是绝对URL,因此立即返回。
  • /doo相对于根目录:返回当前根目录+提供的相对URL。
  • ./meh相对于当前目录。
  • ../booh相对于父目录。

功能相对路径转换为../,并执行搜索和替换(http://domain/sub/anything-but-a-slash/../mehttp://domain/sub/me)。


replace_all_rel_by_abs - 转换网址,所有相关的出现次数的脚本实例(<script>
网址,事件处理程序 取代,因为它几乎不可能创造一个快速和安全过滤器解析JavaScript

这个脚本里面有一些注释正则表达式是动态创建的,因为单个RE的大小可以是 cha racters。 <meta http-equiv=refresh content=.. >可以以各种方式混淆,因此RE的大小。

function replace_all_rel_by_abs(html){ 
    /*HTML/XML Attribute may not be prefixed by these characters (common 
     attribute chars. This list is not complete, but will be sufficient 
     for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */ 
    var att = "[^-a-z0-9:._]"; 

    var entityEnd = "(?:;|(?!\\d))"; 
    var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")", 
       "(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")", 
       ")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")", 
       ".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"}; 
       /* Placeholders to filter obfuscations */ 
    var charMap = {}; 
    var s = ents[" "]+"*"; //Short-hand for common use 
    var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*"; 
    /*^Important: Must be pre- and postfixed by <and>. 
    * This RE should match anything within a tag! */ 

    /* 
     @name ae 
     @description Converts a given string in a sequence of the original 
         input and the HTML entity 
     @param String string String to convert 
     */ 
    function ae(string){ 
     var all_chars_lowercase = string.toLowerCase(); 
     if(ents[string]) return ents[string]; 
     var all_chars_uppercase = string.toUpperCase(); 
     var RE_res = ""; 
     for(var i=0; i<string.length; i++){ 
      var char_lowercase = all_chars_lowercase.charAt(i); 
      if(charMap[char_lowercase]){ 
       RE_res += charMap[char_lowercase]; 
       continue; 
      } 
      var char_uppercase = all_chars_uppercase.charAt(i); 
      var RE_sub = [char_lowercase]; 
      RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd); 
      RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd); 
      if(char_lowercase != char_uppercase){ 
       /* Note: RE ignorecase flag has already been activated */ 
       RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd); 
       RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd); 
      } 
      RE_sub = "(?:" + RE_sub.join("|") + ")"; 
      RE_res += (charMap[char_lowercase] = RE_sub); 
     } 
     return(ents[string] = RE_res); 
    } 

    /* 
     @name by 
     @description 2nd argument for replace(). 
     */ 
    function by(match, group1, group2, group3){ 
     /* Note that this function can also be used to remove links: 
     * return group1 + "javascript://" + group3; */ 
     return group1 + rel_to_abs(group2) + group3; 
    } 
    /* 
     @name by2 
     @description 2nd argument for replace(). Parses relevant HTML entities 
     */ 
    var slashRE = new RegExp(ae("/"), 'g'); 
    var dotRE = new RegExp(ae("."), 'g'); 
    function by2(match, group1, group2, group3){ 
     /*Note that this function can also be used to remove links: 
     * return group1 + "javascript://" + group3; */ 
     group2 = group2.replace(slashRE, "/").replace(dotRE, "."); 
     return group1 + rel_to_abs(group2) + group3; 
    } 
    /* 
     @name cr 
     @description   Selects a HTML element and performs a 
           search-and-replace on attributes 
     @param String selector HTML substring to match 
     @param String attribute RegExp-escaped; HTML element attribute to match 
     @param String marker Optional RegExp-escaped; marks the prefix 
     @param String delimiter Optional RegExp escaped; non-quote delimiters 
     @param String end  Optional RegExp-escaped; forces the match to end 
           before an occurence of <end> 
    */ 
    function cr(selector, attribute, marker, delimiter, end){ 
     if(typeof selector == "string") selector = new RegExp(selector, "gi"); 
     attribute = att + attribute; 
     marker = typeof marker == "string" ? marker : "\\s*=\\s*"; 
     delimiter = typeof delimiter == "string" ? delimiter : ""; 
     end = typeof end == "string" ? "?)("+end : ")("; 
     var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi'); 
     var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi'); 
     var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi'); 
     html = html.replace(selector, function(match){ 
      return match.replace(re1, by).replace(re2, by).replace(re3, by); 
     }); 
    } 
    /* 
     @name cri 
     @description   Selects an attribute of a HTML element, and 
           performs a search-and-replace on certain values 
     @param String selector HTML element to match 
     @param String attribute RegExp-escaped; HTML element attribute to match 
     @param String front  RegExp-escaped; attribute value, prefix to match 
     @param String flags  Optional RegExp flags, default "gi" 
     @param String delimiter Optional RegExp-escaped; non-quote delimiters 
     @param String end  Optional RegExp-escaped; forces the match to end 
           before an occurence of <end> 
    */ 
    function cri(selector, attribute, front, flags, delimiter, end){ 
     if(typeof selector == "string") selector = new RegExp(selector, "gi"); 
     attribute = att + attribute; 
     flags = typeof flags == "string" ? flags : "gi"; 
     var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi'); 
     var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi'); 
     var at1 = new RegExp('('+front+')([^"]+)(")', flags); 
     var at2 = new RegExp("("+front+")([^']+)(')", flags); 
     if(typeof delimiter == "string"){ 
      end = typeof end == "string" ? end : ""; 
      var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags); 
      var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)}; 
     } else { 
      var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)}; 
    } 
     html = html.replace(selector, function(match){ 
      return match.replace(re1, handleAttr).replace(re2, handleAttr); 
     }); 
    } 

    /* <meta http-equiv=refresh content=" ; url= " > */ 
    cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i"); 

    cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */ 
    cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */ 

    cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */ 
    cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */ 

    /* <param name=movie value= >*/ 
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value"); 

    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */ 
    cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */ 
    return html; 
} 

的私人活动的简短摘要:

  • rel_to_abs(url) - 相对/未知的URL转换为绝对URL
  • replace_all_rel_by_abs(html) - 替换由绝对URL的HTML的字符串中的URL的所有相关OCCURENCES 。
    1. ae - 一个纽约ë ntity - 返回RE模式来处理HTML实体。
    2. by - 用替换- 这个简短的函数请求实际的URL替换(rel_to_abs)。如果不是千次,这个函数可能被称为数百个。请小心不要将慢速算法添加到此功能(自定义)。
    3. cr - C reate R eplace - 创建并执行搜索和替换。
      例如:href="..."(在任何HTML标签内)。
    4. cri - Ç reate ř E放置 n第 - 创建并执行一个搜索和替换。
      例如:url(..)属于HTML标记内的所有style属性。

测试用例

打开任何页面,并粘贴在地址栏中以下书签:

javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})(); 

注入的代码包含了两个函数,如上所定义的,加之测试用例,如下所示。 注意:测试用例确实不是修改页面的HTML,但在textarea(可选)中显示解析的结果。

var t=(new Date).getTime(); 
    var result = replace_all_rel_by_abs(document.documentElement.innerHTML); 
    if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){ 
    var txt = document.createElement("textarea"); 
    txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%" 
    txt.ondblclick = function(){this.parentNode.removeChild(this)} 
    txt.value = result; 
    document.body.appendChild(txt); 
} 

参见:

+0

谢谢,你知道我有什么方法可以匹配脚本中的所有相关url吗? – Trevor

+0

在你的代码中包含我的函数,并且每当你想从一个可能的相对URL获得一个绝对URL时调用'rel_to_abs',例如:var some_url =“.././callback/xhr.php";rel_to_abs(some_url) '; –

+0

是的,我明白。但我的意思是,我将如何扫描我代理的网站才能找到这些网址? – Trevor

-1

如果您使用正则表达式来查找所有非绝对URL,则只需在它们前面加上当前URL即可。

你需要修复将是那些不与任何一个/http(s)://开始(或其他协议标记,如果你关心他们)

举个例子,假设您正在刮网址http://www.example.com/。如果你遇到一个相对URL,让我们说foo/bar,只需将前缀的网址被刮它,像这样:http://www.example.com/foo/bar

对于一个正则表达式从页面刮掉的网址,有可能是很多好的可用的,如果你谷歌了一下,所以我不打算开始发明了这里一穷:)

2

到URL从相对转换为绝对可靠的方法是使用内置的url module

例子:

var url = require('url'); 
url.resolve("http://www.example.org/foo/bar/", "../baz/qux.html"); 

>> gives 'http://www.example.org/foo/baz/qux.html' 
+0

“require”来自哪里? –

+0

@ajkochanowicz:问题是关于一个Node.js应用程序。 'require()'是C'#include <...>'的Node.js等价物。 (嗯,不完全是。)所以,在编写JS代码在浏览器中运行时,我的答案是无法使用的。 – tuomassalo

0

这是在当前线程加上我一些代码重新分解,使JSLint的快乐Rob W answer "Advanced HTML string replacement functions"

我应该发布它作为答案的评论,但我没有足够的声望点。

/*jslint browser: true */ 
 
/*jslint regexp: true */ 
 
/*jslint unparam: true*/ 
 
/*jshint strict: false */ 
 

 
/** 
 
* convertRelToAbsUrl 
 
* 
 
* https://stackoverflow.com/a/7544757/1983903 
 
* 
 
* @param {String} url 
 
* @return {String} updated url 
 
*/ 
 
function convertRelToAbsUrl(url) { 
 
    var baseUrl = null; 
 

 
    if (/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) { 
 
     return url; // url is already absolute 
 
    } 
 

 
    baseUrl = location.href.match(/^(.+)\/?(?:#.+)?$/)[0] + '/'; 
 

 
    if (url.substring(0, 2) === '//') { 
 
     return location.protocol + url; 
 
    } 
 
    if (url.charAt(0) === '/') { 
 
     return location.protocol + '//' + location.host + url; 
 
    } 
 
    if (url.substring(0, 2) === './') { 
 
     url = '.' + url; 
 
    } else if (/^\s*$/.test(url)) { 
 
     return ''; // empty = return nothing 
 
    } 
 

 
    url = baseUrl + '../' + url; 
 

 
    while (/\/\.\.\//.test(url)) { 
 
     url = url.replace(/[^\/]+\/+\.\.\//g, ''); 
 
    } 
 

 
    url = url.replace(/\.$/, '').replace(/\/\./g, '').replace(/"/g, '%22') 
 
      .replace(/'/g, '%27').replace(/</g, '%3C').replace(/>/g, '%3E'); 
 

 
    return url; 
 
} 
 

 
/** 
 
* convertAllRelativeToAbsoluteUrls 
 
* 
 
* https://stackoverflow.com/a/7544757/1983903 
 
* 
 
* @param {String} html 
 
* @return {String} updated html 
 
*/ 
 
function convertAllRelativeToAbsoluteUrls(html) { 
 
    var me = this, 
 
     att = '[^-a-z0-9:._]', 
 
     entityEnd = '(?:;|(?!\\d))', 
 
     ents = { 
 
      ' ' : '(?:\\s|&nbsp;?|&#0*32' + entityEnd + '|&#x0*20' + entityEnd + ')', 
 
      '(' : '(?:\\(|&#0*40' + entityEnd + '|&#x0*28' + entityEnd + ')', 
 
      ')' : '(?:\\)|&#0*41' + entityEnd + '|&#x0*29' + entityEnd + ')', 
 
      '.' : '(?:\\.|&#0*46' + entityEnd + '|&#x0*2e' + entityEnd + ')' 
 
     }, 
 
     charMap = {}, 
 
     s = ents[' '] + '*', // short-hand for common use 
 
     any = '(?:[^>\"\']*(?:\"[^\"]*\"|\'[^\']*\'))*?[^>]*', 
 
     slashRE = null, 
 
     dotRE = null; 
 

 
    function ae(string) { 
 
     var allCharsLowerCase = string.toLowerCase(), 
 
      allCharsUpperCase = string.toUpperCase(), 
 
      reRes = '', 
 
      charLowerCase = null, 
 
      charUpperCase = null, 
 
      reSub = null, 
 
      i = null; 
 

 
     if (ents[string]) { 
 
      return ents[string]; 
 
     } 
 

 
     for (i = 0; i < string.length; i++) { 
 
      charLowerCase = allCharsLowerCase.charAt(i); 
 
      if (charMap[charLowerCase]) { 
 
       reRes += charMap[charLowerCase]; 
 
       continue; 
 
      } 
 
      charUpperCase = allCharsUpperCase.charAt(i); 
 
      reSub = [charLowerCase]; 
 
      reSub.push('&#0*' + charLowerCase.charCodeAt(0) + entityEnd); 
 
      reSub.push('&#x0*' + charLowerCase.charCodeAt(0).toString(16) + entityEnd); 
 

 
      if (charLowerCase !== charUpperCase) { 
 
       reSub.push('&#0*' + charUpperCase.charCodeAt(0) + entityEnd); 
 
       reSub.push('&#x0*' + charUpperCase.charCodeAt(0).toString(16) + entityEnd); 
 
      } 
 
      reSub = '(?:' + reSub.join('|') + ')'; 
 
      reRes += (charMap[charLowerCase] = reSub); 
 
     } 
 
     return (ents[string] = reRes); 
 
    } 
 

 
    function by(match, group1, group2, group3) { 
 
     return group1 + me.convertRelToAbsUrl(group2) + group3; 
 
    } 
 

 
    slashRE = new RegExp(ae('/'), 'g'); 
 
    dotRE = new RegExp(ae('.'), 'g'); 
 

 
    function by2(match, group1, group2, group3) { 
 
     group2 = group2.replace(slashRE, '/').replace(dotRE, '.'); 
 
     return group1 + me.convertRelToAbsUrl(group2) + group3; 
 
    } 
 

 
    function cr(selector, attribute, marker, delimiter, end) { 
 
     var re1 = null, 
 
      re2 = null, 
 
      re3 = null; 
 

 
     if (typeof selector === 'string') { 
 
      selector = new RegExp(selector, 'gi'); 
 
     } 
 

 
     attribute = att + attribute; 
 
     marker = typeof marker === 'string' ? marker : '\\s*=\\s*'; 
 
     delimiter = typeof delimiter === 'string' ? delimiter : ''; 
 
     end = typeof end === 'string' ? '?)(' + end : ')('; 
 

 
     re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi'); 
 
     re2 = new RegExp('(' + attribute + marker + '\')([^\'' + delimiter + ']+' + end + ')', 'gi'); 
 
     re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi'); 
 

 
     html = html.replace(selector, function (match) { 
 
      return match.replace(re1, by).replace(re2, by).replace(re3, by); 
 
     }); 
 
    } 
 

 
    function cri(selector, attribute, front, flags, delimiter, end) { 
 
     var re1 = null, 
 
      re2 = null, 
 
      at1 = null, 
 
      at2 = null, 
 
      at3 = null, 
 
      handleAttr = null; 
 

 
     if (typeof selector === 'string') { 
 
      selector = new RegExp(selector, 'gi'); 
 
     } 
 

 
     attribute = att + attribute; 
 
     flags = typeof flags === 'string' ? flags : 'gi'; 
 
     re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi'); 
 
     re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi'); 
 
     at1 = new RegExp('(' + front + ')([^"]+)(")', flags); 
 
     at2 = new RegExp("(" + front + ")([^']+)(')", flags); 
 

 
     if (typeof delimiter === 'string') { 
 
      end = typeof end === 'string' ? end : ''; 
 
      at3 = new RegExp('(' + front + ')([^\"\'][^' + delimiter + ']*' + (end ? '?)(' + end + ')' : ')()'), flags); 
 
      handleAttr = function (match, g1, g2) { 
 
       return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2); 
 
      }; 
 
     } else { 
 
      handleAttr = function (match, g1, g2) { 
 
       return g1 + g2.replace(at1, by2).replace(at2, by2); 
 
      }; 
 
     } 
 
     html = html.replace(selector, function (match) { 
 
      return match.replace(re1, handleAttr).replace(re2, handleAttr); 
 
     }); 
 
    } 
 

 
    cri('<meta' + any + att + 'http-equiv\\s*=\\s*(?:\"' + ae('refresh') 
 
     + '\"' + any + '>|\'' + ae('refresh') + '\'' + any + '>|' + ae('refresh') 
 
     + '(?:' + ae(' ') + any + '>|>))', 'content', ae('url') + s + ae('=') + s, 'i'); 
 

 
    cr('<' + any + att + 'href\\s*=' + any + '>', 'href'); /* Linked elements */ 
 
    cr('<' + any + att + 'src\\s*=' + any + '>', 'src'); /* Embedded elements */ 
 

 
    cr('<object' + any + att + 'data\\s*=' + any + '>', 'data'); /* <object data= > */ 
 
    cr('<applet' + any + att + 'codebase\\s*=' + any + '>', 'codebase'); /* <applet codebase= > */ 
 

 
    /* <param name=movie value= >*/ 
 
    cr('<param' + any + att + 'name\\s*=\\s*(?:\"' + ae('movie') + '\"' + any + '>|\'' 
 
     + ae('movie') + '\'' + any + '>|' + ae('movie') + '(?:' + ae(' ') + any + '>|>))', 'value'); 
 

 
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, 
 
     'url', '\\s*\\(\\s*', '', '\\s*\\)'); /* <style> */ 
 
    cri('<' + any + att + 'style\\s*=' + any + '>', 'style', 
 
     ae('url') + s + ae('(') + s, 0, s + ae(')'), ae(')')); /*< style=" url(...) " > */ 
 

 
    return html; 
 
}

0

由罗布·W上评论有关基本标签,我写的注入功能:

function injectBase(html, base) { 
    // Remove any <base> elements inside <head>  
    html = html.replace(/(<[^>/]*head[^>]*>)[\s\S]*?(<[^>/]*base[^>]*>)[\s\S]*?(<[^>]*head[^>]*>)/img, "$1 $3"); 

    // Add <base> just before </head> 
    html = html.replace(/(<[^>/]*head[^>]*>[\s\S]*?)(<[^>]*head[^>]*>)/img, "$1 " + base + " $2"); 
    return(html); 
}