2010-02-09 62 views
2

当谈到网站开发时(虽然不是针对一般编程),我是一个大多数新手,因此赦免任何不正确的术语。通过JavaScript在文档中检测希伯来文单词

我想要构建一个脚本,当它添加到HTML页面中时,它会检测页面中的每个希伯来单词并将该单词转换为HTML元素,例如,转换为标题的超链接。

因此,以下内容:

<p>ראש הלשכה</p> 

转化为:

<p><a title="word 1" href="#">הלשכה</a> <a title="word 2" href="#">ראש</a></p> 

有意义吗?

因此,我想业务的第一顺序是检测页面中的希伯来文单词。我将如何去做这个?我不知道从哪里开始,除了围绕jQuery文档。

+0

HTML是Unicode吗? UTF-8,还是它可以是任何编码? – 2010-02-09 02:58:19

+0

好问题。让我们简单说一下UTF-8。我喜欢它在像http://www.haaretz.co.il – 2010-02-09 03:03:58

回答

4

在字符串中搜索希伯来字很简单。使用符合希伯来码点的连续序列的正则表达式:

/[\u05D0-\u05FF]+/ 

由于JS支持函数式编程,我们可以很容易地编写自己的功能,行走文档树,调用每个文本节点上的功能。首先,一些脚手架。

if (! window.assert) { 
    window.dbgLvl = 1; // change this to 0 for production release 
    window.assert=function(succeeded, msg) { 
     if (dbgLvl && !succeeded) { 
      if (!msg) msg = 'assertion failed'; 
      throw msg; 
     } 
    } 
} 

接下来,我们定义一个方法将字符串拆分为数组,包括输出中的分隔符。

/* String.separate is like String.split, but the result includes the 
    separators. 

    These implementations of 'String.separate' will work for our purposes, 
    but are buggy in general, due to differences in the implementation of 
    String.split. 

    The two misbehaviors we correct are including neither grouped patterns 
    nor empty strings in the result, though the latter is only corrected 
    when the missing empty string is at the start or the end. 
*/ 
if ('-'.split(/(-)/).length & 1) { 
    assert('a'.split(/a/).length, 'split includes grouping but not empty strings'); 
    // split includes groups in result 
    String.prototype.separate = function (separator) { 
     if (typeof separator == 'string') { 
      if (separator.charAt(0) != '(' 
       || separator.charAt(separator.length-1) != ')') 
      { 
       separator = new RegExp('(' + separator + ')', 'g'); 
      } else { 
       separator = new RegExp(separator, 'g'); 
      } 
     } 
     return this.split(separator); 
    } 
} else { 
    if ('a'.split(/a/).length) { 
     // empty strings included, grouped aren't 
     String.prototype.separate = function (separator) { 
      if (typeof separator == 'string') { 
       separator = new RegExp(separator, 'g'); 
      } 
      var fence = this.match(separator); 
      if (!fence) { 
       return [this]; 
      } 
      var posts = this.split(separator); 
      assert(posts.length = fence.length+1); 
      var result = [], i; 
      for (i=0; i<fence.length; ++i) { 
       result.push(posts[i]); 
       result.push(fence[i]); 
      } 
      result.push(posts[i]); 
      return result; 
     } 
    } else { 
     // neither empty strings nor groups are included. IE, you suck. 
     String.prototype.separate = function (separator) { 
      if (typeof separator == 'string') { 
       separator = new RegExp(separator, 'g'); 
      } 
      var fence = this.match(separator); 
      if (!fence) { 
       return [this]; 
      } 
      var posts = this.split(separator); 
      if (posts.length <= fence.length) { 
       /* missing some posts. Assume that they are the first or 
        last, though this won't be true in general. 
       */ 
       if (posts.length < fence.length) { 
        posts.unshift(''); 
        posts.push(''); 
       } else { 
        if (this.substring(0, fence[0].length) == fence[0]) { 
         posts.unshift(''); 
        } else { 
         posts.push(''); 
        } 
       } 
      } 
      var result = [], i; 
      for (i=0; i<fence.length; ++i) { 
       result.push(posts[i]); 
       result.push(fence[i]); 
      } 
      result.push(posts[i]); 
      return result; 
     } 
    } 
} 

接下来是一些节点谓词。

if (! window.Node) { 
    window.Node={TEXT_NODE: 3}; 
} else if (typeof Node.TEXT_NODE == 'undefined') { 
    Node.TEXT_NODE = 3; 
} 

function isTextNode(node) {return node.nodeType == Node.TEXT_NODE;} 
function hasKids(node) {return node.childNodes && node.childNodes.length;} 
function allNodes(node) {return true;} 

现在的功能走DOM。

/* 
    forEachChild: pre-order traversal of document tree. Applies a function to some nodes, determined by the 'which' and 'descendInto' arguments. 

Arguments: 
    which (function): Returns true if 'action' should be applied to a node. 
    action (function): Takes a node and does something to it. 
    parent (Node): The node to start from. 
    descendInto (function, optional): By default, forEachChild will descend into every child that itself has children. Place additional restrictions by passing this argument. 
*/ 
var forEachChild = (function() { 
     /* the actual implementation is made a local function so that the 
      optional parameter can be handled efficiently. 
     */ 
     function _forEachChild(which, action, node, descendInto) { 
      for (var child=node.firstChild; child; child=child.nextSibling) { 
       if (which(child)) { 
        action(child); 
       } 
       if (hasKids(child) && descendInto(child)) { 
        _forEachChild(which, action, child, descendInto); 
       } 
      } 
     } 
     return function (which, action, node, descendInto) { 
      if (!descendInto) {descendInto=allNodes} 
      _forEachChild(which, action, node, descendInto); 
     } 
    })(); 

function forEachNode(which, action, descendInto) { 
    return forEachChild(which, action, document, descendInto); 
} 

function forEachTextNode(action, descendInto) { 
    return forEachNode(isTextNode, action, descendInto); 
} 

function forEachTextNodeInBody(action, descendInto) { 
    return forEachChild(isTextNode, action, document.body, descendInto); 
} 

最后一组函数将文本节点中的文本替换为与您选择的新节点匹配模式的文本。这个组(好吧,由wrapText返回的函数)还没有经过完全的跨浏览器兼容性测试,包括它是否正确处理文本方向。

/* 
    wrapText replaces substrings in a text node with new nodes. 

Arguments: 
    pattern (RegExp || string): If a RegExp, must be of the form: '/(...)/g'. 
    replace (function): Takes a string and returns a Node to replace the string. 

Returns a function that takes a text node. 
*/ 
function wrapText(pattern, replace) { 
    return function (node) { 
     var chunks = node.nodeValue.separate(pattern); 
     if (chunks.length < 2) 
      return; 
     var wordCount=0; 
     var fragment = document.createDocumentFragment(); 
     var i; 
     // don't bother adding first chunk if it's empty. 
     if (chunks[0].length) { 
      fragment.appendChild(document.createTextNode(chunks[0])); 
     } 
     for (i=1; i < chunks.length; i+=2) { 
      fragment.appendChild(replace(chunks[i])); // † 
      fragment.appendChild(document.createTextNode(chunks[i+1])); // ‡ 
     } 
     // clean-up 
     assert(i == chunks.length, 'even number of chunks in ['+chunks+'] when it should be odd.'); 
     /* chunks.length and i will always be odd, thus i == chunks.length 
     * when the loop finishes. This means the last element is never 
     * missed. 
     * Here's another way of thinking about this. Since the last 
     * (and first) chunk won't match the pattern, it won't be 
     * processed by the line †. The penultimate chunk, however, does 
     * match. Assuming the loop condition is correct,the penultimate 
     * chunk must be processed by †, hence the last chunk is 
     * processed by ‡. 
     */ 
     if (! chunks[i-1].length) { 
      // last chunk is empty; remove it. 
      fragment.removeChild(fragment.lastChild); 
     } 
     node.parentNode.replaceChild(fragment, node); 
    } 
} 

/* 
    createAnchorWrap wraps a string in an anchor node. createAnchorWrap also 
    sets the title of the anchor. 

Arguments: 
    title (string || function, optional): The title for the anchor element. 
     If title is a function, it's called with the string to wrap. If 
     title is a string, wrapper will use a word counter for the title 
     function. 

Returns a function that takes a string and returns an anchor element. 
*/ 
function createAnchorWrap(title) { 
    if (typeof title == 'string') { 
     title=createWordCounter(title); 
    } else if (!title) { 
     title=createWordCounter(); 
    } 
    return function(word) { 
     var a = document.createElement('a'); 
     a.title=title(word); 
     a.appendChild(document.createTextNode(word)); 
     return a; 
    } 
} 

/* 
    createWordCounter creates a word counter, which returns the number of 
    times it's been called (including the current call), prefixed by a string. 

Arguments: 
    pre (string, optional): prefix for return value. 

Returns a function that takes a string (ignored) and returns a string. 

*/ 
function createWordCounter(pre) { 
    var wordCount=0; 
    if (pre) { 
     pre = pre.replace(/ *$/, ' '); 
    } else { 
     pre = 'word '; 
    } 
    return function(text) { 
     return pre + wordCount; 
    } 
} 

要做的最后一件事是在页面底部(例如)载入处理程序或脚本中启动进程。

forEachTextNodeInBody(wrapText(/([\u05D0-\u05FF]+)/g, 
           createAnchorWrap())); 

如果你想改变的前缀称号,createWordCounter(...)结果传递给createAnchorWrap

+0

这样的文件上工作好吧,那是一个开始。所以,Javascript内置了对RegEx的支持。好,很好。现在,关于在HTML文档中查找文本的那一点...... – 2010-02-09 03:18:05

+0

好的,所以现在您已经编写了一些Javascript函数来遍历树。看起来我可以使用forEachTextNode(action)以某种方式将文本元素替换为achor元素。好的。我会看看我能做什么。感谢你目前的帮助。 – 2010-02-09 03:28:49

+0

请注意,使用JS库(jQuery,Prototype,MooTools ...)可能仍然是一个好主意。 – outis 2010-02-09 03:33:54