2014-09-01 85 views
-1

我的HTML代码中有一个Javascript。我想浏览以“Doc”结尾的链接。在这个HTML中,只有一个链接,称为SunnydataDoc。所以我想在这个页面上搜索这个字符串,如果现在有任何以“Doc”结尾的链接,我想在这些页面中进一步浏览。 你能帮我解决吗?我听说我可以将正则表达式和匹配方法与Jsoup结合使用。 在这里我的代码。JSoup和Javascript代码

<script> 
    var data = {"totalRecords": 2, "sort": "name", "startIndex": 0, "dir": "asc", "records": [{"raw_name": "samia/export/sunnydata", "last_changeset": "\n <div>\n  <pre><a title=\"ownerID:\n\nAdded tag V2.11.d50.mkt.001 for changeset 56e10a4864ff\" class=\"tooltip\" href=\"/samia/export/sunnydata/changeset/f602409eba261d749d23dc75551b2959425dfa8d\">r17:f602409eba26</a></pre>\n </div>\n", "atom": "\n <a title=\"Subscribe to samia/export/sunnydata atom feed\" href=\"/samia/export/sunnydata/feed/atom?api_key=e214ebea2335318bee1460a1fd33725ab3e1002e\"><i class=\"icon-rss-sign\" style=\"color: #fa9b39\"></i></a>\n", "owner": "ownerID (Owner)", "rss": "\n <a title=\"Subscribe to samia/export/sunnydata rss feed\" href=\"/samia/export/sunnydata/feed/rss?api_key=e214ebea2335318bee1460a1fd33725ab3e1002e\"><i class=\"icon-rss-sign\" style=\"color: #fa9b39\"></i></a>\n", "name": "\n \n <div style=\"white-space: nowrap; }\">\n  <a href=\"/samia/export/sunnydata\">\n\n  <span title=\"Mercurial repository\"><i class=\"icon-hg\" style=\"color: #316293; font-size: 14px;\"></i></span>\n\n  <span style=\"margin: 0px 8px 0px 8px\"></span>\n Sunnydata\n </a>\n </div>\n", "last_rev_raw": 17, "state": "\n <div>\n  <div class=\"btn btn-mini btn-success disabled\">Created</div>\n </div>\n", "menu": "\n <ul class=\"menu_items hidden\">\n\n <li style=\"border-top:1px solid #003367;margin-left:18px;padding-left:-99px\"></li>\n <li>\n  <a title=\"Summary\" href=\"/samia/export/sunnydata\">\n  <span class=\"icon\">\n   <i class=\"icon-file-text\"></i>\n  </span>\n  <span>Summary</span>\n  </a>\n </li>\n <li>\n  <a title=\"Changelog\" href=\"/samia/export/sunnydata/changelog\">\n  <span class=\"icon\">\n   <i class=\"icon-list-alt\"></i>\n  </span>\n  <span>Changelog</span>\n  </a>\n </li>\n <li>\n  <a title=\"Files\" href=\"/samia/export/sunnydata/files/tip/\">\n  <span class=\"icon\">\n   <i class=\"icon-file-alt\"></i>\n  </span>\n  <span>Files</span>\n  </a>\n </li>\n <li>\n  <a title=\"Fork\" href=\"/samia/export/sunnydata/fork\">\n  <span class=\"icon\">\n   <i class=\"icon-code-fork\"></i>\n  </span>\n  <span>Fork</span>\n  </a>\n </li>\n </ul>\n", "desc": "GHU Sunnydataimport", "last_change": "\n <span class=\"tooltip\" date=\"2014-08-21 18:49:50\" title=\"Thu, 21 Aug 2014 18:49:50\">10 days and 16 hours ago</span>\n"}, {"raw_name": "samia/export/sunnydatadoc", "last_changeset": "\n <div>\n  <pre><a title=\"ownerID;lt;owneremail;gt;:\n\nChangedokumentation\" class=\"tooltip\" href=\"/samia/export/sunnydataDoc/changeset/9ed1679c7a35b76e1402b540cee38000461fdfdd\">r0:9ed1679c7a35</a></pre>\n </div>\n", "atom": "\n <a title=\"Subscribe to samia/export/sunnydataDoc atom feed\" href=\"/samia/export/sunnydataDoc/feed/atom?api_key=e214ebea2335318bee1460a1fd33725ab3e1002e\"><i class=\"icon-rss-sign\" style=\"color: #fa9b39\"></i></a>\n", "owner": "ownerID (Owner)", "rss": "\n <a title=\"Subscribe to samia/export/sunnydataDoc rss feed\" href=\"/samia/export/sunnydataDoc/feed/rss?api_key=e214ebea2335318bee1460a1fd33725ab3e1002e\"><i class=\"icon-rss-sign\" style=\"color: #fa9b39\"></i></a>\n", "name": "\n \n <div style=\"white-space: nowrap; }\">\n  <a href=\"/samia/export/sunnydataDoc\">\n\n  <span title=\"Mercurial repository\"><i class=\"icon-hg\" style=\"color: #316293; font-size: 14px;\"></i></span>\n\n  <span style=\"margin: 0px 8px 0px 8px\"></span>\n SunnydataDoc\n </a>\n </div>\n", "last_rev_raw": 0, "state": "\n <div>\n  <div class=\"btn btn-mini btn-success disabled\">Created</div>\n </div>\n", "menu": "\n <ul class=\"menu_items hidden\">\n\n <li style=\"border-top:1px solid #003367;margin-left:18px;padding-left:-99px\"></li>\n <li>\n  <a title=\"Summary\" href=\"/samia/export/sunnydataDoc\">\n  <span class=\"icon\">\n   <i class=\"icon-file-text\"></i>\n  </span>\n  <span>Summary</span>\n  </a>\n </li>\n <li>\n  <a title=\"Changelog\" href=\"/samia/export/sunnydataDoc/changelog\">\n  <span class=\"icon\">\n   <i class=\"icon-list-alt\"></i>\n  </span>\n  <span>Changelog</span>\n  </a>\n </li>\n <li>\n  <a title=\"Files\" href=\"/samia/export/sunnydataDoc/files/tip/\">\n  <span class=\"icon\">\n   <i class=\"icon-file-alt\"></i>\n  </span>\n  <span>Files</span>\n  </a>\n </li>\n <li>\n  <a title=\"Fork\" href=\"/samia/export/sunnydataDoc/fork\">\n  <span class=\"icon\">\n   <i class=\"icon-code-fork\"></i>\n  </span>\n  <span>Fork</span>\n  </a>\n </li>\n </ul>\n", "desc": "GHU Sunnydataimport (Dokumentation)", "last_change": "\n <span class=\"tooltip\" date=\"2014-04-25 11:03:45\" title=\"Fri, 25 Apr 2014 11:03:45\">4 months and 6 days ago</span>\n"}]}; 
    var myDataSource = new YAHOO.util.DataSource(data); 
    myDataSource.responseType = YAHOO.util.DataSource.TYPE_JSON; 

所以在这个例子中我有这个链接:href = \“/ samia/export/sunnydataDoc \”。我想采取这个链接,并在那里与我的代码。

这是我的Java代码。

public class JScripttest { 

public static void main(String[] args) throws IOException { 

    Response res = Jsoup 
      .connect(
        "url") 
      .data("username", "username", "password", "password") 
      .method(Method.POST).execute(); 
    Map<String, String> loginCookies = res.cookies(); 
    Document doc = Jsoup.connect("url") 
      .cookies(loginCookies).get(); 


    Element script = doc.select("href").last(); 

    Pattern p = Pattern.compile("href\s=\s"([^"]+Doc)""); // Regex for the value of the href 
    Matcher m = p.matcher(script.html()); // you have to use html here and NOT text! Text will drop the 'href' part 

    while(m.find()) 
    { 
     System.out.println(m.group()); 
     System.out.println(m.group(1)); 
    } 

    } 



private static void print(String msg, Object... args) { 
    System.out.println(String.format(msg, args)); 
} 

所以我得到的错误 “模式......” 行

感谢您寻找。

回答

0

这个正则表达式可以让你获得以Doc结尾的链接。我不确定你的意思是“进去”,但这应该可以帮助你。组1包含该URL。

href\s*=\s*"([^"]+Doc)" 

Regular expression visualization

​​

在Java中正确地转义"

Pattern p = Pattern.compile("href\\s*=\\s*\"([^\"]+Doc)\""); 
+0

我使用这个模式和匹配。那是对的吗?我收到错误。 '\t \t Pattern p = Pattern.compile(href \ s * = \ s *“([^”] + Doc)“); Matcher m = p.matcher(script.html()); – michaelsteven 2014-09-01 10:38:04

+0

您似乎是混淆了Java和JavaScript,尽管它们名字上的相似性很差,但它们并没有什么关系,你的'