2016-12-15 74 views
1

我正在尝试使用Jsoup来阅读论坛页面,但我无法这样做。我已成功登录,比我能够阅读第一页或列表页面。但是,当我去线程页,它给我403下面的代码:成功登录JSOUP后无法读取线程页面

Connection.Response loginForm = Jsoup.connect("http://picturepub.net/index.php?login/login").method(Connection.Method.GET) 
    .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").timeout(0).execute(); 

Document doc = Jsoup.connect("http://picturepub.net/index.php?login/login").data("cookieexists", "false").data("cookie_check", "1").data("login", "swordblazer") 
    .data("password", "picturepub").data("register", "0").data("redirect", "/index.php").cookies(loginForm.cookies()) 
    .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").post(); 

doc = loginForm.parse(); 

Map<String, String> cookies = loginForm.cookies(); 

List<String> urls = new ArrayList<String>(); 
List<String> threadUrls = new ArrayList<String>(); 
int h = 0; 
for (int i = 1; i < 20; i++) { 
    if (i == 1) 
    doc = Jsoup.connect("http://picturepub.net/index.php?forums/photoshoots-magazines.51/") 
     .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").cookies(cookies).get(); 
    else 
    doc = Jsoup.connect("http://picturepub.net/index.php?forums/photoshoots-magazines.51/page-" + i) 
     .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0").cookies(cookies).get(); 

    // get all links 
    Elements links = doc.select("a[href]"); 
    System.out.println(doc.title()); 
    for (Element element : links) { 
    if (element.absUrl("href").contains("threads")) { 
     String linkImage = element.absUrl("href"); 
     Document document = Jsoup.connect(linkImage).cookies(cookies).get(); 

     if (!threadUrls.contains(linkImage)) { 
     threadUrls.add(linkImage); 
     h++; 
     } 

    } 
    } 
} 
+0

你得到'403'可能是因为你缺少一些参数/ cookie。如果您已经想出如何登录,而不是使用相同的方法来监控浏览器与网站之间的流量并查看您的浏览器正在发送的内容。 – TDG

+0

我做到了。除了需要发送给服务器的cookie以外,还有其他什么吗? – user236928

+0

Cookie和所需的参数。 – TDG

回答

0

JSoup连接是彼此无关,所以他们不共享“登入状态/会话”。你必须仔细地复制它们之间的状态。你得到HTTP 403,因为有几个原因:

  • loginForm响应不返回身份验证cookie,你不能使用它们授权的资源,但是您以后使用这些Cookie。
  • 要获取身份验证Cookie,您必须从POST http://picturepub.net/index.php?login/login响应中获取Cookie,并且不要将其转换为文档。必须使用method(POST)将第二个请求声明为POST请求。
  • 失败的请求,Jsoup.connect(linkImage).cookies(cookies).get();未命中User-Agent

为了使你的代码不易出错,你应该考虑重构为的事情,这将使你的代码更健壮之一。

private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"; 
private static final String BASE_URL = "http://picturepub.net/index.php"; 
private static final int PAGE_COUNT = 20; 

static void grab() 
     throws IOException { 
    out.println("Getting the login form..."); 
    final Response getLoginFormResponse = prepareConnection(GET, "?login/login", emptyMap()) 
      .execute(); 
    out.println("Posting the login data..."); 
    // Avoid converting to document when it's unnecessary and use `execute()` 
    final Response postLoginFormResponse = prepareConnection(POST, "?login/login", getLoginFormResponse.cookies()) 
      .data("cookieexists", "false") 
      .data("cookie_check", "1") 
      .data("login", ...YOUR USERNAME...) 
      .data("password", ...YOUR PASSWORD...) 
      .data("register", "0") 
      .data("redirect", "/index.php") 
      .execute(); 
    // Obtain the authentication cookies 
    final Map<String, String> cookies = postLoginFormResponse.cookies(); 
    // If you want to discard duplicates, just don't use lists -- sets are designed for unique elements. 
    // The `h` is unnecessary because you can query the collection for its size: threadUrls.size() 
    final Collection<String> threadUrls = new LinkedHashSet<>(); 
    for (int i = 1; i <= PAGE_COUNT; i++) { 
     out.printf("Page #%d...\n", i); 
     final Document getPageDocument = prepareConnection(GET, "?forums/photoshoots-magazines.51/" + (i == 1 ? "" : "page-" + i), cookies) 
       .execute() 
       .parse(); 
     out.printf("Page #%d: %s\n", i, getPageDocument.title()); 
     // `a[href*=threads/]` is a selector to obtain all links having the "threads/" in <A> element URLs -- no need to check for substring later 
     // The following code uses Java 8 streams to filter out duplicate links on the page 
     final Iterable<String> hrefs = getPageDocument.select("a[href*=threads/]") 
       .stream() 
       .map(e -> e.absUrl("href")) 
       .collect(toSet()); 
     for (final String href : hrefs) { 
      out.printf("Probing: %s ... ", href); 
      final Response analyzeMeResponse = prepareConnection(GET, stripBaseUrl(href), cookies) 
        .execute(); 
      threadUrls.add(href); 
      out.println("Done!"); 
     } 
    } 
    out.println(threadUrls); 
} 

private static String stripBaseUrl(final String url) 
     throws IllegalArgumentException { 
    if (!url.startsWith(BASE_URL)) { 
     // This must not happen for a well-written parser 
     throw new IllegalArgumentException(url); 
    } 
    return url.substring(BASE_URL.length()); 
} 

// Just make sure that a particular connection is: 
// * bound to the BASE_URL defined above 
// * bound to a specific HTTP method 
// * follows redirects 
// * User-Agent is set 
// * cookies are always set 
private static Connection prepareConnection(final Method method, final String url, final Map<String, String> cookies) { 
    return connect(BASE_URL + url) 
      .followRedirects(true) 
      .method(method) 
      .userAgent(USER_AGENT) 
      .cookies(cookies); 
} 

上面的代码是基于org.jsoup:json:1.10.1自上次JSoup版本无法处理HTTP 307 Temporary Redirect所使用的那个网址。

相关问题