2009-09-04 634 views
3

我已阅读关于How to download and save a file from internet using Java的优秀讨论。但是,如果我执行下一个代码,则会收到损坏的PDF。任何想法为什么?Java下载PDF文件已损坏?

import java.io.*; 
import java.net.*; 

public class PDFDownload { 
    public static String URL = "http://www.nbc.com/Heroes/novels/downloads/"; 
    public static String FOLDER = "C:/Users/sdelamo/workspace/SandBox/HeroesNovel/"; 

    public static void main(String[] args) { 
     String filename = "Heroes_novel_001.pdf"; 
     try { 
      saveUrl(FOLDER + filename, URL + filename); 
     } catch (MalformedURLException e) { 
      System.out.println("MalformedURLException"); 
     } catch (IOException e) { 
      System.out.println("IOException");        
     }      
    }  



    public static void saveUrl(String filename, String urlString) throws MalformedURLException, IOException { 
     BufferedInputStream in = null; 
     FileOutputStream fout = null; 
     try { 
      URL url = new URL(urlString); 
      in = new BufferedInputStream(url.openStream()); 
      fout = new FileOutputStream(filename); 

      byte data[] = new byte[1024]; 
      int count; 
      while ((count = in.read(data, 0, 1024)) != -1) { 
       fout.write(data, 0, count); 
      } 
     } finally { 
      if (in != null) 
       in.close(); 
      if (fout != null) 
       fout.close(); 
     } 
    } 
} 

上面的代码下载html代替PDF。这是输出:

<?xml version="1.0" encoding="UTF-8" ?> 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML Basic 1.1//EN" 
    "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd"> 

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 
<head> 

<meta name="viewport" content="width=240, user-scalable=yes" /> 
<HTTP-EQUIV="PRAGMA" CONTENT="NO-CACHE"> 
<META HTTP-EQUIV="Expires" CONTENT="-1"> 
<meta http-equiv="Cache-control" content="no-cache"> 
<meta http-equiv="Cache-control" content="must-revalidate"> 
<meta http-equiv="Cache-control" content="max-age=0"> 
<meta http-equiv="refresh" content="200"> 

<title>NBC.com: Heroes</title> 
<link rel="stylesheet" type="text/css" href="/style/default.css?sid=8a9212f822e1c675330ec418bc531169" /> 
<link rel="stylesheet" type="text/css" href="/style/hro.css?sid=8a9212f822e1c675330ec418bc531169" /> 

</head> 
<body> 
<center><img src="http://oimg.nbcuni.com/b/ss/nbcunbcnetworkwapbu,nbcuwapsitebu/5/H.8--WAP/4aa0e4cb8b448?vid=8a9212f822e1c675330ec418bc531169&gn=NBC.com Front Door&c2=&c3=Miscellaneous&c4=&c6=m.nbc.com/show/hro&c8=TV Entertainment&c9=NBC Network&c10=&c11= | &c12= | &c25=offdeck&c27=internal&c29=&c44=D=User-Agent&r=" width="5" height="5" border="0" /></center> 
<h1 id="fHeader"> 
<a href="/?sid=8a9212f822e1c675330ec418bc531169"> 
<img src="/images/nbc_logo.gif" alt="NBC : logo" border="0" /> 
</a> 
</h1> 

<h2> 
<a href="/show/hro?sid=8a9212f822e1c675330ec418bc531169"> 
<img src="/images/shows/1221684699_Heroes_WAP_166x54.jpg" alt="Heroes : showheader" border="0" /> 
</a> 
</h2> 
<div id="tunein_nexton"> 
    <span id="tunein">Mondays 9/8c</span> 
</div><!--end #tunein_nexton--> 
<div id="tunein_nexton"> 
    <!--<span id="tunein">Mondays 8/7c</span>--> 

    <p id="nexton"><span class="sectiontitle"></span></p> 
</div><!--end #tunein_nexton--> 
<div id="featuredcontent"> 
    <h3>FEATURED CONTENT</h3> 
    <table id="featuredItemsTable"> 

     <tr> 
      <td><a href="/show/hro/videos.html?sid=8a9212f822e1c675330ec418bc531169"><img src="/images/hro/nbc_hro_pro_040X921HRO120FLYPSIDE_exp921_20090_543_large.jpg" alt="featured" /></a> 
      </td> 
      <td> 
       <span class="ftitle">Dreams</span> 
       <span class="fdesc">Heroes premieres Mon., Sept. 21s...</span> 
      </td> 
     </tr> 
             <tr> 
      <td><a href="/show/hro/recaps.html?sid=8a9212f822e1c675330ec418bc531169"><img src="http://origin-www.nbc.com/Heroes/images/episodes/season3/325/hro_325_01.jpg" alt="featured" height="45" width="80"/></a> 
      </td> 
      <td> 
       <span class="ftitle">Recap:</span> 
       <span class="fdesc">Season 3 Episode An Invisible Thread</span> 
      </td> 
     </tr> 
             <tr> 
      <td><a href="/show/hro/photos.html?sid=8a9212f822e1c675330ec418bc531169"><img src="http://origin-www.nbc.com/app2/img/200x200xS/scet/photos/51/3736/NUP_110031_0323.JPG" alt="featured" height="45" width="80"/></a> 
      </td> 
      <td class="finfo"> 
       <span class="ftitle">Photo:</span> 
       <span class="fdesc">Heroes "Cast Photos"</span> 
      </td> 
     </tr> 
        </table> 


</div><!--end #featuredcontent--> 

<h3>HEROES</h3> 
<table class="showNav"> 
    <tr><td><a href="/show/hro/about.html?sid=8a9212f822e1c675330ec418bc531169" accesskey="1">About</a></td></tr> 
     <tr><td><a href="/show/hro/videos.html?sid=8a9212f822e1c675330ec418bc531169" accesskey="2">Videos</a></td></tr> 
       <tr><td><a href="/show/hro/recaps.html?sid=8a9212f822e1c675330ec418bc531169" accesskey="3">Episode Recaps</a></td></tr> 
        <tr><td><a href="/show/hro/photos.html?sid=8a9212f822e1c675330ec418bc531169" accesskey="4">Photos</a></td></tr> 
       <tr><td><a href="/show/hro/community.html?sid=8a9212f822e1c675330ec418bc531169" accesskey="5">Community</a></td></tr> 
    <tr><td><a href="/shows.shtml?sid=8a9212f822e1c675330ec418bc531169" accesskey="6">Shows List</a></td></tr> 
</table> 
<!-- <a href="http://www.insightexpress.com/ix/Survey.aspx?id=151580&accessCode=3161643404&sid=8a9212f822e1c675330ec418bc531169" ><img src="/images/mNBCcom_166x54.jpg" border="0"></a> --> 



<div class="footer" align="center"><a href="http://m.nbc.com?sid=8a9212f822e1c675330ec418bc531169"><strong>NBC Mobile Main</strong></a> | <a href="/terms.shtml?sid=8a9212f822e1c675330ec418bc531169"><strong>Terms of Use</strong></a> | <a href="/privacy.shtml?sid=8a9212f822e1c675330ec418bc531169"><strong>Privacy</strong></a></div><div class="cpyrt" align="center">&#169; NBC Universal, Inc.</div> 

</body> 
</html> 

任何想法如何下载PDF?

解决方案

在连接前设置User-Agent。

URL u = new URL(urlString); 
HttpURLConnection huc = (HttpURLConnection) u.openConnection(); 
huc.setRequestMethod("GET"); 
huc.setRequestProperty("User-Agent", " Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"); 
huc.connect();   

in = new BufferedInputStream(huc.getInputStream()); 

回答

1

这与您的其他问题是相同的问题。 NBC.com不PDF发回给你,如果它认为你是一个刮板:)

相同的小把戏,

conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.13) Gecko/2009073021 Firefox/3.0.13"); 
3

您是否尝试过使用例如文本编辑器查看下载的文件?

您会看到它包含一个HTML页面,而不是PDF。可能URL没有指向PDF,或者有一些重定向正在进行,标准java.net类默认不支持。

确保URL正确指向PDF。您可以使用Apache HttpClient用HTTP做更复杂的事情,包括自动处理HTTP重定向。

注意:您发布的代码不会编译,因为您错误地放置了}

+0

该代码*不*点到PDF,我相信。他将URL附加到URL。 – 2009-09-04 09:56:13

+0

现在,它编译 – 2009-09-04 09:58:17

+0

我用一个编辑器打开了PDF,里面有一个html文件 – 2009-09-04 10:00:48

1

检查结果文件 - 我期望它是一个HTML文件。如果没有引用者或使用JavaScript重定向页面或其他内容,该网站可能会返回错误。您可以使用HttpURLConnection类来检查服务器返回的HTTP标头。

URL url = new URL(
    "http://www.nbc.com/Heroes/novels/downloads/Heroes_novel_001.pdf"); 
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); 
conn.setRequestMethod("HEAD"); 
try { 
    for (Map.Entry<String, List<String>> header : conn.getHeaderFields() 
     .entrySet()) { 
    System.out.println(header.getKey() + "=" + header.getValue()); 
    } 
} finally { 
    conn.disconnect(); 
} 

上面的代码返回text/html一个Content-Type

+0

你是对的。我用一个编辑器打开它,里面有一个html – 2009-09-04 10:06:12

1

对于这种探索,我强烈建议Jython(或Groovy,或...)。例如:

 
C:\Users\Vinay>jython 
Jython 2.5.0 (Release_2_5_0:6476, Jun 16 2009, 13:33:26) 
[Java HotSpot(TM) Client VM (Sun Microsystems Inc.)] on java1.6.0_16 
Type "help", "copyright", "credits" or "license" for more information. 
>>> s = "http://www.nbc.com/Heroes/novels/downloads/Heroes_novel_001.pdf" 
>>> import java.net 
>>> import jarray 
>>> u = java.net.URL(s) 
>>> os = u.openStream() 
>>> buffer = jarray.zeros(1024, 'b') 
>>> n = os.read(buffer, 0, 1024) 
>>> java.lang.String(buffer) 
<?xml version="1.0" encoding="UTF-8" ?> 
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML Basic 1.1//EN" 
    "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd"> 

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 
<head> 

<meta name="viewport" content="width=240, user-scalable=yes" /> 
<HTTP-EQUIV="PRAGMA" CONTENT="NO-CACHE"> 
<META HTTP-EQUIV="Expires" CONTENT="-1"> 
<meta http-equiv="Cache-control" content="no-cache"> 
<meta http-equiv="Cache-control" content="must-revalidate"> 
<meta http-equiv="Cache-control" content="max-age=0"> 
meta http-equiv="refresh" content="200"> 
<title>NBC.com: Heroes</title> 
<link rel="stylesheet" type="text/css" href="/style/default.css?sid=c67ddc30f79 
ec4cc811f6e67e383fed7" /> 
<link rel="stylesheet" type="text/css" href="/style/hro.css?sid=c67ddc30f79ec4c 
c811f6e67e383fed7" /> 

</head> 
<body> 
<center><img src="http://oimg.nbcuni.com/b/ss/nbcunbcnetworkwapbu,nbcuwapsitebu/ 
5/H.8--WAP/4aa0e7ce2535c?vid=c67ddc30f79ec4cc811f6e67e383fed7&gn=NBC.com Front 
>>> 

这证明你发现了什么,但没有编辑/编译循环的方式来获得。只是我的2美分...

至于如何获取数据 - 这可能是你必须欺骗你的User-Agent头。从Firefox,相同的URL返回Content-Typeapplication/pdf和PDF文件。

更新:以下Jython脚本:

import java.net 
import jarray 

s = "http://www.nbc.com/Heroes/novels/downloads/Heroes_novel_001.pdf" 
u = java.net.URL(s) 
c = u.openConnection() 
c.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.2) Gecko/20090810 Ubuntu/9.10 (karmic) Firefox/3.5.2") 
BUFLEN = 4 
buffer = jarray.zeros(BUFLEN, 'b') 
c.connect() 
stream = c.getInputStream() 
stream.read(buffer, 0, BUFLEN) 
data = java.lang.String(buffer) 
print data 

打印

%PDF

所以网站User-Agent头。

+0

如何欺骗用户代理头? – 2009-09-04 10:29:26

+1

如果您坚持使用Java的'HttpURLConnection',请在连接之前将其设置为请求属性。 _(请注意,欺骗用户代理可能在这种情况下工作,但它只是Web服务器可用来区分真实浏览器和漫游器/蜘蛛等的一些技巧之一) – McDowell 2009-09-04 10:46:07

0

如果设置的User-Agent并没有解决这个问题。这可能是Cookies的一个问题。安装简单的浏览器插件(EditThisCookie,HTTP Spy for Chrome)并检查请求&响应标头。获取这些cookie值并使用相同的HttpURLConnection设置它们。

代码:(扩展发表塞尔吉奥·德尔阿莫的溶液)

URL u = new URL(urlString); 
HttpURLConnection huc = (HttpURLConnection) u.openConnection(); 
huc.setRequestMethod("GET"); 
huc.setRequestProperty("User-Agent", " Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729)"); 

String myCookies = "cookie_name_1=cookie_value_1;cokoie_name_2=cookie_value_2"; 
huc.setRequestProperty("Cookie", myCookies); 

huc.connect();   

in = new BufferedInputStream(huc.getInputStream());