2017-08-28 65 views
0

我在具有与其关联的IAM角色的EC2实例上运行RStudio,该实例允许完全S3访问它。我想从S3读取一个文件到RStudio中。如何使用IAM角色认证从EC2上的RStudio访问S3数据?

我试图通过sparklyr如下这样做:

spark_install(version = "2.1.0") 
sc <- spark_connect(master = "local") 

ctx <- sparklyr::spark_context(sc) 

#Use below to set the java spark context 
jsc <- invoke_static( 
    sc, 
    "org.apache.spark.api.java.JavaSparkContext", 
    "fromSparkContext", 
    ctx 
) 

hconf <- jsc %>% invoke("hadoopConfiguration") 
hconf %>% invoke("set","fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
hconf %>% invoke("set","fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
usercsv_tbl <- spark_read_csv(sc,name = "temp",path = "s3a://<bucket>/filename.csv") 

我收到以下错误:

Error: java.lang.RuntimeException: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found 
    at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2195) 
    at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2654) 
    at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667) 
    at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94) 
    at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703) 
    at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685) 
    at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373) 
    at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295) 
    at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:372) 
    at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:370) 
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241) 
    at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241) 
    at scala.collection.immutable.List.foreach(List.scala:381) 
    at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241) 
    at scala.collection.immutable.List.flatMap(List.scala:344) 
    at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:370) 
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:152) 
    at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:415) 
    at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:352) 
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 
    at java.lang.reflect.Method.invoke(Method.java:498) 
    at sparklyr.Invoke$.invoke(invoke.scala:102) 
    at sparklyr.StreamHandler$.handleMethodCall(stream.scala:97) 
    at sparklyr.StreamHandler$.read(stream.scala:62) 
    at sparklyr.BackendHandler.channelRead0(handler.scala:52) 
    at sparklyr.BackendHandler.channelRead0(handler.scala:14) 
    at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 
    at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 
    at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:293) 
    at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:267) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 
    at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346) 
    at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367) 
    at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353) 
    at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911) 
    at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131) 
    at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652) 
    at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575) 
    at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489) 
    at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451) 
    at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140) 
    at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144) 
    at java.lang.Thread.run(Thread.java:748) 
Caused by: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found 
    at org.apache.hadoop.conf.Configuration.getClassByName(Configuration.java:2101) 
    at org.apache.hadoop.conf.Configuration.getClass(Configuration.java:2193) 
    ... 52 more 

我用cloudyraws.s3包如下也试过:

library(aws.s3) 
get_bucket(bucket = <bucketname>) 
我有以下错误:
List of 4 
$ Code  : chr "AccessDenied" 
$ Message : chr "Access Denied" 
$ RequestId: chr "CF4041D52D7523D2" 
$ HostId : chr "vtkUIF7qsUwlGxBUaDpfXk9f6QHIelLxcsV0Nigla9yJicBl1YpxtrgGr82IoMyYPu6uvDSpAGI=" 
- attr(*, "headers")=List of 6 
    ..$ x-amz-request-id : chr "CF4041D52D7523D2" 
    ..$ x-amz-id-2  : chr "vtkUIF7qsUwlGxBUaDpfXk9f6QHIelLxcsV0Nigla9yJicBl1YpxtrgGr82IoMyYPu6uvDSpAGI=" 
    ..$ content-type  : chr "application/xml" 
    ..$ transfer-encoding: chr "chunked" 
    ..$ date    : chr "Mon, 28 Aug 2017 17:49:48 GMT" 
    ..$ server   : chr "AmazonS3" 
    ..- attr(*, "class")= chr [1:2] "insensitive" "list" 
- attr(*, "class")= chr "aws_error" 
NULL 
Error in parse_aws_s3_response(r, Sig, verbose = verbose) : 
    Forbidden (HTTP 403). 

我如何使用EC2的IAM角色,所以我不必手动把我的凭据访问S3数据?

+0

对于cloudyr,你必须按顺序安装** ** aws.ec2metadata包皮卡在EC2元数据。你可以通过调用'aws.signature :: locate_credentials()' – Thomas

+0

谢谢!工作! – charmander

回答

1

当您启动Spark时,需要包含org.apache.hadoop:hadoop-aws:2.7.3软件包 - 它允许Spark与S3通话。这个软件包的缺失是为什么你会得到Class org.apache.hadoop.fs.s3a.S3AFileSystem not found错误。

对我来说,以下工作:

config <- spark_config() 
config$sparklyr.defaultPackages <- "org.apache.hadoop:hadoop-aws:2.7.3" 
Sys.setenv(AWS_ACCESS_KEY_ID="") 
Sys.setenv(AWS_SECRET_ACCESS_KEY="") # setting these blank ensures AWS uses the IAM roles associated with the cluster to define S3 permissions 

sc <- spark_connect(master='xxxxx', config=config) 

有一个很好的引导这个here

相关问题