2017-05-27 54 views

回答

0

假设你已经设置了Druid和Yarn/MapReduce设置,你可以启动一个index_hadoop任务来完成你所要求的任务。

有一个德鲁伊兽人的扩展,允许读取ORC文件,我不认为它配备了标准的发布,所以你必须把它以某种方式(我们建立从源)

(扩展名列表http://druid.io/docs/latest/development/extensions.html


在这里,将摄取一堆兽人文件和一个附加间隔到数据源的例子。张贴到一个HTTP霸主:// 霸主:8090 /德/索引/ V1 /任务

(DOC http://druid.io/docs/latest/ingestion/batch-ingestion.html

您可能必须根据您的分布进行调整,我记得我们有问题的hortonworks找不到类(classpathPrefix将帮助调整MapReduce类路径)

{ 
    "type": "index_hadoop", 
    "spec": { 
    "ioConfig": { 
     "type": "hadoop", 
     "inputSpec": { 
     "type": "granularity", 
     "inputFormat": "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat", 
     "dataGranularity": "hour", 
     "inputPath": "/apps/hive/warehouse/table1", 
     "filePattern": ".*", 
     "pathFormat": "'partition='yyyy-MM-dd'T'HH" 
     } 
    }, 
    "dataSchema": { 
     "dataSource": "cube_indexed_from_orc", 
     "parser": { 
     "type": "orc", 
     "parseSpec": { 
      "format": "timeAndDims", 
      "timestampSpec": { 
      "column": "timestamp", 
      "format": "nano" 
      }, 
      "dimensionsSpec": { 
      "dimensions": ["cola", "colb", "colc"], 
      "dimensionExclusions": [], 
      "spatialDimensions": [] 
      } 
     }, 
     "typeString": "struct<timestamp:bigint,cola:bigint,colb:string,colc:string,cold:bigint>" 
     }, 
     "metricsSpec": [{ 
     "type": "count", 
     "name": "count" 
     }], 
     "granularitySpec": { 
     "type": "uniform", 
     "segmentGranularity": "DAY", 
     "queryGranularity": "HOUR", 
     "intervals": ["2017-06-14T00:00:00.000Z/2017-06-15T00:00:00.000Z"] 
     } 
    }, 
    "tuningConfig": { 
     "type": "hadoop", 
     "partitionsSpec": { 
     "type": "hashed", 
     "targetPartitionSize": 5000000 
     }, 
     "leaveIntermediate": false, 
     "forceExtendableShardSpecs": "true" 
    } 
    } 
} 
+0

非常感谢您的反馈Pierre – user2359902

相关问题