您好我有一个mapreduce应用程序可以将数据批量加载到HBase中。 我总共有142个文本文件的总大小为200GB。 我的mapper在5分钟内完成并且所有的reducer也都是100%,但最后一个还是卡住了。 它需要很长时间,并从过去24小时运行。 我有一个专栏家庭。 我的行键如下所示。Last Reducer从最近24小时运行200 GB数据集
48433197315 | 1972-03-31T00:00:00Z | 4 48433197315 | 1972-03-31T00:00:00Z | 38 48433197315 | 1972-03-31T00:00:00Z | 41 48433197315 | 1972-03-31T00 :00:00Z | 23 48433197315 | 1972-03-31T00:00:00Z | 7 48433336118 | 1972-03-31T00:00:00Z | 17 48433197319 | 1972-03-31T00:00:00Z | 64 48433197319 | 1972-03 -31T00:00:00Z | 58 48433197319 | 1972-03-31T00:00:00Z | 61 48433197319 | 1972-03-31T00:00:00Z | 73 48433197319 | 1972-03-31T00:00:00Z | 97 48433336119 | 1972 -03-31T00:00:00Z | 7
我已经创建了这样的表格。
private static Configuration getHbaseConfiguration() {
try {
if (hbaseConf == null) {
System.out.println(
"UserId= " + USERID + " \t keytab file =" + KEYTAB_FILE + " \t conf =" + KRB5_CONF_FILE);
HBaseConfiguration.create();
hbaseConf = HBaseConfiguration.create();
hbaseConf.set("mapreduce.job.queuename", "root.fricadev");
hbaseConf.set("mapreduce.child.java.opts", "-Xmx6553m");
hbaseConf.set("mapreduce.map.memory.mb", "8192");
hbaseConf.setInt(MAX_FILES_PER_REGION_PER_FAMILY, 1024);
System.setProperty("java.security.krb5.conf", KRB5_CONF_FILE);
UserGroupInformation.loginUserFromKeytab(USERID, KEYTAB_FILE);
}
} catch (Exception e) {
e.printStackTrace();
}
return hbaseConf;
}
/**
* HBase bulk import example Data preparation MapReduce job driver
*
* args[0]: HDFS input path args[1]: HDFS output path
*
* @throws Exception
*
*/
public static void main(String[] args) throws Exception {
if (hbaseConf == null)
hbaseConf = getHbaseConfiguration();
String outputPath = args[2];
hbaseConf.set("data.seperator", DATA_SEPERATOR);
hbaseConf.set("hbase.table.name", args[0]);
hbaseConf.setInt(MAX_FILES_PER_REGION_PER_FAMILY, 1024);
Job job = new Job(hbaseConf);
job.setJarByClass(HBaseBulkLoadDriver.class);
job.setJobName("Bulk Loading HBase Table::" + args[0]);
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapperClass(HBaseBulkLoadMapperUnzipped.class);
// job.getConfiguration().set("mapreduce.job.acl-view-job",
// "bigdata-app-fricadev-sdw-u6034690");
if (HbaseBulkLoadMapperConstants.FUNDAMENTAL_ANALYTIC.equals(args[0])) {
HTableDescriptor descriptor = new HTableDescriptor(Bytes.toBytes(args[0]));
descriptor.addFamily(new HColumnDescriptor(COLUMN_FAMILY));
HBaseAdmin admin = new HBaseAdmin(hbaseConf);
byte[] startKey = new byte[16];
Arrays.fill(startKey, (byte) 0);
byte[] endKey = new byte[16];
Arrays.fill(endKey, (byte) 255);
admin.createTable(descriptor, startKey, endKey, REGIONS_COUNT);
admin.close();
// HColumnDescriptor hcd = new
// HColumnDescriptor(COLUMN_FAMILY).setMaxVersions(1);
// createPreSplitLoadTestTable(hbaseConf, descriptor, hcd);
}
job.getConfiguration().setBoolean("mapreduce.compress.map.output", true);
job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", true);
job.getConfiguration().setClass("mapreduce.map.output.compression.codec",
org.apache.hadoop.io.compress.GzipCodec.class, org.apache.hadoop.io.compress.CompressionCodec.class);
job.getConfiguration().set("hfile.compression", Compression.Algorithm.LZO.getName());
// Connection connection =
// ConnectionFactory.createConnection(hbaseConf);
// Table table = connection.getTable(TableName.valueOf(args[0]));
FileInputFormat.setInputPaths(job, args[1]);
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setMapOutputValueClass(Put.class);
HFileOutputFormat.configureIncrementalLoad(job, new HTable(hbaseConf, args[0]));
System.exit(job.waitForCompletion(true) ? 0 : -1);
System.out.println("job is successfull..........");
// LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConf);
// loader.doBulkLoad(new Path(outputPath), (HTable) table);
HBaseBulkLoad.doBulkLoad(outputPath, args[0]);
}
/**
* Enum of counters.
* It used for collect statistics
*/
public static enum Counters {
/**
* Counts data format errors.
*/
WRONG_DATA_FORMAT_COUNTER
}
}
在我的代码中只有mapper没有reducer。 我的映射器代码是这样的。
public class FundamentalAnalyticLoader implements TableLoader {
private ImmutableBytesWritable hbaseTableName;
private Text value;
private Mapper<LongWritable, Text, ImmutableBytesWritable, Put>.Context context;
private String strFileLocationAndDate;
@SuppressWarnings("unchecked")
public FundamentalAnalyticLoader(ImmutableBytesWritable hbaseTableName, Text value, Context context,
String strFileLocationAndDate) {
//System.out.println("Constructing Fundalmental Analytic Load");
this.hbaseTableName = hbaseTableName;
this.value = value;
this.context = context;
this.strFileLocationAndDate = strFileLocationAndDate;
}
@SuppressWarnings("deprecation")
public void load() {
if (!HbaseBulkLoadMapperConstants.FF_ACTION.contains(value.toString())) {
String[] values = value.toString().split(HbaseBulkLoadMapperConstants.DATA_SEPERATOR);
String[] strArrFileLocationAndDate = strFileLocationAndDate
.split(HbaseBulkLoadMapperConstants.FIELD_SEPERATOR);
if (17 == values.length) {
String strKey = values[5].trim() + "|" + values[0].trim() + "|" + values[3].trim() + "|"
+ values[4].trim() + "|" + values[14].trim() + "|" + strArrFileLocationAndDate[0].trim() + "|"
+ strArrFileLocationAndDate[2].trim();
//String strRowKey=StringUtils.leftPad(Integer.toString(Math.abs(strKey.hashCode() % 470)), 3, "0") + "|" + strKey;
byte[] hashedRowKey = HbaseBulkImportUtil.getHash(strKey);
Put put = new Put((hashedRowKey));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID),
Bytes.toBytes(values[0].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID_OBJECT_TYPE_ID),
Bytes.toBytes(values[1].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FUNDAMENTAL_SERIES_ID_OBJECT_TYPE),
Bytes.toBytes(values[2]));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_END_DATE),
Bytes.toBytes(values[3].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_TYPE),
Bytes.toBytes(values[4].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.LINE_ITEM_ID), Bytes.toBytes(values[5].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_ITEM_INSTANCE_KEY),
Bytes.toBytes(values[6].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_VALUE), Bytes.toBytes(values[7].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_CONCEPT_CODE),
Bytes.toBytes(values[8].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_VALUE_CURRENCY_ID),
Bytes.toBytes(values[9].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_IS_ESTIMATED),
Bytes.toBytes(values[10].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_AUDITABILITY_EQUATION),
Bytes.toBytes(values[11].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FINANCIAL_PERIOD_TYPE_ID),
Bytes.toBytes(values[12].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_CONCEPT_ID),
Bytes.toBytes(values[13].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.ANALYTIC_LINE_ITEM_IS_YEAR_TO_DATE),
Bytes.toBytes(values[14].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.IS_ANNUAL), Bytes.toBytes(values[15].trim()));
// put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
// Bytes.toBytes(HbaseBulkLoadMapperConstants.TAXONOMY_ID),
// Bytes.toBytes(values[16].trim()));
//
// put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
// Bytes.toBytes(HbaseBulkLoadMapperConstants.INSTRUMENT_ID),
// Bytes.toBytes(values[17].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FF_ACTION),
Bytes.toBytes(values[16].substring(0, values[16].length() - 3)));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FILE_PARTITION),
Bytes.toBytes(strArrFileLocationAndDate[0].trim()));
put.add(Bytes.toBytes(HbaseBulkLoadMapperConstants.COLUMN_FAMILY),
Bytes.toBytes(HbaseBulkLoadMapperConstants.FILE_PARTITION_DATE),
Bytes.toBytes(strArrFileLocationAndDate[2].trim()));
try {
context.write(hbaseTableName, put);
} catch (IOException e) {
context.getCounter(Counters.WRONG_DATA_FORMAT_COUNTER).increment(1);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
System.out.println("Values length is less 15 and value is " + value.toString());
}
}
}
我无法决定如何在createTable()中提供splitKeys。请给出一个例子吗? – SUDARSHAN
@SUDARSHAN查看编辑我的答案。 – gudok
感谢您的努力。我只是改变了,因为你已经逐行解释过了。但仍然发生同样的事情。我刚刚在3小时后杀死了我的代码。我的代码正在使用较小的数据集。我尝试过使用16 gb的数据集。我是gona接下来尝试50GB。但是对于完整的数据集来说,它是一样的。 – SUDARSHAN