最近在只用 Hadoop 自带的 DBOutputFormat 时总是内存溢出,问题出现在写入数据库时,原因是由于 mapreduce 的结果集非常大,而 DBOutputFormat 在写入数据库时使用了 batch 方式,在最后 close 时做的 executeBatch,最终导致内存溢出。
因此写了一个自定义的 OutputFormat,记录一下,主要是备忘
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
/**
* @author Bruce.W
* @date 2021/7/28 9:45 上午
*/
@Slf4j
public class PostgreOutputFormat extends OutputFormat<PostgreDBWritable, NullWritable> {
private static final String tableName = "tb";
private static final String[] fieldNames = {"id", "segment", "count", "date", "source_id"};
private FileOutputCommitter committer = null;
public PostgreOutputFormat() {
log.info("construct postgreOutputFormat object");
}
@Override
public RecordWriter<PostgreDBWritable, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
DBConfiguration dbConf = new DBConfiguration(context.getConfiguration());
try {
Connection connection = dbConf.getConnection();
PreparedStatement statement = null;
statement = connection.prepareStatement(constructInsert(tableName, fieldNames));
return new PostgreRecordWriter(connection, statement);
} catch (Exception ex) {
throw new IOException(ex.getMessage());
}
}
/**
* 构造一个用于插入数据的 SQL 语句
*
* @param table
* 要插入的表
* @param fieldNames
* 要插入的字段。 如果字段名称未知,则提供一个空数组.
*/
public String constructInsert(String table, String[] fieldNames) {
if(fieldNames == null) {
throw new IllegalArgumentException("Field names may not be null");
}
StringBuilder query = new StringBuilder();
query.append("INSERT INTO ").append(table);
if (fieldNames.length > 0 && fieldNames[0] != null) {
query.append(" (");
for (int i = 0; i < fieldNames.length; i++) {
query.append(fieldNames[i]);
if (i != fieldNames.length - 1) {
query.append(",");
}
}
query.append(")");
}
query.append(" VALUES (");
for (int i = 0; i < fieldNames.length; i++) {
query.append("?");
if(i != fieldNames.length - 1) {
query.append(",");
}
}
query.append(");");
return query.toString();
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
if (committer == null) {
Path output = getOutputPath(context);
committer = new FileOutputCommitter(output, context);
}
return committer;
}
public static Path getOutputPath(JobContext job) {
String name = job.getConfiguration().get(FileOutputFormat.OUTDIR);
return name == null ? null : new Path(name);
}
class PostgreRecordWriter extends RecordWriter<PostgreDBWritable, NullWritable> {
private int batchSize = 10000;
private int batchCount = 0;
private Connection connection;
private PreparedStatement statement;
public PostgreRecordWriter(Connection connection
, PreparedStatement statement) throws SQLException {
this.connection = connection;
this.statement = statement;
this.connection.setAutoCommit(false);
}
@Override
public void write(PostgreDBWritable key, NullWritable value) throws IOException, InterruptedException {
try {
key.write(this.statement);
this.statement.addBatch();
this.batchCount++;
} catch (SQLException e) {
e.printStackTrace();
log.warn("add batch error ", e);
}
// 按照 batch size 执行 SQL,而不是最后一次性执行
// 保证不会因为 batch 过大而导致 out of memory
if (this.batchCount == this.batchSize) {
reset();
try {
log.info("execute batch............");
this.statement.executeBatch();
this.connection.commit();
// 提交成功后,清楚之前的内容,后续重新增加
this.statement.clearBatch();
} catch (SQLException e) {
try {
this.connection.rollback();
}
catch (SQLException ex) {
log.warn(StringUtils.stringifyException(ex));
}
throw new IOException(e.getMessage());
}
}
}
@Override
public void close(TaskAttemptContext context) throws IOException {
try {
statement.executeBatch();
connection.commit();
} catch (SQLException e) {
try {
connection.rollback();
}
catch (SQLException ex) {
log.warn(StringUtils.stringifyException(ex));
}
throw new IOException(e.getMessage());
} finally {
try {
statement.close();
connection.close();
}
catch (SQLException ex) {
throw new IOException(ex.getMessage());
}
}
}
private void reset() {
this.batchCount = 0;
}
}
}
说明一下,PostgreDBWritable 和 NullWritable 是 reduce 的 outkey 和 out value,实际中可以中 Java 的泛型 K,V 代替。
上面代码主要是在 write 中增加 batchSize 和 batchCount,用来当 addBatch 达到 batchSize 时提交数据;未被 batchSize 覆盖的部分,最后会在 close 时统一提交。
其实上面的代码还是有一些可能存在的 bug,但考虑到几率较小,进一步优化的性价比不高。


710

被折叠的 条评论
为什么被折叠?



