HBase Mapreduce 自定义 DBOutputFormat

最新推荐文章于 2022-12-14 11:21:33 发布

原创最新推荐文章于 2022-12-14 11:21:33 发布 · 205 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#xcode #application #ios

HBase 同时被 2 个专栏收录

1 篇文章

订阅专栏

大数据

1 篇文章

订阅专栏

最近在只用 Hadoop 自带的 DBOutputFormat 时总是内存溢出，问题出现在写入数据库时，原因是由于 mapreduce 的结果集非常大，而 DBOutputFormat 在写入数据库时使用了 batch 方式，在最后 close 时做的 executeBatch，最终导致内存溢出。

因此写了一个自定义的 OutputFormat，记录一下，主要是备忘

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;

import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;

/**
 * @author Bruce.W
 * @date 2021/7/28 9:45 上午
 */
@Slf4j
public class PostgreOutputFormat extends OutputFormat<PostgreDBWritable, NullWritable> {

    private static final String tableName = "tb";
    private static final String[] fieldNames = {"id", "segment", "count", "date", "source_id"};

    private FileOutputCommitter committer = null;

    public PostgreOutputFormat() {
        log.info("construct postgreOutputFormat object");
    }

    @Override
    public RecordWriter<PostgreDBWritable, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

        DBConfiguration dbConf = new DBConfiguration(context.getConfiguration());

        try {
            Connection connection = dbConf.getConnection();
            PreparedStatement statement = null;

            statement = connection.prepareStatement(constructInsert(tableName, fieldNames));
            return new PostgreRecordWriter(connection, statement);
        } catch (Exception ex) {
            throw new IOException(ex.getMessage());
        }
    }

    /**
     * 构造一个用于插入数据的 SQL 语句
     *
     * @param table
     *          要插入的表
     * @param fieldNames
     *          要插入的字段。 如果字段名称未知，则提供一个空数组.
     */
    public String constructInsert(String table, String[] fieldNames) {
        if(fieldNames == null) {
            throw new IllegalArgumentException("Field names may not be null");
        }

        StringBuilder query = new StringBuilder();
        query.append("INSERT INTO ").append(table);

        if (fieldNames.length > 0 && fieldNames[0] != null) {
            query.append(" (");
            for (int i = 0; i < fieldNames.length; i++) {
                query.append(fieldNames[i]);
                if (i != fieldNames.length - 1) {
                    query.append(",");
                }
            }
            query.append(")");
        }
        query.append(" VALUES (");

        for (int i = 0; i < fieldNames.length; i++) {
            query.append("?");
            if(i != fieldNames.length - 1) {
                query.append(",");
            }
        }

        query.append(");");

        return query.toString();
    }

    @Override
    public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {}

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
        if (committer == null) {
            Path output = getOutputPath(context);
            committer = new FileOutputCommitter(output, context);
        }
        return committer;
    }

    public static Path getOutputPath(JobContext job) {
        String name = job.getConfiguration().get(FileOutputFormat.OUTDIR);
        return name == null ? null : new Path(name);
    }


    class PostgreRecordWriter extends RecordWriter<PostgreDBWritable, NullWritable> {

        private int batchSize = 10000;
        private int batchCount = 0;
        private Connection connection;
        private PreparedStatement statement;

        public PostgreRecordWriter(Connection connection
                , PreparedStatement statement) throws SQLException {
            this.connection = connection;
            this.statement = statement;
            this.connection.setAutoCommit(false);
        }

        @Override
        public void write(PostgreDBWritable key, NullWritable value) throws IOException, InterruptedException {
            try {
                key.write(this.statement);
                this.statement.addBatch();
                this.batchCount++;
            } catch (SQLException e) {
                e.printStackTrace();
                log.warn("add batch error ", e);
            }

            // 按照 batch size 执行 SQL，而不是最后一次性执行
            // 保证不会因为 batch 过大而导致 out of memory
            if (this.batchCount == this.batchSize) {
                reset();

                try {
                    log.info("execute batch............");
                    this.statement.executeBatch();
                    this.connection.commit();
                    // 提交成功后，清楚之前的内容，后续重新增加
                    this.statement.clearBatch();
                } catch (SQLException e) {
                    try {
                        this.connection.rollback();
                    }
                    catch (SQLException ex) {
                        log.warn(StringUtils.stringifyException(ex));
                    }
                    throw new IOException(e.getMessage());
                }
            }
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException {
            try {
                statement.executeBatch();
                connection.commit();
            } catch (SQLException e) {
                try {
                    connection.rollback();
                }
                catch (SQLException ex) {
                    log.warn(StringUtils.stringifyException(ex));
                }
                throw new IOException(e.getMessage());
            } finally {
                try {
                    statement.close();
                    connection.close();
                }
                catch (SQLException ex) {
                    throw new IOException(ex.getMessage());
                }
            }
        }

        private void reset() {
            this.batchCount = 0;
        }
    }
}

说明一下，PostgreDBWritable 和 NullWritable 是 reduce 的 outkey 和 out value，实际中可以中 Java 的泛型 K，V 代替。

上面代码主要是在 write 中增加 batchSize 和 batchCount，用来当 addBatch 达到 batchSize 时提交数据；未被 batchSize 覆盖的部分，最后会在 close 时统一提交。

其实上面的代码还是有一些可能存在的 bug，但考虑到几率较小，进一步优化的性价比不高。