使用 MultipleOutputs设置多路径输出

chengjianxiaoxue

浏览: 1325235 次
性别:
来自: 北京

最近访客更多访客>>

liu_shui8

happy2012

nddht

yhtppp

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hadoop1

工作中需要根据不同的值来设置不同的输出目录，有两个点需要注意：

1  其中参数2的 namedOutput 必须设置
 MultipleOutputs.addNamedOutput(job, a, TextOutputFormat.class,
                    NullWritable.class, Text.class);


2 
 mos.write(keyStr, NullWritable.get(), valText, keyStr + "/");  // 对应于  run方法中的 addNamedOutput里的参数2， 这里必须保持一致 否则报Named ouput 'xxx' not defined的错，

具体代码如下，针对 mos.write中的baseOutputPath的不同设置的值的hdfs输出目录见代码注释：

import com.alibaba.fastjson.JSON;
import com.mydb.bigdata.config.Config;
import com.mydb.bigdata.xetl.model.Constant;
import com.mydb.bigdata.xetl.utils.AutoActLogParseUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by pc on 2017/9/21.
    
./log/data_log_parse_to_hdfs/data_log_parse_to_hdfs.sh:66:hadoop jar ${com_jar_bigdata_common_mongo_batch_get} com.mydb.bigdata.xetl.mr.AutoActLogParseMr 
 /collect_data/userlog/20170902/*userlog*.log.gz    ---->  源头数据    args[0]
 /log_data/2018-04-03/        ----> 解析后的文件夹                     args[1]
 'stg_log_'     ------>  解析后的埋点表名                              args[2]
 /include/xetl.properties    ------>  hive源数据库的连接信息           args[3]
  * 
  *  create external table stg_log_1900039(...) PARTITIONED BY (day STRING)  ROW FORMAT DELIMITED  FIELDS TERMINATED BY '\001'  LINES TERMINATED BY '\n'   STORED AS TEXTFILE  LOCATION '/log_data/stg_log_1900039';
  * 
  */
public class AutoActLogParseMr extends Configured implements Tool {
    private static Logger logger = LoggerFactory.getLogger(AutoActLogParseMr.class);
    // public static String day="";

    public static String acts = "";

    /**
     *
     * @param args
     * 1、传入参数
     * 2、传出参数
     * 3、业务参数：①stg_log_  ②stg_log_class_perform_
     * 4、配置文件路径：如xetl.properties
     */
    public static void main(String[] args) {
        if (args.length < 2) {
            System.out.println("args must more than 2.");
            System.exit(0);
        }
        // day=(args[0].split("/"))[3];
        Configuration conf = new Configuration();
        FileSystem hdfs = null;
        try {
            int res = ToolRunner.run(conf, new AutoActLogParseMr(), args);
            System.exit(res);
        } catch (Exception e) {
            logger.error("", e);
        }
    }


    public int run(String[] params) throws Exception {

        Configuration conf = getConf();
        conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");

        Integer numReduceTasks = 3;

        FileSystem hdfs = null;
        try {
            // 程序配置
//            conf.set("fs.default.name", "hdfs://Galaxy");
            //config.set("hadoop.job.ugi", "feng,111111");
            //config.set("hadoop.tmp.dir", "/tmp/hadoop-fengClient");
            //config.set("dfs.replication", "1");
            //config.set("mapred.job.tracker", "master:9001");
//            hdfs = FileSystem.get(new URI("hdfs://Galaxy"),
//                    conf, "bigdata");
            Path path = new Path("/log_data/");
            hdfs = path.getFileSystem(conf);
         //   logger.info("path 的值：" + path);
            String flag=params[2];   // stg_log_
            acts = getOutPutName(hdfs, path, conf,flag);  // 1900039@1900038
            conf.set("fs.allActs", acts);
        } catch (Exception e) {
            e.printStackTrace();
        }
        // acts = Hdfstools.readHDFSFile("/log_data/actId");
      //  logger.info("acts的值为" + acts);

        //获取配置文件信息
        Config propertiesConfig = new Config();
        propertiesConfig.init(params[3]);   // xetl.properties 

        String mysqlUrl = propertiesConfig.getValue("mysqlUrl");
        String mysqlUser = propertiesConfig.getValue("mysqlUser");
        String mysqlPassword = propertiesConfig.getValue("mysqlPassword");
        String dbname = propertiesConfig.getValue("dbname");


        conf.set("mysqlUser",mysqlUser);
        conf.set("mysqlUrl",mysqlUrl);
        conf.set("mysqlPassword",mysqlPassword);
        conf.set("dbname",dbname);



        Job job = Job.getInstance(conf);
        job.setJarByClass(AutoActLogParseMr.class);
        
        job.setMapperClass(AutoActLogParseMr.AutoActLogParseMaper.class);
        job.setReducerClass(AutoActLogParseMr.AutoActLogParseReducer.class);

        //将第一个路径参数作为输入参数
        FileInputFormat.setInputPaths(job, new Path(params[0])); //  /collect_data/userlog/20170902/*userlog*.log.gz 
        //将第二个参数作为输出参数
        FileOutputFormat.setOutputPath(job, new Path(params[1])); //  /log_data/2018-04-03/  
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setNumReduceTasks(numReduceTasks);

        String dirName[] = acts.split(Constant.MARK_AITE); // 1900039@1900038 这些要解析的埋点变成数组 
        for (String a : dirName) { // hdfs多路径输出文件  ，  其中参数2的 namedOutput 必须设置
            MultipleOutputs.addNamedOutput(job, a, TextOutputFormat.class,
                    NullWritable.class, Text.class);
        }
        logger.info("---excuter---");

        return job.waitForCompletion(true) ? 0 : 1;

    }

    public static class AutoActLogParseMaper extends Mapper<LongWritable, Text, Text, Text> {

        public static Map<String, List> actMap = new HashMap();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            try {
                String mysqlUser = context.getConfiguration().get("mysqlUser");
                String mysqlUrl = context.getConfiguration().get("mysqlUrl");
                String mysqlPassword = context.getConfiguration().get("mysqlPassword");
                String dbname = context.getConfiguration().get("dbname");



                String string = context.getConfiguration().get("fs.allActs");  // 1900039@1900038  要解析的actid
                actMap = AutoActLogParseUtil.getHiveStaticConf(string,mysqlUrl,mysqlUser,mysqlPassword,dbname); // actids,mysql链接信息  得到  <actid, actid所在的表的所有列>


            } catch (SQLException e) {
                e.printStackTrace();
            }
            super.setup(context);
        }

        @Override
        protected void cleanup(Context context)
                throws IOException, InterruptedException {

        }

        protected void map(LongWritable key, Text val, Context context) {
            try {
                String jsonStr = val.toString();
                if (StringUtils.isEmpty(jsonStr)) {
                    return;
                }
                /**
                 * 要解析的一行内容：
                 * 2017-09-02 00:00:17.017 172.16.0.8 openApi -- {"actId":"1712012","classId":"f17ee6fd1fcb4f1fb3e1688f5cd364bd","className":"九年级(1)班","countyId":"445381","countyName":"罗定市","gradeId":"9","originCode":"4","role":"STUDENT","schoolId"
:"18738","schoolName":"罗定第二中学","time":1504281617255,"userIcon":"d355db9797f854e1805e653b95555001","userId":"37628410","userIp":"113.101.248.138","userName":"陈嘉怡","data":{"subjectName":"数学","subjectId":"020","rankChallengeTime
":"201709","rankScope":"2"}}
                 */
                if (jsonStr.indexOf(Constant.SPLIT_MAKER) == -1) {  //  -- 
                    logger.info("--split error--" + jsonStr);
                    return;
                }
                //AutoActLogParseUtil autoActLogParseUtil = new AutoActLogParseUtil();
                jsonStr = jsonStr.split(Constant.SPLIT_MAKER,2)[1]; // 得到要解析的字符串

                jsonStr = jsonStr.replaceAll("\\\\\\\\n", " ");
                jsonStr = jsonStr.replaceAll("\\\\n", " ");
                jsonStr = jsonStr.replaceAll("\n", " ");
                jsonStr = jsonStr.replaceAll("\\\\\\\\r", " ");
                jsonStr = jsonStr.replaceAll("\\\\r", " ");
                jsonStr = jsonStr.replaceAll("\r", " ");


/**
{
    "actId":"1712012",
    "classId":"f17ee6fd1fcb4f1fb3e1688f5cd364bd",
    "className":"九年级(1)班",
    "countyId":"445381",
    "countyName":"罗定市",
    "gradeId":"9",
    "originCode":"4",
    "role":"STUDENT",
    "schoolId":"18738",
    "schoolName":"罗定第二中学",
    "time":1504281617255,
    "userIcon":"d355db9797f854e1805e653b95555001",
    "userId":"37628410",
    "userIp":"113.101.248.138",
    "userName":"陈嘉怡",
    "data":{
        "subjectName":"数学",
        "subjectId":"020",
        "rankScope":"2",
        "rankChallengeTime":"201709"
    }
}
 */

                // logger.info("解析数据为：" + jsonStr);

                JSONObject jsonObject = null;
                try {
                    jsonObject = new JSONObject(jsonStr);
                } catch (Exception e) {
                    logger.info("失败，无效的json格式；解析数据为：" + jsonStr);
                    return;

                }
                String actId ="";
                if(jsonObject.has("actId")) {
                    actId = jsonObject.get("actId").toString();
                    if (actId.length() < 1) {
                        logger.info("失败，没有找到actID，解析数据为：" + jsonStr);
                        return;
                    }
                }
                else
                    {
                        logger.info("失败，打点数据没有actid key，解析数据为：" + jsonStr);
                    }


              //  logger.info("开始解析，解析数据为：" + jsonStr);
                Object jsonObj = JSON.parse(jsonStr);

                String sbff = AutoActLogParseUtil.jsonParse(jsonObj, actId, actMap); // 行数据 ，  里面代码写的太乱 实在看不懂 ....    预先建表的列顺序和json的行的列的存储顺序一致

                if (StringUtils.isBlank(sbff)) {
                    logger.info("解析失败，解析数据为：" + jsonStr);
                    return;
                }
                Text outKey = new Text();
                Text outValue = new Text();

                outKey.set(actId);
                outValue.set(sbff);
                context.write(outKey, outValue);
            } catch (IOException e) {
                logger.error("IO错误", e);
            } catch (JSONException e) {
                logger.error("JSON格式不对", e);

                // e.printStackTrace();
            } catch (InterruptedException e) {
                logger.error("JSON格式不对", e);
            }
        }


    }

    public static class AutoActLogParseReducer extends Reducer<Text, Text, NullWritable, Text> {

        private MultipleOutputs<NullWritable, Text> mos; // 输出类型和Reduce一致

        @Override
        protected void setup(Reducer<Text, Text, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            mos = new MultipleOutputs<NullWritable, Text>(context);
        }

        @Override
        protected void cleanup(
                Reducer<Text, Text, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {

            mos.close();
        }

        @Override
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            Text valText = new Text();
            Text keyValue = new Text();
            Iterator<Text> it = values.iterator(); // 一行行的数据 
            String keyStr = key.toString();

            while (it.hasNext()) {
                String[] uk = it.next().toString().split(Constant.MARK_LINE);  // \002    行数据之间用 \002间隔
                for (String tmpUk : uk) {
                    valText.set(tmpUk);

                     System.out.println("keyStr="+keyStr+"valText"+valText); // keyStr=2900011valText0eb688ff16674be3b1642346aa347a7629000114f527f0cd64e4bf8bfaceeda4d33049a二年级(4)班445381罗定市a07973c2a487424f801c35b68517e88b76131115xl_9604e109d44c416492d23f1cdeb856e1_v111["a6c15215335649e29ddf48224a9bbe7e"]010语文2openApi210.0.26.434STUDENT19026素龙街中心小学15353860243005c8f4e4149fcce05f6c29cb5094b45ef39180478113.101.249.141杨大立
                    /**
                     * // String namedOutput, K key, V value,  String baseOutputPath, 如果baseOutputPath不包含文件分隔符“/”，那么输出的文件格式为baseOutputPath-r-nnnnn（name-r-nnnnn)；
                     * 如果写成 mos.write(keyStr, NullWritable.get(), valText, keyStr + "hello"); 输出结果为  /log_data/2017-10-27/1100001hello-r-00001这种的，
                     * 而实际需要的是 /log_data/2017-10-27/1100001/-r-00001
                     * 
                     * 如果是 mos.write(keyStr, NullWritable.get(), valText, keyStr + "/hello");  ， 则输出结果为  /log_data/2017-10-27/1100001/hello-r-00001
                     * 
                     * 如果是 mos.write(keyStr, NullWritable.get(), valText, keyStr + "/");  则输出结果为  /log_data/2017-10-27/1100001/-r-00001
                     */
                     mos.write(keyStr, NullWritable.get(), valText, keyStr + "/");  // 对应于  run方法中的 addNamedOutput里的参数2， 这里必须保持一致 否则报Named ouput 'xxx' not defined的错
                }
            }

        }

    }



    /**
     * 来了新埋点，会产生新的建表语句，eg:
create external table stg_log_1900039
(uuid String,
actId String,
...
)COMMENT 'log' 
PARTITIONED BY (day STRING) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '\001' 
LINES TERMINATED BY '\n' 
STORED AS TEXTFILE 
LOCATION '/log_data/stg_log_1900039';

这里先去 /log_data/里找到所有符合 stg_log_1900039的表，解析得到所有埋点，  组品成  1900039@1900038这种方式返回
     * @param hdfs
     * @param path
     * @param conf
     * @param tableType
     * @return
     */
    public static String getOutPutName(FileSystem hdfs, Path path, Configuration conf,String tableType) { //  fs, /log_data/  , conf ,  stg_log_
        StringBuffer sb = new StringBuffer();
        try {
            if (hdfs == null || path == null) {
                return sb.toString();
            }
            //获取文件列表
            FileStatus[] listStatus = hdfs.listStatus(path);

            Path[] listPath = FileUtil.stat2Paths(listStatus);
            for (Path p : listPath) {
                String fileNamePath = p.toString();//获得带路径的文件名字符串
                String fileName = p.getName();//获取文件名    eg  stg_log_1100001
                String regex=tableType+"\\d{7}";
                int fileNameLength=tableType.length()+7; // 15
                int index=tableType.split("_").length;  // 2


//                if (flag==1) {
//                    regex = "stg_log_\\d{7}";//stg_log_1400004
//                    fileNameLength=15;
//                    index=2;
//
//                }
//                else {
//                    regex = "stg_log_class_perform_\\d{7}";//stg_log_1400004
//                    fileNameLength=29;
//                    index=4;
//                }
                // logger.info("regex="+regex+"and"+"filenameLength="+fileNameLength);
                //logger.info("fileNamePath:"+fileNamePath+"  fileName"+fileName);
                Pattern pattern = Pattern.compile(regex);
                if (fileName.length() == fileNameLength) {
                    //      logger.info("filename7=" + fileName);
                    Matcher matcher = pattern.matcher(fileName);
                    if (matcher.matches()) {
                        //找到满足的文件，做下一步处理
                        if (!StringUtils.isBlank(sb.toString())) {
                            sb.append(Constant.MARK_AITE);  //  @
                        }
                        String str = matcher.group(); // 
                        sb.append((str.split(Constant.MARK_XHX))[index]);//2,4
                   //     logger.info("str.value is:" + str + "   sb.value is " + sb);
                    }
                    //logger.info("   sb.value is " + sb);  // 将需要解析的埋点 1100001@1400004 拼接成这样 
                }

            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        return sb.toString();
    }
}
}

分享到：

mysql 常用命令总结备份 | 自定义DBInputFormat,抽取mysql表存储在分 ...

2018-08-29 18:15
浏览 893
评论(0)
分类:互联网
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用 MultipleOutputs设置多路径输出

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用 MultipleOutputs设置多路径输出

评论

发表评论

相关推荐

hadoop fs命令详解链接

自定义DBInputFormat,抽取mysql表存储在分库中

64位linux系统编译hadoop源码 native库

YARN内存使用优化配置

mapreduce-chain TODO

hadoop mr实现单表列转行--mr system.out数据位置

google mapreduce杂谈

gfs杂谈

hadoop调度算法

hadoop优化0

hadoop压缩

hadoop博客整理

hadoop fsimage edits关系

hadoop合并小文件的一些说说

ma-hadoop脚本命令 hadoop-hadoop dfs-hdfs dfs区别

ma-hadoop1 集群内存设置

ma-hadoop集群-配置文件-进程地址端口-和hive hbase关系

ma-大数据HDFS

ma-大数据mapreduce思想和数据切割

ma-云计算 大数据 mapreduce概念和关系

最近访客更多访客>>

ma-云计算大数据 mapreduce概念和关系