`

读取hive中一张表的所有字段的分组

    博客分类:
  • Hive
 
阅读更多

第一个参数是数据库名字

第二个参数是表的名字

 

 

#!/bin/python 

#[START] DESCRIPTION
#@param the first argument is the name of database
#@param the second argument is the name of table
#[END] DESCRIPTION

#[START]conf
hive_bin = "/home/work/lib/hive_udw2/bin/hive"
#[END]conf

import sys
import subprocess
arglen = len(sys.argv)
if (arglen < 3) :
        print "too less arguments"
        exit(1)
db = sys.argv[1] 
table = sys.argv[2] 
def run_hive_cmd(hive_cmd):
        cmd = "%s -e '%s'" % (hive_bin, hive_cmd)
        print "run sehll command : %s" % (cmd)
        res = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True);
        out=res.stdout.readlines();
        print "output of shell command is "
        print out 
        for k, v in enumerate(out):
                out[k] = v.rstrip("\n")
        return out 

def get_fields(rows):
        ans = []
        for k, v in enumerate(rows):
                f = v.split("   ")[0].strip()
                ans.append(f)
        return ans;

def get_group(db, table, field):
        hive_cmd = "USE %s ; SELECT %s, count(1) FROM %s GROUP BY %s" % (db, field, table, field)
        ans = run_hive_cmd(hive_cmd) 
        print "[START] output group of field %s (database is %s, table is %s)" % (field, db, table)
        print "%s                               %s" % ('value', 'num')
        for k, v in enumerate(ans):
                tmp = v.split(" ") 
                value = tmp[0]
                num = tmp[1]
                print "%s                               %s" % (value, num)
        print "[END] output group of field %s (database is %s, table is %s)" % (field, db, table)

cmd = "USE %s ; DESC %s" % (db, table)
fields = run_hive_cmd(cmd)
fields = get_fields(fields) 
for k, v in enumerate(fields):
        get_group(db, table, v)
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics