【HiveSQL】查询练习(pandas连接hadoop, jupyter 输出)

连接参数设置
启动集群
- 启动hadoop: myhadoop.sh start
- 启动hive: hiveservice.sh start (四个参数:start|stop|resart|status)
- 查看所有服务器进程: jpsall
- 文件管理系统:http://hadoop102:9870/explorer.html#/user/hive/warehouse/hive.db
建表导入数据
创建原始数据表 gulivideo_ori, gulivideo_user_ori

    #--gulivideo_ori
        create table if not exists gulivideo_ori
            (
            videoid string,
            uploader string,
            age int,
            category array<string>,
            length int,
            views int,
            rate int,
            ratings int,
            comments int,
            relatedid array<string>
            )
        row format delimited
        fields terminated by "\t"
        collection items terminated by "&"
        stored as textfile;
    #--gulivideo_user_ori
        create table if not exists gulivideo_user_ori
            (
                uploader string,
                videos int,
                friends int
            )
        row format delimited
        fields terminated by '\t';
    #--gulivideo_orc
        create table if not exists gulivideo_orc
            (
            videoid string,
            uploader string,
            age int,
            category array<string>,
            length int,
            views int,
            rate int,
            ratings int,
            comments int,
            relatedid array<string>
            )
        stored as orc;
    #--gulivideo_user_orc
        create table if not exists gulivideo_user_orc
            (
                uploader string,
                videos int,
                friends int
            )
        stored as orc;

向 ori 表插入数据

    load data local inpath '/opt/module/hive/data/video' into table gulivideo_ori;
    load data local inpath '/opt/module/hive/data/user' into table gulivideo_user_ori;

向 orc 表插入数据

    insert into table gulivideo_orc select * from gulivideo_ori;
    insert into table gulivideo_user_orc select * from gulivideo_user_ori;

导入模块

import pandas as pd

# python 连 hive 专用包
from pyhive import hive

# 连接参数
eng = hive.Connection(host='hadoop102', port=10000, username='root', password='root', database='default', auth='LDAP')
cursor = eng.cursor()

# 连接测试
gulivideo_orc = pd.read_sql('''
select * from gulivideo_orc limit 20
''', eng)
gulivideo_orc.head()

	gulivideo_orc.videoid	gulivideo_orc.uploader	gulivideo_orc.age	gulivideo_orc.category	gulivideo_orc.length	gulivideo_orc.views	gulivideo_orc.rate	gulivideo_orc.ratings	gulivideo_orc.comments	gulivideo_orc.relatedid
0	52K6NEXxjrg	Goodygums	631.0	["Gadgets","Games"]	216	2450	4	14	11	["typDFieUR-o","THA9tDiD0l8","VovC3GS8jzc","JV...
1	qraNEktJfVI	darkdragonprincesss	583.0	["Gadgets","Games"]	557	2750	4	10	19	["64FB-g8eggM","AkXmgNPOa74","67iHfqQRT14","zB...
2	vDUElhHlQD8	kneesuko	577.0	["Entertainment"]	140	755	3	2	2	["zhqbB1kDBvo","mT-iSTmsLZk","5ZZpEj55CcE","50...
3	265TFhvWZ4o	manhhakchareun	400.0	["Comedy"]	196	78496	2	155	56	["nQhH7HLsv2I","4y7WMrU0H64","pcOmdb8AaNM","Vn...
4	6Xv4FHZHnMk	timmytutu	646.0	["Comedy"]	33	3565	3	5	7	["Uep7rDOdz0A","GXOyvzDnVO8","XN0okKMpPDo","41...

1.观看人数前top20

排序后 limit 10

        SELECT
            videoid,
            views
        FROM gulivideo_orc 
        ORDER BY views
        DESC LIMIT 10;

sql_1 = 'select videoid, views from gulivideo_orc order by views desc limit 10'
view_top10 = pd.read_sql(sql_1, eng)
view_top10

	videoid	views
0	dMH0bHeiRNg	42513417
1	0XxI-hvPRRA	20282464
2	1dmVU08zVpA	16087899
3	RB-wUgnyGv0	15712924
4	QjA5faZF1A8	15256922
5	-_CSo1gOd48	13199833
6	49IDp76kjPw	11970018
7	tYnn51C3X_w	11823701
8	pv5zWaTEVkI	11672017
9	D2kJZOfq7zk	11184051

窗口函数方法(运行时间快2.5秒)

        select
            *
        from
            (
                select
                    videoid,
                    views,
                    rank()over(order by views desc) rank
                from gulivideo_orc
            ) t1
        where t1.rank <=10

sql_1_rank = 'select * from (select videoid, views, rank()over(order by views desc) rank from gulivideo_orc) t1 where t1.rank <=10'
sql_1_rank = pd.read_sql(sql_1_rank, eng)
sql_1_rank

	t1.videoid	t1.views	t1.rank
0	dMH0bHeiRNg	42513417	1
1	0XxI-hvPRRA	20282464	2
2	1dmVU08zVpA	16087899	3
3	RB-wUgnyGv0	15712924	4
4	QjA5faZF1A8	15256922	5
5	-_CSo1gOd48	13199833	6
6	49IDp76kjPw	11970018	7
7	tYnn51C3X_w	11823701	8
8	pv5zWaTEVkI	11672017	9
9	D2kJZOfq7zk	11184051	10

2.统计视频类别热度 Top10

        select
            category_name,
            count(*) as video_num
        from
            (
                select
                    videoid,
                    category_name
                from gulivideo_orc
                lateral view explode(category) t1_tmp as category_name
            ) t1
        group by category_name
        order by video_num desc
        limit 10

sql_2 = 'select category_name, count(*) as video_num from (select videoid, category_name from gulivideo_orc lateral view explode(category) t1_tmp as category_name ) t1 group by category_name order by video_num desc limit 10'
cate_top10 = pd.read_sql(sql_2, eng)
cate_top10

	category_name	video_num
0	Music	179049
1	Entertainment	127674
2	Comedy	87818
3	Animation	73293
4	Film	73293
5	Sports	67329
6	Gadgets	59817
7	Games	59817
8	People	48890
9	Blogs	48890

3.统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数

        select
            category_name,
            count(1) as video_count
        from
            (
                select
                    category_name,
                    t1.videoid
                from
                    (
                        select
                            videoid,
                            category,
                            views
                        from gulivideo_orc
                        order by views desc limit 20
                    ) t1
                lateral view explode(t1.category) t1_tmp as category_name
            ) t2
        group by category_name

sql_3 = 'select category_name, count(1) as video_count from ( select category_name, t1.videoid from ( select videoid, category, views from gulivideo_orc order by views desc limit 20 ) t1 lateral view explode(t1.category) t1_tmp as category_name ) t2 group by category_name'
quest_3 = pd.read_sql(sql_3, eng)
quest_3

	category_name	video_count
0	Blogs	2
1	Comedy	6
2	Entertainment	6
3	Music	5
4	People	2
5	UNA	1

4.统计视频观看数 Top50 所关联视频的所属类别排序

mydql order by 后的字段是不需要出现在 select 里的，但这里不行，有 distinct 的情况也不行
这个查询涉及两个大表 join, 要设置 set hive.auto.convert.join = false. 原因是:mapJoin在处理一张小表和一张大表,新版的hive会自动优化把小表存入内存中进行缓存, 如果表的数据过大的时候,内存吃不消进而报错
注意做完这个查询里要把还原为true set hive.auto.convert.join = true

        select
            category_name, 
            video_sum, 
            dense_rank()over(order by video_sum) as rank
        from
            (
                select
                    category_name,
                    count(*) as video_sum
                from
                    (
                        select 
                            t3.related_id,
                            category_name
                        from
                            (
                                select    
                                    t2.related_id,
                                    t1.category
                                from gulivideo_orc t1
                                join
                                    (
                                        select
                                            related_id
                                        from
                                            (
                                                select
                                                    relatedid,
                                                    `views`
                                                from gulivideo_orc
                                                order by `views` desc
                                                limit 50        
                                            ) t1
                                        lateral view explode(relatedid) t1_tmp as related_id
                                    ) t2 on t1.videoid = t2.related_id
                            ) t3
                        lateral view explode(t3.category) t3_tmp as category_name
                    ) t4
                group by t4.category_name
            ) t5

sql_4 = 'select category_name, video_sum, dense_rank()over(order by video_sum) as rank from (select category_name, count(*) as video_sum from ( select t3.related_id, category_name from ( select t2.related_id, t1.category from gulivideo_orc t1 join ( select related_id from ( select relatedid, `views` from gulivideo_orc order by `views` desc limit 50) t1 lateral view explode(relatedid) t1_tmp as related_id ) t2 on t1.videoid = t2.related_id ) t3 lateral view explode(t3.category) t3_tmp as category_name ) t4 group by t4.category_name ) t5'
quest_4 = pd.read_sql(sql_4, eng)
quest_4

	category_name	video_sum	rank
0	Vehicles	4	1
1	Autos	4	1
2	Animals	11	2
3	Pets	11	2
4	Places	12	3
5	Travel	12	3
6	UNA	13	4
7	Howto	14	5
8	DIY	14	5
9	Sports	19	6
10	Games	22	7
11	Gadgets	22	7
12	News	24	8
13	Politics	24	8
14	Film	47	9
15	Animation	47	9
16	Blogs	51	10
17	People	51	10
18	Music	195	11
19	Entertainment	216	12
20	Comedy	237	13

5.统计每个类别中的视频热度 Top10，以 Music 为例

方法一

        select
            *
        from
            (
                select
                    category_name,	
                    videoid,
                    views
                from gulivideo_orc 
                lateral view explode(category) t1_tmp as category_name
                where category_name = 'Music'
            ) t1
        order by views DESC limit 10

方法二, 结果一样

        select
            category_name,
            videoid,
            VIEWS
        from gulivideo_orc 
        lateral view explode(category) tmp as category_name
        where category_name = 'Music'
        order by views desc
        limit 10

sql_5 = 'select * from ( select category_name,videoid,views from gulivideo_orc lateral view explode(category) t1_tmp as category_name where category_name="Music") t1 order by views DESC limit 10 '
quest_5 = pd.read_sql(sql_5, eng)
quest_5

	t1.category_name	t1.videoid	t1.views
0	Music	QjA5faZF1A8	15256922
1	Music	tYnn51C3X_w	11823701
2	Music	pv5zWaTEVkI	11672017
3	Music	8bbTtPL1jRs	9579911
4	Music	UMf40daefsI	7533070
5	Music	-xEzGIuY7kw	6946033
6	Music	d6C0bNDqf3Y	6935578
7	Music	HSoVKUVOnfQ	6193057
8	Music	3URfWTEPmtE	5581171
9	Music	thtmaZnxk_0	5142238

6.统计每个类别视频观看数 Top10

        select
            *
        from
        (
        select 
            category_name,
            videoid,
            VIEWS,
            rank()over(partition by category order by views desc) as rank
        from gulivideo_orc 
        lateral view explode(category) tmp as category_name
        ) t1
        where t1.rank <=10

sql_6 = 'select * from (select category_name,videoid,VIEWS,rank()over(partition by category_name order by views desc) as rank from gulivideo_orc lateral view explode(category) tmp as category_name) t1 where t1.rank <=10 order by t1.category_name, t1.rank'
quest_6 = pd.read_sql(sql_6, eng)
quest_6

	t1.category_name	t1.videoid	t1.views	t1.rank
0	Animals	2GWPOPSXGYI	3660009	1
1	Animals	xmsV9R8FsDA	3164582	2
2	Animals	12PsUW-8ge4	3133523	3
3	Animals	OeNggIGSKH8	2457750	4
4	Animals	WofFb_eOxxA	2075728	5
...	...	...	...	...
205	Vehicles	46LQd9dXFRU	1262173	6
206	Vehicles	pdiuDXwgrjQ	1013697	7
207	Vehicles	kY_cDpENQLE	956665	8
208	Vehicles	YtxfbxGz1u4	942604	9
209	Vehicles	aCamHfJwSGU	847442	10

210 rows × 4 columns

7.统计上传视频最多的用户 Top10 以及他们上传的视频观看次数在前 20 的视频

sql_7 = 'select t2.videoid, views from ( select uploader, videos from gulivideo_user_orc order by videos desc limit 10 ) t1 join gulivideo_orc t2 on t1.uploader = t2.uploader order by t2.views desc limit 20'
quest_7 = pd.read_sql(sql_7, eng)
quest_7

	t2.videoid	views
0	-IxHBW0YpZw	39059
1	BU-fT5XI_8I	29975
2	ADOcaBYbMl0	26270
3	yAqsULIDJFE	25511
4	vcm-t0TJXNg	25366
5	0KYGFawp14c	24659
6	j4DpuPvMLF4	22593
7	Msu4lZb2oeQ	18822
8	ZHZVj44rpjE	16304
9	foATQY3wovI	13576
10	-UnQ8rcBOQs	13450
11	crtNd46CDks	11639
12	D1leA0JKHhE	11553
13	NJu2oG1Wm98	11452
14	CapbXdyv4j4	10915
15	epr5erraEp4	10817
16	IyQoDgaLM7U	10597
17	tbZibBnusLQ	10402
18	_GnCHodc7mk	9422
19	hvEYlSlRitU	7123

【HiveSQL】查询练习(pandas连接hadoop, jupyter 输出)

1.观看人数前top20

2.统计视频类别热度 Top10

3.统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数

4.统计视频观看数 Top50 所关联视频的所属类别排序

5.统计每个类别中的视频热度 Top10，以 Music 为例

6.统计每个类别视频观看数 Top10

7.统计上传视频最多的用户 Top10 以及他们上传的视频 观看次数在前 20 的视频

7.统计上传视频最多的用户 Top10 以及他们上传的视频观看次数在前 20 的视频