【HiveSQL】查询练习(pandas连接hadoop, jupyter 输出)

  • 连接参数设置

  • 启动集群

    • 启动hadoop: myhadoop.sh start
    • 启动hive: hiveservice.sh start (四个参数:start|stop|resart|status)
    • 查看所有服务器进程: jpsall
    • 文件管理系统:http://hadoop102:9870/explorer.html#/user/hive/warehouse/hive.db
  • 建表导入数据

  • 创建原始数据表 gulivideo_ori, gulivideo_user_ori

    #--gulivideo_ori
        create table if not exists gulivideo_ori
            (
            videoid string,
            uploader string,
            age int,
            category array<string>,
            length int,
            views int,
            rate int,
            ratings int,
            comments int,
            relatedid array<string>
            )
        row format delimited
        fields terminated by "\t"
        collection items terminated by "&"
        stored as textfile;
    #--gulivideo_user_ori
        create table if not exists gulivideo_user_ori
            (
                uploader string,
                videos int,
                friends int
            )
        row format delimited
        fields terminated by '\t';
    #--gulivideo_orc
        create table if not exists gulivideo_orc
            (
            videoid string,
            uploader string,
            age int,
            category array<string>,
            length int,
            views int,
            rate int,
            ratings int,
            comments int,
            relatedid array<string>
            )
        stored as orc;
    #--gulivideo_user_orc
        create table if not exists gulivideo_user_orc
            (
                uploader string,
                videos int,
                friends int
            )
        stored as orc;
  • 向 ori 表插入数据
    load data local inpath '/opt/module/hive/data/video' into table gulivideo_ori;
    load data local inpath '/opt/module/hive/data/user' into table gulivideo_user_ori;
  • 向 orc 表插入数据
    insert into table gulivideo_orc select * from gulivideo_ori;
    insert into table gulivideo_user_orc select * from gulivideo_user_ori;
  • 导入模块
import pandas as pd

# python 连 hive 专用包
from pyhive import hive

# 连接参数
eng = hive.Connection(host='hadoop102', port=10000, username='root', password='root', database='default', auth='LDAP')
cursor = eng.cursor()
# 连接测试
gulivideo_orc = pd.read_sql('''
select * from gulivideo_orc limit 20
''', eng)
gulivideo_orc.head()
gulivideo_orc.videoidgulivideo_orc.uploadergulivideo_orc.agegulivideo_orc.categorygulivideo_orc.lengthgulivideo_orc.viewsgulivideo_orc.rategulivideo_orc.ratingsgulivideo_orc.commentsgulivideo_orc.relatedid
052K6NEXxjrgGoodygums631.0["Gadgets","Games"]216245041411["typDFieUR-o","THA9tDiD0l8","VovC3GS8jzc","JV...
1qraNEktJfVIdarkdragonprincesss583.0["Gadgets","Games"]557275041019["64FB-g8eggM","AkXmgNPOa74","67iHfqQRT14","zB...
2vDUElhHlQD8kneesuko577.0["Entertainment"]140755322["zhqbB1kDBvo","mT-iSTmsLZk","5ZZpEj55CcE","50...
3265TFhvWZ4omanhhakchareun400.0["Comedy"]19678496215556["nQhH7HLsv2I","4y7WMrU0H64","pcOmdb8AaNM","Vn...
46Xv4FHZHnMktimmytutu646.0["Comedy"]333565357["Uep7rDOdz0A","GXOyvzDnVO8","XN0okKMpPDo","41...
1.观看人数前top20
  • 排序后 limit 10
        SELECT
            videoid,
            views
        FROM gulivideo_orc 
        ORDER BY views
        DESC LIMIT 10;
sql_1 = 'select videoid, views from gulivideo_orc order by views desc limit 10'
view_top10 = pd.read_sql(sql_1, eng)
view_top10
videoidviews
0dMH0bHeiRNg42513417
10XxI-hvPRRA20282464
21dmVU08zVpA16087899
3RB-wUgnyGv015712924
4QjA5faZF1A815256922
5-_CSo1gOd4813199833
649IDp76kjPw11970018
7tYnn51C3X_w11823701
8pv5zWaTEVkI11672017
9D2kJZOfq7zk11184051
  • 窗口函数方法(运行时间快2.5秒)
        select
            *
        from
            (
                select
                    videoid,
                    views,
                    rank()over(order by views desc) rank
                from gulivideo_orc
            ) t1
        where t1.rank <=10
sql_1_rank = 'select * from (select videoid, views, rank()over(order by views desc) rank from gulivideo_orc) t1 where t1.rank <=10'
sql_1_rank = pd.read_sql(sql_1_rank, eng)
sql_1_rank
t1.videoidt1.viewst1.rank
0dMH0bHeiRNg425134171
10XxI-hvPRRA202824642
21dmVU08zVpA160878993
3RB-wUgnyGv0157129244
4QjA5faZF1A8152569225
5-_CSo1gOd48131998336
649IDp76kjPw119700187
7tYnn51C3X_w118237018
8pv5zWaTEVkI116720179
9D2kJZOfq7zk1118405110
2.统计视频类别热度 Top10
        select
            category_name,
            count(*) as video_num
        from
            (
                select
                    videoid,
                    category_name
                from gulivideo_orc
                lateral view explode(category) t1_tmp as category_name
            ) t1
        group by category_name
        order by video_num desc
        limit 10
sql_2 = 'select category_name, count(*) as video_num from (select videoid, category_name from gulivideo_orc lateral view explode(category) t1_tmp as category_name ) t1 group by category_name order by video_num desc limit 10'
cate_top10 = pd.read_sql(sql_2, eng)
cate_top10
category_namevideo_num
0Music179049
1Entertainment127674
2Comedy87818
3Animation73293
4Film73293
5Sports67329
6Gadgets59817
7Games59817
8People48890
9Blogs48890
3.统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数
        select
            category_name,
            count(1) as video_count
        from
            (
                select
                    category_name,
                    t1.videoid
                from
                    (
                        select
                            videoid,
                            category,
                            views
                        from gulivideo_orc
                        order by views desc limit 20
                    ) t1
                lateral view explode(t1.category) t1_tmp as category_name
            ) t2
        group by category_name
sql_3 = 'select category_name, count(1) as video_count from ( select category_name, t1.videoid from ( select videoid, category, views from gulivideo_orc order by views desc limit 20 ) t1 lateral view explode(t1.category) t1_tmp as category_name ) t2 group by category_name'
quest_3 = pd.read_sql(sql_3, eng)
quest_3
category_namevideo_count
0Blogs2
1Comedy6
2Entertainment6
3Music5
4People2
5UNA1
4.统计视频观看数 Top50 所关联视频的所属类别排序
  • mydql order by 后的字段是不需要出现在 select 里的, 但这里不行,有 distinct 的情况也不行
  • 这个查询涉及两个大表 join, 要设置 set hive.auto.convert.join = false. 原因是:mapJoin在处理一张小表和一张大表,新版的hive会自动优化把小表存入内存中进行缓存, 如果表的数据过大的时候,内存吃不消进而报错
  • 注意做完这个查询里要把还原为true set hive.auto.convert.join = true
        select
            category_name, 
            video_sum, 
            dense_rank()over(order by video_sum) as rank
        from
            (
                select
                    category_name,
                    count(*) as video_sum
                from
                    (
                        select 
                            t3.related_id,
                            category_name
                        from
                            (
                                select    
                                    t2.related_id,
                                    t1.category
                                from gulivideo_orc t1
                                join
                                    (
                                        select
                                            related_id
                                        from
                                            (
                                                select
                                                    relatedid,
                                                    `views`
                                                from gulivideo_orc
                                                order by `views` desc
                                                limit 50        
                                            ) t1
                                        lateral view explode(relatedid) t1_tmp as related_id
                                    ) t2 on t1.videoid = t2.related_id
                            ) t3
                        lateral view explode(t3.category) t3_tmp as category_name
                    ) t4
                group by t4.category_name
            ) t5
sql_4 = 'select category_name, video_sum, dense_rank()over(order by video_sum) as rank from (select category_name, count(*) as video_sum from ( select t3.related_id, category_name from ( select t2.related_id, t1.category from gulivideo_orc t1 join ( select related_id from ( select relatedid, `views` from gulivideo_orc order by `views` desc limit 50) t1 lateral view explode(relatedid) t1_tmp as related_id ) t2 on t1.videoid = t2.related_id ) t3 lateral view explode(t3.category) t3_tmp as category_name ) t4 group by t4.category_name ) t5'
quest_4 = pd.read_sql(sql_4, eng)
quest_4
category_namevideo_sumrank
0Vehicles41
1Autos41
2Animals112
3Pets112
4Places123
5Travel123
6UNA134
7Howto145
8DIY145
9Sports196
10Games227
11Gadgets227
12News248
13Politics248
14Film479
15Animation479
16Blogs5110
17People5110
18Music19511
19Entertainment21612
20Comedy23713
5.统计每个类别中的视频热度 Top10,以 Music 为例
  • 方法一
        select
            *
        from
            (
                select
                    category_name,	
                    videoid,
                    views
                from gulivideo_orc 
                lateral view explode(category) t1_tmp as category_name
                where category_name = 'Music'
            ) t1
        order by views DESC limit 10
  • 方法二, 结果一样
        select
            category_name,
            videoid,
            VIEWS
        from gulivideo_orc 
        lateral view explode(category) tmp as category_name
        where category_name = 'Music'
        order by views desc
        limit 10
sql_5 = 'select * from ( select category_name,videoid,views from gulivideo_orc lateral view explode(category) t1_tmp as category_name where category_name="Music") t1 order by views DESC limit 10 '
quest_5 = pd.read_sql(sql_5, eng)
quest_5
t1.category_namet1.videoidt1.views
0MusicQjA5faZF1A815256922
1MusictYnn51C3X_w11823701
2Musicpv5zWaTEVkI11672017
3Music8bbTtPL1jRs9579911
4MusicUMf40daefsI7533070
5Music-xEzGIuY7kw6946033
6Musicd6C0bNDqf3Y6935578
7MusicHSoVKUVOnfQ6193057
8Music3URfWTEPmtE5581171
9MusicthtmaZnxk_05142238
6.统计每个类别视频观看数 Top10
        select
            *
        from
        (
        select 
            category_name,
            videoid,
            VIEWS,
            rank()over(partition by category order by views desc) as rank
        from gulivideo_orc 
        lateral view explode(category) tmp as category_name
        ) t1
        where t1.rank <=10
sql_6 = 'select * from (select category_name,videoid,VIEWS,rank()over(partition by category_name order by views desc) as rank from gulivideo_orc lateral view explode(category) tmp as category_name) t1 where t1.rank <=10 order by t1.category_name, t1.rank'
quest_6 = pd.read_sql(sql_6, eng)
quest_6
t1.category_namet1.videoidt1.viewst1.rank
0Animals2GWPOPSXGYI36600091
1AnimalsxmsV9R8FsDA31645822
2Animals12PsUW-8ge431335233
3AnimalsOeNggIGSKH824577504
4AnimalsWofFb_eOxxA20757285
...............
205Vehicles46LQd9dXFRU12621736
206VehiclespdiuDXwgrjQ10136977
207VehicleskY_cDpENQLE9566658
208VehiclesYtxfbxGz1u49426049
209VehiclesaCamHfJwSGU84744210

210 rows × 4 columns

7.统计上传视频最多的用户 Top10 以及他们上传的视频 观看次数在前 20 的视频
sql_7 = 'select t2.videoid, views from ( select uploader, videos from gulivideo_user_orc order by videos desc limit 10 ) t1 join gulivideo_orc t2 on t1.uploader = t2.uploader order by t2.views desc limit 20'
quest_7 = pd.read_sql(sql_7, eng)
quest_7
t2.videoidviews
0-IxHBW0YpZw39059
1BU-fT5XI_8I29975
2ADOcaBYbMl026270
3yAqsULIDJFE25511
4vcm-t0TJXNg25366
50KYGFawp14c24659
6j4DpuPvMLF422593
7Msu4lZb2oeQ18822
8ZHZVj44rpjE16304
9foATQY3wovI13576
10-UnQ8rcBOQs13450
11crtNd46CDks11639
12D1leA0JKHhE11553
13NJu2oG1Wm9811452
14CapbXdyv4j410915
15epr5erraEp410817
16IyQoDgaLM7U10597
17tbZibBnusLQ10402
18_GnCHodc7mk9422
19hvEYlSlRitU7123
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值