sql_2 ='select category_name, count(*) as video_num from (select videoid, category_name from gulivideo_orc lateral view explode(category) t1_tmp as category_name ) t1 group by category_name order by video_num desc limit 10'
cate_top10 = pd.read_sql(sql_2, eng)
cate_top10
sql_3 ='select category_name, count(1) as video_count from ( select category_name, t1.videoid from ( select videoid, category, views from gulivideo_orc order by views desc limit 20 ) t1 lateral view explode(t1.category) t1_tmp as category_name ) t2 group by category_name'
quest_3 = pd.read_sql(sql_3, eng)
quest_3
category_name
video_count
0
Blogs
2
1
Comedy
6
2
Entertainment
6
3
Music
5
4
People
2
5
UNA
1
4.统计视频观看数 Top50 所关联视频的所属类别排序
mydql order by 后的字段是不需要出现在 select 里的, 但这里不行,有 distinct 的情况也不行
这个查询涉及两个大表 join, 要设置 set hive.auto.convert.join = false. 原因是:mapJoin在处理一张小表和一张大表,新版的hive会自动优化把小表存入内存中进行缓存, 如果表的数据过大的时候,内存吃不消进而报错
注意做完这个查询里要把还原为true set hive.auto.convert.join = true
sql_4 ='select category_name, video_sum, dense_rank()over(order by video_sum) as rank from (select category_name, count(*) as video_sum from ( select t3.related_id, category_name from ( select t2.related_id, t1.category from gulivideo_orc t1 join ( select related_id from ( select relatedid, `views` from gulivideo_orc order by `views` desc limit 50) t1 lateral view explode(relatedid) t1_tmp as related_id ) t2 on t1.videoid = t2.related_id ) t3 lateral view explode(t3.category) t3_tmp as category_name ) t4 group by t4.category_name ) t5'
quest_4 = pd.read_sql(sql_4, eng)
quest_4
category_name
video_sum
rank
0
Vehicles
4
1
1
Autos
4
1
2
Animals
11
2
3
Pets
11
2
4
Places
12
3
5
Travel
12
3
6
UNA
13
4
7
Howto
14
5
8
DIY
14
5
9
Sports
19
6
10
Games
22
7
11
Gadgets
22
7
12
News
24
8
13
Politics
24
8
14
Film
47
9
15
Animation
47
9
16
Blogs
51
10
17
People
51
10
18
Music
195
11
19
Entertainment
216
12
20
Comedy
237
13
5.统计每个类别中的视频热度 Top10,以 Music 为例
方法一
select*from(select
category_name,
videoid,
views
from gulivideo_orc
lateral view explode(category) t1_tmp as category_name
where category_name ='Music') t1
orderby views DESClimit10
方法二, 结果一样
select
category_name,
videoid,
VIEWS
from gulivideo_orc
lateral view explode(category) tmp as category_name
where category_name ='Music'orderby views desclimit10
sql_5 ='select * from ( select category_name,videoid,views from gulivideo_orc lateral view explode(category) t1_tmp as category_name where category_name="Music") t1 order by views DESC limit 10 '
quest_5 = pd.read_sql(sql_5, eng)
quest_5
t1.category_name
t1.videoid
t1.views
0
Music
QjA5faZF1A8
15256922
1
Music
tYnn51C3X_w
11823701
2
Music
pv5zWaTEVkI
11672017
3
Music
8bbTtPL1jRs
9579911
4
Music
UMf40daefsI
7533070
5
Music
-xEzGIuY7kw
6946033
6
Music
d6C0bNDqf3Y
6935578
7
Music
HSoVKUVOnfQ
6193057
8
Music
3URfWTEPmtE
5581171
9
Music
thtmaZnxk_0
5142238
6.统计每个类别视频观看数 Top10
select*from(select
category_name,
videoid,
VIEWS,
rank()over(partitionby category orderby views desc)as rank
from gulivideo_orc
lateral view explode(category) tmp as category_name
) t1
where t1.rank <=10
sql_6 ='select * from (select category_name,videoid,VIEWS,rank()over(partition by category_name order by views desc) as rank from gulivideo_orc lateral view explode(category) tmp as category_name) t1 where t1.rank <=10 order by t1.category_name, t1.rank'
quest_6 = pd.read_sql(sql_6, eng)
quest_6
t1.category_name
t1.videoid
t1.views
t1.rank
0
Animals
2GWPOPSXGYI
3660009
1
1
Animals
xmsV9R8FsDA
3164582
2
2
Animals
12PsUW-8ge4
3133523
3
3
Animals
OeNggIGSKH8
2457750
4
4
Animals
WofFb_eOxxA
2075728
5
...
...
...
...
...
205
Vehicles
46LQd9dXFRU
1262173
6
206
Vehicles
pdiuDXwgrjQ
1013697
7
207
Vehicles
kY_cDpENQLE
956665
8
208
Vehicles
YtxfbxGz1u4
942604
9
209
Vehicles
aCamHfJwSGU
847442
10
210 rows × 4 columns
7.统计上传视频最多的用户 Top10 以及他们上传的视频 观看次数在前 20 的视频
sql_7 ='select t2.videoid, views from ( select uploader, videos from gulivideo_user_orc order by videos desc limit 10 ) t1 join gulivideo_orc t2 on t1.uploader = t2.uploader order by t2.views desc limit 20'
quest_7 = pd.read_sql(sql_7, eng)
quest_7