一、字节跳动
最高峰同时直播人数
https://blog.csdn.net/SHWAITME/article/details/135918264
0 问题描述
有如下数据记录直播平台主播上播及下播时间,根据该数据计算出平台最高峰同时直播人数。
+----------+----------------------+----------------------+
| user_id | start_time | end_time |
+----------+----------------------+----------------------+
| 1 | 2024-04-29 01:00:00 | 2024-04-29 02:01:05 |
| 2 | 2024-04-29 01:05:00 | 2024-04-29 02:03:18 |
| 3 | 2024-04-29 02:00:00 | 2024-04-29 04:03:22 |
| 4 | 2024-04-29 03:15:07 | 2024-04-29 04:33:21 |
| 5 | 2024-04-29 03:34:16 | 2024-04-29 06:10:45 |
| 6 | 2024-04-29 05:22:00 | 2024-04-29 07:01:08 |
| 7 | 2024-04-29 06:11:03 | 2024-04-29 09:26:05 |
| 3 | 2024-04-29 08:00:00 | 2024-04-29 12:34:27 |
| 1 | 2024-04-29 11:00:00 | 2024-04-29 16:03:18 |
| 8 | 2024-04-29 15:00:00 | 2024-04-29 17:01:05 |
+----------+----------------------+----------------------+
1 数据准备
CREATE TABLE IF NOT EXISTS t1_livestream_log (
user_id INT, -- 主播ID
start_time STRING, -- 开始时间
end_time STRING -- 结束时间
);
insert overwrite table t1_livestream_log values
(1,'2024-04-29 01:00:00','2024-04-29 02:01:05'),
(2,'2024-04-29 01:05:00','2024-04-29 02:03:18'),
(3,'2024-04-29 02:00:00','2024-04-29 04:03:22'),
(4,'2024-04-29 03:15:07','2024-04-29 04:33:21'),
(5,'2024-04-29 03:34:16','2024-04-29 06:10:45'),
(6,'2024-04-29 05:22:00','2024-04-29 07:01:08'),
(7,'2024-04-29 06:11:03','2024-04-29 09:26:05'),
(3,'2024-04-29 08:00:00','2024-04-29 12:34:27'),
(1,'2024-04-29 11:00:00','2024-04-29 16:03:18'),
(8,'2024-04-29 15:00:00','2024-04-29 17:01:05');
2 数据分析
with t1 as(
select
user_id,
start_time as action_time,
--开播记录,标记 1
1 as change_cnt
from t1_livestream_log
union all
select
user_id,
end_time as action_time,
--下播记录,标记 1
-1 as change_cnt
from t1_livestream_log
)
select
max(online_cnt) as max_online_cnt
from
(select
user_id,
action_time,
change_cnt,
sum(change_cnt)over(order by action_time asc) as online_cnt
from t1)t2;

思路分析:
- step1: 首先对原始数据进行处理,生成主播上下播的日志数据,同时增加人数变化字段,主播上播为1,主播下播-1;
- step2:对操作日志按照操作时间进行累积求和
- step3:求取累计求和中的最大值,即为当天最高峰同时直播人数
3 小结
该题的关键点在于:对每个用户进入/退出直播间的行为进行打标签,再利用sum()over + max聚合函数计算最终的数值。
==========================*****==========================
股票波峰波谷
https://blog.csdn.net/SHWAITME/article/details/135902998
0 问题描述
有如下数据,记录每天每只股票的收盘价格,请计算 每只股票的波峰和波谷的日期和价格;
波峰:当天的股票价格大于前一天和后一天
波谷:当天的股票价格小于前一天和后一天
1 数据准备
create table t3_stock_test(
ts_code string comment '股票代码',
trade_date string comment '交易日期',
close float comment '收盘价'
);
insert overwrite table `t3_stock_test` values
('000001.SZ','20220104',16.66),
('000002.SZ','20220104',20.49),
('000001.SZ','20220105',17.15),
('000002.SZ','20220105',21.17),
('000001.SZ','20220106',17.12),
('000002.SZ','20220106',21.05),
('000001.SZ','20220107',17.2),
('000002.SZ','20220107',21.89),
('000001.SZ','20220110',17.19),
('000002.SZ','20220110',22.16),
('000001.SZ','20220111',17.41),
('000002.SZ','20220111',22.3),
('000001.SZ','20220112',17),
('000002.SZ','20220112',22.05),
('000001.SZ','20220113',16.98),
('000002.SZ','20220113',21.53),
('000001.SZ','20220114',16.33),
('000002.SZ','20220114',20.7),
('000001.SZ','20220117',16.22),
('000002.SZ','20220117',20.87);
2 数据分析
利用lag函数和lead函数,对每支股票分组,开窗计算出每天股票记录的前一天和后一天记录中的价格。
select
ts_code,
trade_date,
close,
point_type
from
(
select
ts_code,
trade_date,
close,
lastday_close,
nextday_close,
case when close > lastday_close and close > nextday_close then '波峰'
when close < lastday_close and close < nextday_close then '波谷'
else '其他' end as `point_type`
from
(
select
ts_code,
trade_date,
close,
lag(close,1)over(partition by ts_code order by trade_date ) as lastday_close,
lead(close,1)over(partition by ts_code order by trade_date ) as nextday_close
from t3_stock_test
) t1
) t2
where t2.point_type in('波峰','波谷')
3 小结
lead和lag函数一般用于计算当前行与上一行,或者当前行与下一行之间的差值。
-- 用于统计窗口内往上第n行。参数1为列名,参数2为往上第n行(可选,默认为1),参数3为默认值(当往上第n行为NULL时候,取默认值,如不指定,则为NULL)
lag(col,n,DEFAULT) over(partition by xx order by xx)
-- 用于统计窗口内往下第n行。参数1为列名,参数2为往下第n行(可选,默认为1),参数3为默认值(当往下第n行为NULL时候,取默认值,如不指定,则为NULL)
lead(col,n,DEFAULT) over(partition by xx order by xx)
==========================*****==========================
合并日期重叠的活动
0 问题描述
已知有表记录了每个大厅的活动开始日期和结束日期,每个大厅可以有多个活动。请编写一个SQL查询合并在同一个大厅举行的所有重叠的活动,如果两个活动至少有一天相同,那他们就是重叠的
样例数据
+----------+-------------+-------------+
| hall_id | start_date | end_date |
+----------+-------------+-------------+
| 1 | 2023-01-13 | 2023-01-20 |
| 1 | 2023-01-14 | 2023-01-17 |
| 1 | 2023-01-14 | 2023-01-16 |
| 1 | 2023-01-18 | 2023-01-25 |
| 1 | 2023-01-20 | 2023-01-26 |
| 2 | 2022-12-09 | 2022-12-23 |
| 2 | 2022-12-13 | 2022-12-17 |
| 2 | 2022-12-20 | 2022-12-24 |
| 2 | 2022-12-25 | 2022-12-30 |
| 3 | 2022-12-01 | 2023-01-30 |
+----------+-------------+-------------+
结果
+----------+-------------+-------------+
| hall_id | start_date | end_date |
+----------+-------------+-------------+
| 1 | 2023-01-13 | 2023-01-26 |
| 2 | 2022-12-09 | 2022-12-24 |
| 2 | 2022-12-25 | 2022-12-30 |
| 3 | 2022-12-01 | 2023-01-30 |
+----------+-------------+-------------+
解释:
大厅 1:
两个活动["2823-01-13","2023-01-20"]和[“2023-01-14","2023-01-17"]重叠,需求将它们合并到一个活动中[“2023-01-13","2023-01-20"]。
1 数据准备
CREATE TABLE IF NOT EXISTS t4_hall_event (
hall_id STRING, --大厅ID
start_date STRING, -- 营销活动开始日期
end_date STRING -- 营销活动结束日期
);
--数据插入
insert overwrite table t4_hall_event values
('1','2023-01-13','2023-01-20'),
('1','2023-01-14','2023-01-17'),
('1','2023-01-14','2023-01-16'),
('1','2023-01-18','2023-01-25'),
('1','2023-01-20','2023-01-26'),
('2','2022-12-09','2022-12-23'),
('2','2022-12-13','2022-12-17'),
('2','2022-12-20','2022-12-24'),
('2','2022-12-25','2022-12-30'),
('3','2022-12-01','2023-01-30');
2 数据分析
select
hall_id,
min(start_date) as start_date,
max(end_date) as end_date
from (select
hall_id,
start_date,
end_date,
max_end_date,
is_merge,
sum(is_merge) over (partition by hall_id order by start_date ) as group_id
from (select
hall_id,
start_date,
end_date,
max_end_date,
if(start_date <= max_end_date, 0, 1) as is_merge
--0:日期重叠,需要合并,1:日期没重叠,不用合并
from (select
hall_id,
start_date,
end_date,
-- step1: 1.使用max()函数开窗,获得截止到当前行之前的活动最后日期
max(end_date)
over (partition by hall_id order by start_date rows between unbounded preceding and 1 preceding) as max_end_date
from t4_hall_event) t1
) t2
) t3
group by hall_id, group_id --注意这里的分组,有group_id
思路分析:
- step1: 使用max()函数开窗,获得截止到当前行之前的活动最后日期
- step2: 对当前行的start_date 和截止到上一行的最大end_date进行比较,如果当前行的start_date 小于等于截止到前一行最大end_date 代表有交叉,可以合并,否则代表不可合并。 if(start_date <= max_end_date, 0, 1) as is_merge
- step3:连续问题,使用sum()over()进行分组
- step4:对hall_id+group_id分组,取每个组内的start_day 的最小值作为活动开始日期,end_day的最大值作为活动结束日期,得到最终结果。
题意转换: 统计每个大厅开展的营销活动总天数(日期有重叠的地方需要去重)
HiveSQL题——炸裂函数(explode/posexplode)_hive explode-CSDN博客
select
hall_id,
sum( datediff(end_date,new_start_date)+ 1) as day_cnt
from (select hall_id,
start_date,
end_date,
max_end_date,
new_start_date
from (select hall_id,
start_date,
end_date,
max_end_date,
if(max_end_date is null, start_date,
if(start_date > max_end_date, start_date, date_add(max_end_date, 1))) new_start_date
from (select hall_id,
start_date,
end_date,
max(end_date)
over (partition by hall_id order by
start_date ,end_date rows between unbounded preceding and 1 preceding) as max_end_date
from t4_hall_event) t1) t2
where new_start_date <= end_date) t3
group by hall_id;
3 小结
处理的关键思路:当营销活动的上一个日期区间A 与 当前活动的日期区间B出现重叠(日期交叉,有重复数据)时,需要将区间B的起始时间改成区间A的结束时间。(注意:修改之后需要保证B区间的结束时间> 开始时间)。
==========================*****==========================
查询最近一笔有效订单
0 问题描述
现有订单表t5_order,包含订单ID,订单时间,下单用户,当前订单是否有效
+---------+----------------------+----------+-----------+
| ord_id | ord_time | user_id | is_valid |
+---------+----------------------+----------+-----------+
| 1 | 2023-12-11 12:01:03 | a | 1 |
| 2 | 2023-12-11 12:02:06 | a | 0 |
| 3 | 2023-12-11 12:03:15 | a | 0 |
| 4 | 2023-12-11 12:04:20 | a | 1 |
| 5 | 2023-12-11 12:05:03 | a | 1 |
| 6 | 2023-12-11 12:01:02 | b | 1 |
| 7 | 2023-12-11 12:03:03 | b | 0 |
| 8 | 2023-12-11 12:04:01 | b | 1 |
| 9 | 2023-12-11 12:07:03 | b | 1 |
+---------+----------------------+----------+-----------+
请查询出每笔订单的上一笔有效订单,期望查询结果如下:
+---------+----------------------+----------+-----------+--------------------+
| ord_id | ord_time | user_id | is_valid | last_valid_ord_id |
+---------+----------------------+----------+-----------+--------------------+
| 1 | 2023-12-11 12:01:03 | a | 1 | NULL |
| 2 | 2023-12-11 12:02:06 | a | 0 | 1 |
| 3 | 2023-12-11 12:03:15 | a | 0 | 1 |
| 4 | 2023-12-11 12:04:20 | a | 1 | 1 |
| 5 | 2023-12-11 12:05:03 | a | 1 | 4 |
| 6 | 2023-12-11 12:01:02 | b | 1 | NULL |
| 7 | 2023-12-11 12:03:03 | b | 0 | 6 |
| 8 | 2023-12-11 12:04:01 | b | 1 | 6 |
| 9 | 2023-12-11 12:07:03 | b | 1 | 8 |
+---------+----------------------+----------+-----------+--------------------+
1 数据准备
create table t5_order
(
ord_id bigint COMMENT '订单ID',
ord_time string COMMENT '订单时间',
user_id string COMMENT '用户',
is_valid bigint COMMENT '订单是否有效'
);
-- 数据插入
insert overwrite table t5_order values
(1,'2023-12-11 12:01:03','a',1),
(2,'2023-12-11 12:02:06','a',0),
(3,'2023-12-11 12:03:15','a',0),
(4,'2023-12-11 12:04:20','a',1),
(5,'2023-12-11 12:05:03','a',1),
(6,'2023-12-11 12:01:02','b',1),
(7,'2023-12-11 12:03:03','b',0),
(8,'2023-12-11 12:04:01','b',1),
(9,'2023-12-11 12:07:03','b',1);
2 数据分析
with tmp as
(select
ord_id,
ord_time,
user_id,
is_valid,
lag (ord_id,1,null) over(partition by user_id order by ord_time) last_valid_ord_id
from t5_order
where is_valid = 1)
select
ord_id,
ord_time,
user_id,
is_valid,
last_valid_ord_id
from
(
select
t5.*,
tmp.last_valid_ord_id,
row_number()over(partition by t5.ord_id,t5.user_id order by tmp.ord_time ) rn
from t5_order t5
left join tmp
on t5.user_id = tmp.user_id
where tmp.ord_time >= t5.ord_time
)t6
where rn = 1;
思路分析:
- step1: 查询出有效订单,以及 每笔有效订单的上一单有效订单
- step2: 原始的明细数据与step1步的有效订单表按照用户user_id进行关联,筛选条件:有效订单表的订单时间大于等于原始订单表
- step3:使用row_number,对原始订单记录表的user_id、ord_id进行分组,对有效订单表的时间排序
- step4:筛选rn=1 的记录
3 小结
略
==========================*****==========================
共同使用ip用户检测
0 问题描述
现有用户登录日志表,记录了每个用户登录的IP地址,请查询共同使用过3个及以上IP的用户对;
+----------+-----------------+----------------------+
| user_id | ip | time_stamp |
+----------+-----------------+----------------------+
| 2 | 223.104.41.101 | 2023-08-24 07:00:00 |
| 4 | 223.104.41.122 | 2023-08-24 10:00:00 |
| 5 | 223.104.41.126 | 2023-08-24 11:00:00 |
| 4 | 223.104.41.126 | 2023-08-24 13:00:00 |
| 1 | 223.104.41.101 | 2023-08-24 16:00:00 |
| 3 | 223.104.41.101 | 2023-08-24 16:02:00 |
| 2 | 223.104.41.104 | 2023-08-24 16:30:00 |
| 1 | 223.104.41.121 | 2023-08-24 17:00:00 |
| 2 | 223.104.41.122 | 2023-08-24 17:05:00 |
| 3 | 223.104.41.103 | 2023-08-24 18:11:00 |
| 2 | 223.104.41.103 | 2023-08-24 19:00:00 |
| 1 | 223.104.41.104 | 2023-08-24 19:00:00 |
| 3 | 223.104.41.122 | 2023-08-24 19:07:00 |
| 1 | 223.104.41.122 | 2023-08-24 21:00:00 |
+----------+-----------------+----------------------+
1 数据准备
CREATE TABLE t6_login_log (
user_id bigint COMMENT '用户ID',
ip string COMMENT '用户登录ip地址',
time_stamp string COMMENT '登录时间'
) COMMENT '用户登录记录表';
-- 插入数据
insert overwrite table t6_login_log
values
(1,'223.104.41.101','2023-08-24 16:00:00'),
(1,'223.104.41.121','2023-08-24 17:00:00'),
(1,'223.104.41.104','2023-08-24 19:00:00'),
(1,'223.104.41.122','2023-08-24 21:00:00'),
(1,'223.104.41.122','2023-08-24 22:00:00'),
(2,'223.104.41.101','2023-08-24 07:00:00'),
(2,'223.104.41.103','2023-08-24 19:00:00'),
(2,'223.104.41.104','2023-08-24 16:30:00'),
(2,'223.104.41.122','2023-08-24 17:05:00'),
(3,'223.104.41.103','2023-08-24 18:11:00'),
(3,'223.104.41.122','2023-08-24 19:07:00'),
(3,'223.104.41.101','2023-08-24 16:02:00'),
(4,'223.104.41.126','2023-08-24 13:00:00'),
(5,'223.104.41.126','2023-08-24 11:00:00'),
(4,'223.104.41.122','2023-08-24 10:00:00');
2 数据分析
-- step1: 针对用户登录记录,按照用户ID和登录IP去重
with tmp as
(
select
user_id,
ip
from t6_login_log
group by user_id,ip
)
s





1169

被折叠的 条评论
为什么被折叠?



