Table of Contents
一.需求
求每个用户的最长连续登录天数,两个日期的间隔小于或等于 3 均视为连续登录。比如 01-01 号登录,最近的下一次登录是 01-04 号,两个日期的间隔等于 3 天,因此这两个日期之间的天数都算作活跃天数,一共 4 天。
因为考虑MySQL版本问题,暂时不能使用分析函数。
二.解决方案
测试数据:
create table test_login(user_id int,login_date timestamp);
insert into test_login values (1,'2020-01-01 00:01:00');
insert into test_login values (1,'2020-01-01 00:02:00');
insert into test_login values (1,'2020-01-01 00:03:00');
insert into test_login values (1,'2020-01-02 00:03:00');
insert into test_login values (1,'2020-01-05 00:03:00');
insert into test_login values (1,'2020-01-07 00:03:00');
insert into test_login values (1,'2020-01-11 00:03:00');
insert into test_login values (1,'2020-01-12 00:03:00');
insert into test_login values (1,'2020-01-13 00:03:00');
insert into test_login values (1,'2020-01-14 00:03:00');
insert into test_login values (1,'2020-01-17 00:03:00');
insert into test_login values (1,'2020-01-18 00:03:00');
insert into test_login values (1,'2020-01-19 00:03:00');
insert into test_login values (1,'2020-01-22 00:03:00');
insert into test_login values (2,'2020-01-01 00:04:00');
insert into test_login values (2,'2020-01-01 00:05:00');
insert into test_login values (2,'2020-01-01 00:06:00');
insert into test_login values (2,'2020-01-02 00:06:00');
insert into test_login values (2,'2020-01-07 00:06:00');
insert into test_login values (2,'2020-01-11 00:06:00');
insert into test_login values (2,'2020-01-12 00:06:00');
insert into test_login values (2,'2020-01-14 00:06:00');
insert into test_login values (2,'2020-01-18 00:06:00');
insert into test_login values (2,'2020-01-19 00:06:00');
insert into test_login values (2,'2020-01-22 00:06:00');
代码
SELECT user_id,
max(diff_days) + 1 max_diff_days
from
(
SELECT user_id,
flag2,
min(login_date) min_login_date,
max(login_date) max_login_date,
datediff(max(login_date),min(login_date)) diff_days
from
(
SELECT tmp3.user_id,
tmp3.login_date,
tmp3.flag1,
sum(tmp4.flag1) flag2
from
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
left join
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp4 -- 功能同tmp3
on tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1
) tmp5 -- 每一个用户 每一个连续区间设置为不同的flag 标记为flag2
group by user_id,flag2
) tmp6 -- 计算出每个连续区间 最大值和最小值 以及连续间隔天数
group by user_id
这样看起来是不是觉得非常的复杂,下面我们拆解开
2.1 同一天多次登陆去重
考虑同一天会有多次登陆的,此处用group by语句进行去重,同一天只保留一条记录
代码:
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
测试记录:
mysql> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d');
+---------+------------+
| user_id | login_date |
+---------+------------+
| 1 | 2020-01-01 |
| 1 | 2020-01-02 |
| 1 | 2020-01-05 |
| 1 | 2020-01-07 |
| 1 | 2020-01-11 |
| 1 | 2020-01-12 |
| 1 | 2020-01-13 |
| 1 | 2020-01-14 |
| 1 | 2020-01-17 |
| 1 | 2020-01-18 |
| 1 | 2020-01-19 |
| 1 | 2020-01-22 |
| 2 | 2020-01-01 |
| 2 | 2020-01-02 |
| 2 | 2020-01-07 |
| 2 | 2020-01-11 |
| 2 | 2020-01-12 |
| 2 | 2020-01-14 |
| 2 | 2020-01-18 |
| 2 | 2020-01-19 |
| 2 | 2020-01-22 |
+---------+------------+
21 rows in set (0.00 sec)
mysql>
2.2 求上一次的登陆日期
我们需要找到上一次登陆日期,与此次的登陆日期对比,才可以判断是否符合小于等于3天的条件
此处,我们可以使用标量子查询来实现
需要记录的每个用户的第一条登陆信息上一天为null
代码:
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
测试记录:
mysql> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ;
+---------+------------+---------------+
| user_id | login_date | up_login_date |
+---------+------------+---------------+
| 1 | 2020-01-01 | NULL |
| 1 | 2020-01-02 | 2020-01-01 |
| 1 | 2020-01-05 | 2020-01-02 |
| 1 | 2020-01-07 | 2020-01-05 |
| 1 | 2020-01-11 | 2020-01-07 |
| 1 | 2020-01-12 | 2020-01-11 |
| 1 | 2020-01-13 | 2020-01-12 |
| 1 | 2020-01-14 | 2020-01-13 |
| 1 | 2020-01-17 | 2020-01-14 |
| 1 | 2020-01-18 | 2020-01-17 |
| 1 | 2020-01-19 | 2020-01-18 |
| 1 | 2020-01-22 | 2020-01-19 |
| 2 | 2020-01-01 | NULL |
| 2 | 2020-01-02 | 2020-01-01 |
| 2 | 2020-01-07 | 2020-01-02 |
| 2 | 2020-01-11 | 2020-01-07 |
| 2 | 2020-01-12 | 2020-01-11 |
| 2 | 2020-01-14 | 2020-01-12 |
| 2 | 2020-01-18 | 2020-01-14 |
| 2 | 2020-01-19 | 2020-01-18 |
| 2 | 2020-01-22 | 2020-01-19 |
+---------+------------+---------------+
21 rows in set (0.00 sec)
2.3 打标记判断是否间隔天数<=3
求出上一个登陆日期,与本次登陆日期比较,如果<=3,则标记为0,否则标记为1
需要注意的是上一步的空值直接标记为1
这样通过flag1 我们就可以看到连续登陆的区间了
代码:
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
测试记录:
mysql> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ;
+---------+------------+-------+
| user_id | login_date | flag1 |
+---------+------------+-------+
| 1 | 2020-01-01 | 1 |
| 1 | 2020-01-02 | 0 |
| 1 | 2020-01-05 | 0 |
| 1 | 2020-01-07 | 0 |
| 1 | 2020-01-11 | 1 |
| 1 | 2020-01-12 | 0 |
| 1 | 2020-01-13 | 0 |
| 1 | 2020-01-14 | 0 |
| 1 | 2020-01-17 | 0 |
| 1 | 2020-01-18 | 0 |
| 1 | 2020-01-19 | 0 |
| 1 | 2020-01-22 | 0 |
| 2 | 2020-01-01 | 1 |
| 2 | 2020-01-02 | 0 |
| 2 | 2020-01-07 | 1 |
| 2 | 2020-01-11 | 1 |
| 2 | 2020-01-12 | 0 |
| 2 | 2020-01-14 | 0 |
| 2 | 2020-01-18 | 1 |
| 2 | 2020-01-19 | 0 |
| 2 | 2020-01-22 | 0 |
+---------+------------+-------+
21 rows in set (0.01 sec)
2.4 打分组的标记
因为同一个用户存在多个连续登陆的区间,根据上一个步骤的flag1没办法区分开,此时需要区分开同一个用户的不同连续登陆区间。
因为MySQL版本不支持分析函数,只能通过临时表表连接的方式实现,再考虑去除重复,需要用到分组语句进行去重。
其实flag2是这个解决方案核心所在,同一个用户第一次连续登陆区间标记为1,第二次则累加为2,以此类推。
代码:
SELECT tmp3.user_id,
tmp3.login_date,
tmp3.flag1,
sum(tmp4.flag1) flag2
from
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
left join
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp4 -- 功能同tmp3
on tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1
测试记录:
mysql> SELECT tmp3.user_id,
-> tmp3.login_date,
-> tmp3.flag1,
-> sum(tmp4.flag1) flag2
-> from
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
-> left join
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp4 -- 功能同tmp3
-> on tmp3.user_id = tmp4.user_id
-> and tmp4.login_date <= tmp3.login_date
-> group by tmp3.user_id,tmp3.login_date,tmp3.flag1;
+---------+------------+-------+-------+
| user_id | login_date | flag1 | flag2 |
+---------+------------+-------+-------+
| 1 | 2020-01-01 | 1 | 1 |
| 1 | 2020-01-02 | 0 | 1 |
| 1 | 2020-01-05 | 0 | 1 |
| 1 | 2020-01-07 | 0 | 1 |
| 1 | 2020-01-11 | 1 | 2 |
| 1 | 2020-01-12 | 0 | 2 |
| 1 | 2020-01-13 | 0 | 2 |
| 1 | 2020-01-14 | 0 | 2 |
| 1 | 2020-01-17 | 0 | 2 |
| 1 | 2020-01-18 | 0 | 2 |
| 1 | 2020-01-19 | 0 | 2 |
| 1 | 2020-01-22 | 0 | 2 |
| 2 | 2020-01-01 | 1 | 1 |
| 2 | 2020-01-02 | 0 | 1 |
| 2 | 2020-01-07 | 1 | 2 |
| 2 | 2020-01-11 | 1 | 3 |
| 2 | 2020-01-12 | 0 | 3 |
| 2 | 2020-01-14 | 0 | 3 |
| 2 | 2020-01-18 | 1 | 4 |
| 2 | 2020-01-19 | 0 | 4 |
| 2 | 2020-01-22 | 0 | 4 |
+---------+------------+-------+-------+
21 rows in set (0.01 sec)
2.5 分组求最大的间隔天数
其实有了上一个步骤的flag2,就可以判断区分开每一个用户的每一个连续登陆区间,直接进行分组就可以求出该连续区间 最大和最小的登陆日期,两个日期差值就是间隔天数。最后根据user_id进行分组,求最大的间隔天数就是最终结果。
需要审题 01-01到01-04 间隔是3天,但是算4天连续登陆 所以datediff函数的结果要+1
代码:
SELECT user_id,
max(diff_days) +1 max_diff_days
from
(
SELECT user_id,
flag2,
min(login_date) min_login_date,
max(login_date) max_login_date,
datediff(max(login_date),min(login_date)) diff_days
from
(
SELECT tmp3.user_id,
tmp3.login_date,
tmp3.flag1,
sum(tmp4.flag1) flag2
from
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
left join
(
SELECT user_id,
login_date,
case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
from
(
SELECT user_id,
login_date,
(select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
from
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
) tmp2 -- 通过标量子查询实现上一天的登陆日期
) tmp4 -- 功能同tmp3
on tmp3.user_id = tmp4.user_id
and tmp4.login_date <= tmp3.login_date
group by tmp3.user_id,tmp3.login_date,tmp3.flag1
) tmp5 -- 每一个用户 每一个连续区间设置为不同的flag 标记为flag2
group by user_id,flag2
) tmp6 -- 计算出每个连续区间 最大值和最小值 以及连续间隔天数
group by user_id
测试记录:
mysql> SELECT user_id,
-> flag2,
-> min(login_date) min_login_date,
-> max(login_date) max_login_date,
-> datediff(max(login_date),min(login_date)) diff_days
-> from
-> (
-> SELECT tmp3.user_id,
-> tmp3.login_date,
-> tmp3.flag1,
-> sum(tmp4.flag1) flag2
-> from
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
-> left join
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp4 -- 功能同tmp3
-> on tmp3.user_id = tmp4.user_id
-> and tmp4.login_date <= tmp3.login_date
-> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
-> ) tmp5 -- 每一个用户 每一个连续区间设置为不同的flag 标记为flag2
-> group by user_id,flag2;
+---------+-------+----------------+----------------+-----------+
| user_id | flag2 | min_login_date | max_login_date | diff_days |
+---------+-------+----------------+----------------+-----------+
| 1 | 1 | 2020-01-01 | 2020-01-07 | 6 |
| 1 | 2 | 2020-01-11 | 2020-01-22 | 11 |
| 2 | 1 | 2020-01-01 | 2020-01-02 | 1 |
| 2 | 2 | 2020-01-07 | 2020-01-07 | 0 |
| 2 | 3 | 2020-01-11 | 2020-01-14 | 3 |
| 2 | 4 | 2020-01-18 | 2020-01-22 | 4 |
+---------+-------+----------------+----------------+-----------+
6 rows in set (0.01 sec)
mysql>
mysql> SELECT user_id,
-> max(diff_days) +1 max_diff_days
-> from
-> (
-> SELECT user_id,
-> flag2,
-> min(login_date) min_login_date,
-> max(login_date) max_login_date,
-> datediff(max(login_date),min(login_date)) diff_days
-> from
-> (
-> SELECT tmp3.user_id,
-> tmp3.login_date,
-> tmp3.flag1,
-> sum(tmp4.flag1) flag2
-> from
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp3 -- 打标记 如上下间隔<=3 则为1 否则为0
-> left join
-> (
-> SELECT user_id,
-> login_date,
-> case when up_login_date is null then 1 else case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end end as flag1
-> from
-> (
-> SELECT user_id,
-> login_date,
-> (select max(date_format(login_date,'%Y-%m-%d')) from test_login t2 where t2.user_id = tmp1.user_id and t2.login_date < tmp1.login_date ) as up_login_date
-> from
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ) tmp1 -- tmp1表 分组去重 一天多次登陆的算一条
-> ) tmp2 -- 通过标量子查询实现上一天的登陆日期
-> ) tmp4 -- 功能同tmp3
-> on tmp3.user_id = tmp4.user_id
-> and tmp4.login_date <= tmp3.login_date
-> group by tmp3.user_id,tmp3.login_date,tmp3.flag1
-> ) tmp5 -- 每一个用户 每一个连续区间设置为不同的flag 标记为flag2
-> group by user_id,flag2
-> ) tmp6 -- 计算出每个连续区间 最大值和最小值 以及连续间隔天数
-> group by user_id
-> ;
+---------+---------------+
| user_id | max_diff_days |
+---------+---------------+
| 1 | 12 |
| 2 | 5 |
+---------+---------------+
2 rows in set (0.00 sec)
mysql>
三.MySQL 8.0写法
MySQL 8.0的with语句以及分析窗口函数,可以使上面的解决方案的代码简洁度大大提升
代码:
with tmp1 AS
-- tmp1临时表 一天多次登陆算一次
(
SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
from test_login
group by user_id,date_format(login_date,'%Y-%m-%d')
),
tmp2 as
-- tmp2临时表 通过lag获取上一次登陆时间 如是第一条给默认值
(
select user_id,
login_date,
lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_date
from tmp1
),
tmp3 AS
-- 判断是否符合3天内标准 打标记flag1
(
select user_id,
login_date,
case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end as flag1
from tmp2
),
tmp4 AS
-- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
(
select user_id,
login_date,
flag1,
sum(flag1) over(partition by user_id order by login_date) as flag2
from tmp3
),
tmp5 AS
-- 通过user_id flag2进行分组
(
SELECT user_id,
flag2,
min(login_date) min_login_date,
max(login_date) max_login_date,
datediff(max(login_date),min(login_date)) diff_days
from tmp4
group by user_id,flag2
)
SELECT user_id,
max(diff_days) + 1 as diff_days
from tmp5
group by user_id;
测试记录:
mysql> select version();
+-----------+
| version() |
+-----------+
| 8.0.19 |
+-----------+
1 row in set (0.00 sec)
mysql> with tmp1 AS
-> -- tmp1临时表 一天多次登陆算一次
-> (
-> SELECT user_id,date_format(login_date,'%Y-%m-%d') login_date
-> from test_login
-> group by user_id,date_format(login_date,'%Y-%m-%d')
-> ),
-> tmp2 as
-> -- tmp2临时表 通过lag获取上一次登陆时间 如是第一条给默认值
-> (
-> select user_id,
-> login_date,
-> lag(login_date,1,'1900-01-01') over(partition by user_id order by login_date) up_login_d
-> from tmp1
-> ),
-> tmp3 AS
-> -- 判断是否符合3天内标准 打标记flag1
-> (
-> select user_id,
-> login_date,
-> case when DATEDIFF(login_date,up_login_date) <= 3 then 0 else 1 end as flag1
-> from tmp2
-> ),
-> tmp4 AS
-> -- 通过分析函数将每个用户 每一个连续登陆期间进行标记 flag2
-> (
-> select user_id,
-> login_date,
-> flag1,
-> sum(flag1) over(partition by user_id order by login_date) as flag2
-> from tmp3
-> ),
-> tmp5 AS
-> -- 通过user_id flag2进行分组
-> (
-> SELECT user_id,
-> flag2,
-> min(login_date) min_login_date,
-> max(login_date) max_login_date,
-> datediff(max(login_date),min(login_date)) diff_days
-> from tmp4
-> group by user_id,flag2
-> )
-> SELECT user_id,
-> max(diff_days) + 1 as diff_days
-> from tmp5
-> group by user_id;
+---------+-----------+
| user_id | diff_days |
+---------+-----------+
| 1 | 12 |
| 2 | 5 |
+---------+-----------+
2 rows in set (0.00 sec)