# SQL重叠时段的合并与跨天日期的拆分
假如我们有一张用户行为记录的日志表,表中记录有时间段信息,该如何**消除重复数据**并**统计日活**呢?
现有**书籍阅读日志表**如下,字段含义分别为**用户id**,**书籍id**以及**阅读记录的起止时间戳(毫秒值)**。
| user_id | book_id | start_time | end_time |
| ------- | ------- | ------------- | ------------- |
| 1 | 1 | 1614007373869 | 1614009602660 |
| 1 | 1 | 1614009607510 | 1614009910194 |
| 1 | 2 | 1614009409869 | 1614009604402 |
| 1 | 3 | 1614009536068 | 1614009606827 |
| 2 | 1 | 1614009603253 | 1614009998379 |
| 2 | 2 | 1614009602662 | 1614009999169 |
| 3 | 3 | 1614009409869 | 1614009504402 |
| 3 | 3 | 1614009500069 | 1614009600020 |
| 3 | 3 | 1614009459869 | 1614009604402 |
| 3 | 4 | 1614009636068 | 1614010606827 |
| 4 | 5 | 1614009687535 | 1614011620832 |
| 5 | 5 | 1614007310446 | 1614009622109 |
<br/>
## 准备数据
```sql
CREATE TABLE logs
(
`book_id` INT comment '用户ID',
`user_id` INT comment '书品ID',
`start_time` BIGINT comment '该次阅读起始毫秒时间',
`end_time` BIGINT comment '该次阅读结束毫秒时间'
)
comment '阅读日志表';
```
```sql
INSERT INTO logs
VALUES (1, 1, 1614007373869, 1614009602660),
(1, 1, 1614009607510, 1614009910194),
(1, 2, 1614009409869, 1614009604402),
(1, 3, 1614009536068, 1614009606827),
(2, 1, 1614009603253, 1614009998379),
(2, 2, 1614009602662, 1614009999169),
(3, 3, 1614009409869, 1614009504402),
(3, 3, 1614009500069, 1614009600020),
(3, 3, 1614009459869, 1614009604402),
(3, 4, 1614009636068, 1614010606827),
(4, 5, 1614009687535, 1614011620832),
(5, 5, 1614007310446, 1614009622109);
```
查询数据
```sql
SELECT `user_id`,
`book_id`,
From_unixtime(`start_time` / 1000) start_time,
From_unixtime(`end_time` / 1000) end_time
FROM logs;
```
![原始数据.png](https://www.concoding.com/upload/2021/03/%E5%8E%9F%E5%A7%8B%E6%95%B0%E6%8D%AE-adbd2e8c6b564790b59231023916a21e.png)
<br/>
## 合并重叠时间段
每个用户可能存在多条记录重叠的情况,第一步将可能存在的重叠情况进行合并。
query c1
```sql
--合并重复记录结果集记为C1
SELECT `user_id`, -- 按分组查询时间段边界值
`book_id`,
MIN(`start_time`) start_time,
MIN(`end_time`) end_time
FROM (SELECT `user_id`, -- 对原始数据集按是否为重叠的同一时间段进行分组,标记flag
`book_id`,
`start_time`,
`end_time`,
SUM(`broken`) OVER (PARTITION BY `user_id`, `book_id` ORDER BY `start_time`, `end_time`) flag
FROM (SELECT t.*, -- 按是否包含时间段对原始数据划分(类似于打断点,可以单独执行此query查看结果集)
(CASE
WHEN `start_time` <= MAX(`end_time`)
OVER (PARTITION BY `user_id`, `book_id` ORDER BY `start_time`,
`end_time` ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING)
THEN 0
ELSE 1
END) AS broken
FROM (SELECT `user_id`, -- 原始数据
`book_id`,
`start_time`,
`end_time`
FROM logs) t) t) t
GROUP BY `user_id`,
`book_id`,
`flag`
```
## 按天拆分数据
query c2
```sql
--拆分C1跨天日期
SELECT `user_id`, `book_id`, `start_time`, `end_time` -- union结果
FROM c1
WHERE date(from_unixtime(`start_time` / 1000)) = date(from_unixtime(`end_time` / 1000)) -- 不跨天
UNION ALL
SELECT `user_id`, -- 对子查询结果集进行进行起始日期的更新
`book_id`,
CASE
WHEN `id` = 1 THEN `start_time`
ELSE (unix_timestamp(date(from_unixtime(`start_time` / 1000) + `id` - 1)) * 1000)
END,
CASE
WHEN `id` = `m2` THEN `end_time`
ELSE ((unix_timestamp(date(from_unixtime(`end_time` / 1000)) + `id`) - 1) * 1000)
END
FROM (
SELECT `user_id`, -- 拆分步骤,拆分并标记C1中跨天的每条记录所在跨越的第几天(从该记录起始Date开始)
`book_id`,
`start_time`,
`end_time`,
`id`,
MAX(`id`) OVER (PARTITION BY `user_id`, `book_id`, `start_time` ) AS m2 -- 跨越最长的天数
FROM c1,
nums -- 辅助表 意为跨越天数
WHERE date(from_unixtime(`start_time` / 1000)) <> date(from_unixtime(`end_time` / 1000)) -- 跨天
AND `id` <=
datediff(date(from_unixtime(`end_time` / 1000)), date(from_unixtime(`start_time` / 1000))) +
1 -- 跨了几天
) t1
```
## 查询连续阅读时长数据
```sql
SELECT c2.`user_id`,
c2.`book_id`,
sum(c2.`end_time` - c2.`start_time`) `read_time(ms)`,
date(from_unixtime(`end_time` / 1000)) date
FROM c2
GROUP BY c2.`user_id`,
c2.`book_id`,
date(from_unixtime(`end_time` / 1000));
```
结果集:
![结果集.png](https://www.concoding.com/upload/2021/03/%E7%BB%93%E6%9E%9C%E9%9B%86-38b4a8686fca45f88de571640d68437b.png)
> 文中所用nums表为辅助表。
> create table nums(id);
> insert into nums values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10)……;
> 注意:不同数据库中使用的时间日期处理函数可能不同,本文测试环境为MySQL;Hive中所用函数略有不同。
### 完整SQL
```sql
WITH c1 AS
(
SELECT `user_id`, -- 按分组查询时间段边界值
`book_id`,
min(`start_time`) start_time,
min(`end_time`) end_time
FROM (
SELECT `user_id`, -- 对原始数据集按是否为重叠的同一时间段进行分组,标记flag
`book_id`,
`start_time`,
`end_time`,
SUM(`broken`) over (PARTITION BY `user_id`, `book_id` ORDER BY `start_time`, `end_time`) flag
FROM (
SELECT t.*, -- 按是否包含时间段对原始数据划分(类似于打断点,可以单独执行此query查看结果集)
(
CASE
WHEN `start_time` <= max(`end_time`) over (PARTITION BY `user_id`, `book_id` ORDER BY `start_time`, `end_time` ROWS BETWEEN unbounded preceding AND 1 preceding) THEN 0
ELSE 1
END) AS broken
FROM (
SELECT `user_id`, -- 原始数据
`book_id`,
`start_time`,
`end_time`
FROM logs) t) t) t
GROUP BY `user_id`,
`book_id`,
`flag`), c2 AS
(
SELECT *
FROM (
SELECT `user_id`,
`book_id`,
`start_time`,
`end_time` -- union结果
FROM c1
WHERE DATE(from_unixtime(`start_time` / 1000)) = DATE(from_unixtime(`end_time` / 1000)) -- 不跨天
UNION ALL
SELECT `user_id`, -- 对子查询结果集进行进行起始日期的更新
`book_id`,
CASE
WHEN `id` = 1 THEN `start_time`
ELSE (unix_timestamp(DATE(from_unixtime(`start_time` / 1000) + `id` - 1)) * 1000)
END,
CASE
WHEN `id` = `m2` THEN `end_time`
ELSE ((unix_timestamp(DATE(from_unixtime(`end_time` / 1000)) + `id`) - 1) * 1000)
END
FROM (
SELECT `user_id`, -- 拆分步骤,拆分并标记C1中跨天的每条记录所在跨越的第几天(从该记录起始Date开始)
`book_id`,
`start_time`,
`end_time`,
`id`,
max(`id`) over (PARTITION BY `user_id`, `book_id`, `start_time` ) AS m2 -- 跨越最长的天数
FROM c1,
nums -- 辅助表 意为跨越天数
WHERE DATE(from_unixtime(`start_time` / 1000)) <> DATE(from_unixtime(`end_time` / 1000)) -- 跨天
AND `id` <= datediff(DATE(from_unixtime(`end_time` / 1000)), DATE(from_unixtime(`start_time` / 1000))) + 1 -- 跨了几天
) t1 ) t1)
SELECT c2.`user_id`,
c2.`book_id`,
SUM(c2.`end_time` - c2.`start_time`) `read_time(ms)`,
DATE(from_unixtime(`end_time` / 1000)) DATE
FROM c2
GROUP BY c2.`user_id`,
c2.`book_id`,
DATE(from_unixtime(`end_time` / 1000));
```
SQL重叠时段的合并与跨天日期的拆分问题