通过间隔分析查看转化时长的分布情况。
假设用户在过去某个时间段内行为序列是:A1 -> A2 -> A3 -> B4 -> B5 -> B6 -> A7 -> B8
,
初始行为 A 事件,后续行为 B 事件,
以距离初始行为事件最近的规则得到,(A1 -> B4), (A7 -> B8)
以距离后续行为事件最近的规则得到,(A3 -> B4), (A7 -> B8)
如何用 Clickhouse 语言实现间隔分析?
可以使用 ASOF JOIN 找出配对记录,再对配对记录进行排序去重后得到结果。测试语句如下所示,
-- 建表语句
CREATE TABLE IF NOT EXISTS interval_test (
time DateTime,
event String,
user_id UInt64,
amount Int64,
log_id String
) ENGINE = Memory;
--- 插入数据
INSERT INTO interval_test VALUES
('2024-08-01 00:00:00', 'A', 1, 0, '1-1'),
('2024-08-01 01:00:00', 'A', 1, 0, '1-2'),
('2024-08-01 02:00:00', 'B', 1, 1, '1-3'),
('2024-08-02 03:00:00', 'B', 1, 2, '1-4'),
('2024-08-05 01:00:00', 'A', 1, 3, '1-5'),
('2024-08-05 02:00:00', 'A', 1, 4, '1-6'),
('2024-08-05 03:00:00', 'A', 1, 5, '1-7'),
('2024-08-05 04:00:00', 'B', 1, 5, '1-8'),
('2024-08-05 05:00:00', 'B', 1, 5, '1-9'),
('2024-08-05 06:00:00', 'B', 1, 5, '1-10'),
('2024-08-01 01:00:00', 'A', 2, 0, '2-1'),
('2024-08-02 02:00:00', 'A', 2, 1, '2-2'),
('2024-08-02 03:00:00', 'B', 2, 1, '2-3'),
('2024-08-10 04:00:00', 'B', 2, 2, '2-4'),
('2024-08-02 00:00:00', 'A', 3, 0, '3-1'),
('2024-08-03 01:00:00', 'B', 3, 1, '3-2'),
('2024-08-06 02:00:00', 'B', 3, 2, '3-3'),
('2024-08-10 03:00:00', 'B', 3, 3, '3-4'),
('2024-08-01 04:00:00', 'A', 4, 0, '4-1');
--- 以距离初始行为事件最近的规则
select distinct on (t2.log_id) *, date_diff('second', time, t2.time) as diff_time from (
select date_trunc('day', time) as date, 0 as event_type, * from interval_test where event = 'A'
) as t1 global asof join (
select date_trunc('day', time) as date, 1 as event_type, * from interval_test where event = 'B'
) as t2 on t1.date = t2.date and t1.user_id = t2.user_id and t1.time < t2.time
order by time
┌────────────────date─┬─event_type─┬────────────────time─┬─event─┬─user_id─┬─amount─┬─log_id─┬─────────────t2.date─┬─t2.event_type─┬─────────────t2.time─┬─t2.event─┬─t2.user_id─┬─t2.amount─┬─t2.log_id─┬─diff_time─┐
1. │ 2024-08-01 00:00:00 │ 0 │ 2024-08-01 00:00:00 │ A │ 1 │ 0 │ 1-1 │ 2024-08-01 00:00:00 │ 1 │ 2024-08-01 02:00:00 │ B │ 1 │ 1 │ 1-3 │ 7200 │
2. │ 2024-08-02 00:00:00 │ 0 │ 2024-08-02 02:00:00 │ A │ 2 │ 1 │ 2-2 │ 2024-08-02 00:00:00 │ 1 │ 2024-08-02 03:00:00 │ B │ 2 │ 1 │ 2-3 │ 3600 │
3. │ 2024-08-05 00:00:00 │ 0 │ 2024-08-05 01:00:00 │ A │ 1 │ 3 │ 1-5 │ 2024-08-05 00:00:00 │ 1 │ 2024-08-05 04:00:00 │ B │ 1 │ 5 │ 1-8 │ 10800 │
└─────────────────────┴────────────┴─────────────────────┴───────┴─────────┴────────┴────────┴─────────────────────┴───────────────┴─────────────────────┴──────────┴────────────┴───────────┴───────────┴───────────┘
--- 以距离后续行为事件最近的规则
select distinct on (t2.log_id) *, date_diff('second', time, t2.time) as diff_time from (
select date_trunc('day', time) as date, 0 as event_type, * from interval_test where event = 'A'
) as t1 global asof join (
select date_trunc('day', time) as date, 1 as event_type, * from interval_test where event = 'B'
) as t2 on t1.date = t2.date and t1.user_id = t2.user_id and t1.time < t2.time
order by time desc
┌────────────────date─┬─event_type─┬────────────────time─┬─event─┬─user_id─┬─amount─┬─log_id─┬─────────────t2.date─┬─t2.event_type─┬─────────────t2.time─┬─t2.event─┬─t2.user_id─┬─t2.amount─┬─t2.log_id─┬─diff_time─┐
1. │ 2024-08-05 00:00:00 │ 0 │ 2024-08-05 03:00:00 │ A │ 1 │ 5 │ 1-7 │ 2024-08-05 00:00:00 │ 1 │ 2024-08-05 04:00:00 │ B │ 1 │ 5 │ 1-8 │ 3600 │
2. │ 2024-08-02 00:00:00 │ 0 │ 2024-08-02 02:00:00 │ A │ 2 │ 1 │ 2-2 │ 2024-08-02 00:00:00 │ 1 │ 2024-08-02 03:00:00 │ B │ 2 │ 1 │ 2-3 │ 3600 │
3. │ 2024-08-01 00:00:00 │ 0 │ 2024-08-01 01:00:00 │ A │ 1 │ 0 │ 1-2 │ 2024-08-01 00:00:00 │ 1 │ 2024-08-01 02:00:00 │ B │ 1 │ 1 │ 1-3 │ 3600 │
└─────────────────────┴────────────┴─────────────────────┴───────┴─────────┴────────┴────────┴─────────────────────┴───────────────┴─────────────────────┴──────────┴────────────┴───────────┴───────────┴───────────┘