通过间隔分析查看转化时长的分布情况。

假设用户在过去某个时间段内行为序列是:A1 -> A2 -> A3 -> B4 -> B5 -> B6 -> A7 -> B8
初始行为 A 事件,后续行为 B 事件,
以距离初始行为事件最近的规则得到,(A1 -> B4), (A7 -> B8)
以距离后续行为事件最近的规则得到,(A3 -> B4), (A7 -> B8)

如何用 Clickhouse 语言实现间隔分析?

可以使用 ASOF JOIN 找出配对记录,再对配对记录进行排序去重后得到结果。测试语句如下所示,

-- 建表语句
CREATE TABLE IF NOT EXISTS interval_test (
  time DateTime,
  event String,
  user_id UInt64,
  amount Int64,
  log_id String
) ENGINE = Memory;


--- 插入数据
INSERT INTO interval_test VALUES
('2024-08-01 00:00:00', 'A', 1, 0, '1-1'),
('2024-08-01 01:00:00', 'A', 1, 0, '1-2'),
('2024-08-01 02:00:00', 'B', 1, 1, '1-3'),
('2024-08-02 03:00:00', 'B', 1, 2, '1-4'),
('2024-08-05 01:00:00', 'A', 1, 3, '1-5'),
('2024-08-05 02:00:00', 'A', 1, 4, '1-6'),
('2024-08-05 03:00:00', 'A', 1, 5, '1-7'),
('2024-08-05 04:00:00', 'B', 1, 5, '1-8'),
('2024-08-05 05:00:00', 'B', 1, 5, '1-9'),
('2024-08-05 06:00:00', 'B', 1, 5, '1-10'),
('2024-08-01 01:00:00', 'A', 2, 0, '2-1'),
('2024-08-02 02:00:00', 'A', 2, 1, '2-2'),
('2024-08-02 03:00:00', 'B', 2, 1, '2-3'),
('2024-08-10 04:00:00', 'B', 2, 2, '2-4'),
('2024-08-02 00:00:00', 'A', 3, 0, '3-1'),
('2024-08-03 01:00:00', 'B', 3, 1, '3-2'),
('2024-08-06 02:00:00', 'B', 3, 2, '3-3'),
('2024-08-10 03:00:00', 'B', 3, 3, '3-4'),
('2024-08-01 04:00:00', 'A', 4, 0, '4-1');

--- 以距离初始行为事件最近的规则
select distinct on (t2.log_id) *, date_diff('second', time, t2.time) as diff_time from (
  select date_trunc('day', time) as date, 0 as event_type, * from interval_test where event = 'A'
) as t1 global asof join (
  select date_trunc('day', time) as date, 1 as event_type, * from interval_test where event = 'B'
) as t2 on t1.date = t2.date and t1.user_id = t2.user_id and t1.time < t2.time
order by time
   ┌────────────────date─┬─event_type─┬────────────────time─┬─event─┬─user_id─┬─amount─┬─log_id─┬─────────────t2.date─┬─t2.event_type─┬─────────────t2.time─┬─t2.event─┬─t2.user_id─┬─t2.amount─┬─t2.log_id─┬─diff_time─┐
1. │ 2024-08-01 00:00:0002024-08-01 00:00:00 │ A     │       101-12024-08-01 00:00:0012024-08-01 02:00:00 │ B        │          111-372002. │ 2024-08-02 00:00:0002024-08-02 02:00:00 │ A     │       212-22024-08-02 00:00:0012024-08-02 03:00:00 │ B        │          212-336003. │ 2024-08-05 00:00:0002024-08-05 01:00:00 │ A     │       131-52024-08-05 00:00:0012024-08-05 04:00:00 │ B        │          151-810800 │
   └─────────────────────┴────────────┴─────────────────────┴───────┴─────────┴────────┴────────┴─────────────────────┴───────────────┴─────────────────────┴──────────┴────────────┴───────────┴───────────┴───────────┘

--- 以距离后续行为事件最近的规则
select distinct on (t2.log_id) *, date_diff('second', time, t2.time) as diff_time from (
  select date_trunc('day', time) as date, 0 as event_type, * from interval_test where event = 'A'
) as t1 global asof join (
  select date_trunc('day', time) as date, 1 as event_type, * from interval_test where event = 'B'
) as t2 on t1.date = t2.date and t1.user_id = t2.user_id and t1.time < t2.time
order by time desc
   ┌────────────────date─┬─event_type─┬────────────────time─┬─event─┬─user_id─┬─amount─┬─log_id─┬─────────────t2.date─┬─t2.event_type─┬─────────────t2.time─┬─t2.event─┬─t2.user_id─┬─t2.amount─┬─t2.log_id─┬─diff_time─┐
1. │ 2024-08-05 00:00:0002024-08-05 03:00:00 │ A     │       151-72024-08-05 00:00:0012024-08-05 04:00:00 │ B        │          151-836002. │ 2024-08-02 00:00:0002024-08-02 02:00:00 │ A     │       212-22024-08-02 00:00:0012024-08-02 03:00:00 │ B        │          212-336003. │ 2024-08-01 00:00:0002024-08-01 01:00:00 │ A     │       101-22024-08-01 00:00:0012024-08-01 02:00:00 │ B        │          111-33600 │
   └─────────────────────┴────────────┴─────────────────────┴───────┴─────────┴────────┴────────┴─────────────────────┴───────────────┴─────────────────────┴──────────┴────────────┴───────────┴───────────┴───────────┘