某个项目下,以 角色创建 为初始事件,角色登录 为回访事件,同时展示回访用户的 付费 事件的 阶段累计人均 充值金额,且仅显示
1日,2日,6日,13日,29日,59日,119日,179日,239日关键日期 的 SQL 查询示例
旧查询
WITH range(-1, 239 + 1) AS date_interval_nums,
arrayEnumerate(date_interval_nums) AS date_interval_nums_enumerate
SELECT
init_date,
toString(init_date) AS init_date_str,
loss_data,
array(
retention_data_all [1],
retention_data_all [3],
retention_data_all [4],
retention_data_all [8],
retention_data_all [15],
retention_data_all [31],
retention_data_all [61],
retention_data_all [121],
retention_data_all [181],
retention_data_all [241]
) AS retention_data,
array(
retention_amount_all [1],
retention_amount_all [3],
retention_amount_all [4],
retention_amount_all [8],
retention_amount_all [15],
retention_amount_all [31],
retention_amount_all [61],
retention_amount_all [121],
retention_amount_all [181],
retention_amount_all [241]
) AS retention_amount,
array(
total_retention_amount_all [1],
total_retention_amount_all [3],
total_retention_amount_all [4],
total_retention_amount_all [8],
total_retention_amount_all [15],
total_retention_amount_all [31],
total_retention_amount_all [61],
total_retention_amount_all [121],
total_retention_amount_all [181],
total_retention_amount_all [241]
) AS total_retention_amount,
toString(retention_data) AS retention_data_str,
toJSONString(loss_data) AS loss_data_str,
toJSONString(retention_amount) AS retention_amount_str,
toJSONString(total_retention_amount) AS total_retention_amount_str
FROM
(
SELECT
init_date,
retention_data_all,
loss_data,
arrayMap(
x -> round(
x / if(
retention_data_all [1] = 0,
1,
retention_data_all [1]
),
2
),
arrayCumSum(base_retention_amount_all)
) AS retention_amount_all,
arrayCumSum(base_retention_amount_all) AS total_retention_amount_all
FROM
(
SELECT
init_date,
groupArray(r) AS r_list,
groupArray(a) AS a_list,
arrayMap(
i -> arraySum(arrayMap(x -> x [i], r_list)),
range(1, length(r_list [1]) + 1)
) AS retention_data_all,
arrayMap(
i -> round(arraySum(arrayMap(x -> x [i], a_list)), 2),
range(1, length(a_list [1]) + 1)
) AS base_retention_amount_all,
arrayReduce(
'sumMap',
arrayMap(x -> map(x, 1), groupArray(loss_date_count))
) AS loss_data
FROM
(
SELECT
t1.date_time AS init_date,
t1.virtual_user_id AS virtual_user_id,
mapFromArrays(
return_dates,
arrayWithConstant(length(return_dates), 1)
) AS return_dates_map,
arrayMap(
x -> if(
x == -1,
1,
mapContains(return_dates_map, date_add(DAY, x, init_date))
),
date_interval_nums
) AS r,
coalesce(
date_diff(
'day',
init_date,
arrayFirstOrNull(x -> x > init_date, return_dates)
),
999999
) - 1 AS loss_date_count,
date_amounts_map,
arrayMap(
x -> multiIf(
x == 1,
0,
r [x],
date_amounts_map [date_add(DAY, date_interval_nums[x],init_date ) ],
0
),
date_interval_nums_enumerate
) AS a
FROM
(
SELECT
DISTINCT ON (date_time, virtual_user_id) date_time,
virtual_user_id
FROM
(
SELECT
toDateTime(date_trunc('day', `vpc@timezone_#time`)) AS date_time,
toString(`#user_id`) AS virtual_user_id,
`#time`
FROM
(
SELECT
*
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#user_create'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2025-12-08'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2025-12-08'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2025-12-07 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#user_create'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
)
ORDER BY
`#time`
) AS t1 GLOBAL
LEFT JOIN (
SELECT
toString(`#user_id`) AS virtual_user_id,
arraySort(
groupUniqArray(
toDateTime(date_trunc('day', `vpc@timezone_#time`))
)
) AS return_dates
FROM
(
SELECT
*
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#user_login'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2026-08-04 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#user_login'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
GROUP BY
virtual_user_id
) AS t2 ON t1.virtual_user_id = t2.virtual_user_id GLOBAL
LEFT JOIN (
SELECT
virtual_user_id,
cast(
(groupArray(date_time), groupArray(amount)) AS Map(Datetime, Float64)
) AS date_amounts_map
FROM
(
SELECT
toDateTime(date_trunc('day', `vpc@timezone_#time`)) AS date_time,
toString(`#user_id`) AS virtual_user_id,
toFloat64(coalesce(SUM(`#vp@amount_yuan`), 0)) AS amount
FROM
(
SELECT
*,
CAST((`#amount` / 100) AS Nullable(Float64)) AS `#vp@amount_yuan`
FROM
(
SELECT
`#amount`,
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#amount`,
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#charge'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2026-08-04 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#charge'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
GROUP BY
date_time,
virtual_user_id
)
GROUP BY
virtual_user_id
) AS t3 ON t1.virtual_user_id = t3.virtual_user_id
)
GROUP BY
init_date
)
)
ORDER BY
init_date,
retention_data [1] DESC
LIMIT
100000
settings use_query_cache = 0, enable_writes_to_query_cache = 0;慢的原因
聚合阶段使用了太多中间数组,导致处理很慢。
在最小改动的情况下,去掉中间层数组聚合处理,查询可以从 600 秒提升到 6 秒内。
新查询
WITH range(-1, 239 + 1) AS date_interval_nums,
arrayEnumerate(date_interval_nums) AS date_interval_nums_enumerate
SELECT
init_date,
toString(init_date) AS init_date_str,
loss_data,
retention_data_all AS retention_data,
retention_amount_all AS retention_amount,
total_retention_amount_all AS total_retention_amount,
toString(retention_data) AS retention_data_str,
toJSONString(loss_data) AS loss_data_str,
toJSONString(retention_amount) AS retention_amount_str,
toJSONString(total_retention_amount) AS total_retention_amount_str
FROM
(
SELECT
init_date,
retention_data_all,
loss_data,
arrayMap(
x -> round(
x / if(
retention_data_all [1] = 0,
1,
retention_data_all [1]
),
2
),
base_retention_amount_all
) AS retention_amount_all,
base_retention_amount_all AS total_retention_amount_all
FROM
(
SELECT
init_date,
[
count(),
countIf(mapContains(return_dates_map, date_add(DAY, 1, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 2, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 6, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 13, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 29, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 59, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 119, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 179, init_date))),
countIf(mapContains(return_dates_map, date_add(DAY, 239, init_date)))
] AS retention_data_all,
[
0,
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 1, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 2, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 6, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 13, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 29, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 59, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 119, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 179, init_date), date_amounts_map)))),
sum(arraySum(mapValues(mapFilter((k, v) -> k >= init_date AND k <= date_add(DAY, 239, init_date), date_amounts_map))))
] AS base_retention_amount_all,
arrayReduce(
'sumMap',
arrayMap(x -> map(x, 1), groupArray(loss_date_count))
) AS loss_data
FROM
(
SELECT
t1.date_time AS init_date,
t1.virtual_user_id AS virtual_user_id,
return_dates,
mapFromArrays(return_dates, arrayWithConstant(length(return_dates), 1)) AS return_dates_map,
coalesce(
date_diff(
'day',
init_date,
arrayFirstOrNull(x -> x > init_date, return_dates)
),
999999
) - 1 AS loss_date_count,
date_amounts_map
FROM
(
SELECT
DISTINCT ON (date_time, virtual_user_id) date_time,
virtual_user_id
FROM
(
SELECT
toDateTime(date_trunc('day', `vpc@timezone_#time`)) AS date_time,
toString(`#user_id`) AS virtual_user_id,
`#time`
FROM
(
SELECT
*
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#user_create'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2025-12-08'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2025-12-08'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2025-12-07 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#user_create'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
)
ORDER BY
`#time`
) AS t1 GLOBAL
LEFT JOIN (
SELECT
toString(`#user_id`) AS virtual_user_id,
arraySort(
groupUniqArray(
toDateTime(date_trunc('day', `vpc@timezone_#time`))
)
) AS return_dates
FROM
(
SELECT
*
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#user_login'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2026-08-04 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#user_login'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
GROUP BY
virtual_user_id
) AS t2 ON t1.virtual_user_id = t2.virtual_user_id GLOBAL
LEFT JOIN (
SELECT
virtual_user_id,
cast(
(groupArray(date_time), groupArray(amount)) AS Map(Datetime, Float64)
) AS date_amounts_map
FROM
(
SELECT
toDateTime(date_trunc('day', `vpc@timezone_#time`)) AS date_time,
toString(`#user_id`) AS virtual_user_id,
toFloat64(coalesce(SUM(`#vp@amount_yuan`), 0)) AS amount
FROM
(
SELECT
*,
CAST((`#amount` / 100) AS Nullable(Float64)) AS `#vp@amount_yuan`
FROM
(
SELECT
`#amount`,
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
`vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
(
SELECT
`#amount`,
`#data_lifecycle`,
`#dt`,
`#event`,
`#time`,
timestamp_add(
fromUnixTimestamp64Milli(`#time`),
interval 8 * 60 minute
) AS `vpc@timezone_#time`,
`#user_id`,
`#zone_offset`
FROM
events
WHERE
(`#event` IN ('#charge'))
AND (
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
)
)
) AS event
)
WHERE
(
(
`#dt` BETWEEN '2025-11-07'
AND '2026-08-05'
AND `vpc@timezone_#time` BETWEEN '2025-11-08 00:00:00.000'
AND '2026-08-04 23:59:59.999'
)
)
AND (`#data_lifecycle` = '0')
AND (`#event` IN ('#charge'))
)
WHERE
virtual_user_id != ''
AND virtual_user_id IS NOT NULL
GROUP BY
date_time,
virtual_user_id
)
GROUP BY
virtual_user_id
) AS t3 ON t1.virtual_user_id = t3.virtual_user_id
)
GROUP BY
init_date
)
)
ORDER BY
init_date,
retention_data [1] DESC
LIMIT
100000
format Vertical
settings use_query_cache = 0, enable_writes_to_query_cache = 0;