以Postgresql为主,再聊聊资料库应用递回加快 count distinct 的等效查询


create table it201011a (
  gal text not null
);

-- 输入一亿笔
insert into it201011a
select (array['小岛南', '初川南', '相沢南', '架乃ゆら', '山岸逢花', '枫カレン', '坂道みる', '桥本ありな', '葵つかさ', '天使もえ', '水卜さくら'])[floor(random() * 11) + 1::int]
  from generate_series(1, 1e8);

create index on it201011a(gal);
-- 
要找出栏位distinct , 一般直接的方式.

select distinct gal
  from it201011a;

这样会很慢....

我们只看一下 评估成本,就不用 timing.
explain (costs)
select distinct gal
  from it201011a;
+------------------------------------------------------------------------------+
|                                  QUERY PLAN                                  |
+------------------------------------------------------------------------------+
| HashAggregate  (cost=1790541.00..1790541.11 rows=11 width=12)                |
|   Group Key: gal                                                             |
|   ->  Seq Scan on it201011a  (cost=0.00..1540541.00 rows=100000000 width=12) |
+------------------------------------------------------------------------------+

--
with recursive t1 as (
select min(t.gal) gal
  from it201011a t
 where t.gal is not null
union all
select (select min(gal) 
          from it201011a a
        where a.gal > b.gal
          and a.gal is not null)
  from t1 b
 where b.gal is not null
)
select *
  from t1
 where gal is not null;

+------------+
|    gal     |
+------------+
| 初川南     |
| 小岛南     |
| 相沢南     |
| 坂道みる   |
| 天使もえ   |
| 山岸逢花   |
| 架乃ゆら   |
| 枫カレン   |
| 葵つかさ   |
| 桥本ありな |
| 水卜さくら |
+------------+
(11 rows)

Time: 4.377 ms
速度超快!! 来看一下执行计画

explain (analyze,timing,costs)
with recursive t1 as (
select min(t.gal) gal
  from it201011a t
 where t.gal is not null
union all
select (select min(gal) 
          from it201011a a
        where a.gal > b.gal
          and a.gal is not null)
  from t1 b
 where b.gal is not null
)
select *
  from t1
 where gal is not null;
 
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                       QUERY PLAN                                                                                       |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| CTE Scan on t1  (cost=241.33..243.35 rows=100 width=32) (actual time=0.147..3.688 rows=11 loops=1)                                                                                     |
|   Filter: (gal IS NOT NULL)                                                                                                                                                            |
|   Rows Removed by Filter: 1                                                                                                                                                            |
|   CTE t1                                                                                                                                                                               |
|     ->  Recursive Union  (cost=2.33..241.33 rows=101 width=32) (actual time=0.141..3.666 rows=12 loops=1)                                                                              |
|           ->  Result  (cost=2.33..2.34 rows=1 width=32) (actual time=0.107..0.109 rows=1 loops=1)                                                                                      |
|                 InitPlan 3 (returns $1)                                                                                                                                                |
|                   ->  Limit  (cost=0.57..2.33 rows=1 width=12) (actual time=0.100..0.101 rows=1 loops=1)                                                                               |
|                         ->  Index Only Scan using it201011a_gal_idx on it201011a t  (cost=0.57..176009683.02 rows=100000000 width=12) (actual time=0.098..0.099 rows=1 loops=1)        |
|                               Index Cond: (gal IS NOT NULL)                                                                                                                            |
|                               Heap Fetches: 1                                                                                                                                          |
|           ->  WorkTable Scan on t1 b  (cost=0.00..23.70 rows=10 width=32) (actual time=0.291..0.292 rows=1 loops=12)                                                                   |
|                 Filter: (gal IS NOT NULL)                                                                                                                                              |
|                 Rows Removed by Filter: 0                                                                                                                                              |
|                 SubPlan 2                                                                                                                                                              |
|                   ->  Result  (cost=2.34..2.35 rows=1 width=32) (actual time=0.315..0.315 rows=1 loops=11)                                                                             |
|                         InitPlan 1 (returns $3)                                                                                                                                        |
|                           ->  Limit  (cost=0.57..2.34 rows=1 width=12) (actual time=0.313..0.313 rows=1 loops=11)                                                                      |
|                                 ->  Index Only Scan using it201011a_gal_idx on it201011a a  (cost=0.57..59072943.18 rows=33333333 width=12) (actual time=0.312..0.312 rows=1 loops=11) |
|                                       Index Cond: ((gal > b.gal) AND (gal IS NOT NULL))                                                                                                |
|                                       Heap Fetches: 10                                                                                                                                 |
| Planning Time: 0.437 ms                                                                                                                                                                |
| Execution Time: 3.823 ms                                                                                                                                                               |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

其实就一个loop,依次比较,且逐步减少需要聚合的量.
-----------
接着来看第二个例子. 有一种情境是 一个 table 存放 device 相关资料,device 可以是 sensor, 车辆 等等;
另一个 table 存放 device 的纪录, 例如感应数据,或是行车状况等等.
为了简便起见,以下的例子就不加上日期.

create table itdevice (
 id int not null primary key
);

insert into itdevice
select generate_series(1,1000);

create table itdevval (
  id int generated always as identity
, devid int not null
, ts timestamp not null
);

输入600万笔资料

insert into itdevval(devid, ts)
select b.devid
     , clock_timestamp()
  from (select generate_series(1,1e4)) a
     , (select generate_series(1,600) devid) b;

建立 itdevval 中 devid 栏位的 index.

create index on itdevval(devid);

----
当我们想要查询哪些device,没有数据资料,例如设备故障等情况.
因为两边都有 device id 的 index, 一般直接查询.

基本的 not in

select *
  from itdevice
 where id not in (
       select distinct devid
         from itdevval);
Time: 1372.260 ms (00:01.372)

select *
  from itdevice
 where id not in (
       select devid
         from itdevval);
Time: 249832.184 ms (04:09.832)

来看一下执行计画

explain (costs)
select *
  from itdevice
 where id not in (
       select distinct devid
         from itdevval);
+--------------------------------------------------------------------------------+
|                                   QUERY PLAN                                   |
+--------------------------------------------------------------------------------+
| Seq Scan on itdevice  (cost=107440.50..107458.00 rows=500 width=4)             |
|   Filter: (NOT (hashed SubPlan 1))                                             |
|   SubPlan 1                                                                    |
|     ->  HashAggregate  (cost=107433.00..107439.00 rows=600 width=4)            |
|           Group Key: itdevval.devid                                            |
|           ->  Seq Scan on itdevval  (cost=0.00..92433.00 rows=6000000 width=4) |
+--------------------------------------------------------------------------------+

explain (costs)
select *
  from itdevice
 where id not in (
       select devid
         from itdevval);
+--------------------------------------------------------------------------------+
|                                   QUERY PLAN                                   |
+--------------------------------------------------------------------------------+
| Seq Scan on itdevice  (cost=0.00..80435517.50 rows=500 width=4)                |
|   Filter: (NOT (SubPlan 1))                                                    |
|   SubPlan 1                                                                    |
|     ->  Materialize  (cost=0.00..145871.00 rows=6000000 width=4)               |
|           ->  Seq Scan on itdevval  (cost=0.00..92433.00 rows=6000000 width=4) |
+--------------------------------------------------------------------------------+

distinct 有聚合, HashAggregate  Group Key: itdevval.devid 
但是都是 Seq Scan

使用 outer join 的方式

select a.id
  from itdevice a
  left join itdevval b
    on (a.id = b.devid)
 where b.* is null;
Time: 2218.486 ms (00:02.218)

explain (costs)
select a.id
  from itdevice a
  left join itdevval b
    on (a.id = b.devid)
 where b.* is null;
+---------------------------------------------------------------------------+
|                                QUERY PLAN                                 |
+---------------------------------------------------------------------------+
| Hash Right Join  (cost=27.50..108277.25 rows=30000 width=4)               |
|   Hash Cond: (b.devid = a.id)                                             |
|   Filter: (b.* IS NULL)                                                   |
|   ->  Seq Scan on itdevval b  (cost=0.00..92433.00 rows=6000000 width=44) |
|   ->  Hash  (cost=15.00..15.00 rows=1000 width=4)                         |
|         ->  Seq Scan on itdevice a  (cost=0.00..15.00 rows=1000 width=4)  |
+---------------------------------------------------------------------------+

递回的方式

with recursive t1 as (
select min(devid) devid
  from itdevval
 where devid is not null
union all
select (select min(v.devid)
          from itdevval v
         where v.devid > t1.devid
           and v.devid is not null)
  from t1
 where t1.devid is not null
), t2 as (
select devid
  from t1
 where devid is not null
 -- t2 是有 itdevval 有资料的 devid
)
select id
  from itdevice
 where id not in (
       select devid
         from t2);
         
Time: 11.715 ms

原理与第一个例子相同.速度比一般方法快多了.

杂谈