以Postgresql为主,再聊聊资料库 PostgreSQL复制schema下table结构到另一schema的procedure及取样的研讨

在资料库的日常使用,会有将schema下的table结构复制到另一schema下,供做测试,或是做移转等需求.

为方便说明,将建立三个 schema

myfunc 用来放我们的 function , procedure
s0822 用来放source tables
t0822 用来放target tables

复制的方法

create schema myfunc;
create schema s0822;
create schema t0822;

建立 source tables

create table s0822.t001 (
  id int generated always as identity primary key
, txt text not null
);

create table s0822.t002 (
  id int generated always as identity primary key
, t001_id int not null references s0822.t001(id)
, txt text not null
);

建立一个 function 产生指定 schema 下一般tables 的名称

create or replace function myfunc.f_get_schema_tables (
  in in_schema text
, out relname text
)
returns setof text
language sql as
$code$
  select c.relname::text
    from pg_class c
    join pg_namespace n
      on c.relnamespace = n.oid
   where n.nspname::text = in_schema
     and c.relkind = 'r'
   order by 1;
$code$;

建立复制结构的 procedure

create or replace procedure myfunc.p_clone_schmea_tables (
  in source_schema text
, in target_schema text
)
language plpgsql as
$code$
  declare
    curs cursor for select relname from myfunc.f_get_schema_tables(source_schema);
    rel text;
   
begin
  open curs;
  loop
    fetch curs into rel;
    exit when not found;
      execute 'create table if not exists '
                 ||  target_schema || '.' || rel
                 || ' (like ' || source_schema || '.' || rel
                 || ' including all)';
  end loop;
  close curs;
end;
$code$;

使用 procedure 来复制

call myfunc.p_clone_schmea_tables('s0822', 't0822');

观察结果

\dt s0822.*
       List of relations
 Schema | Name | Type  | Owner
--------+------+-------+-------
 s0822  | t001 | table | nana
 s0822  | t002 | table | nana
(2 rows)

\dt t0822.*
       List of relations
 Schema | Name | Type  | Owner
--------+------+-------+-------
 t0822  | t001 | table | nana
 t0822  | t002 | table | nana
(2 rows)



\d s0822.t001
                           Table "s0822.t001"
 Column |  Type   | Collation | Nullable |           Default
--------+---------+-----------+----------+------------------------------
 id     | integer |           | not null | generated always as identity
 txt    | text    |           | not null |
Indexes:
    "t001_pkey" PRIMARY KEY, btree (id)
Referenced by:
    TABLE "s0822.t002" CONSTRAINT "t002_t001_id_fkey" FOREIGN KEY (t001_id) REFERENCES s0822.t001(id)



\d t0822.t001
                           Table "t0822.t001"
 Column |  Type   | Collation | Nullable |           Default
--------+---------+-----------+----------+------------------------------
 id     | integer |           | not null | generated always as identity
 txt    | text    |           | not null |
Indexes:
    "t001_pkey" PRIMARY KEY, btree (id)


\d s0822.t002

                          Table "s0822.t002"
 Column  |  Type   | Collation | Nullable |           Default
---------+---------+-----------+----------+------------------------------
 id      | integer |           | not null | generated always as identity
 t001_id | integer |           | not null |
 txt     | text    |           | not null |
Indexes:
    "t002_pkey" PRIMARY KEY, btree (id)
Foreign-key constraints:
    "t002_t001_id_fkey" FOREIGN KEY (t001_id) REFERENCES s0822.t001(id)
    
\d t0822.t002

                           Table "t0822.t002"
 Column  |  Type   | Collation | Nullable |           Default
---------+---------+-----------+----------+------------------------------
 id      | integer |           | not null | generated always as identity
 t001_id | integer |           | not null |
 txt     | text    |           | not null |
Indexes:
    "t002_pkey" PRIMARY KEY, btree (id)

注意到除了 Foreign key constraints 是都复制了.
因为我们是使用了 like INCLUDING ALL
细节可以参考官网文件
https://www.postgresql.org/docs/current/sql-createtable.html

取样的方法

先建立一个放乱数资料的table

create table s0822.t (
  a integer
, b text
, c boolean
);

select setseed(0.5);

insert into s0822.t
with r as (
select id
     , chr((32+random()*94)::integer)
     , random() < 0.01
  from generate_series(1, 1e5) as s(id)
)
select *
  from r
 order by random();
 
analyze s0822.t;

select attname
     , correlation
  from pg_stats
 where schemaname = 's0822'
   and tablename = 't';
   
 attname | correlation
---------+--------------
 a       | 0.0044361027
 b       | 0.0063023297
 c       |    0.9826269
(3 rows)

建立四个 table 用来放取样资料

使用 like 方法, 但不使用 including all

create table t0822.t1 (
like s0822.t
);

create table t0822.t2 (
like s0822.t
);

create table t0822.t3 (
like s0822.t
);

create table t0822.t4 (
like s0822.t
);

取样的方式在 select 的 tablesample 子句

https://www.postgresql.org/docs/current/sql-select.html

TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ]

取样10% 使用 system method

insert into t0822.t1
select *
  from s0822.t
 tablesample system(10);

使用 bernoulli method

insert into t0822.t2
select *
  from s0822.t
 tablesample bernoulli(10);

INSERT 0 9987
Time: 17.633 ms

比较

analyze t0822.t1;
analyze t0822.t2;


select attname
     , correlation
  from pg_stats
 where schemaname = 't0822'
   and tablename = 't1'
 order by 1;

 attname |  correlation
---------+---------------
 a       | -0.0044052443
 b       |  0.0011416585
 c       |     0.9800325
(3 rows)

select attname
     , correlation
  from pg_stats
 where schemaname = 't0822'
   and tablename = 't2'
 order by 1;

 attname |  correlation
---------+---------------
 a       | 0.00066867325
 b       |   0.023340156
 c       |    0.98007065
(3 rows)

a 是 id,递增, 取样後相关性就差异很大,是正常现象.
b, c 栏位的相关性, c 原本是 random() < 0.01 方式产生,
b 原本是 chr((32+random()*94)::integer) 方式产生,
可以观察到两种取样方式与原始table在统计值上的接近程度.
但是要知道 pg_stat 的相关性,也是抽样的,并不是 100%.
以官方文件的说法, bernoulli method 会慢一些,但是效果会
好一些.另外可以参考官网文件以及古博士的分享.

https://island.postgresql.tw/2018/07/24/tablesample.html

另外还有 REPEATABLE, 这是让我们可以设定取样时的乱数种子,进而可以重现.

使用 bernoulli取样 20%,乱数种子为 100

insert into t0822.t3
select *
  from s0822.t
 tablesample bernoulli(20) REPEATABLE(100);
 
INSERT 0 20075
Time: 23.151 ms

insert into t0822.t4
select *
  from s0822.t
 tablesample bernoulli(20) REPEATABLE(100);

INSERT 0 20075
Time: 22.981 ms

---
analyze t0822.t3;
analyze t0822.t4;

select attname
     , correlation
  from pg_stats
 where schemaname = 't0822'
   and tablename = 't3'
 order by 1;

 attname | correlation
---------+--------------
 a       | 0.0013938936
 b       |  0.002332309
 c       |   0.98037195
(3 rows)


select attname
     , correlation
  from pg_stats
 where schemaname = 't0822'
   and tablename = 't4'
  order by 1;
  
 attname | correlation
---------+--------------
 a       | 0.0013938936
 b       |  0.002332309
 c       |   0.98037195
(3 rows)

可以观察到,使用了同样乱数种子,会取样相同.