PostgreSQL删除表中重复数据

原创阎书利 2022-01-19

1234

一、测试数据创建

postgres=# create table test_chongfu(id int not null default null,name char(50) default null,class int  default null);
CREATE TABLE

postgres=# insert into test_chongfu values(1,'AA',1);
INSERT 0 1
postgres=# insert into test_chongfu values(2,'AB',8);
INSERT 0 1
postgres=# insert into test_chongfu values(3,'AB',8);
INSERT 0 1
postgres=# insert into test_chongfu values(4,'BB',5);
INSERT 0 1
postgres=# insert into test_chongfu values(5,'BD',7);
INSERT 0 1
postgres=# insert into test_chongfu values(6,'BD',7);
INSERT 0 1

postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)


postgres=# alter table test_chongfu add constraint pk1 primary key ("id");
ALTER TABLE

postgres=# \d test_chongfu
                 Table "public.test_chongfu"
 Column |     Type      | Collation | Nullable |   Default
--------+---------------+-----------+----------+--------------
 id     | integer       |           | not null |
 name   | character(50) |           |          | NULL::bpchar
 class  | integer       |           |          |
Indexes:
    "pk1" PRIMARY KEY, btree (id)
复制

二、存在唯一标识情况

如果删除重复数据的表里有除ctid外的唯一标识，可以利用这一列，我这里为id列。

postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)

postgres=# select * from test_chongfu where id not in (select min(id)  from test_chongfu group by name,class);
 id |                        name                        | class
----+----------------------------------------------------+-------
  3 | AB                                                 |     8
  6 | BD                                                 |     7
(2 rows)
复制

通过id列，查看根据name和class两列匹配的重复列。执行如下语句删除重复数据。

postgres=# delete from test_chongfu where id not in (select min(id)  from test_chongfu group by name,class);
DELETE 2
postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)
复制

三、不存在唯一标识情况

恢复初始测试环境。如果表里不存在唯一标识列，可以通过ctid来进行。ctid类似于oracle的rowid，但是形式不同。

postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  3 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
  6 | BD                                                 |     7
(6 rows)

postgres=# select * from test_chongfu where ctid not in( select min(ctid) from test_chongfu group by name,class);
 id |                        name                        | class
----+----------------------------------------------------+-------
  3 | AB                                                 |     8
  6 | BD                                                 |     7
(2 rows)


postgres=# delete from test_chongfu where ctid not in( select min(ctid) from test_chongfu group by name,class);
DELETE 2
postgres=# select * from test_chongfu;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)
复制

四、not in优化

in本来效率就很低，not in的效率更加低，如果数据量很大的情况，可能执行时间会特别长。因此可以尝试把查询到的唯一数据重新插入到一张新表里，然后把原来旧表删掉，最后修改去重数据后的新表名为旧表名。

postgres=# create table tab_new as select * from test_chongfu where ctid in(select min(ctid) from test_chongfu group by name,class);
SELECT 4
postgres=# select * from tab_new;
 id |                        name                        | class
----+----------------------------------------------------+-------
  1 | AA                                                 |     1
  2 | AB                                                 |     8
  4 | BB                                                 |     5
  5 | BD                                                 |     7
(4 rows)
复制