PostgreSQL行列转换介绍

原创贺晓群 2021-09-08

5333

在分析数据时经常需要对数据表进行行列转换，本文针对PostgreSQL的行列转换作一个总结分享。

一列拆成多行，列值中通过指定的符号进行分隔

postgres=# SELECT regexp_split_to_table('hello world    joan', '\s+');
 regexp_split_to_table 
-----------------------
 hello
 world
 joan
(3 rows)

postgres=# SELECT regexp_split_to_table('数学，语文，英语，物理和化学', '，|和');
 regexp_split_to_table 
-----------------------
 数学
 语文
 英语
 物理
 化学
(5 rows)
复制

regexp_split_to_table函数接收两个参数，第一个参数为被拆分的string，第二个参数为分隔符（支持正则表达式）

多行聚合到同一列

postgres=# WITH tmp_table (classname, subject) AS (
postgres(#   VALUES 
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '英语'),
postgres(#      ('二班', '物理'),
postgres(#      ('二班', '化学'),
postgres(#      ('二班', '数学')
postgres(# )
postgres-# SELECT classname,string_agg(subject, ',')
postgres-#   FROM tmp_table
postgres-#  GROUP BY classname;
 classname |   string_agg   
-----------+----------------
 二班      | 物理,化学,数学
 一班      | 语文,语文,英语
(2 rows)

--上面语句聚合后有重复数据怎么办？处理语句如下：
postgres=# WITH tmp_table (classname, subject) AS (
postgres(#   VALUES 
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '英语'),
postgres(#      ('二班', '物理'),
postgres(#      ('二班', '化学'),
postgres(#      ('二班', '数学')
postgres(# )
postgres-# SELECT classname,string_agg(DISTINCT subject, ',')
postgres-#   FROM tmp_table
postgres-#  GROUP BY classname;
 classname |   string_agg   
-----------+----------------
 二班      | 化学,数学,物理
 一班      | 英语,语文
(2 rows)

--上面语句对于聚合后的列值如果要排序怎么办？处理语句如下：
postgres=# WITH tmp_table (classname, subject) AS (
postgres(#   VALUES 
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '语文'),
postgres(#      ('一班', '英语'),
postgres(#      ('二班', '物理'),
postgres(#      ('二班', '化学'),
postgres(#      ('二班', '数学')
postgres(# )
postgres-# SELECT classname,string_agg(DISTINCT subject, ',' ORDER BY subject)
postgres-#   FROM tmp_table
postgres-#  GROUP BY classname;
 classname |   string_agg   
-----------+----------------
 二班      | 化学,数学,物理
 一班      | 英语,语文
(2 rows)
复制

行转换到多列

--构造测试数据
postgres=# CREATE TABLE test(name VARCHAR(32),subject VARCHAR(10),score numeric(5,2));
CREATE TABLE

postgres=# INSERT INTO test
postgres-#     (name, subject, score)
postgres-# VALUES
postgres-#     ('Joan', '语文', 90),
postgres-#     ('Joan', '数学', 99),
postgres-#     ('Joan', '英语', 80),
postgres-#     ('Joan', '物理', 96),
postgres-#     ('Jack', '语文', 91),
postgres-#     ('Jack', '物理', 92),
postgres-#     ('Jack', '数学', 93),
postgres-#     ('Jack', '英语', 94);
INSERT 0 8

postgres=# SELECT * FROM test;
 name | subject | score 
------+---------+-------
 Joan | 语文    | 90.00
 Joan | 数学    | 99.00
 Joan | 英语    | 80.00
 Joan | 物理    | 96.00
 Jack | 语文    | 91.00
 Jack | 物理    | 92.00
 Jack | 数学    | 93.00
 Jack | 英语    | 94.00
(8 rows)

--常用方法是对生成的列做条件判断，如下：
postgres=# SELECT name,
postgres-#        SUM(score) filter(WHERE subject = '语文') AS "语文",
postgres-#        SUM(score) filter(WHERE subject = '数学') AS "数学",
postgres-#        SUM(score) filter(WHERE subject = '英语') AS "英语",
postgres-#        SUM(score) filter(WHERE subject = '物理') AS "物理"
postgres-#   FROM test
postgres-#  GROUP BY name;
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 | 80.00 | 96.00
(2 rows)

--但是这样如果需要转换的列很多，代码量比较大，太繁琐，怎么简化此语句？
--我们可以借助tablefunc插件来简化语句
--先创建插件tablefunc
postgres=# CREATE EXTENSION tablefunc;
CREATE EXTENSION

--使用单参数的crosstab函数，列的次序受限于ORDER BY中的第二列，像这个例子列名就没有像我们习惯的方式排序
--crosstab(text sql)
postgres=# SELECT *
postgres-#   FROM crosstab('SELECT NAME,subject,score
postgres'#                    FROM test
postgres'#                   ORDER BY 1,2'
postgres(#                 ) AS tb(NAME VARCHAR, "数学" NUMERIC(5, 2), "物理" NUMERIC(5, 2), "英语" NUMERIC(5, 2), "语文" NUMERIC(5, 2));
 name | 数学  | 物理  | 英语  | 语文  
------+-------+-------+-------+-------
 Jack | 93.00 | 92.00 | 94.00 | 91.00
 Joan | 99.00 | 96.00 | 80.00 | 90.00
(2 rows)

--既然排序的第二列无法满足我们要求，可以对排序列做自定义排序（按照我们的次序进行设定：语文、数学、英语...）
postgres=# SELECT *
postgres-#   FROM crosstab('SELECT NAME,subject,score
postgres'#                    FROM test
postgres'#                   ORDER BY 1,POSITION(CONCAT(subject,'','') IN CONCAT(''语文'','','',''数学'','','',''英语'','','',''物理'','',''))'
postgres(#                 ) AS tb(NAME VARCHAR, "语文" NUMERIC(5, 2), "数学" NUMERIC(5, 2), "英语" NUMERIC(5, 2), "物理" NUMERIC(5, 2));
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 | 80.00 | 96.00
(2 rows)

--如果觉得上面自定义排序太复杂，也可以在外层调整查询的列次序
postgres=# SELECT name,category_4 AS "语文",category_1 AS "数学",category_3 AS "英语",category_2 AS "物理"
postgres-#  FROM crosstab('SELECT NAME,subject,score
postgres'#                   FROM test
postgres'#                  ORDER BY 1,2'
postgres(#                ) AS tb(NAME VARCHAR, category_1 NUMERIC(5, 2), category_2 NUMERIC(5, 2), category_3 NUMERIC(5, 2), category_4 NUMERIC(5, 2));
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 | 80.00 | 96.00
(2 rows)

--上面使用crosstab函数时，在FROM中都定义了输出列，比较繁琐，那如何避免？
--使用插件tablefunc中的预定义函数crosstab4
postgres=# SELECT row_name,category_4 AS "语文",category_1 AS "数学",category_3 AS "英语",category_2 AS "物理"
postgres-#  FROM crosstab4('SELECT NAME::text,subject,score::text
postgres'#                   FROM test
postgres'#                  ORDER BY 1,2'
postgres(#                );
 row_name | 语文  | 数学  | 英语  | 物理  
----------+-------+-------+-------+-------
 Jack     | 91.00 | 93.00 | 94.00 | 92.00
 Joan     | 90.00 | 99.00 | 80.00 | 96.00
(2 rows)

--其实crosstab4函数内部还是调用了crosstab(text sql)函数，只是在返回值上使用了自定义类型tablefunc_crosstab_4，插件预定义的有：crosstab2、crosstab3和crosstab4，但是我们也可以创建额外的crosstabN函数
--下面是预定义的crosstab4函数
CREATE TYPE public.tablefunc_crosstab_4 AS
(
	row_name text,
	category_1 text,
	category_2 text,
	category_3 text,
	category_4 text
);

CREATE OR REPLACE FUNCTION public.crosstab4(
	text)
    RETURNS SETOF tablefunc_crosstab_4 
    LANGUAGE 'c'
    COST 1
    STABLE STRICT PARALLEL UNSAFE
    ROWS 1000
AS '$libdir/tablefunc', 'crosstab';

--我们可以自定义crosstabN，例如crosstab5
CREATE TYPE public.tablefunc_crosstab_num_5 AS
(
	name text,
	chinese numeric(5,2),
	maths   numeric(5,2),
	english numeric(5,2),
	science numeric(5,2),
	music   numeric(5,2)
);

CREATE OR REPLACE FUNCTION public.crosstab5(
	text)
    RETURNS SETOF tablefunc_crosstab_num_5
    LANGUAGE 'c'
    COST 1
    STABLE STRICT PARALLEL UNSAFE
    ROWS 1000
AS '$libdir/tablefunc', 'crosstab';

--上面单参数的函数crosstab，使用也有限制条件，如下：
--删除一个学生的英语科目
postgres=# DELETE FROM test WHERE name = 'Joan' AND subject = '英语';
DELETE 1
--这个时候语句就产生了错误，我们删除的是英语，但是Joan行的指标数据出现了错乱
postgres=# SELECT row_name,category_4 AS "语文",category_1 AS "数学",category_3 AS "英语",category_2 AS "物理"
 FROM crosstab4('SELECT NAME::text,subject,score::text
                  FROM test
                 ORDER BY 1,2'
               );
 row_name | 语文  | 数学  | 英语  | 物理  
----------+-------+-------+-------+-------
 Jack     | 91.00 | 93.00 | 94.00 | 92.00
 Joan     |       | 99.00 | 90.00 | 96.00
(2 rows)

--要解决上面的问题，我们得使用双参数的 crosstab(text source_sql, text category_sql) 函数
--crosstab的单参数形式的主要限制是它把一个组中的所有值都视作相似，并且把每一个值插入到第一个可用的列中。如果你想要值列对应于特定的数据分类，并且某些分组可能没有关于某些分类的数据，这样的形式就无法工作。crosstab的双参数形式通过提供一个对应于输出列的显式分类列表来处理这种情况。
postgres=# SELECT *
postgres-#   FROM crosstab('SELECT name,subject,score FROM test ORDER BY 1',
postgres(#                 'SELECT regexp_split_to_table(''语文,数学,英语,物理'', '','')'  --可以显示指定列，会自动匹配对应的列值
postgres(#                 ) AS tb(NAME VARCHAR, "语文" NUMERIC(5, 2), "数学" NUMERIC(5, 2), "英语" NUMERIC(5, 2), "物理" NUMERIC(5, 2));
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 |       | 96.00
(2 rows)

postgres=# SELECT name,category_4 AS "语文",category_1 AS "数学",category_3 AS "英语",category_2 AS "物理"
postgres-#   FROM crosstab('SELECT name,subject,score FROM test ORDER BY 1',
postgres(#                 'SELECT DISTINCT subject FROM test ORDER BY 1' --可以动态生成列，但是列的次序可能不确定，依赖于排序
postgres(#                 ) AS tb(NAME VARCHAR, category_1 NUMERIC(5, 2), category_2 NUMERIC(5, 2), category_3 NUMERIC(5, 2), category_4 NUMERIC(5, 2));
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 |       | 96.00
(2 rows)

--双参数的crosstab函数，我们也可以自定义crosstabN
postgres=# CREATE OR REPLACE FUNCTION crosstab_student_score(
postgres(#     IN text,
postgres(#     IN text,
postgres(#     OUT name varchar,
postgres(#     OUT category_1 numeric(5,2),
postgres(#     OUT category_2 numeric(5,2),
postgres(#     OUT category_3 numeric(5,2),
postgres(#     OUT category_4 numeric(5,2))
postgres-#   RETURNS setof record
postgres-#   AS '$libdir/tablefunc', 'crosstab_hash' LANGUAGE C STABLE STRICT;
CREATE FUNCTION

--使用自定义的crosstab_student_score函数
postgres=# SELECT name,category_4 AS "语文",category_1 AS "数学",category_3 AS "英语",category_2 AS "物理"
postgres-#   FROM crosstab_student_score('SELECT name,subject,score FROM test ORDER BY 1',
postgres(#                               'SELECT DISTINCT subject FROM test ORDER BY 1');
 name | 语文  | 数学  | 英语  | 物理  
------+-------+-------+-------+-------
 Jack | 91.00 | 93.00 | 94.00 | 92.00
 Joan | 90.00 | 99.00 |       | 96.00
(2 rows)

--如下是官网给出的 crosstab(text source_sql, text category_sql) 函数的实例
postgres=# CREATE TABLE sales(year int, month int, qty int);
CREATE TABLE
postgres=# INSERT INTO sales VALUES (2007, 1, 1000);
INSERT 0 1
postgres=# INSERT INTO sales VALUES (2007, 2, 1500);
INSERT 0 1
postgres=# INSERT INTO sales VALUES (2007, 7, 500);
INSERT 0 1
postgres=# INSERT INTO sales VALUES (2007, 11, 1500);
INSERT 0 1
postgres=# INSERT INTO sales VALUES (2007, 12, 2000);
INSERT 0 1
postgres=# INSERT INTO sales VALUES (2008, 1, 1000);
INSERT 0 1
postgres=# select * from crosstab(
postgres(#   'select year, month, qty from sales order by 1',
postgres(#   'select m from generate_series(1,12) m'
postgres(# ) as (
postgres(#   year int,
postgres(#   "Jan" int,
postgres(#   "Feb" int,
postgres(#   "Mar" int,
postgres(#   "Apr" int,
postgres(#   "May" int,
postgres(#   "Jun" int,
postgres(#   "Jul" int,
postgres(#   "Aug" int,
postgres(#   "Sep" int,
postgres(#   "Oct" int,
postgres(#   "Nov" int,
postgres(#   "Dec" int
postgres(# );
 year | Jan  | Feb  | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov  | Dec  
------+------+------+-----+-----+-----+-----+-----+-----+-----+-----+------+------
 2007 | 1000 | 1500 |     |     |     |     | 500 |     |     |     | 1500 | 2000
 2008 | 1000 |      |     |     |     |     |     |     |     |     |      |     
(2 rows)

postgres=# CREATE TABLE cth(rowid text, rowdt timestamp, attribute text, val text);
CREATE TABLE
postgres=# INSERT INTO cth VALUES('test1','01 March 2003','temperature','42');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test1','01 March 2003','test_result','PASS');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test1','01 March 2003','volts','2.6987');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test2','02 March 2003','temperature','53');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test2','02 March 2003','test_result','FAIL');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test2','02 March 2003','test_startdate','01 March 2003');
INSERT 0 1
postgres=# INSERT INTO cth VALUES('test2','02 March 2003','volts','3.1234');
INSERT 0 1
postgres=# SELECT * FROM crosstab
postgres-# (
postgres(#   'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
postgres(#   'SELECT DISTINCT attribute FROM cth ORDER BY 1'
postgres(# )
postgres-# AS
postgres-# (
postgres(#        rowid text,
postgres(#        rowdt timestamp,
postgres(#        temperature int4,
postgres(#        test_result text,
postgres(#        test_startdate timestamp,
postgres(#        volts float8
postgres(# );
 rowid |        rowdt        | temperature | test_result |   test_startdate    | volts  
-------+---------------------+-------------+-------------+---------------------+--------
 test1 | 2003-03-01 00:00:00 |          42 | PASS        |                     | 2.6987
 test2 | 2003-03-02 00:00:00 |          53 | FAIL        | 2003-03-01 00:00:00 | 3.1234
(2 rows)

以上对各种场景的行列转换作了简单的介绍，详细信息请参考官方文档。

参考：https://www.postgresql.org/docs/13/tablefunc.html
复制

postgresql

最后修改时间：2021-09-09 11:25:48

「喜欢这篇文章，您的关注和赞赏是给作者最好的鼓励」

关注作者

1人已赞赏

PostgreSQL行列转换介绍

评论