绿色分布

发布于 2025-01-23 19:54:52 字数 750 浏览 3 评论 0原文

有一个具有随机分布的表，

CREATE TABLE schema.table (
    col1 int4 NULL,
    col2 int4 NULL,
    col3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED RANDOMLY;

我们需要在一个字段上最佳（最小偏斜）分配行。为此，我们可以创建测试表

CREATE TABLE schema.test_table (
    col_1 int4 NULL,
    col_2 int4 NULL,
    col_3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED BY (col_i); 
INSERT INTO schema.test_table SELECT * FROM schema.table;

，然后通过偏斜进行检查，例如，通过

select * from gp_toolkit.gp_skew_coefficient('schema.test_table'::regclass);

，问题是我们要在不创建测试表的情况下检查偏斜的表格。可以做到这一点，如果是的话，如何？

原文

There is a table with a random distribution

CREATE TABLE schema.table (
    col1 int4 NULL,
    col2 int4 NULL,
    col3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED RANDOMLY;

We need to optimally (with minimal skew) distribute rows over one field. For this we can create test tables

CREATE TABLE schema.test_table (
    col_1 int4 NULL,
    col_2 int4 NULL,
    col_3 int4 NULL
)
WITH (
    appendonly=true,
    compresstype=zstd,
    orientation=column
)
DISTRIBUTED BY (col_i); 
INSERT INTO schema.test_table SELECT * FROM schema.table;

And then check them against skew, for example via

select * from gp_toolkit.gp_skew_coefficient('schema.test_table'::regclass);

The problem is that we want to check the table for skew without creating test tables. Can this be done, and if so, how?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

若有似无的小暗淡 2025-01-30 19:54:52

如果您真的不想创建一个新表，那么这样的事情将起作用，但是我不知道在不实际（重新）分发数据的情况下建模分布的方法。

foo=# create table foo(a int, b int, c int);
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE

foo=# insert into foo values (generate_series(1,100), generate_series(101,200), generate_series(2001, 2100));
INSERT 0 100

foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 18.460769214742921763000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    17
             1 |    18
             2 |    23
             3 |    17
             4 |    15
             5 |    10
(6 rows)


foo=# ALTER TABLE foo SET
foo-# WITH (REORGANIZE=true)
foo-# DISTRIBUTED BY (a);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 18.460769214742921763000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    21
             1 |    18
             2 |    12
             3 |    15
             4 |    18
             5 |    16
(6 rows)

foo=#

foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (b);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 27.011108825814611346000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    12
             1 |    14
             2 |    20
             3 |    24
             4 |    16
             5 |    14
(6 rows)

foo=#


foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (c);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 30.983866769659334938000
(1 row)

foo=#

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    19
             1 |    10
             2 |    20
             3 |    23
             4 |    11
             5 |    17
(6 rows)

foo=#

Something like this will work if you really do not want to create a new table, but I do not know of a way to model the distribution without actually (re)distributing the data.

foo=# create table foo(a int, b int, c int);
NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE

foo=# insert into foo values (generate_series(1,100), generate_series(101,200), generate_series(2001, 2100));
INSERT 0 100

foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 18.460769214742921763000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    17
             1 |    18
             2 |    23
             3 |    17
             4 |    15
             5 |    10
(6 rows)


foo=# ALTER TABLE foo SET
foo-# WITH (REORGANIZE=true)
foo-# DISTRIBUTED BY (a);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 18.460769214742921763000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    21
             1 |    18
             2 |    12
             3 |    15
             4 |    18
             5 |    16
(6 rows)

foo=#

foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (b);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 27.011108825814611346000
(1 row)

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    12
             1 |    14
             2 |    20
             3 |    24
             4 |    16
             5 |    14
(6 rows)

foo=#


foo=# ALTER TABLE foo SET
WITH (REORGANIZE=true)
DISTRIBUTED BY (c);
ALTER TABLE
foo=# select * from gp_toolkit.gp_skew_coefficient('public.foo'::regclass);
 skcoid |         skccoeff
--------+--------------------------
  76788 | 30.983866769659334938000
(1 row)

foo=#

foo=# select gp_segment_id, count(*) from foo group by 1 order by 1;
 gp_segment_id | count
---------------+-------
             0 |    19
             1 |    10
             2 |    20
             3 |    23
             4 |    11
             5 |    17
(6 rows)

foo=#

回复收藏 0 原文

~没有更多了~