SQL sorting before copy performance


Raw or sorted copy

Raw copy is definitely the fastest. But sorted copy can be reasonably fast, provided you sort by the “most sortable” columns first. This is most likely to be big

create table r1(t timestamp, s1 text, s2 text, n int, b boolean);
insert into r1
select timestamp '2000-01-10 20:00:00' +
       random() * (timestamp '2000-01-20 20:00:00' -
                   timestamp '2021-01-10 10:00:00'),
        substr(md5(random()::text), 0, 25),
        substr(md5(random()::text), 0, 250),
        trunc(random() * 10000000),
        trunc(random() * 2)::int % 2 = 0
from generate_series(1, 100000);

copy r1 to '/dev/null'; -- 80ms
copy (select * from r1 order by t, n, b, s1, s2) to '/dev/null'; -- 160ms
copy (select * from r1 order by b, t, n, s1, s2) to '/dev/null'; -- 200ms
copy (select * from r1 order by s2, s1, t, n, b) to '/dev/null'; -- 2000ms

Sorted primary key copy

Primary keys make everything much faster.

create table r1(t timestamp, s1 text, s2 text primary key, n int);
insert into r1
select timestamp '2000-01-10 20:00:00' +
       random() * (timestamp '2000-01-20 20:00:00' -
                   timestamp '2021-01-10 10:00:00'),
        substr(md5(random()::text), 0, 25),
        substr(md5(random()::text), 0, 250),
        trunc(random() * 10000000)
  from generate_series(1, 100000);

copy (select * from r1 order by s2, n, t, s1) to '/dev/null'; -- 220ms
copy (select * from r1 order by s2, s1, n, t) to '/dev/null'; -- 240ms

Your normal serial primary key is very fast to sort on, pratically as fast as raw copy.

create table r1(t timestamp, s1 text, s2 text, n serial primary key);
insert into r1(t, s1, s2)
select timestamp '2000-01-10 20:00:00' +
       random() * (timestamp '2000-01-20 20:00:00' -
                   timestamp '2021-01-10 10:00:00'),
        substr(md5(random()::text), 0, 25),
        substr(md5(random()::text), 0, 250)
  from generate_series(1, 100000);

copy (select * from r1 order by n, t, s1, s2) to '/dev/null'; -- 110ms

-- ordering of the underlying table is not important
create table r2(t timestamp, s1 text, s2 text, n int primary key);
insert into r2 select * from r1 order by s2;
copy (select * from r1 order by n, t, s1, s2) to '/dev/null'; -- 110ms

Sorting on multicolumn primary key

Does order matter? Yes, multicolumn sort performance depends on the multicolumn index. But if you’re leveraging the multicolumn index correctly, it’s again almost as fast as a raw copy.

create table r3(t timestamp, s1 text, s2 text, n int, primary key (s2, n));
insert into r3
select timestamp '2000-01-10 20:00:00' +
       random() * (timestamp '2000-01-20 20:00:00' -
                   timestamp '2021-01-10 10:00:00'),
        substr(md5(random()::text), 0, 25),
        substr(md5(random()::text), 0, 50),
        trunc(random() * 10000)
from generate_series(0, 100000);

-- matching the sort order of the multicolumn primary key
copy (select * from r3 order by s2, n) to '/dev/null';  -- 120ms

-- swapped the order
copy (select * from r3 order by n, s2) to '/dev/null'; -- 450ms

Column ordering

Column ordering has a small, but not critical, effect on copy speed.

create table r3(t timestamp, s1 text, s2 text, s3 text, s4 text, n int, primary key (s2, n));
insert into r3
select timestamp '2000-01-10 20:00:00' +
       random() * (timestamp '2000-01-20 20:00:00' -
                   timestamp '2021-01-10 10:00:00'),
        substr(md5(random()::text), 0, 25),
        substr(md5(random()::text), 0, 50),
        substr(md5(random()::text), 0, 75),
        substr(md5(random()::text), 0, 100),
        trunc(random() * 10000)
from generate_series(0, 100000);

copy (select * from r3 order by s2, n) to '/tmp/r3.csv'; -- 190ms
copy (select t, s1, s2, s3, s4, n from r3 order by s2, n) to '/tmp/r3.csv'; -- 200ms
copy (select n, s4, s3, s2, s1, t from r3 order by s2, n) to '/tmp/r3.csv'; -- 210ms