PostgreSQL sort or not sort when group by?
背景
朋友的一个CASE,在一个查询中为什么group by用到了sort?
其实这也是优化器的选择问题,只要成本最低,就有可能选择sort。
当然如果hashagg的成本更低,那么就会选择hashagg。
CASE:
postgres=# create table t1(c1 int,c2 int,c3 int,c4 int);
postgres=# insert into t1 select generate_series(1,100000),1,1,1;
postgres=# show work_mem;
work_mem
----------
4MB
(1 row)
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------
Group (cost=9845.82..11095.82 rows=100000 width=16) (actual time=340.382..384.324 rows=100000 loops=1)
Output: c1, c2, c3, c4
Group Key: t1.c1, t1.c2, t1.c3, t1.c4
Buffers: shared hit=544, temp read=318 written=318
-> Sort (cost=9845.82..10095.82 rows=100000 width=16) (actual time=340.379..353.887 rows=100000 loops=1)
Output: c1, c2, c3, c4
Sort Key: t1.c1, t1.c2, t1.c3, t1.c4
Sort Method: external sort Disk: 2544kB
Buffers: shared hit=544, temp read=318 written=318
-> Seq Scan on public.t1 (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.025..26.641 rows=100000 loops=1)
Output: c1, c2, c3, c4
Buffers: shared hit=541
Planning time: 0.079 ms
Execution time: 392.131 ms
(14 rows)
postgres=# set work_mem='1GB';
SET
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
HashAggregate (cost=2541.00..3541.00 rows=100000 width=16) (actual time=74.786..97.568 rows=100000 loops=1)
Output: c1, c2, c3, c4
Group Key: t1.c1, t1.c2, t1.c3, t1.c4
Buffers: shared hit=541
-> Seq Scan on public.t1 (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.037..16.179 rows=100000 loops=1)
Output: c1, c2, c3, c4
Buffers: shared hit=541
Planning time: 0.128 ms
Execution time: 104.705 ms
(9 rows)
postgres=# set enable_hashagg=off;
SET
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------
Group (cost=9845.82..11095.82 rows=100000 width=16) (actual time=28.217..62.442 rows=100000 loops=1)
Output: c1, c2, c3, c4
Group Key: t1.c1, t1.c2, t1.c3, t1.c4
Buffers: shared hit=541
-> Sort (cost=9845.82..10095.82 rows=100000 width=16) (actual time=28.214..35.161 rows=100000 loops=1)
Output: c1, c2, c3, c4
Sort Key: t1.c1, t1.c2, t1.c3, t1.c4
Sort Method: quicksort Memory: 7760kB
Buffers: shared hit=541
-> Seq Scan on public.t1 (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.010..9.235 rows=100000 loops=1)
Output: c1, c2, c3, c4
Buffers: shared hit=541
Planning time: 0.039 ms
Execution time: 68.409 ms
(14 rows)
相关的参数:
#enable_hashagg = on
#enable_sort = on
成本计算方法:
注意如果是sort聚合,agg的启动成本是SORT后的成本。
src/backend/optimizer/path/costsize.c
/*
* cost_agg
* Determines and returns the cost of performing an Agg plan node,
* including the cost of its input.
*
* aggcosts can be NULL when there are no actual aggregate functions (i.e.,
* we are using a hashed Agg node just to do grouping).
*
* Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
* are for appropriately-sorted input.
*/
void
cost_agg(Path *path, PlannerInfo *root,
AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
int numGroupCols, double numGroups,
Cost input_startup_cost, Cost input_total_cost,
double input_tuples)
{
double output_tuples;
Cost startup_cost;
Cost total_cost;
AggClauseCosts dummy_aggcosts;
/* Use all-zero per-aggregate costs if NULL is passed */
if (aggcosts == NULL)
{
Assert(aggstrategy == AGG_HASHED);
MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
aggcosts = &dummy_aggcosts;
}
/*
* The transCost.per_tuple component of aggcosts should be charged once
* per input tuple, corresponding to the costs of evaluating the aggregate
* transfns and their input expressions (with any startup cost of course
* charged but once). The finalCost component is charged once per output
* tuple, corresponding to the costs of evaluating the finalfns.
*
* If we are grouping, we charge an additional cpu_operator_cost per
* grouping column per input tuple for grouping comparisons.
*
* We will produce a single output tuple if not grouping, and a tuple per
* group otherwise. We charge cpu_tuple_cost for each output tuple.
*
* Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
* same total CPU cost, but AGG_SORTED has lower startup cost. If the
* input path is already sorted appropriately, AGG_SORTED should be
* preferred (since it has no risk of memory overflow). This will happen
* as long as the computed total costs are indeed exactly equal --- but if
* there's roundoff error we might do the wrong thing. So be sure that
* the computations below form the same intermediate values in the same
* order.
*/
if (aggstrategy == AGG_PLAIN)
{
startup_cost = input_total_cost;
startup_cost += aggcosts->transCost.startup;
startup_cost += aggcosts->transCost.per_tuple * input_tuples;
startup_cost += aggcosts->finalCost;
/* we aren't grouping */
total_cost = startup_cost + cpu_tuple_cost;
output_tuples = 1;
}
else if (aggstrategy == AGG_SORTED)
{
/* Here we are able to deliver output on-the-fly */
startup_cost = input_startup_cost;
total_cost = input_total_cost;
/* calcs phrased this way to match HASHED case, see note above */
total_cost += aggcosts->transCost.startup;
total_cost += aggcosts->transCost.per_tuple * input_tuples;
total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
total_cost += aggcosts->finalCost * numGroups;
total_cost += cpu_tuple_cost * numGroups;
output_tuples = numGroups;
}
else
{
/* must be AGG_HASHED */
startup_cost = input_total_cost;
startup_cost += aggcosts->transCost.startup;
startup_cost += aggcosts->transCost.per_tuple * input_tuples;
startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
total_cost = startup_cost;
total_cost += aggcosts->finalCost * numGroups;
total_cost += cpu_tuple_cost * numGroups;
output_tuples = numGroups;
}
path->rows = output_tuples;
path->startup_cost = startup_cost;
path->total_cost = total_cost;
}
/*
* cost_sort
* Determines and returns the cost of sorting a relation, including
* the cost of reading the input data.
*
* If the total volume of data to sort is less than sort_mem, we will do
* an in-memory sort, which requires no I/O and about t*log2(t) tuple
* comparisons for t tuples.
*
* If the total volume exceeds sort_mem, we switch to a tape-style merge
* algorithm. There will still be about t*log2(t) tuple comparisons in
* total, but we will also need to write and read each tuple once per
* merge pass. We expect about ceil(logM(r)) merge passes where r is the
* number of initial runs formed and M is the merge order used by tuplesort.c.
* Since the average initial run should be about twice sort_mem, we have
* disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
* cpu = comparison_cost * t * log2(t)
*
* If the sort is bounded (i.e., only the first k result tuples are needed)
* and k tuples can fit into sort_mem, we use a heap method that keeps only
* k tuples in the heap; this will require about t*log2(k) tuple comparisons.
*
* The disk traffic is assumed to be 3/4ths sequential and 1/4th random
* accesses (XXX can't we refine that guess?)
*
* By default, we charge two operator evals per tuple comparison, which should
* be in the right ballpark in most cases. The caller can tweak this by
* specifying nonzero comparison_cost; typically that's used for any extra
* work that has to be done to prepare the inputs to the comparison operators.
*
* 'pathkeys' is a list of sort keys
* 'input_cost' is the total cost for reading the input data
* 'tuples' is the number of tuples in the relation
* 'width' is the average tuple width in bytes
* 'comparison_cost' is the extra cost per comparison, if any
* 'sort_mem' is the number of kilobytes of work memory allowed for the sort
* 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
*
* NOTE: some callers currently pass NIL for pathkeys because they
* can't conveniently supply the sort keys. Since this routine doesn't
* currently do anything with pathkeys anyway, that doesn't matter...
* but if it ever does, it should react gracefully to lack of key data.
* (Actually, the thing we'd most likely be interested in is just the number
* of sort keys, which all callers *could* supply.)
*/
void
cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
Cost comparison_cost, int sort_mem,
double limit_tuples)
Cost startup_cost = input_cost;
Cost run_cost = 0;
double input_bytes = relation_byte_size(tuples, width);
double output_bytes;
double output_tuples;
long sort_mem_bytes = sort_mem * 1024L;
if (!enable_sort)
startup_cost += disable_cost;
path->rows = tuples;
/*
* We want to be sure the cost of a sort is never estimated as zero, even
* if passed-in tuple count is zero. Besides, mustn't do log(0)...
*/
if (tuples < 2.0)
tuples = 2.0;
/* Include the default cost-per-comparison */
comparison_cost += 2.0 * cpu_operator_cost;
/* Do we have a useful LIMIT? */
if (limit_tuples > 0 && limit_tuples < tuples)
{
output_tuples = limit_tuples;
output_bytes = relation_byte_size(output_tuples, width);
}
else
{
output_tuples = tuples;
output_bytes = input_bytes;
}
if (output_bytes > sort_mem_bytes)
{
/*
* We'll have to use a disk-based sort of all the tuples
*/
double npages = ceil(input_bytes / BLCKSZ);
double nruns = (input_bytes / sort_mem_bytes) * 0.5;
double mergeorder = tuplesort_merge_order(sort_mem_bytes);
double log_runs;
double npageaccesses;
/*
* CPU costs
*
* Assume about N log2 N comparisons
*/
startup_cost += comparison_cost * tuples * LOG2(tuples);
/* Disk costs */
/* Compute logM(r) as log(r) / log(M) */
if (nruns > mergeorder)
log_runs = ceil(log(nruns) / log(mergeorder));
else
log_runs = 1.0;
npageaccesses = 2.0 * npages * log_runs;
/* Assume 3/4ths of accesses are sequential, 1/4th are not */
startup_cost += npageaccesses *
(seq_page_cost * 0.75 + random_page_cost * 0.25);
}
else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
{
/*
* We'll use a bounded heap-sort keeping just K tuples in memory, for
* a total number of tuple comparisons of N log2 K; but the constant
* factor is a bit higher than for quicksort. Tweak it so that the
* cost curve is continuous at the crossover point.
*/
startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
}
else
{
/* We'll use plain quicksort on all the input tuples */
startup_cost += comparison_cost * tuples * LOG2(tuples);
}
/*
* Also charge a small amount (arbitrarily set equal to operator cost) per
* extracted tuple. We don't charge cpu_tuple_cost because a Sort node
* doesn't do qual-checking or projection, so it has less overhead than
* most plan nodes. Note it's correct to use tuples not output_tuples
* here --- the upper LIMIT will pro-rate the run cost so we'd be double
* counting the LIMIT otherwise.
*/
run_cost += cpu_operator_cost * tuples;
path->startup_cost = startup_cost;
path->total_cost = startup_cost + run_cost;
}