PostgreSQL sort or not sort when group by?

8 minute read

背景

朋友的一个CASE,在一个查询中为什么group by用到了sort?

其实这也是优化器的选择问题,只要成本最低,就有可能选择sort。

当然如果hashagg的成本更低,那么就会选择hashagg。

CASE:

postgres=# create table t1(c1 int,c2 int,c3 int,c4 int);  
postgres=# insert into t1 select generate_series(1,100000),1,1,1;  
postgres=# show work_mem;  
 work_mem   
----------  
 4MB  
(1 row)  
  
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;  
                                                          QUERY PLAN                                                            
------------------------------------------------------------------------------------------------------------------------------  
 Group  (cost=9845.82..11095.82 rows=100000 width=16) (actual time=340.382..384.324 rows=100000 loops=1)  
   Output: c1, c2, c3, c4  
   Group Key: t1.c1, t1.c2, t1.c3, t1.c4  
   Buffers: shared hit=544, temp read=318 written=318  
   ->  Sort  (cost=9845.82..10095.82 rows=100000 width=16) (actual time=340.379..353.887 rows=100000 loops=1)  
         Output: c1, c2, c3, c4  
         Sort Key: t1.c1, t1.c2, t1.c3, t1.c4  
         Sort Method: external sort  Disk: 2544kB  
         Buffers: shared hit=544, temp read=318 written=318  
         ->  Seq Scan on public.t1  (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.025..26.641 rows=100000 loops=1)  
               Output: c1, c2, c3, c4  
               Buffers: shared hit=541  
 Planning time: 0.079 ms  
 Execution time: 392.131 ms  
(14 rows)  
  
postgres=# set work_mem='1GB';  
SET  
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;  
                                                       QUERY PLAN                                                         
------------------------------------------------------------------------------------------------------------------------  
 HashAggregate  (cost=2541.00..3541.00 rows=100000 width=16) (actual time=74.786..97.568 rows=100000 loops=1)  
   Output: c1, c2, c3, c4  
   Group Key: t1.c1, t1.c2, t1.c3, t1.c4  
   Buffers: shared hit=541  
   ->  Seq Scan on public.t1  (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.037..16.179 rows=100000 loops=1)  
         Output: c1, c2, c3, c4  
         Buffers: shared hit=541  
 Planning time: 0.128 ms  
 Execution time: 104.705 ms  
(9 rows)  
  
postgres=# set enable_hashagg=off;  
SET  
postgres=# explain (analyze,verbose,buffers,costs,timing) select c1,c2,c3,c4 from t1 group by c1,c2,c3,c4;  
                                                         QUERY PLAN                                                            
-----------------------------------------------------------------------------------------------------------------------------  
 Group  (cost=9845.82..11095.82 rows=100000 width=16) (actual time=28.217..62.442 rows=100000 loops=1)  
   Output: c1, c2, c3, c4  
   Group Key: t1.c1, t1.c2, t1.c3, t1.c4  
   Buffers: shared hit=541  
   ->  Sort  (cost=9845.82..10095.82 rows=100000 width=16) (actual time=28.214..35.161 rows=100000 loops=1)  
         Output: c1, c2, c3, c4  
         Sort Key: t1.c1, t1.c2, t1.c3, t1.c4  
         Sort Method: quicksort  Memory: 7760kB  
         Buffers: shared hit=541  
         ->  Seq Scan on public.t1  (cost=0.00..1541.00 rows=100000 width=16) (actual time=0.010..9.235 rows=100000 loops=1)  
               Output: c1, c2, c3, c4  
               Buffers: shared hit=541  
 Planning time: 0.039 ms  
 Execution time: 68.409 ms  
(14 rows)  

相关的参数:

#enable_hashagg = on  
#enable_sort = on  

成本计算方法:

注意如果是sort聚合,agg的启动成本是SORT后的成本。

src/backend/optimizer/path/costsize.c

/*  
 * cost_agg  
 *              Determines and returns the cost of performing an Agg plan node,  
 *              including the cost of its input.  
 *  
 * aggcosts can be NULL when there are no actual aggregate functions (i.e.,  
 * we are using a hashed Agg node just to do grouping).  
 *  
 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs  
 * are for appropriately-sorted input.  
 */  
void  
cost_agg(Path *path, PlannerInfo *root,  
                 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,  
                 int numGroupCols, double numGroups,  
                 Cost input_startup_cost, Cost input_total_cost,  
                 double input_tuples)  
{  
        double          output_tuples;  
        Cost            startup_cost;  
        Cost            total_cost;  
        AggClauseCosts dummy_aggcosts;  
  
        /* Use all-zero per-aggregate costs if NULL is passed */  
        if (aggcosts == NULL)  
        {  
                Assert(aggstrategy == AGG_HASHED);  
                MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));  
                aggcosts = &dummy_aggcosts;  
        }  
  
        /*  
         * The transCost.per_tuple component of aggcosts should be charged once  
         * per input tuple, corresponding to the costs of evaluating the aggregate  
         * transfns and their input expressions (with any startup cost of course  
         * charged but once).  The finalCost component is charged once per output  
         * tuple, corresponding to the costs of evaluating the finalfns.  
         *  
         * If we are grouping, we charge an additional cpu_operator_cost per  
         * grouping column per input tuple for grouping comparisons.  
         *  
         * We will produce a single output tuple if not grouping, and a tuple per  
         * group otherwise.  We charge cpu_tuple_cost for each output tuple.  
         *  
         * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the  
         * same total CPU cost, but AGG_SORTED has lower startup cost.  If the  
         * input path is already sorted appropriately, AGG_SORTED should be  
         * preferred (since it has no risk of memory overflow).  This will happen  
         * as long as the computed total costs are indeed exactly equal --- but if  
         * there's roundoff error we might do the wrong thing.  So be sure that  
         * the computations below form the same intermediate values in the same  
         * order.  
         */  
        if (aggstrategy == AGG_PLAIN)  
        {  
                startup_cost = input_total_cost;  
                startup_cost += aggcosts->transCost.startup;  
                startup_cost += aggcosts->transCost.per_tuple * input_tuples;  
                startup_cost += aggcosts->finalCost;  
                /* we aren't grouping */  
                total_cost = startup_cost + cpu_tuple_cost;  
                output_tuples = 1;  
        }  
        else if (aggstrategy == AGG_SORTED)  
        {  
                /* Here we are able to deliver output on-the-fly */  
                startup_cost = input_startup_cost;  
                total_cost = input_total_cost;  
                /* calcs phrased this way to match HASHED case, see note above */  
                total_cost += aggcosts->transCost.startup;  
                total_cost += aggcosts->transCost.per_tuple * input_tuples;  
                total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;  
                total_cost += aggcosts->finalCost * numGroups;  
                total_cost += cpu_tuple_cost * numGroups;  
                output_tuples = numGroups;  
        }  
        else  
        {  
                /* must be AGG_HASHED */  
                startup_cost = input_total_cost;  
                startup_cost += aggcosts->transCost.startup;  
                startup_cost += aggcosts->transCost.per_tuple * input_tuples;  
                startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;  
                total_cost = startup_cost;  
                total_cost += aggcosts->finalCost * numGroups;  
                total_cost += cpu_tuple_cost * numGroups;  
                output_tuples = numGroups;  
        }  
  
        path->rows = output_tuples;  
        path->startup_cost = startup_cost;  
        path->total_cost = total_cost;  
}  
  
  
/*  
 * cost_sort  
 *        Determines and returns the cost of sorting a relation, including  
 *        the cost of reading the input data.  
 *  
 * If the total volume of data to sort is less than sort_mem, we will do  
 * an in-memory sort, which requires no I/O and about t*log2(t) tuple  
 * comparisons for t tuples.  
 *  
 * If the total volume exceeds sort_mem, we switch to a tape-style merge  
 * algorithm.  There will still be about t*log2(t) tuple comparisons in  
 * total, but we will also need to write and read each tuple once per  
 * merge pass.  We expect about ceil(logM(r)) merge passes where r is the  
 * number of initial runs formed and M is the merge order used by tuplesort.c.  
 * Since the average initial run should be about twice sort_mem, we have  
 *              disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))  
 *              cpu = comparison_cost * t * log2(t)  
 *  
 * If the sort is bounded (i.e., only the first k result tuples are needed)  
 * and k tuples can fit into sort_mem, we use a heap method that keeps only  
 * k tuples in the heap; this will require about t*log2(k) tuple comparisons.  
 *  
 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random  
 * accesses (XXX can't we refine that guess?)  
 *  
 * By default, we charge two operator evals per tuple comparison, which should  
 * be in the right ballpark in most cases.  The caller can tweak this by  
 * specifying nonzero comparison_cost; typically that's used for any extra  
 * work that has to be done to prepare the inputs to the comparison operators.  
 *  
 * 'pathkeys' is a list of sort keys  
 * 'input_cost' is the total cost for reading the input data  
 * 'tuples' is the number of tuples in the relation  
 * 'width' is the average tuple width in bytes  
 * 'comparison_cost' is the extra cost per comparison, if any  
 * 'sort_mem' is the number of kilobytes of work memory allowed for the sort  
 * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound  
 *  
 * NOTE: some callers currently pass NIL for pathkeys because they  
 * can't conveniently supply the sort keys.  Since this routine doesn't  
 * currently do anything with pathkeys anyway, that doesn't matter...  
 * but if it ever does, it should react gracefully to lack of key data.  
 * (Actually, the thing we'd most likely be interested in is just the number  
 * of sort keys, which all callers *could* supply.)  
 */  
void  
cost_sort(Path *path, PlannerInfo *root,  
                  List *pathkeys, Cost input_cost, double tuples, int width,  
                  Cost comparison_cost, int sort_mem,  
                  double limit_tuples)  
        Cost            startup_cost = input_cost;  
        Cost            run_cost = 0;  
        double          input_bytes = relation_byte_size(tuples, width);  
        double          output_bytes;  
        double          output_tuples;  
        long            sort_mem_bytes = sort_mem * 1024L;  
  
        if (!enable_sort)  
                startup_cost += disable_cost;  
  
        path->rows = tuples;  
  
        /*  
         * We want to be sure the cost of a sort is never estimated as zero, even  
         * if passed-in tuple count is zero.  Besides, mustn't do log(0)...  
         */  
        if (tuples < 2.0)  
                tuples = 2.0;  
  
        /* Include the default cost-per-comparison */  
        comparison_cost += 2.0 * cpu_operator_cost;  
  
        /* Do we have a useful LIMIT? */  
        if (limit_tuples > 0 && limit_tuples < tuples)  
        {  
                output_tuples = limit_tuples;  
                output_bytes = relation_byte_size(output_tuples, width);  
        }  
        else  
        {  
                output_tuples = tuples;  
                output_bytes = input_bytes;  
        }  
  
        if (output_bytes > sort_mem_bytes)  
        {  
                /*  
                 * We'll have to use a disk-based sort of all the tuples  
                 */  
                double          npages = ceil(input_bytes / BLCKSZ);  
                double          nruns = (input_bytes / sort_mem_bytes) * 0.5;  
                double          mergeorder = tuplesort_merge_order(sort_mem_bytes);  
                double          log_runs;  
                double          npageaccesses;  
  
                /*  
                 * CPU costs  
                 *  
                 * Assume about N log2 N comparisons  
                 */  
                startup_cost += comparison_cost * tuples * LOG2(tuples);  
  
                /* Disk costs */  
  
                /* Compute logM(r) as log(r) / log(M) */  
                if (nruns > mergeorder)  
                        log_runs = ceil(log(nruns) / log(mergeorder));  
                else  
                        log_runs = 1.0;  
                npageaccesses = 2.0 * npages * log_runs;  
                /* Assume 3/4ths of accesses are sequential, 1/4th are not */  
                startup_cost += npageaccesses *  
                        (seq_page_cost * 0.75 + random_page_cost * 0.25);  
        }  
        else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)  
        {  
                /*  
                 * We'll use a bounded heap-sort keeping just K tuples in memory, for  
                 * a total number of tuple comparisons of N log2 K; but the constant  
                 * factor is a bit higher than for quicksort.  Tweak it so that the  
                 * cost curve is continuous at the crossover point.  
                 */  
                startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);  
        }  
        else  
        {  
                /* We'll use plain quicksort on all the input tuples */  
                startup_cost += comparison_cost * tuples * LOG2(tuples);  
        }  
  
        /*  
         * Also charge a small amount (arbitrarily set equal to operator cost) per  
         * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node  
         * doesn't do qual-checking or projection, so it has less overhead than  
         * most plan nodes.  Note it's correct to use tuples not output_tuples  
         * here --- the upper LIMIT will pro-rate the run cost so we'd be double  
         * counting the LIMIT otherwise.  
         */  
        run_cost += cpu_operator_cost * tuples;  
  
        path->startup_cost = startup_cost;  
        path->total_cost = startup_cost + run_cost;  
}  

Flag Counter

digoal’s 大量PostgreSQL文章入口