PostgreSQL hll (HyperLogLog) extension for State of The Art Cardinality Estimation Algorithm - 1

7 minute read

背景

安装

1. 下载 :

https://github.com/aggregateknowledge/postgresql-hll/archive/master.zip

2. 修改Makefile, 指定PG_CONFIG路径

vi Makefile  
PG_CONFIG = /home/ocz/pgsql9.2.1/bin/pg_config  

3. 编译. (测试数据库版本PostgreSQL 9.2.1)

安装时有个BUG, 少包含了1个头文件. 可以通过修改hll.c来修复建后面. 已经发邮件给作者了.

root@db-172-16-3-150-> make clean  
rm -f hll.so   libhll.a   
rm -f hll.o MurmurHash3.o   
rm -rf -r   
root@db-172-16-3-150-> make  
gcc -O2 -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -g -fpic -fPIC -std=c99 -I. -I. -I/home/ocz/pgsql9.2.1/include/server -I/home/ocz/pgsql9.2.1/include/internal -D_GNU_SOURCE -I/usr/include/libxml2   -c -o hll.o hll.c  
hll.c: In function ‘hll_hashval_in’:  
hll.c:1532: error: ‘int8in’ undeclared (first use in this function)  
hll.c:1532: error: (Each undeclared identifier is reported only once  
hll.c:1532: error: for each function it appears in.)  
hll.c: In function ‘hll_hashval_out’:  
hll.c:1541: error: ‘int8out’ undeclared (first use in this function)  
make: *** [hll.o] Error 1  

原因是int8in和int8out未定义. 把头文件放进来就可以了.

修复如下 :

root@db-172-16-3-150-> vi hll.c  

增加以下头文件信息.

#include "utils/int8.h"  

重新编译通过 :

su - root  
. /home/ocz/.bash_profile  
gmake clean  
gmake  
gmake install  

使用

1. 创建extension

ocz@db-172-16-3-150-> psql digoal postgres  
psql (9.2.1)  
Type "help" for help.  
digoal=# create extension hll;  
CREATE EXTENSION  

2. 新增类型, 函数, 操作符如下.

CREATE TYPE hll;  
  
CREATE FUNCTION hll_in(cstring, oid, integer)  
RETURNS hll  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_out(hll)  
RETURNS cstring  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_typmod_in(cstring[])  
RETURNS integer  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_typmod_out(integer)  
RETURNS cstring  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll(hll, integer, boolean)  
RETURNS hll  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE TYPE hll (  
        INTERNALLENGTH = variable,  
        INPUT = hll_in,  
        OUTPUT = hll_out,  
        TYPMOD_IN = hll_typmod_in,  
        TYPMOD_OUT = hll_typmod_out,  
        STORAGE = external  
);  
  
CREATE CAST (hll AS hll) WITH FUNCTION hll(hll, integer, boolean) AS IMPLICIT;  
  
CREATE CAST (bytea AS hll) WITHOUT FUNCTION;  
  
-- ----------------------------------------------------------------  
-- Hashed value type  
-- ----------------------------------------------------------------  
  
CREATE TYPE hll_hashval;  
  
CREATE FUNCTION hll_hashval_in(cstring, oid, integer)  
RETURNS hll_hashval  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_hashval_out(hll_hashval)  
RETURNS cstring  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE TYPE hll_hashval (  
        INTERNALLENGTH = 8,  
        PASSEDBYVALUE,  
        ALIGNMENT = double,  
        INPUT = hll_hashval_in,  
        OUTPUT = hll_hashval_out  
);  
  
CREATE FUNCTION hll_hashval_eq(hll_hashval, hll_hashval)  
RETURNS bool  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_hashval_ne(hll_hashval, hll_hashval)  
RETURNS bool  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_hashval(bigint)  
RETURNS hll_hashval  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_hashval_int4(integer)  
RETURNS hll_hashval  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
CREATE OPERATOR = (  
        LEFTARG = hll_hashval, RIGHTARG = hll_hashval,  
                PROCEDURE = hll_hashval_eq,  
        COMMUTATOR = '=', NEGATOR = '<>',  
        RESTRICT = eqsel, JOIN = eqjoinsel,  
        MERGES  
);  
  
CREATE OPERATOR <> (  
        LEFTARG = hll_hashval, RIGHTARG = hll_hashval,  
                PROCEDURE = hll_hashval_ne,  
        COMMUTATOR = '<>', NEGATOR = '=',  
        RESTRICT = neqsel, JOIN = neqjoinsel  
);  
  
-- Only allow explicit casts.  
CREATE CAST (bigint AS hll_hashval) WITHOUT FUNCTION;  
CREATE CAST (integer AS hll_hashval) WITH FUNCTION hll_hashval_int4(integer);  
  
-- ----------------------------------------------------------------  
-- Functions  
-- ----------------------------------------------------------------  
  
-- Equality of multisets.  
--  
CREATE FUNCTION hll_eq(hll, hll)  
RETURNS bool  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
-- Inequality of multisets.  
--  
CREATE FUNCTION hll_ne(hll, hll)  
RETURNS bool  
AS 'MODULE_PATHNAME'  
LANGUAGE C STRICT IMMUTABLE;  
  
-- Cardinality of a multiset.  
--  
CREATE FUNCTION hll_cardinality(hll)  
     RETURNS double precision  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Union of a pair of multisets.  
--  
CREATE FUNCTION hll_union(hll, hll)  
     RETURNS hll  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Adds an integer hash to a multiset.  
--  
CREATE FUNCTION hll_add(hll, hll_hashval)  
     RETURNS hll  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Adds a multiset to an integer hash.  
--  
CREATE FUNCTION hll_add_rev(hll_hashval, hll)  
     RETURNS hll  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Pretty-print a multiset.  
--  
CREATE FUNCTION hll_print(hll)  
     RETURNS cstring  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Create an empty multiset with parameters.  
--  
-- NOTE - we create multiple signatures to avoid coding the defaults  
-- in this sql file.  This allows the defaults to changed at runtime.  
--  
CREATE FUNCTION hll_empty()  
     RETURNS hll  
     AS 'MODULE_PATHNAME', 'hll_empty0'  
     LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_empty(integer)  
     RETURNS hll  
     AS 'MODULE_PATHNAME', 'hll_empty1'  
     LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_empty(integer, integer)  
     RETURNS hll  
     AS 'MODULE_PATHNAME', 'hll_empty2'  
     LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_empty(integer, integer, bigint)  
     RETURNS hll  
     AS 'MODULE_PATHNAME', 'hll_empty3'  
     LANGUAGE C STRICT IMMUTABLE;  
  
CREATE FUNCTION hll_empty(integer, integer, bigint, integer)  
     RETURNS hll  
     AS 'MODULE_PATHNAME', 'hll_empty4'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the schema version of an hll.  
--  
CREATE FUNCTION hll_schema_version(hll)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the type of an hll.  
--  
CREATE FUNCTION hll_type(hll)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the log2m value of an hll.  
--  
CREATE FUNCTION hll_log2m(hll)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the register width of an hll.  
--  
CREATE FUNCTION hll_regwidth(hll)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the maximum explicit threshold of an hll.  
--  
CREATE FUNCTION hll_expthresh(hll, OUT specified bigint, OUT effective bigint)  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Returns the sparse enabled value of an hll.  
--  
CREATE FUNCTION hll_sparseon(hll)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Set output version.  
--  
CREATE FUNCTION hll_set_output_version(integer)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Set sparse to full compressed threshold to fixed value.  
--  
CREATE FUNCTION hll_set_max_sparse(integer)  
     RETURNS integer  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Change the default type modifier, empty and add aggregate defaults.  
CREATE FUNCTION hll_set_defaults(IN i_log2m integer,  
                                 IN i_regwidth integer,  
                                 IN i_expthresh bigint,  
                                 IN i_sparseon integer,  
                                 OUT o_log2m integer,  
                                 OUT o_regwidth integer,  
                                 OUT o_expthresh bigint,  
                                 OUT o_sparseon integer)  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- ----------------------------------------------------------------  
-- Murmur Hashing  
-- ----------------------------------------------------------------  
  
-- Hash a boolean.  
--  
CREATE FUNCTION hll_hash_boolean(boolean, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_1byte'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Hash a smallint.  
--  
CREATE FUNCTION hll_hash_smallint(smallint, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_2byte'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Hash an integer.  
--  
CREATE FUNCTION hll_hash_integer(integer, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_4byte'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Hash a bigint.  
--  
CREATE FUNCTION hll_hash_bigint(bigint, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_8byte'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Hash a byte array.  
--  
CREATE FUNCTION hll_hash_bytea(bytea, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_varlena'  
     LANGUAGE C STRICT IMMUTABLE;  
  
-- Hash a text.  
--  
CREATE FUNCTION hll_hash_text(text, integer default 0)  
     RETURNS hll_hashval  
     AS 'MODULE_PATHNAME', 'hll_hash_varlena'  
     LANGUAGE C STRICT IMMUTABLE;  
  
  
-- ----------------------------------------------------------------  
-- Operators  
-- ----------------------------------------------------------------  
  
CREATE OPERATOR = (  
        LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_eq,  
        COMMUTATOR = '=', NEGATOR = '<>',  
        RESTRICT = eqsel, JOIN = eqjoinsel,  
        MERGES  
);  
  
CREATE OPERATOR <> (  
        LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_ne,  
        COMMUTATOR = '<>', NEGATOR = '=',  
        RESTRICT = neqsel, JOIN = neqjoinsel  
);  
  
CREATE OPERATOR || (  
       LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_union  
);  
  
CREATE OPERATOR || (  
       LEFTARG = hll, RIGHTARG = hll_hashval, PROCEDURE = hll_add  
);  
  
CREATE OPERATOR || (  
       LEFTARG = hll_hashval, RIGHTARG = hll, PROCEDURE = hll_add_rev  
);  
  
CREATE OPERATOR # (  
       RIGHTARG = hll, PROCEDURE = hll_cardinality  
);  
  
-- ----------------------------------------------------------------  
-- Aggregates  
-- ----------------------------------------------------------------  
  
-- Union aggregate transition function, first arg internal data  
-- structure, second arg is a packed multiset.  
--  
CREATE FUNCTION hll_union_trans(internal, hll)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
-- NOTE - unfortunately aggregate functions don't support default  
-- arguments so we need to declare 5 signatures.  
  
-- Add aggregate transition function, first arg internal data  
-- structure, second arg is a hashed value.  Remaining args are log2n,  
-- regwidth, expthresh, sparseon.  
--  
CREATE FUNCTION hll_add_trans4(internal,  
                               hll_hashval,  
                               integer,  
                               integer,  
                               bigint,  
                               integer)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
CREATE FUNCTION hll_add_trans3(internal,  
                               hll_hashval,  
                               integer,  
                               integer,  
                               bigint)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
CREATE FUNCTION hll_add_trans2(internal,  
                               hll_hashval,  
                               integer,  
                               integer)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
CREATE FUNCTION hll_add_trans1(internal,  
                               hll_hashval,  
                               integer)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
CREATE FUNCTION hll_add_trans0(internal,  
                               hll_hashval)  
     RETURNS internal  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
  
-- Converts internal data structure into packed multiset.  
--  
CREATE FUNCTION hll_pack(internal)  
     RETURNS hll  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
-- Computes cardinality of internal data structure.  
--  
CREATE FUNCTION hll_card_unpacked(internal)  
     RETURNS double precision  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
-- Computes floor(cardinality) of internal data structure.  
--  
CREATE FUNCTION hll_floor_card_unpacked(internal)  
     RETURNS int8  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
-- Computes ceil(cardinality) of internal data structure.  
--  
CREATE FUNCTION hll_ceil_card_unpacked(internal)  
     RETURNS int8  
     AS 'MODULE_PATHNAME'  
     LANGUAGE C;  
  
-- Union aggregate function, returns hll.  
--  
CREATE AGGREGATE hll_union_agg (hll) (  
       SFUNC = hll_union_trans,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  
  
-- NOTE - unfortunately aggregate functions don't support default  
-- arguments so we need to declare 5 signatures.  
  
-- Add aggregate function, returns hll.  
CREATE AGGREGATE hll_add_agg (hll_hashval) (  
       SFUNC = hll_add_trans0,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  
  
-- Add aggregate function, returns hll.  
CREATE AGGREGATE hll_add_agg (hll_hashval, integer) (  
       SFUNC = hll_add_trans1,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  
  
-- Add aggregate function, returns hll.  
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer) (  
       SFUNC = hll_add_trans2,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  
  
-- Add aggregate function, returns hll.  
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer, bigint) (  
       SFUNC = hll_add_trans3,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  
  
-- Add aggregate function, returns hll.  
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer, bigint, integer) (  
       SFUNC = hll_add_trans4,  
       STYPE = internal,  
       FINALFUNC = hll_pack  
);  

参考

1. http://tapoueh.org/blog/2013/02/25-postgresql-hyperloglog.html

2. http://blog.aggregateknowledge.com/2013/02/04/open-source-release-postgresql-hll/

3. https://github.com/aggregateknowledge/postgresql-hll

4. http://blog.aggregateknowledge.com/author/wwkae/

5. http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf

Flag Counter

digoal’s 大量PostgreSQL文章入口