PostgreSQL hll (HyperLogLog) extension for State of The Art Cardinality Estimation Algorithm - 1
背景
安装
1. 下载 :
https://github.com/aggregateknowledge/postgresql-hll/archive/master.zip
2. 修改Makefile, 指定PG_CONFIG路径
vi Makefile
PG_CONFIG = /home/ocz/pgsql9.2.1/bin/pg_config
3. 编译. (测试数据库版本PostgreSQL 9.2.1)
安装时有个BUG, 少包含了1个头文件. 可以通过修改hll.c来修复建后面. 已经发邮件给作者了.
root@db-172-16-3-150-> make clean
rm -f hll.so libhll.a
rm -f hll.o MurmurHash3.o
rm -rf -r
root@db-172-16-3-150-> make
gcc -O2 -Wall -Wmissing-prototypes -Wpointer-arith -Wdeclaration-after-statement -Wendif-labels -Wmissing-format-attribute -Wformat-security -fno-strict-aliasing -fwrapv -g -fpic -fPIC -std=c99 -I. -I. -I/home/ocz/pgsql9.2.1/include/server -I/home/ocz/pgsql9.2.1/include/internal -D_GNU_SOURCE -I/usr/include/libxml2 -c -o hll.o hll.c
hll.c: In function ‘hll_hashval_in’:
hll.c:1532: error: ‘int8in’ undeclared (first use in this function)
hll.c:1532: error: (Each undeclared identifier is reported only once
hll.c:1532: error: for each function it appears in.)
hll.c: In function ‘hll_hashval_out’:
hll.c:1541: error: ‘int8out’ undeclared (first use in this function)
make: *** [hll.o] Error 1
原因是int8in和int8out未定义. 把头文件放进来就可以了.
修复如下 :
root@db-172-16-3-150-> vi hll.c
增加以下头文件信息.
#include "utils/int8.h"
重新编译通过 :
su - root
. /home/ocz/.bash_profile
gmake clean
gmake
gmake install
使用
1. 创建extension
ocz@db-172-16-3-150-> psql digoal postgres
psql (9.2.1)
Type "help" for help.
digoal=# create extension hll;
CREATE EXTENSION
2. 新增类型, 函数, 操作符如下.
CREATE TYPE hll;
CREATE FUNCTION hll_in(cstring, oid, integer)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_out(hll)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_typmod_in(cstring[])
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_typmod_out(integer)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll(hll, integer, boolean)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE TYPE hll (
INTERNALLENGTH = variable,
INPUT = hll_in,
OUTPUT = hll_out,
TYPMOD_IN = hll_typmod_in,
TYPMOD_OUT = hll_typmod_out,
STORAGE = external
);
CREATE CAST (hll AS hll) WITH FUNCTION hll(hll, integer, boolean) AS IMPLICIT;
CREATE CAST (bytea AS hll) WITHOUT FUNCTION;
-- ----------------------------------------------------------------
-- Hashed value type
-- ----------------------------------------------------------------
CREATE TYPE hll_hashval;
CREATE FUNCTION hll_hashval_in(cstring, oid, integer)
RETURNS hll_hashval
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_hashval_out(hll_hashval)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE TYPE hll_hashval (
INTERNALLENGTH = 8,
PASSEDBYVALUE,
ALIGNMENT = double,
INPUT = hll_hashval_in,
OUTPUT = hll_hashval_out
);
CREATE FUNCTION hll_hashval_eq(hll_hashval, hll_hashval)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_hashval_ne(hll_hashval, hll_hashval)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_hashval(bigint)
RETURNS hll_hashval
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_hashval_int4(integer)
RETURNS hll_hashval
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR = (
LEFTARG = hll_hashval, RIGHTARG = hll_hashval,
PROCEDURE = hll_hashval_eq,
COMMUTATOR = '=', NEGATOR = '<>',
RESTRICT = eqsel, JOIN = eqjoinsel,
MERGES
);
CREATE OPERATOR <> (
LEFTARG = hll_hashval, RIGHTARG = hll_hashval,
PROCEDURE = hll_hashval_ne,
COMMUTATOR = '<>', NEGATOR = '=',
RESTRICT = neqsel, JOIN = neqjoinsel
);
-- Only allow explicit casts.
CREATE CAST (bigint AS hll_hashval) WITHOUT FUNCTION;
CREATE CAST (integer AS hll_hashval) WITH FUNCTION hll_hashval_int4(integer);
-- ----------------------------------------------------------------
-- Functions
-- ----------------------------------------------------------------
-- Equality of multisets.
--
CREATE FUNCTION hll_eq(hll, hll)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Inequality of multisets.
--
CREATE FUNCTION hll_ne(hll, hll)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Cardinality of a multiset.
--
CREATE FUNCTION hll_cardinality(hll)
RETURNS double precision
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Union of a pair of multisets.
--
CREATE FUNCTION hll_union(hll, hll)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Adds an integer hash to a multiset.
--
CREATE FUNCTION hll_add(hll, hll_hashval)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Adds a multiset to an integer hash.
--
CREATE FUNCTION hll_add_rev(hll_hashval, hll)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Pretty-print a multiset.
--
CREATE FUNCTION hll_print(hll)
RETURNS cstring
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Create an empty multiset with parameters.
--
-- NOTE - we create multiple signatures to avoid coding the defaults
-- in this sql file. This allows the defaults to changed at runtime.
--
CREATE FUNCTION hll_empty()
RETURNS hll
AS 'MODULE_PATHNAME', 'hll_empty0'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_empty(integer)
RETURNS hll
AS 'MODULE_PATHNAME', 'hll_empty1'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_empty(integer, integer)
RETURNS hll
AS 'MODULE_PATHNAME', 'hll_empty2'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_empty(integer, integer, bigint)
RETURNS hll
AS 'MODULE_PATHNAME', 'hll_empty3'
LANGUAGE C STRICT IMMUTABLE;
CREATE FUNCTION hll_empty(integer, integer, bigint, integer)
RETURNS hll
AS 'MODULE_PATHNAME', 'hll_empty4'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the schema version of an hll.
--
CREATE FUNCTION hll_schema_version(hll)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the type of an hll.
--
CREATE FUNCTION hll_type(hll)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the log2m value of an hll.
--
CREATE FUNCTION hll_log2m(hll)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the register width of an hll.
--
CREATE FUNCTION hll_regwidth(hll)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the maximum explicit threshold of an hll.
--
CREATE FUNCTION hll_expthresh(hll, OUT specified bigint, OUT effective bigint)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Returns the sparse enabled value of an hll.
--
CREATE FUNCTION hll_sparseon(hll)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Set output version.
--
CREATE FUNCTION hll_set_output_version(integer)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Set sparse to full compressed threshold to fixed value.
--
CREATE FUNCTION hll_set_max_sparse(integer)
RETURNS integer
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- Change the default type modifier, empty and add aggregate defaults.
CREATE FUNCTION hll_set_defaults(IN i_log2m integer,
IN i_regwidth integer,
IN i_expthresh bigint,
IN i_sparseon integer,
OUT o_log2m integer,
OUT o_regwidth integer,
OUT o_expthresh bigint,
OUT o_sparseon integer)
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
-- ----------------------------------------------------------------
-- Murmur Hashing
-- ----------------------------------------------------------------
-- Hash a boolean.
--
CREATE FUNCTION hll_hash_boolean(boolean, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_1byte'
LANGUAGE C STRICT IMMUTABLE;
-- Hash a smallint.
--
CREATE FUNCTION hll_hash_smallint(smallint, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_2byte'
LANGUAGE C STRICT IMMUTABLE;
-- Hash an integer.
--
CREATE FUNCTION hll_hash_integer(integer, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_4byte'
LANGUAGE C STRICT IMMUTABLE;
-- Hash a bigint.
--
CREATE FUNCTION hll_hash_bigint(bigint, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_8byte'
LANGUAGE C STRICT IMMUTABLE;
-- Hash a byte array.
--
CREATE FUNCTION hll_hash_bytea(bytea, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_varlena'
LANGUAGE C STRICT IMMUTABLE;
-- Hash a text.
--
CREATE FUNCTION hll_hash_text(text, integer default 0)
RETURNS hll_hashval
AS 'MODULE_PATHNAME', 'hll_hash_varlena'
LANGUAGE C STRICT IMMUTABLE;
-- ----------------------------------------------------------------
-- Operators
-- ----------------------------------------------------------------
CREATE OPERATOR = (
LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_eq,
COMMUTATOR = '=', NEGATOR = '<>',
RESTRICT = eqsel, JOIN = eqjoinsel,
MERGES
);
CREATE OPERATOR <> (
LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_ne,
COMMUTATOR = '<>', NEGATOR = '=',
RESTRICT = neqsel, JOIN = neqjoinsel
);
CREATE OPERATOR || (
LEFTARG = hll, RIGHTARG = hll, PROCEDURE = hll_union
);
CREATE OPERATOR || (
LEFTARG = hll, RIGHTARG = hll_hashval, PROCEDURE = hll_add
);
CREATE OPERATOR || (
LEFTARG = hll_hashval, RIGHTARG = hll, PROCEDURE = hll_add_rev
);
CREATE OPERATOR # (
RIGHTARG = hll, PROCEDURE = hll_cardinality
);
-- ----------------------------------------------------------------
-- Aggregates
-- ----------------------------------------------------------------
-- Union aggregate transition function, first arg internal data
-- structure, second arg is a packed multiset.
--
CREATE FUNCTION hll_union_trans(internal, hll)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- NOTE - unfortunately aggregate functions don't support default
-- arguments so we need to declare 5 signatures.
-- Add aggregate transition function, first arg internal data
-- structure, second arg is a hashed value. Remaining args are log2n,
-- regwidth, expthresh, sparseon.
--
CREATE FUNCTION hll_add_trans4(internal,
hll_hashval,
integer,
integer,
bigint,
integer)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION hll_add_trans3(internal,
hll_hashval,
integer,
integer,
bigint)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION hll_add_trans2(internal,
hll_hashval,
integer,
integer)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION hll_add_trans1(internal,
hll_hashval,
integer)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION hll_add_trans0(internal,
hll_hashval)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Converts internal data structure into packed multiset.
--
CREATE FUNCTION hll_pack(internal)
RETURNS hll
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Computes cardinality of internal data structure.
--
CREATE FUNCTION hll_card_unpacked(internal)
RETURNS double precision
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Computes floor(cardinality) of internal data structure.
--
CREATE FUNCTION hll_floor_card_unpacked(internal)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Computes ceil(cardinality) of internal data structure.
--
CREATE FUNCTION hll_ceil_card_unpacked(internal)
RETURNS int8
AS 'MODULE_PATHNAME'
LANGUAGE C;
-- Union aggregate function, returns hll.
--
CREATE AGGREGATE hll_union_agg (hll) (
SFUNC = hll_union_trans,
STYPE = internal,
FINALFUNC = hll_pack
);
-- NOTE - unfortunately aggregate functions don't support default
-- arguments so we need to declare 5 signatures.
-- Add aggregate function, returns hll.
CREATE AGGREGATE hll_add_agg (hll_hashval) (
SFUNC = hll_add_trans0,
STYPE = internal,
FINALFUNC = hll_pack
);
-- Add aggregate function, returns hll.
CREATE AGGREGATE hll_add_agg (hll_hashval, integer) (
SFUNC = hll_add_trans1,
STYPE = internal,
FINALFUNC = hll_pack
);
-- Add aggregate function, returns hll.
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer) (
SFUNC = hll_add_trans2,
STYPE = internal,
FINALFUNC = hll_pack
);
-- Add aggregate function, returns hll.
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer, bigint) (
SFUNC = hll_add_trans3,
STYPE = internal,
FINALFUNC = hll_pack
);
-- Add aggregate function, returns hll.
CREATE AGGREGATE hll_add_agg (hll_hashval, integer, integer, bigint, integer) (
SFUNC = hll_add_trans4,
STYPE = internal,
FINALFUNC = hll_pack
);
参考
1. http://tapoueh.org/blog/2013/02/25-postgresql-hyperloglog.html
2. http://blog.aggregateknowledge.com/2013/02/04/open-source-release-postgresql-hll/
3. https://github.com/aggregateknowledge/postgresql-hll
4. http://blog.aggregateknowledge.com/author/wwkae/
5. http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf