PostgreSQL CSVLOG monitor script for nagios
背景
postgresql.conf配置 :
要求开启csvlog
log_destination = 'csvlog'
logging_collector = on
log_directory = 'pg_log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_file_mode = 0600
log_truncate_on_rotation = on
log_rotation_age = 1d
log_rotation_size = 10MB
脚本如下 :
vi /usr/local/nagios/libexec/check_pg_csvlog.sh
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGPASSFILE=/etc/.pgpass
if [ $# -ne 1 ]; then
echo "USE $0 summary|detail"
exit 0
else
PAR1=$1
if [ $PAR1 != "summary" ] && [ $PAR1 != "detail" ]; then
echo "USE $0 summary|detail"
exit 0
fi
fi
create_csvlogtable() {
echo $CONN_INFO
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
# PostgreSQL 9+
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
begin
perform 1 from pg_tables where tablename='postgres_log';
if not found then
CREATE TABLE postgres_log
(
log_time timestamp(3) with time zone,
user_name text,
database_name text,
process_id integer,
connection_from text,
session_id text,
session_line_num bigint,
command_tag text,
session_start_time timestamp with time zone,
virtual_transaction_id text,
transaction_id bigint,
error_severity text,
sql_state_code text,
message text,
detail text,
hint text,
internal_query text,
internal_query_pos integer,
context text,
query text,
query_pos integer,
location text,
application_name text,
PRIMARY KEY (session_id, session_line_num)
);
end if;
end;
\$\$;
EOF
else
# PostgreSQL 8+
# 不支持inline language, 建议预先创建
echo "PostgreSQL 8.x"
fi
}
load_csvlog() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
if [ $VER -ge 1 ]; then
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
v_dir text;
v_filename text;
v_sql text;
begin
truncate postgres_log;
select setting into v_dir from pg_settings where name='log_directory';
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)
where filename ~ 'csv\$' and filename ~ to_char(current_date,'yyyy-mm-dd') loop
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;
raise notice '%', v_sql;
execute v_sql;
end loop;
end;
\$\$;
EOF
else
# PostgreSQL 8.x 不支持do language, 这里忽略导入
echo "PostgreSQL 8.x"
fi
}
analyze_csvlog() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
CNT=`psql -q -t $CONN_INFO -c "select error_severity from postgres_log where error_severity<>'LOG' group by error_severity;"|grep -c -E "FATAL|ERROR"`
if [ $CNT -ge 1 ]; then
result=2
if [ $PAR1 == "summary" ]; then
psql -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
sql_state_code,
count(1) from postgres_log
where error_severity<>'LOG'
group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code
order by count(1) desc;
EOF
fi
if [ $PAR1 == "detail" ]; then
# detail的建议按照单库输出,否则太多
psql -x -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
connection_from,
sql_state_code,
message,
detail,
hint,
internal_query,
query,
location
from postgres_log
where error_severity<>'LOG'
order by log_time desc limit $OLINE;
EOF
fi
else
return 0
fi
}
# global result
result=0
# set env
. /usr/local/nagios/etc/check_pg_env.sh
# monitor
# detail监控输出最近20条详细信息.
OLINE=20
echo $OLINE
for i in "HZ1_CHECK_PG_CSVLOG_1"
do
eval CONN_INFO="$"$i
create_csvlogtable
load_csvlog
analyze_csvlog
done
# exit global proc
exit $result
修改执行权限 :
chmod 555 /usr/local/nagios/libexec/check_pg_csvlog.sh
修改nagios配置 :
vi /usr/local/nagios/etc/nrpe.cfg
command[check_pg_csvlog]=/usr/local/nagios/libexec/check_pg_csvlog.sh summary
重启xinetd服务 :
service xinetd restart
直接执行的结果如下 :
/usr/local/nagios/libexec/check_pg_csvlog.sh summary
其他
1. 由于数据需要从csvlog导入数据库表, 所以不推荐频繁调用这个脚本.
2. 对于9.1以上的系统, 可以考虑使用file_fdw创建外部表, 省去导入过程.
3. 所有8.4以上系统都支持的直接读取文件的函数, pg_read_file(text, bigint, bigint).
4. 这个脚本更偏向查看前一天的统计.
如果要做成实时告警的方式, 需要调整一下策略.
例如取最近1个小时的ERROR和FATAL记录数作为告警阈值.
每个系统的频繁程度不一样, 阈值也不一样.
修改后的nagios check脚本如下 :
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGPASSFILE=/etc/.pgpass
if [ $# -ne 0 ]; then
echo "USE $0"
exit 0
fi
create_csvlogtable() {
# debug
# echo $CONN_INFO
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
# PostgreSQL 9+
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
begin
perform 1 from pg_tables where tablename='postgres_log';
if not found then
CREATE TABLE postgres_log
(
log_time timestamp(3) with time zone,
user_name text,
database_name text,
process_id integer,
connection_from text,
session_id text,
session_line_num bigint,
command_tag text,
session_start_time timestamp with time zone,
virtual_transaction_id text,
transaction_id bigint,
error_severity text,
sql_state_code text,
message text,
detail text,
hint text,
internal_query text,
internal_query_pos integer,
context text,
query text,
query_pos integer,
location text,
application_name text,
PRIMARY KEY (session_id, session_line_num)
);
end if;
end;
\$\$;
EOF
else
# PostgreSQL 8+
# 不支持inline language, 建议预先创建
echo "PostgreSQL 8.x"
fi
}
load_csvlog() {
# 需要用到BOND变量, 减少导入日志数量
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
if [ $VER -ge 1 ]; then
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
v_dir text;
v_filename text;
v_sql text;
begin
set client_min_messages=warning;
truncate postgres_log;
select setting into v_dir from pg_settings where name='log_directory';
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)
where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;
raise notice '%', v_sql;
execute v_sql;
end loop;
end;
\$\$;
EOF
else
# PostgreSQL 8.x 不支持do language, 这里忽略导入
echo "PostgreSQL 8.x"
fi
}
analyze_csvlog() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
# 严重警告
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $CRIT_CNT;"|grep -c "GET_IT"`
if [ $CNT -ge 1 ]; then
cresult=1
psql -F ',' -A -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
sql_state_code,
count(1) from postgres_log
where error_severity<>'LOG'
group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code
order by count(1) desc;
EOF
return 0
fi
# 警告
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $WARN_CNT;"|grep -c "GET_IT"`
if [ $CNT -ge 1 ]; then
wresult=1
psql -F ',' -A -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
sql_state_code,
count(1) from postgres_log
where error_severity<>'LOG'
group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code
order by count(1) desc;
EOF
return 0
fi
}
# global result
wresult=0
cresult=0
# set env
. /usr/local/nagios/etc/check_pg_env.sh
# monitor
# 错误阈值.
BOND="'1 hour'"
WARN_CNT=100
CRIT_CNT=200
# 本脚本不支持8.x的版本
for i in "HZXY_CHECK_PG_LOCK_1" "HZXY_CHECK_PG_LOCK_5"
do
eval CONN_INFO="$"$i
create_csvlogtable
load_csvlog
analyze_csvlog
done
# exit global proc
if [ $cresult -eq 1 ]; then
exit 2
fi
if [ $wresult -eq 1 ]; then
exit 1
fi
exit 0
5. 由于nagios check有超时机制, 建议做成异步(如crontab). 也就是说本地定时CHECK. 将数据写入LOG中.
nagios的check脚本检查LOG的输出.
修改后的两个脚本如下 :
-- 放在crontab 中定期执行的脚本 :
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGPASSFILE=/etc/.pgpass
if [ $# -ne 0 ]; then
echo "USE $0"
exit 0
fi
create_csvlogtable() {
# debug
# echo $CONN_INFO
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
# PostgreSQL 9+
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
begin
perform 1 from pg_tables where tablename='postgres_log';
if not found then
CREATE TABLE postgres_log
(
log_time timestamp(3) with time zone,
user_name text,
database_name text,
process_id integer,
connection_from text,
session_id text,
session_line_num bigint,
command_tag text,
session_start_time timestamp with time zone,
virtual_transaction_id text,
transaction_id bigint,
error_severity text,
sql_state_code text,
message text,
detail text,
hint text,
internal_query text,
internal_query_pos integer,
context text,
query text,
query_pos integer,
location text,
application_name text,
PRIMARY KEY (session_id, session_line_num)
);
end if;
end;
\$\$;
EOF
else
# PostgreSQL 8+
# 不支持inline language, 建议预先创建表.
echo "PostgreSQL 8.x : $CONN_INFO"
fi
}
load_csvlog() {
# 需要用到BOND变量, 减少导入日志数量
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
if [ $VER -ge 1 ]; then
psql -q -t $CONN_INFO <<EOF
do language plpgsql \$\$
declare
v_dir text;
v_filename text;
v_sql text;
begin
set client_min_messages=warning;
truncate postgres_log;
select setting into v_dir from pg_settings where name='log_directory';
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)
where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;
raise notice '%', v_sql;
execute v_sql;
end loop;
end;
\$\$;
EOF
else
# PostgreSQL 8.x 不支持do language, 所以使用函数.
echo "PostgreSQL 8.x : $CONN_INFO"
psql -q -t $CONN_INFO <<EOF
create or replace function load_csvlog() returns void as \$\$
declare
v_dir text;
v_filename text;
v_sql text;
begin
set client_min_messages=warning;
truncate postgres_log;
select setting into v_dir from pg_settings where name='log_directory';
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)
where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;
raise notice '%', v_sql;
execute v_sql;
end loop;
return;
end;
\$\$ language plpgsql;
select * from load_csvlog();
EOF
fi
}
analyze_csvlog() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
# 严重警告
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $CRIT_CNT;"|grep -c "GET_IT"`
if [ $CNT -ge 1 ]; then
cresult=1
echo -e "CHECK ERRORs>$CRIT_CNT. return 2\n"
# nagios 将|视为status和performance info的分隔符, 所以需要调整一下psql的输出格式
psql -F ',' -A -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
sql_state_code,
count(1) from postgres_log
where error_severity<>'LOG'
group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code
having count(1)>100
order by count(1) desc;
EOF
return 0
fi
# 警告
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $WARN_CNT;"|grep -c "GET_IT"`
if [ $CNT -ge 1 ]; then
wresult=1
echo -e "CHECK ERRORs>$WARN_CNT. return 1\n"
psql -F ',' -A -q -t $CONN_INFO <<EOF
select inet_server_addr() as server_ip,
inet_server_port() as server_port,
error_severity,
sql_state_code,
count(1) from postgres_log
where error_severity<>'LOG'
group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code
having count(1)>100
order by count(1) desc;
EOF
return 0
fi
}
# global result
wresult=0
cresult=0
# set env
. /usr/local/nagios/etc/check_pg_env.sh
# monitor
# 错误阈值.
BOND="'1 hour'"
WARN_CNT=500
CRIT_CNT=1000
LOGFILE=/tmp/check_pg_csvlog.log
TMPLOGFILE=/tmp/check_pg_csvlog.log.tmp
# 清除tmp数据
echo "" >$TMPLOGFILE
for i in "HZ_CHECK_PG_LOCK_1" "HZ_CHECK_PG_LOCK_2"
do
eval CONN_INFO="$"$i
create_csvlogtable
load_csvlog
analyze_csvlog >>$TMPLOGFILE 2>&1
done
# 替换数据
cat $TMPLOGFILE >$LOGFILE
# exit global proc
if [ $cresult -eq 1 ]; then
exit 2
fi
if [ $wresult -eq 1 ]; then
exit 1
fi
exit 0
nagios的check脚本 :
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
LOGFILE=/tmp/check_pg_csvlog.log
CNT=`cat $LOGFILE|grep -c -E "ERROR|FATAL"`
if [ $CNT -ge 1 ]; then
RETVAL2=`cat $LOGFILE|grep -c "return 2"`
if [ $RETVAL2 -ge 1 ]; then
# summary
echo -e "CHECK ERRORs DB IP:"
cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"|awk -F ',' '{print $1}'|sort|uniq
# detail
echo -e "\n DETAIL:"
cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"
exit 2
fi
RETVAL1=`cat $LOGFILE|grep -c "return 1"`
if [ $RETVAL1 -ge 1 ]; then
# summary
echo -e "CHECK ERRORs DB IP:"
cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"|awk -F ',' '{print $1}'|sort|uniq
# detail
echo -e "\n DETAIL:"
cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"
exit 1
fi
cat $LOGFILE
exit 4
fi
参考
1. http://blog.163.com/digoal@126/blog/static/1638770402013121102459107/