PostgreSQL CSVLOG monitor script for nagios

9 minute read

背景

postgresql.conf配置 :

要求开启csvlog  
log_destination = 'csvlog'  
logging_collector = on  
log_directory = 'pg_log'  
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'  
log_file_mode = 0600  
log_truncate_on_rotation = on  
log_rotation_age = 1d  
log_rotation_size = 10MB  

脚本如下 :

vi /usr/local/nagios/libexec/check_pg_csvlog.sh  
  
#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGPASSFILE=/etc/.pgpass  
  
if [ $# -ne 1 ]; then  
  echo "USE $0 summary|detail"  
  exit 0  
else  
  PAR1=$1  
  if [ $PAR1 != "summary" ] && [ $PAR1 != "detail" ]; then  
    echo "USE $0 summary|detail"  
    exit 0  
  fi  
fi  
  
create_csvlogtable() {  
echo $CONN_INFO  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
# PostgreSQL 9+  
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
if [ $IS_STDBY -ge 1 ]; then  
  return 0  
fi  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
begin  
perform 1 from pg_tables where tablename='postgres_log';  
if not found then  
CREATE TABLE postgres_log  
(  
  log_time timestamp(3) with time zone,  
  user_name text,  
  database_name text,  
  process_id integer,  
  connection_from text,  
  session_id text,  
  session_line_num bigint,  
  command_tag text,  
  session_start_time timestamp with time zone,  
  virtual_transaction_id text,  
  transaction_id bigint,  
  error_severity text,  
  sql_state_code text,  
  message text,  
  detail text,  
  hint text,  
  internal_query text,  
  internal_query_pos integer,  
  context text,  
  query text,  
  query_pos integer,  
  location text,  
  application_name text,  
  PRIMARY KEY (session_id, session_line_num)  
);  
end if;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8+  
# 不支持inline language, 建议预先创建  
echo "PostgreSQL 8.x"  
fi  
}  
  
load_csvlog() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
if [ $VER -ge 1 ]; then  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
v_dir text;  
v_filename text;  
v_sql text;  
begin  
truncate postgres_log;  
select setting into v_dir from pg_settings where name='log_directory';  
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)   
  where filename ~ 'csv\$' and filename ~ to_char(current_date,'yyyy-mm-dd') loop  
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;  
raise notice '%', v_sql;  
execute v_sql;  
end loop;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8.x 不支持do language, 这里忽略导入  
echo "PostgreSQL 8.x"  
fi  
}  
  
analyze_csvlog() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
CNT=`psql -q -t $CONN_INFO -c "select error_severity from postgres_log where error_severity<>'LOG' group by error_severity;"|grep -c -E "FATAL|ERROR"`  
if [ $CNT -ge 1 ]; then  
  result=2  
  if [ $PAR1 == "summary" ]; then  
    psql -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      sql_state_code,  
      count(1) from postgres_log   
      where error_severity<>'LOG'   
      group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code   
      order by count(1) desc;  
EOF  
  fi  
  if [ $PAR1 == "detail" ]; then  
# detail的建议按照单库输出,否则太多  
    psql -x -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      connection_from,  
      sql_state_code,  
      message,  
      detail,  
      hint,  
      internal_query,  
      query,  
      location  
      from postgres_log   
      where error_severity<>'LOG'   
      order by log_time desc limit $OLINE;  
EOF  
  fi  
else  
  return 0  
fi  
}  
  
# global result  
result=0  
  
# set env  
. /usr/local/nagios/etc/check_pg_env.sh  
  
# monitor  
# detail监控输出最近20条详细信息.  
OLINE=20  
echo $OLINE  
for i in "HZ1_CHECK_PG_CSVLOG_1"  
do  
  eval CONN_INFO="$"$i  
  create_csvlogtable  
  load_csvlog  
  analyze_csvlog  
done  
  
# exit global proc  
exit $result  

修改执行权限 :

chmod 555 /usr/local/nagios/libexec/check_pg_csvlog.sh  

修改nagios配置 :

vi /usr/local/nagios/etc/nrpe.cfg  
command[check_pg_csvlog]=/usr/local/nagios/libexec/check_pg_csvlog.sh summary  

重启xinetd服务 :

service xinetd restart  

直接执行的结果如下 :

/usr/local/nagios/libexec/check_pg_csvlog.sh summary  

其他

1. 由于数据需要从csvlog导入数据库表, 所以不推荐频繁调用这个脚本.

2. 对于9.1以上的系统, 可以考虑使用file_fdw创建外部表, 省去导入过程.

3. 所有8.4以上系统都支持的直接读取文件的函数, pg_read_file(text, bigint, bigint).

4. 这个脚本更偏向查看前一天的统计.

如果要做成实时告警的方式, 需要调整一下策略.

例如取最近1个小时的ERROR和FATAL记录数作为告警阈值.

每个系统的频繁程度不一样, 阈值也不一样.

修改后的nagios check脚本如下 :

#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGPASSFILE=/etc/.pgpass  
  
if [ $# -ne 0 ]; then  
  echo "USE $0"  
  exit 0  
fi  
  
create_csvlogtable() {  
# debug  
# echo $CONN_INFO  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
# PostgreSQL 9+  
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
if [ $IS_STDBY -ge 1 ]; then  
  return 0  
fi  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
begin  
perform 1 from pg_tables where tablename='postgres_log';  
if not found then  
CREATE TABLE postgres_log  
(  
  log_time timestamp(3) with time zone,  
  user_name text,  
  database_name text,  
  process_id integer,  
  connection_from text,  
  session_id text,  
  session_line_num bigint,  
  command_tag text,  
  session_start_time timestamp with time zone,  
  virtual_transaction_id text,  
  transaction_id bigint,  
  error_severity text,  
  sql_state_code text,  
  message text,  
  detail text,  
  hint text,  
  internal_query text,  
  internal_query_pos integer,  
  context text,  
  query text,  
  query_pos integer,  
  location text,  
  application_name text,  
  PRIMARY KEY (session_id, session_line_num)  
);  
end if;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8+  
# 不支持inline language, 建议预先创建  
echo "PostgreSQL 8.x"  
fi  
}  
  
load_csvlog() {  
# 需要用到BOND变量, 减少导入日志数量  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
if [ $VER -ge 1 ]; then  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
v_dir text;  
v_filename text;  
v_sql text;  
begin  
set client_min_messages=warning;  
truncate postgres_log;  
select setting into v_dir from pg_settings where name='log_directory';  
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)   
  where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop  
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;  
raise notice '%', v_sql;  
execute v_sql;  
end loop;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8.x 不支持do language, 这里忽略导入  
echo "PostgreSQL 8.x"  
fi  
}  
  
analyze_csvlog() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
# 严重警告  
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $CRIT_CNT;"|grep -c "GET_IT"`  
if [ $CNT -ge 1 ]; then  
  cresult=1  
    psql -F ',' -A -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      sql_state_code,  
      count(1) from postgres_log   
      where error_severity<>'LOG'   
      group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code   
      order by count(1) desc;  
EOF  
return 0  
fi  
# 警告  
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $WARN_CNT;"|grep -c "GET_IT"`  
if [ $CNT -ge 1 ]; then  
  wresult=1  
    psql -F ',' -A -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      sql_state_code,  
      count(1) from postgres_log   
      where error_severity<>'LOG'   
      group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code   
      order by count(1) desc;  
EOF  
return 0  
fi  
}  
  
# global result  
wresult=0  
cresult=0  
  
# set env  
. /usr/local/nagios/etc/check_pg_env.sh  
  
# monitor  
# 错误阈值.  
BOND="'1 hour'"  
WARN_CNT=100  
CRIT_CNT=200  
# 本脚本不支持8.x的版本  
for i in "HZXY_CHECK_PG_LOCK_1" "HZXY_CHECK_PG_LOCK_5"  
do  
  eval CONN_INFO="$"$i  
  create_csvlogtable  
  load_csvlog  
  analyze_csvlog  
done  
  
# exit global proc  
if [ $cresult -eq 1 ]; then  
exit 2  
fi  
  
if [ $wresult -eq 1 ]; then  
exit 1  
fi  
  
exit 0  

5. 由于nagios check有超时机制, 建议做成异步(如crontab). 也就是说本地定时CHECK. 将数据写入LOG中.

nagios的check脚本检查LOG的输出.

修改后的两个脚本如下 :

-- 放在crontab 中定期执行的脚本  :   
#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGPASSFILE=/etc/.pgpass  
  
if [ $# -ne 0 ]; then  
  echo "USE $0"  
  exit 0  
fi  
  
create_csvlogtable() {  
# debug  
# echo $CONN_INFO  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
# PostgreSQL 9+  
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
if [ $IS_STDBY -ge 1 ]; then  
  return 0  
fi  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
begin  
perform 1 from pg_tables where tablename='postgres_log';  
if not found then  
CREATE TABLE postgres_log  
(  
  log_time timestamp(3) with time zone,  
  user_name text,  
  database_name text,  
  process_id integer,  
  connection_from text,  
  session_id text,  
  session_line_num bigint,  
  command_tag text,  
  session_start_time timestamp with time zone,  
  virtual_transaction_id text,  
  transaction_id bigint,  
  error_severity text,  
  sql_state_code text,  
  message text,  
  detail text,  
  hint text,  
  internal_query text,  
  internal_query_pos integer,  
  context text,  
  query text,  
  query_pos integer,  
  location text,  
  application_name text,  
  PRIMARY KEY (session_id, session_line_num)  
);  
end if;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8+  
# 不支持inline language, 建议预先创建表.   
echo "PostgreSQL 8.x : $CONN_INFO"  
fi  
}  
  
load_csvlog() {  
# 需要用到BOND变量, 减少导入日志数量  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
if [ $VER -ge 1 ]; then  
psql -q -t $CONN_INFO <<EOF  
do language plpgsql \$\$  
declare  
v_dir text;  
v_filename text;  
v_sql text;  
begin  
set client_min_messages=warning;  
truncate postgres_log;  
select setting into v_dir from pg_settings where name='log_directory';  
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)   
  where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop  
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;  
raise notice '%', v_sql;  
execute v_sql;  
end loop;  
end;  
\$\$;  
EOF  
else  
# PostgreSQL 8.x 不支持do language, 所以使用函数.  
echo "PostgreSQL 8.x : $CONN_INFO"  
psql -q -t $CONN_INFO <<EOF  
create or replace function load_csvlog() returns void as \$\$  
declare  
v_dir text;  
v_filename text;  
v_sql text;  
begin  
set client_min_messages=warning;  
truncate postgres_log;  
select setting into v_dir from pg_settings where name='log_directory';  
for v_filename in select filename from pg_ls_dir(v_dir) t(filename)   
  where filename ~ 'csv\$' and (pg_stat_file(v_dir||'/'||filename)).modification+interval $BOND >= now() loop  
v_sql := \$_\$COPY postgres_log FROM '\$_\$||v_dir||'/'||v_filename||\$_\$' WITH csv\$_\$;  
raise notice '%', v_sql;  
execute v_sql;  
end loop;  
return;  
end;  
\$\$ language plpgsql;  
select * from load_csvlog();  
EOF  
fi  
}  
  
analyze_csvlog() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
# 严重警告  
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $CRIT_CNT;"|grep -c "GET_IT"`  
if [ $CNT -ge 1 ]; then  
  cresult=1  
  echo -e "CHECK ERRORs>$CRIT_CNT. return 2\n"  
# nagios 将|视为status和performance info的分隔符, 所以需要调整一下psql的输出格式  
    psql -F ',' -A -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      sql_state_code,  
      count(1) from postgres_log   
      where error_severity<>'LOG'   
      group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code   
      having count(1)>100  
      order by count(1) desc;  
EOF  
return 0  
fi  
# 警告  
CNT=`psql -q -t $CONN_INFO -c "select 'GET_IT' from postgres_log where error_severity<>'LOG' having count(*) >= $WARN_CNT;"|grep -c "GET_IT"`  
if [ $CNT -ge 1 ]; then  
  wresult=1  
  echo -e "CHECK ERRORs>$WARN_CNT. return 1\n"  
    psql -F ',' -A -q -t $CONN_INFO <<EOF  
      select inet_server_addr() as server_ip,  
      inet_server_port() as server_port,  
      error_severity,  
      sql_state_code,  
      count(1) from postgres_log   
      where error_severity<>'LOG'   
      group by inet_server_addr(),inet_server_port(),error_severity,sql_state_code   
      having count(1)>100  
      order by count(1) desc;  
EOF  
return 0  
fi  
}  
  
# global result  
wresult=0  
cresult=0  
  
# set env  
. /usr/local/nagios/etc/check_pg_env.sh  
  
# monitor  
# 错误阈值.  
BOND="'1 hour'"  
WARN_CNT=500  
CRIT_CNT=1000  
LOGFILE=/tmp/check_pg_csvlog.log  
TMPLOGFILE=/tmp/check_pg_csvlog.log.tmp  
  
# 清除tmp数据  
echo "" >$TMPLOGFILE  
  
for i in "HZ_CHECK_PG_LOCK_1" "HZ_CHECK_PG_LOCK_2"  
do  
  eval CONN_INFO="$"$i  
  create_csvlogtable  
  load_csvlog  
  analyze_csvlog >>$TMPLOGFILE 2>&1  
done  
  
# 替换数据  
cat $TMPLOGFILE >$LOGFILE  
  
# exit global proc  
if [ $cresult -eq 1 ]; then  
exit 2  
fi  
  
if [ $wresult -eq 1 ]; then  
exit 1  
fi  
  
exit 0  

nagios的check脚本 :

#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
  
LOGFILE=/tmp/check_pg_csvlog.log  
  
CNT=`cat $LOGFILE|grep -c -E "ERROR|FATAL"`  
if [ $CNT -ge 1 ]; then  
  RETVAL2=`cat $LOGFILE|grep -c "return 2"`  
  if [ $RETVAL2 -ge 1 ]; then  
    # summary  
    echo -e "CHECK ERRORs DB IP:"  
    cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"|awk -F ',' '{print $1}'|sort|uniq  
    # detail  
    echo -e "\n DETAIL:"  
    cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"  
    exit 2  
  fi  
  RETVAL1=`cat $LOGFILE|grep -c "return 1"`  
  if [ $RETVAL1 -ge 1 ]; then  
    # summary  
    echo -e "CHECK ERRORs DB IP:"  
    cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"|awk -F ',' '{print $1}'|sort|uniq  
    # detail  
    echo -e "\n DETAIL:"  
    cat $LOGFILE|grep -v "^\$"|grep -v "CHECK ERRORs"  
    exit 1  
  fi  
  cat $LOGFILE  
  exit 4  
fi  

参考

1. http://blog.163.com/digoal@126/blog/static/1638770402013121102459107/

Flag Counter

digoal’s 大量PostgreSQL文章入口