PostgreSQL LOCK WAITING monitor script for nagios

5 minute read

背景

数据库锁等待监控脚本.

提供给nagios使用 :

本例环境 :

被监控的数据库DATABASEs

1. 192.168.2.2 : 1921  
2. 192.168.21.3 : 1921  

监控服务器monitor server

192.168.100.2  

nagios 服务器

192.168.10.2  

首先要在被监控的数据库中创建1个超级用户, 用于查询pg_stat_activity表. 此例该用户名为digoal.

create role digoal superuser login connection limit 32 encrypted password 'DIGOAL';  

接下来需要配置被监控的数据库的pg_hba.conf, 允许192.168.100.2通过digoal用户访问postgres数据库.

vi pg_hba.conf  
host postgres digoal 192.168.100.2/32 md5  
pg_ctl reload  

然后需要在monitor server新建PGPASSFILE

vi /etc/.pgpass  
*:*:postgres:digoal:DIGOAL  
chmod 400 /etc/.pgpass  

.pgpass的owner必须改成调用nagios脚本的用户. 该用户配置在/etc/xinetd.d/nrpe.

在监控服务器上新建检测脚本 :

vi /usr/local/nagios/libexec/check_pg_lock.sh  
  
#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGPASSFILE=/etc/.pgpass  
  
create_func() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
  
echo $CONN_INFO  
  
psql -q -t $CONN_INFO <<EOF  
create or replace function lock_wait (i_lockwait interval) returns void as \$\$  
declare  
  v_dbip inet;  
  v_dbport int;  
  v_datname name;  
  v_usename name;  
  v_client_addr inet;  
  v_client_port int;  
  v_query_start timestamp with time zone;  
  v_current_query text;  
  v_ver text;  
begin  
  set client_min_messages = notice;  
  select substr(version(),12,3) into v_ver;  
  select inet_server_addr() into v_dbip;  
  select inet_server_port() into v_dbport;  
  if v_ver >= '9.2' then  
    for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in   
      select datname,usename,client_addr,client_port,query_start,query from pg_stat_activity where waiting and now()-query_start > i_lockwait  
    loop  
      raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'  
      ,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;  
    end loop;  
    return;  
  else  
    for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in   
      select datname,usename,client_addr,client_port,query_start,current_query from pg_stat_activity where waiting and now()-query_start > i_lockwait  
    loop  
      raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'  
      ,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;  
    end loop;  
    return;  
  end if;  
  return;  
end;  
\$\$ language plpgsql;  
EOF  
}  
  
mon() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
  
echo $CONN_INFO  
  
psql -q -t $CONN_INFO -c "$SQL"  
}  
  
# global result  
result=0  
  
# set env  
. /usr/local/nagios/etc/check_pg_env.sh  
  
# monitor  
LOG="/tmp/check_pg_lock.log"  
SQL="select lock_wait(interval '5 second')"  
for i in "HZ1_CHECK_PG_LOCK_DIGOAL" "HZ2_CHECK_PG_LOCK_DIGOAL"  
do  
  eval CONN_INFO="$"$i  
  create_func  
  cnt=`mon 2>&1|grep -c -E "NOTICE|ERROR"`  
  if [ $cnt -ge 1 ]; then  
    result=2  
    mon 2>&1|tee $LOG  
  fi  
done  
  
# exit global proc  
exit $result  

配置脚本权限 :

chmod 555 /usr/local/nagios/libexec/check_pg_lock.sh  

配置连接文件 :

vi /usr/local/nagios/etc/check_pg_env.sh  
# CHECK_PG_LOCK env  
# 杭州某IDC  
HZ1_CHECK_PG_LOCK_DIGOAL="-h 10.1.110.40 -p 1921 -U digoal postgres"  
HZ2_CHECK_PG_LOCK_DIGOAL="-h 10.1.123.212 -p 1921 -U digoal postgres"  

修改权限:

chmod 444 /usr/local/nagios/etc/check_pg_env.sh  

修改监控服务器的nagios客户端配置文件 :

vi /usr/local/nagios/etc/nrpe.cfg   
command[check_pg_lock]=/usr/local/nagios/libexec/check_pg_lock.sh  

重启xinetd服务

service xinetd restart  
Stopping xinetd: [  OK  ]  
Starting xinetd: [  OK  ]  

配置nagios服务端, 增加check_pg_lock的检测.

如果发现锁等待超时, 可以使用如下3个视图进行分析 :

CREATE VIEW lockview AS  
SELECT pid,virtualtransaction AS vxid, locktype AS lock_type,  
mode AS lock_mode,granted,  
CASE  
WHEN virtualxid IS NOT NULL AND transactionid IS NOT NULL  
THEN virtualxid||' ' || transactionid  
WHEN virtualxid::text IS NOT NULL  
THEN virtualxid  
ELSE transactionid::text  
END AS xid_lock, relname,  
page, tuple,classid,objid,objsubid  
FROM pg_locks LEFT OUTER JOIN pg_class ON (pg_locks.relation= pg_class.oid)  
WHERE -- donot showour view'slocks  
pid != pg_backend_pid()AND  
--no needtoshowself-vxid locks  
virtualtransaction IS DISTINCT FROM virtualxid  
--granted isordered earlier  
ORDER BY 1, 2,5 DESC, 6, 3,4, 7;  
  
CREATE VIEW lockview1 AS  
SELECT pid,vxid, lock_type,lock_mode,granted,xid_lock, relname  
FROM lockview  
--granted isordered earlier  
ORDER BY 1,2,5 DESC, 6,3,4,7;  
  
CREATE VIEW lockview2 AS  
SELECT pid,vxid, lock_type,page, tuple,classid,objid,objsubid  
FROM lockview  
--granted isfirst  
--add non-display columns tomatch orderingoflockview  
ORDER BY 1,2,granted DESC, vxid, xid_lock::text, 3,4,5,6,7,8;  

例如 :

[root@db-digoal ~]# /usr/local/nagios/libexec/check_pg_lock.sh  
NOTICE:  dbip:10.1.173.212, dbport:1921, datname:postgres, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:35:23.245022+08, current_query:update test set id=1;  
   
  
NOTICE:  dbip:10.1.100.40, dbport:1921, datname:android_market, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:30:47.899411+08, current_query:update test set id=3;  
NOTICE:  dbip:10.1.100.40, dbport:1921, datname:android_market, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:31:22.157254+08, current_query:update test set id=3;  

返回值, nagios要用到这个值判断检查状态.

[root@db-digoal ~]# echo $?  
2  

分析10.1.100.40的NOTICE :

select * from lockview1 where relname='test';  
  pid  |    vxid    | lock_type |    lock_mode     | granted | xid_lock | relname   
-------+------------+-----------+------------------+---------+----------+---------  
 12396 | 219/122462 | relation  | RowExclusiveLock | t       |          | test  
 12396 | 219/122462 | tuple     | ExclusiveLock    | t       |          | test  
 21011 | 234/100082 | relation  | RowExclusiveLock | t       |          | test  
 21011 | 234/100082 | tuple     | ExclusiveLock    | f       |          | test  
 27207 | 236/256259 | relation  | RowExclusiveLock | t       |          | test  
(5 rows)  

从以上锁信息分析,

12396正是21011和27207等待的会话.

KILL掉即可.

digoal=# select pg_terminate_backend(12396);  
 pg_terminate_backend   
----------------------  
 t  
(1 row)  
  
digoal=# select * from lockview1 where relname='test';  
  pid  |    vxid    | lock_type |    lock_mode     | granted | xid_lock | relname   
-------+------------+-----------+------------------+---------+----------+---------  
 21011 | 234/100082 | relation  | RowExclusiveLock | t       |          | test  
 21011 | 234/100082 | tuple     | ExclusiveLock    | t       |          | test  
 27207 | 236/256259 | relation  | RowExclusiveLock | t       |          | test  
(3 rows)  

此时21011获得了tuple exclusivelock, 而27207还未获得, 它在等21011释放.

KILL 21011即可.

digoal=# select pg_terminate_backend(21011);  

小结

1. 程序BUG可能带来持有锁, 但是不释放的问题. 例如持有锁后啥都不干, 也不结束事务, 也就是pg_stat_activity.current_query看到的IDLE in transaction.

\所以数据库的锁等待超时监控显得尤为重要.

2. 由于集中监控, 可能因为网络的延时导致执行时间过长, nagios有超时机制, 容易导致不稳定.

可以考虑将监控放到crontab中, nagios调用的脚本通过检测输出日志. 分出如下两部分 :

crontab执行脚本 :

#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
export PGPASSFILE=/etc/.pgpass  
  
create_func() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
  
#DEBUG switch  
#echo $CONN_INFO  
  
psql -q -t $CONN_INFO <<EOF  
create or replace function lock_wait (i_lockwait interval) returns void as \$\$  
declare  
  v_dbip inet;  
  v_dbport int;  
  v_datname name;  
  v_usename name;  
  v_client_addr inet;  
  v_client_port int;  
  v_query_start timestamp with time zone;  
  v_current_query text;  
  v_ver text;  
begin  
  set client_min_messages = notice;  
  select substr(version(),12,3) into v_ver;  
  select inet_server_addr() into v_dbip;  
  select inet_server_port() into v_dbport;  
  if v_ver >= '9.2' then  
    for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in   
      select datname,usename,client_addr,client_port,query_start,query from pg_stat_activity where waiting and now()-query_start > i_lockwait  
    loop  
      raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'  
      ,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;  
    end loop;  
    return;  
  else  
    for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in   
      select datname,usename,client_addr,client_port,query_start,current_query from pg_stat_activity where waiting and now()-query_start > i_lockwait  
    loop  
      raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'  
      ,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;  
    end loop;  
    return;  
  end if;  
  return;  
end;  
\$\$ language plpgsql;  
EOF  
}  
  
mon() {  
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`  
if [ $VER -ge 1 ]; then  
  IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`  
  if [ $IS_STDBY -ge 1 ]; then  
    return 0  
  fi  
fi  
  
#DEBUG switch  
#echo $CONN_INFO  
  
psql -q -t $CONN_INFO -c "$SQL"  
}  
  
# global result  
result=0  
  
# set env  
. /usr/local/nagios/etc/check_pg_env.sh  
  
# monitor  
LOGFILE="/tmp/check_pg_lock.log"  
TMPLOGFILE="/tmp/check_pg_lock.log.tmp"  
SQL="select lock_wait(interval '5 second')"  
# 清除临时日志数据  
echo "" >$TMPLOGFILE  
  
for i in "HZ1_CHECK_PG_LOCK_DIGOAL" "HZ2_CHECK_PG_LOCK_DIGOAL"  
do  
  eval CONN_INFO="$"$i  
  create_func  
  cnt=`mon 2>&1|grep -c -E "NOTICE|ERROR"`  
  if [ $cnt -ge 1 ]; then  
    result=2  
    mon >>$TMPLOGFILE 2>&1  
  fi  
done  
  
echo -e "return 2" >>$TMPLOGFILE 2>&1  
# 替换数据  
cat $TMPLOGFILE >$LOGFILE  
  
# exit global proc  
exit $result  

nagios脚本 :

#!/bin/bash  
# 环境变量  
export LANG=en_US.utf8  
export PGHOME=/opt/pgsql  
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib  
export DATE=`date +"%Y%m%d%H%M"`  
export PATH=$PGHOME/bin:$PATH:.  
export MANPATH=$PGHOME/share/man:$MANPATH  
  
LOGFILE=/tmp/check_pg_lock.log  
  
CNT=`cat $LOGFILE|grep -c -E "NOTICE|ERROR"`  
if [ $CNT -ge 1 ]; then  
    # summary  
    echo -e "ERROR: SQL lock timeout."  
    cat $LOGFILE|grep -v "^\$"|awk '{print $2}'|uniq -c  
    # detail  
    echo -e "\nDETAIL:"  
    cat $LOGFILE|grep -v "^\$"  
    exit 2  
fi  
exit 0  

其他

1. 现成的监控脚本可以参考bucardo的check_postgres.pl

支持nagios, mrtg, cacti等

http://bucardo.org/wiki/Check_postgres

2. csvlog监控

http://blog.163.com/digoal@126/blog/static/163877040201312241028667/

Flag Counter

digoal’s 大量PostgreSQL文章入口