#!/usr/bin/ksh ########################################################### #10/01/2003 ########################################################### #load-mon.sh command to monitor load from cron TEMPORARY # #this script check if idle is under 10% for last 200 sec #and do alerting, goes each 11,21,31,41,51 minutes #This has to be done trough RMC # # ARGUMENTS: # # QUIRCKS # it is just improvisation ... # # FUNCTIONS: # mesgLoad # mesgHog # eventLoad # eventHog # ########################################################### # # REVISIONS # Wed Oct 1 10:53:07 DFT 2003 # 1.0 First version, tested and rushed into production # 1.1 speed up - load was oscilating # Fri Oct 24 09:49:59 DFT 2003 # 1.1 hog monitoring too - CPU cummulative for oracle >4% # just not very good fix # 1.2 Mon Jan 26 08:25:22 NFT 2004 # L modified to be able to fire oracle snap # ########################################################### #action on load over limit event mesgLoad () { logger "Load $L" mailx -s "Load on $HST over 90%" $MAILS << EOF Load on $HST over 90% EOF exit 0 } #action on hog event #mail is commented out mesgHog () { logger "Hog $H" exit 0 #mailx -s "Possible CPU hog on $HST " $MAILS << EOF #Possible CPU hog on $HST #$H #EOF } #load detection eventLoad () { sar -u -i$TIME | tail -3 | head -1 | awk '$5+0< '$IDLE ' { print }' } #hog detection eventHog () { ps -ekF "%u %p %C %x %c %a" | sort -rn +2 | awk '$3+0>'$HOGL' && \ $1~/oracle/ { print }'| sed 's/ *$//' } ########################################################## #main part export PATH=/usr/bin:/etc:/usr/sbin:/usr/ucb:/usr/bin/X11:/sbin:/usr/java131/ \ jre/bin:/usr/java131/bin:/usr/lib/instl:/usr/local/bin:/usr/symcli/ \ bin:/usr/lpp/Symmetrix/bin ########################################################## ### configuration part VERSION="1.2" #where messages are going MAILS="root" #if monitoring is enabled /rootop/control_enabled.sh || exit 0 DT=$(date "+%y.%m.%d.%H:%M:%S") HST=$(hostname) #sar with 10 minutes resolution -i600, changed to 5 minutes #load idle less than 10% IDLE=10 ; #idle TIME=200; #200 seconds ## sensor part and condition part - there is flodding control here L=$(eventLoad) #long term hog - CPU over 7 - by experinece HOGL=7 ;#cumulative CPU load H=$(eventHog) ## event and action part #it is event fire snapshot oracle and send message exit because all is done test -n "$L" && mesgLoad && /rootop/fireOracleSnap.sh && exit 0 #it is event fire snapshot oracle and send message test -n "$H" && mesgHog && /rootop/fireOracleSnap.sh exit 0