#!/bin/bash
#
# Copyright (C) 1994-2018 Altair Engineering, Inc.
# For more information, contact Altair at www.altair.com.
#
# This file is part of the PBS Professional ("PBS Pro") software.
#
# Open Source License Information:
#
# PBS Pro is free software. You can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.
# See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Commercial License Information:
#
# For a copy of the commercial license terms and conditions,
# go to: (http://www.pbspro.com/UserArea/agreement.html)
# or contact the Altair Legal Department.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.
#

if [ $# -eq 1 ] && [ $1 = "--version" ]; then
   echo pbs_version = 18.1.1
   exit 0
fi

#                                   pbs_diag
#
# File: pbs_diag
# Summary: captures PBS Professional files and information for technical support
# Date: May 19, 2005
#
#  pbs_diag script is an interactive script that customers who need help can run
#  to supply support engineers with the information they need to diagnose a problem.
#
#  The script can be run with the -f option to force non-interactive behavior.  If no
#  other options are given, only basic configuration information will be collected.
#  The other options can be used in conjunction with -f to collect job specific
#  information, log files from particular dates, etc.
#
#  Specifically, this script will extract the following information:
#
#  qmgr settings for server, queues, and nodes
#  pbs_probe information about file permissions
#  pbs.conf master configuration information
#  pbsnodes node configuration/state information
#  qstat information about current state of the queues and server
#  information about existing reservations
#  pbs_hostn name resolution information
#  operating system version information
#  server, scheduler, and mom configuration files
#  tracejob and logging information for jobs specified by the user
#  server, scheduler, and mom logs for dates specified by the user
#  cpuset configuration information and current state if on a cpuset aware system
#  vnode definition files
#
#  The options for pbs_diag are documented here:
#
#  -o dirname
#     This overrides $HOME as the default location for the output .tar.gz file.
#
#  -c
#     Use this option to run ONLY the cpuset information gathering portion of the script.
#     This is important if there is a cpuset aware system that we need information about
#     that is not running as the PBS server/scheduler.
#
#  -f
#     Use this option for non-interactive mode.  The greeting will be supressed, as
#     will the command execution confirmation.  Only the basic information gathering
#     will take place, no additional information will be prompted for.  You can specify
#     that more specific information (logs, job specific information, etc.) be gathered by using
#     the other options.
#
#     Note that the -u username option (and -j jobidlist) must be used if pbs_dtj information is
#     to be collected.  If -f is specified without -u username, this step is skipped.
#
#  -j jobidlist
#     Use this option to specify the job ids you want to collect information on the
#     command line, rather than being prompted during the running of the script.
#
#  -d daterange
#     Use this option to specify a single date, a date range, or a single integer representing the number
#     of days in the past to retrieve server and scheduler logs.  Dates and date ranges must be of
#     the form MM/DD/YYYY-MM/DD/YYYY
#
#  -g core_file
#     Use this option and specify a core file produced by one of the three PBS daemons to easily obtain a
#     stack trace.  This will only work on Linux systems where gdb (the GNU debugger) is installed.
#
#     If the -g option is specified, no other data collection will take place.
#
#  -p daemon_name
#     Use this option to gather OS level data about a misbehaving (e.g., hung) daemon process.
#     Daemon PID will be obtained via lock file.  Use -i to optionally override.
#
#     If the -p option is specified, no other data collection will take place.
#
#  -i daemon_pid
#     In the rare case that the PID stored in the PBS lock file is incorrect, use this option to specify
#     the correct PID.
#
#  -u username
#  -r rcommand
#  -n x
#     These options are used only during the pbs_dtj portion of the script and map directly to the options
#     in that script.  See the pbs_dtj header for details.  Note that -u username is particularly useful if
#     you wish to run pbs_diag completely non-interactively.
#
#
#
# NOTE that this is a bash script, and will not run on a system without this shell
# present.


#CAPS for environment variables, lowercase for script vars, CamelBack for function names


#Note from http://www.lawbiz.ch/spahni/programs/programs.html
#
#Applies only to jd and delay
#
#Disclaimer: These programs are in the public domain (with the
#exception of dbtool and timon which are distributed under the
#GPL2 GNU Public Licence). All others may be freely copied,
#distributed and changed. However, no guarantee is given that
#they are useful for any purpose or that they will run on any
#particular machine without damaging hard- and software.
#Enjoy at your own risk!

jd()
{
# jd  Shell script to calculate the Julian Day number (JD).
#
#     The JD is a consecutive count of days from the beginning
#     of the year -4712 onwards. Following an astronomical tradition
#     Julian Day numbers are counted from noon of each day,
#     thus ending in 'nnnn.5' at midnight. The Gregorian
#     calendar reform during October 1582 is taken into account.
#     The method is valid for negative years but not for
#     negative JD numbers.
#
#     usage:  'jd  <dd> <mm> <yyyy>'
#     or      'jd - '                (use standard input)
#     where:                  yyyy = Year
#                        mm        = Month
#                   dd             = Day
#
#     The shell script takes 3 arguments for day, month and year
#     from the command line arguments or from the standard input.
#     The year has four digits in our days and the separator
#     between arguments is a blank.
#
# Method from: Jean Meeus, Astronomical Formulae for Calculators,
#              3rd Edition, 1985, Willmann-Bell, Inc., Richmond
#              ISBN 0-943396-09-3
#

declare -i DAY
declare -i MONTH
declare -i YEAR
declare -i VY
declare -i VM
declare -i VA
declare -i VB
declare -i DSTRING
declare -i JD

if [ "$1" = "-" ] ; then
   STDINP="`cat -`"
   DAY=`echo $STDINP | cut -d ' ' -f 1`
   MONTH=`echo $STDINP | cut -d ' ' -f 2`
   YEAR=`echo $STDINP | cut -d ' ' -f 3`
else
   if [ $# -lt 3 ] ; then
      echo 'Usage: jd <dd> <mm> <yyyy>' 1>&2
      echo '       jd -  (use standard input)' 1>&2
      exit 1
   else
      DAY=$[10#$1]
      MONTH=$[10#$2]
      YEAR=$[$3]
   fi
fi

if [ $DAY -lt 1 -o $DAY -gt 31 ] ; then
   echo "jd: parameter #1 for day out of range" 1>&2
   exit 1
fi

if [ $MONTH -lt 1 -o $MONTH -gt 12 ] ; then
   echo "jd: parameter #2 for month out of range" 1>&2
   exit 1
fi

if [ $YEAR -eq 1582 -a $MONTH -eq 10 ] ; then
   if [ $DAY -gt 4 -a $DAY -lt 15 ] ; then
      echo "jd: Warning: Date ${DAY} ${MONTH} ${YEAR} does not \
exist in Gregorian calendar" 1>&2
   fi
fi

# This is the start of the calculation

if [ $MONTH -gt 2 ] ; then
   VY=$YEAR
   VM=$MONTH
else
   VY=$[ $YEAR - 1 ]
   VM=$[ $MONTH + 12 ]
fi

DSTRING=$[ ($YEAR * 10000) + ($MONTH * 100) + $DAY ]
if [ $DSTRING -ge 15821015 ] ; then
   VA=$[ $VY / 100 ]
   VB=$[ ($VA / 4) + 2 - $VA ]
else
   VA=0
   VB=0
fi

VM=$[ $VM + 1 ]
if [ $VY -lt 0 ] ; then
   JD=$[ (36525 * $VY - 75) / 100 ]
else
   JD=$[ 36525 * $VY / 100 ]
fi
let JD+=$[ 306001 * $VM / 10000 ]
let JD+=$[ $DAY + $VB + 1720994 ]

echo "${JD}.5"
}



delay()
{
#
# delay    shell script for the claculation of a delay
#
# Usage:   delay  dd1 mm1 yyyy1   dd2 mm2 yyyy2
#          delay  dd1 mm1 yyyy1   [-]number_of_days
#

declare -i DAY_A
declare -i MONTH_A
declare -i YEAR_A
declare -i DAY_B
declare -i MONTH_B
declare -i YEAR_B
declare -i JD_A
declare -i JD_B
declare -i DIFF

if [ $# -lt 3 ] ; then
   echo 'Usage: delay <d1> <m1> <yyy1>  <d2> <m2> <yyy2>'
   echo '             <d1> <m1> <yyy1>  <[-]#of_days>'
   exit 1
fi

DAY_A=$[10#$1]
MONTH_A=$[10#$2]
YEAR_A=$[$3]

if [ $DAY_A -lt 1 -o $DAY_A -gt 31 ] ; then
   echo "delay: parameter #1 for day out of range"
   exit 1
fi

if [ $MONTH_A -lt 1 -o $MONTH_A -gt 12 ] ; then
   echo "delay: parameter #2 for month out of range"
   exit 1
fi

JD_A=`jd $1 $2 $3 | sed -e 's/\.[0-9]*$//'`

if [ $# -ge 6 ] ; then
   DAY_B=$[10#$4]
   MONTH_B=$[10#$5]
   YEAR_B=$[$6]
   if [ $DAY_B -lt 1 -o $DAY_B -gt 31 ] ; then
      echo "delay: parameter #4 for day out of range"
      exit 1
   fi
   if [ $MONTH_B -lt 1 -o $MONTH_B -gt 12 ] ; then
      echo "delay: parameter #5 for month out of range"
      exit 1
   fi
   JD_B=`jd $4 $5 $6 | sed -e 's/\.[0-9]*$//'`
   let DIFF=$[ $JD_B - $JD_A ]
   echo "$DIFF"
   exit 0
fi

}

#generates the staging directory name
MakeStageName () {
dirname="pbs_diag_`date +%y%m%d_%T | tr -d :`"
stage="$outdir/$dirname"
}


#function to package up resulting directory and provide delivery instructions and clean out directory
TarAndClean () {
echo
echo "Running tar and gzip on output, please wait."
echo

tarname=$dirname.tar
gzname=$tarname.gz

cd $outdir
tar -cf $tarname $dirname
gzip $tarname

fsize=`ls -l $outdir/$gzname | awk '{print $5}'`

if [ -z $p_input ] ; then
	clear
fi

#this goes back to the JobId function, I wanted to output the warning at the end, so here it is
if [ -n "$dtj_user_warning" ] ; then
	Warning " "
	Warning " The -f option was used, but the -u option was not."
	Warning " You must use -u username if you want to collect"
	Warning " pbs_dtj information when in non-interactive mode (-f)."
	Warning " "
fi

#see if the file is larger than 5MB, if so, give different instructions
if [ "$fsize" -gt 5242880 ] ; then
#        clear
	echo
        echo " File $outdir/$gzname created. "
        echo
        echo " The file is over 5mb in size.  Please upload it to "
        echo " ftp.altair.com/pub/incoming, then send an email to"
        echo " the support engineer who requested it that includes"
        echo " the filename.  If you are sending this to "
        echo " pbssupport@altair.com as a new issue, please be "
        echo " sure to include a detailed description of the "
        echo " problem you are having."
        echo
        echo " Thanks!  -The PBS Professional Support Team"
else
#        clear
	echo
        echo " File $outdir/$gzname created."
    	echo
        echo " Please send it as an attachment to "
        echo " pbssupport@altair.com, or the support engineer"
        echo " who requested it.  If you are sending this to "
        echo " pbssupport@altair.com as a new issue, please be"
        echo " sure to include a detailed description of the"
        echo " problem you are having."
        echo
        echo " Thanks!  -The PBS Professional Support Team"
fi
#clean up after ourselves
rm -f $tarname
rm -fr $stage
}


# Simple Warning message to STDERR
Warning () {
  msg=$1
  echo "$msg" >&2
   if [ -n "$stage_exists" ] ; then
     echo "$msg" >> $stage/diag_log
   fi
}


#function to test validity of characters in a Job ID, amy need to extend this if there are other characters allowed aside from ., -, and _  ?
IsValidJobID()
{
case $1 in
	*[!a-zA-Z0-9_\.\-\[\]]*|"")
	return 1;;
*)
	return 0;;
esac
}


#This function is called with a date, date range, or single int as an argument.  It will collect the appropriate
#server and sched logs
PBSLog ()
{
	date_range=$1

        #if the serer_logs directory is not there yet, make it
        if [ ! -d "$stage"/server_logs ] ; then
                RunAndOutput "mkdir $stage/server_logs"
        fi

	#this is a test to see if a date (or range) was specified, or just a number
	test_date=`echo $date_range | awk -F \/ '{print $2}'`

	#if it was not a range, we want to grab the last n log files
	if [ -z "$test_date" ] ; then
		#get a list of the log files, one per line, then only take the number we want
		server_log_list=`ls -rtA1 $PBS_HOME/server_logs | tail -n $date_range`
		#go through the list and copy them into the right place
		for c in $server_log_list
		do
			RunAndOutput "cp -p $PBS_HOME/server_logs/$c $stage/server_logs/"
		done

		#do the same for the scheduler, if it is configured to run on this system
		if [ "$PBS_START_SCHED" = "1" ] ; then

	                if [ ! -d "$stage"/sched_logs ] ; then
                                RunAndOutput "mkdir $stage/sched_logs"
                        fi

	                sched_log_list=`ls -rtA1 $PBS_HOME/sched_logs | tail -n $date_range`
	                for c in $sched_log_list
			do
	                        RunAndOutput "cp -p $PBS_HOME/sched_logs/$c $stage/sched_logs/"
	                done
        	else
	                Warning " This is not the PBS scheduler system, skipping sched log collection."

		fi

                #do the same for pbs_comm, if it is configured to run on this system
                if [ "$PBS_START_COMM" = "1" ] ; then

                        if [ ! -d "$stage"/comm_logs ] ; then
                                RunAndOutput "mkdir $stage/comm_logs"
                        fi

                        comm_log_list=`ls -rtA1 $PBS_HOME/comm_logs | tail -n $date_range`
                        for c in $comm_log_list
                        do
                                RunAndOutput "cp -p $PBS_HOME/comm_logs/$c $stage/comm_logs/"
                        done
                else
                        Warning " This is not a PBS comm system, skipping comm log collection."

                fi




	#A date or range was specified
	else

	#separate the dates and store them in separate vars
        date1=`echo $date_range | awk -F - '{print $1}'`
        date2=`echo $date_range | awk -F - '{print $2}'`

	#translate the first date to the log file name format
        date_log1=`echo $date1 | awk -F \/ '{print $3 $1 $2 }'`

	#translate the second date to the log file name format, if it exists
        if [ -n "$date2" ] ; then
                date_log2=`echo $date2 | awk -F \/ '{print $3 $1 $2 }'`

		#get a list of all of the log file names
                server_log_list=`ls $PBS_HOME/server_logs`

		#loop through the log file names and copy over the correct ones
                for c in $server_log_list
                do
                        if [ "$c" -ge "$date_log1" ] && [ "$c" -le "$date_log2" ] ; then
                                RunAndOutput "cp -p $PBS_HOME/server_logs/$c $stage/server_logs/"
                        fi
                done
        else
		#if there is just one date then just copy that log file over
                RunAndOutput "cp -p $PBS_HOME/server_logs/$date_log1 $stage/server_logs/"

        fi

	#see if this is the scheduler system, and copy over log files, using the same method as for the server
        if [ "$PBS_START_SCHED" = "1" ] ; then
                if [ -n "$date2" ] ; then

                        sched_log_list=`ls $PBS_HOME/sched_logs`

                        if [ ! -d "$stage"/sched_logs ] ; then
                                RunAndOutput "mkdir $stage/sched_logs"
                        fi

                        for c in $sched_log_list
                        do
                                if [ "$c" -ge "$date_log1" ] && [ "$c" -le "$date_log2" ] ; then
                                        RunAndOutput "cp -p $PBS_HOME/sched_logs/$c $stage/sched_logs/"
                                fi
                        done
                else
                        if [ ! -d "$stage"/sched_logs ] ; then
                                RunAndOutput "mkdir $stage/sched_logs"
                        fi
                        RunAndOutput "cp -p $PBS_HOME/sched_logs/$date_log1 $stage/sched_logs/"
                fi

        else
                Warning " This is not the PBS scheduler system, skipping sched log collection."
        fi


        #see if this is a pbs_comm system, and copy over log files, using the same method as for the server
        if [ "$PBS_START_COMM" = "1" ] ; then
                if [ -n "$date2" ] ; then

                        comm_log_list=`ls $PBS_HOME/comm_logs`

                        if [ ! -d "$stage"/comm_logs ] ; then
                                RunAndOutput "mkdir $stage/comm_logs"
                        fi

                        for c in $comm_log_list
                        do
                                if [ "$c" -ge "$date_log1" ] && [ "$c" -le "$date_log2" ] ; then
                                        RunAndOutput "cp -p $PBS_HOME/comm_logs/$c $stage/comm_logs/"
                                fi
                        done
                else
                        if [ ! -d "$stage"/comm_logs ] ; then
                                RunAndOutput "mkdir $stage/comm_logs"
                        fi
                        RunAndOutput "cp -p $PBS_HOME/comm_logs/$date_log1 $stage/comm_logs/"
                fi

        else
                Warning " This is not a PBS comm system, skipping comm log collection."
        fi



fi

}





# function to properly execute and log commands.
RunAndOutput() {
 cmd=$1
echo "running $cmd"

#echo command to the logfile
echo "running $cmd" >> $stage/diag_log

#run the command, stdout to screen, stderror to logfile
eval $cmd 2>> $stage/diag_log
}



#these functions add the commands to collect the *_priv files to the commands array.  The commands are not actually executed here
get_sched_priv(){

cc=200

sched_priv="sched_config resource_group holidays dedicated_time sched.lock sched_formula"

for sched_priv_file in $sched_priv ; do
  if [ -f "$PBS_HOME"/sched_priv/"$sched_priv_file" ]; then
     commands[$cc]="cp -p $PBS_HOME/sched_priv/$sched_priv_file $stage/sched_priv/$sched_priv_file"
     cc=$(($cc+1))
  fi
done
}


get_server_priv(){

cc=300

server_priv="hooks resourcedef server.lock topology"

for server_priv_file in $server_priv ; do
  if [ -e "$PBS_HOME"/server_priv/"$server_priv_file" ]; then
     commands[$cc]="cp -rp $PBS_HOME/server_priv/$server_priv_file $stage/server_priv/$server_priv_file"
     cc=$(($cc+1))
  fi
done
}


get_mom_priv(){

cc=400
mom_priv="config prologue epilogue mom.lock"
for mom_priv_file in $mom_priv ; do
  if [ -f "$PBS_HOME"/mom_priv/"$mom_priv_file" ]; then
     commands[$cc]="cp -p $PBS_HOME/mom_priv/$mom_priv_file $stage/mom_priv/$mom_priv_file"
     cc=$(($cc+1))
  fi
done
commands[$cc]="for x in `$PBS_EXEC/sbin/pbs_mom -s list` ; do echo ; echo "---\$x---" ; echo ; $PBS_EXEC/sbin/pbs_mom -s show \$x ; done >> $stage/mom_priv/vnodedefs"
cc=$(($cc+1))
}




#*******************************
# FUCNTION: JobId()
# SUMMARY:  populates job ids
#*******************************
JobId ()
{

pbs_jobid=$1

#make sure entry is not blank and does not conatain anything other than a-Z 0-9 _ or .
if IsValidJobID $pbs_jobid ; then

        echo " Job id $pbs_jobid processing ..."

        RunAndOutput "mkdir $stage/$pbs_jobid"

	#run a tracejob, if the file is empty abort, if not collect more information

        RunAndOutput "$PBS_EXEC/bin/tracejob -n$n_value_in $pbs_jobid > $stage/$pbs_jobid/${pbs_jobid}_tracejob"

	#if the tracejob was empty we bail, if not we collect more information
	if [ -s $stage/$pbs_jobid/${pbs_jobid}_tracejob ] ; then

		#obtain date of first tracejob entry, store the logfilename
		first_date=`grep -m1 \/ $stage/$pbs_jobid/${pbs_jobid}_tracejob | cut -c 1-10 | awk -F\/ '{print $2 " "  $1 " " $3}'`
		first_log=`grep -m1 \/ $stage/$pbs_jobid/${pbs_jobid}_tracejob | cut -c 1-10 | awk -F\/ '{print $3 $1 $2}'`
		#find the date of the last ltracejob date, store the information in logfile format
		last_log=`tail -n1 $stage/$pbs_jobid/${pbs_jobid}_tracejob | cut -c 1-10 | awk -F\/ '{print $3 $1 $2}'`

		#collect log files based on first and last dates appearing in tracejob output
		#read in entire ls output and then sort out what I want

		server_log_list=`ls $PBS_HOME/server_logs`

		if [ ! -d "$stage"/server_logs ] ; then
		        RunAndOutput "mkdir $stage/server_logs"
		fi

		for c in $server_log_list
		do
			if [ "$c" -ge "$first_log" ] && [ "$c" -le "$last_log" ] ; then
				RunAndOutput "cp -p $PBS_HOME/server_logs/$c $stage/server_logs/"
			fi
		done

		if [ "$PBS_START_SCHED" = "1" ] ; then
			sched_log_list=`ls $PBS_HOME/sched_logs`
			if [ ! -d "$stage"/sched_logs ] ; then

        	        	RunAndOutput "mkdir $stage/sched_logs"
			fi

                	for c in $sched_log_list
	                do
	                        if [ "$c" -ge "$first_log" ] && [ "$c" -le "$last_log" ] ; then
	                                RunAndOutput "cp  -p $PBS_HOME/sched_logs/$c $stage/sched_logs/"
	                        fi
	                done
		else
		Warning " This is not the PBS scheduler system, skipping sched log collection."
		fi

                if [ "$PBS_START_COMM" = "1" ] ; then
                        comm_log_list=`ls $PBS_HOME/comm_logs`
                        if [ ! -d "$stage"/comm_logs ] ; then

                                RunAndOutput "mkdir $stage/comm_logs"
                        fi

                        for c in $comm_log_list
                        do
                                if [ "$c" -ge "$first_log" ] && [ "$c" -le "$last_log" ] ; then
                                        RunAndOutput "cp  -p $PBS_HOME/comm_logs/$c $stage/comm_logs/"
                                fi
                        done
                else
                Warning " This is not a PBS comm scheduler system, skipping comm log collection."
                fi



                if $PBS_EXEC/bin/qstat $pbs_jobid &> /dev/null ; then
                    echo " Job $pbs_jobid is still active, obtaining qstat -f output"
                    RunAndOutput "$PBS_EXEC/bin/qstat -f $pbs_jobid > $stage/$pbs_jobid/${pbs_jobid}_qstat_f"
                fi


	        #delay finds the number of days between two dates
	        #delay d m y d m y
	        today=`date +%d" "%m" "%Y`
	        n_value=`delay $first_date $today`
		       #tracejob on -n 0 is useless, so let's have at least 1.  Also gives one day "fuzz zone" for clusters without date and times exactly matched
        	n_value=$(($n_value+1))

		#THIS IS THE PBS_DTJ STUFF
		#I want to print a warning that we are skipping the pbs_dtj steps, but I need the warning to go at the end.
		# I am setting a flag to be used later (in TarAndClean)
		if [ -n "$force" ] && [ -z "$dtj_user" ] ; then
			dtj_user_warning=1
		fi

		#if they didn't force non-interactive, and didn't give a username, and this hasn't been done once yet
		if [ -z "$force" ] && [ -z "$dtj_user" ] && [ -z "$once" ] ; then
			clear
			echo " The script will now attempt to collect tracejob "
			echo " information from all of the hosts that the PBS"
			echo " job ran on.  To do this requires passwordless ssh"
			echo " or rsh from the server to the execution hosts. "
			echo
			echo " If the root user cannot login to these systems "
			echo " passwordlessly, please supply the username of a "
			echo " user who can.  Press enter to run it as root. "
			echo

			read dtj_user
			once=1

		fi
		if [ -z "$rcommand" ] ; then
		        #see if they are using ssh or rsh
		        if [ -z $PBS_SCP ] ; then
		                rcommand="rsh"
		        else
		                rcommand="ssh"
		        fi
		fi
		if [ -z "$dtj_user" ] ; then
			RunAndOutput " $PBS_EXEC/unsupported/pbs_dtj -n $n_value -r $rcommand $pbs_jobid > $stage/$pbs_jobid/${pbs_jobid}_pbs_dtj"
		else
			RunAndOutput " $PBS_EXEC/unsupported/pbs_dtj -n $n_value -u $dtj_user -r $rcommand $pbs_jobid > $stage/$pbs_jobid/${pbs_jobid}_pbs_dtj"
		fi
	else
		Warning "Job id not found in tracejob for the last 30 days, no information will be collected for this job."
		RunAndOutput "rm $stage/$pbs_jobid/${pbs_jobid}_tracejob"
		RunAndOutput "rmdir $stage/$pbs_jobid"
	fi

	    echo

else
    echo
    echo " Not a valid job id.  Please try again. "
fi


}

CpusetInformation()

{

if [ -z "$cpusetonly" ] && [ -z "$force" ] ; then
        clear
        echo " The script has detected that you are running on an"
        echo " Altix system with the cpuset mom.  It is often"
        echo " helpful to the support team if they can view the"
        echo " configuration of your current cpusets."
        echo
        echo " Is it OK to collect information about all of the"
        echo " cpusets currently created on your system?"
        echo " ([y]|n)"
        read answer

        if [ -z "$answer" ]; then
          answer="y"
        fi
else
	answer="y"
fi

        if [ "$answer" = "Y"  ] || [ "$answer" = "y" ] || [ "$answer" = "yes" ]|| [ "$answer" = "Yes" ] || [ "$answer" = "YES" ]; then

	#find the ProPack version
        ppv=`cat /etc/sgi-release | awk -F " " '{print $3}' | cut -c 1`

        if [ "$ppv" -le "1" ] ; then
                Warning "ProPack versions 2.4 and lower are not supported."
        else
                mkdir $stage/cpusetinfo_PP$ppv
        	cp -p /etc/sgi-release $stage/cpusetinfo_PP$ppv
        fi


        if [ "$ppv" -eq "2" ] || [ "$ppv" -eq "3" ] ; then

		#get the list of current cpusets and store them in setlist
		setlist=`cpuset -Q | awk -F [ '{print $2}' | tr ']' ' '`
		#go through setlist and run qpuset -q ... -p
                for set in $setlist ; do
                        RunAndOutput "cpuset -q $set -p >> $stage/cpusetinfo_PP$ppv/cpuset_q_p.out"
                done
        fi

        if [ "$ppv" -ge "4" ] ;then
                #minimal listing of all cpusets that exist
		RunAndOutput "cp -p /proc/sgi_sn/sn_topology $stage/cpusetinfo_PP$ppv/sn_topology"
                RunAndOutput "cpuset -r -s / > $stage/cpusetinfo_PP$ppv/cpuset_rs.out "

                #ls -l of /dev/cpuset/PBSPro folder itself and the contents
                RunAndOutput "ls -l `find /dev/cpuset/ -name \* ` > $stage/cpusetinfo_PP$ppv/ls_l_cpuset.out"

                #Get the contents of all of the files under /dev/cpuset into one file
                RunAndOutput "head -n-0 `find /dev/cpuset/ -name \* ` > $stage/cpusetinfo_PP$ppv/cpuset_details.out "

         fi
	 fi

}



#***********************************************************
#****************** START OF MAIN **************************
#***********************************************************


#process options
while getopts ":cu:r:n:o:fj:d:g:p:i:" opt; do
    case $opt in
        u  )
        #this is so that the su - dtj_user commands are not used if they decided to enter their current user name into the -u option
        if [ $OPTARG == $LOGNAME ] ; then
                continue
        fi
        dtj_user=$OPTARG ;;
	#this is here so that a user can specify rsh or ssh (or whatever)
        r  )  rcommand=$OPTARG ;;
	#n is just used for the initial server tracejobs.  The pbs_dtj commands will get the calculated n value based on the output of tracejob
        n  )  n_value_in=$OPTARG ;;
	#c is for users who only want to collect cpuset information
	c ) cpusetonly=1 ;;
	o ) outdir=$OPTARG ;;
	f ) force=yes ;;
       	j ) jlist=$OPTARG ;;
       	d ) date_in=$OPTARG ;;
	g ) core=$OPTARG ;;
	p ) p_input=$OPTARG ;;
	i ) daemon_pid_in=$OPTARG ;;
	\? ) Warning "usage: pbs_diag [-c] [-u user] [-r remote_command] [-n xxx] [-o outdir] [-f] [-j jlist] [-d daterange] [-p daemon_name [-i daemon_pid]]"

		Warning " -o dirname"
		Warning " This overrides \$HOME as the default location for"
		Warning " the output .tar.gz file."
		Warning " "
		Warning	" -c"
		Warning " Use this option to run ONLY the cpuset"
		Warning " information gathering portion of the script."
		Warning " This is important if there is a cpuset aware"
		Warning " system that we need information about that is"
		Warning " not running as the PBS server/scheduler."
		Warning " "
		Warning " -f"
		Warning " Use this option for non-interactive mode."
		Warning " The greeting will be supressed, as will the"
		Warning " command execution confirmation and prompts for"
		Warning " additional parameters."
		Warning " "
		Warning " -j jobidlist"
		Warning " Use this option to specify the job ids you want"
		Warning " to collect information on the command line, "
		Warning " rather than being prompted during the running of"
		Warning " the script. "
		Warning " "
		Warning " -d daterange"
		Warning " Use this option to specify a date range or a"
		Warning " single integer representing the number of days in "
		Warning " the past to retrieve server and scheduler logs."
		Warning " Date ranges must be of the form MM/DD/YYYY-MM/DD/YYYY"
		Warning " "
		Warning " -g core_file"
		Warning " Use this option and specify a core file produced by "
		Warning " one of the three PBS daemons to easily obtain a"
		Warning " stack trace.  This will only work on Linux systems "
		Warning " where gdb (the GNU debugger) is installed."
		Warning " "
		Warning " If the -g option is specified, no other data "
		Warning " collection will take place. "
		Warning " "
		Warning " -p daemon_name"
		Warning " Use this option to gather OS level data about a"
		Warning " misbehaving (e.g., hung) daemon process."
		Warning " "
		Warning " If the -p option is specified, no other data collection"
		Warning "  will take place."
		Warning " "
		Warning " -i daemon_pid"
		Warning " In the rare case that the PID stored in the PBS"
		Warning " lock file is incorrect, use this option to specify"
		Warning " the correct PID."
		Warning " "
		Warning " -u username"
		Warning " -r rcommand"
		Warning " -n x"
		Warning " These options are used only during the pbs_dtj"
		Warning " portion of the script and map directly to the"
		Warning " options in that script.  See the pbs_dtj header"
		Warning " for details."
		Warning " "
		Warning " The -u option must be specified for non-interactive use."
		Warning " "
	     exit 1
    esac
done
shift $(($OPTIND - 1))

# checking if pbs is installed on local machine

conf=${PBS_CONF_FILE:-/etc/pbs.conf}

if [ -f "$conf" ] ; then
    source $conf
else
    Warning " "
    Warning "The pbs.conf file cannot be found, which probably means that"
    Warning "PBS Professional is not installed on this machine.  Please "
    Warning "run this scrip on a machine that has PBS Professional installed."
    Warning " "
    Warning "If you have pbs.conf in a directory other than /etc, please"
    Warning "make sure the PBS_CONF_FILE is set to point at it."
    Warning " "
    exit 102
fi

#see if we are just processing a core file.  if so, do it and exit

if [ -n "$core" ] ; then
        if file $core 2>&1  | grep pbs_server.bin 2>&1 > /dev/null ; then
                binary=pbs_server.bin
        elif file $core 2>&1  | grep pbs_sched 2>&1 > /dev/null ; then
                binary=pbs_sched
        elif file $core 2>&1  | grep pbs_mom 2>&1 > /dev/null ; then
                binary=pbs_mom
        elif file $core 2>&1  | grep pbs_comm 2>&1 > /dev/null ; then
                binary=pbs_comm
        else
                echo "Error: core file does not appear to have been generated by a PBS Professional daemon." >&2
                exit 1
        fi

        cmd_file=/tmp/pbs_core_cmd_file
        tmp_file=/tmp/pbs_core_tmp_output
        echo "bt" > $cmd_file
        gdb $PBS_EXEC/sbin/$binary $core < $cmd_file 2>&1 > $tmp_file
        rm -f $cmd_file
        echo "info threads" > $cmd_file
        for x in $(seq 1 $(grep "\[New " $tmp_file  | wc -l)) ; do echo "thread $x" >> $cmd_file ; echo "bt" >> $cmd_file ; done
        gdb $PBS_EXEC/sbin/$binary $core < $cmd_file
        rm -f $tmp_file
        rm -f $cmd_file
        echo
        exit 0
fi



#need to see if -o was specified, if it wasn't, set outdir to $HOME

if [ -z "$outdir" ] ; then
	outdir=$HOME
fi


#if -p was spcified with a daemon name

if [ -n "$p_input" ] ; then

	MakeStageName

	if ! mkdir $stage ; then
		Warning "Could not create $stage   exiting"
		exit 1
	fi

	stage_exists=1


	case $p_input in
	        *server*) daemon="server"
	                daemon_priv="server_priv"
	                ;;
	        *sched) daemon="sched"
	                daemon_priv="sched_priv"
	                ;;
	        *mom) daemon="mom"
	                daemon_priv="mom_priv"
	                ;;
	        *comm) daemon="comm"
	                daemon_priv="server_priv"
	                ;;
	        *) echo "usage: $0 -p daemon_name"
	                exit 1
	                ;;
	esac

	echo "pbs_diag running in process information gathering mode."
	echo "This will likely take about 40 seconds."

	if [ -n "$daemon_pid_in" ] ; then
		daemon_pid=${daemon_pid_in}
	else
		daemon_pid=$(cat ${PBS_HOME}/${daemon_priv}/${daemon}.lock)

	fi

	top -b -n10 -p $daemon_pid > ${stage}/top &
	ps -efj | grep pbs | grep -v grep > ${stage}/ps_efj_grep_pbs
	ps -efj | grep $(for x in $(ps -efj | grep pbs_${daemon} | grep -v grep | awk '{print $3 "X" $5}') ; do if [ $(echo $x | cut -c 1-2) = "1X" ] ; then  echo $x | awk -FX '{print $2}' ; fi ; done) | grep -v "top -b " > ${stage}/ps_efj_grep_sid
	ps -efj | grep $daemon_pid | grep -v grep | grep -v "top -b " > ${stage}/ps_efj_grep_pid
	netstat -anp | grep pbs > ${stage}/netstat_anp_grep_pbs
	cat /proc/$daemon_pid/limits > ${stage}/proc_limits
	cat /proc/$daemon_pid/environ > ${stage}/proc_environ
	cat /proc/$daemon_pid/stat > ${stage}/proc_stat
	ls -l /proc/$daemon_pid/cwd > ${stage}/proc_cwd
	lsof | grep pbs > ${stage}/lsof_grep_pbs
	ls -l /proc/$daemon_pid/fd > ${stage}/proc_fd
	#unfortunately I can't strace and gstack at the same time, pthreads prevents it I think
	strace -tt -T -ff -s 1024 -o ${stage}/strace.out -p $daemon_pid &
	strace_pid=$!
	sleep 30
	kill ${strace_pid}
	for x in $(seq 1 10) ; do gstack $daemon_pid >> ${stage}/gstack.out ; sleep 1 ; echo "------ $(date) -----" >> ${stage}/gstack.out ; done
	#just gathering this to cross reference with lsof output
	cat /etc/services > ${stage}/etc_services
	cp ${PBS_HOME}/${daemon}_logs/$(date +%Y%m%d) ${stage}/
	gcore -o ${stage}/core $daemon_pid
	wait
	


	TarAndClean

	exit 0

fi

#if -c option was specified we really only want to collect cpuset information
if [ -n "$cpusetonly" ] ; then
#	if pbsnodes `hostname` | grep linux_cpuset ; then
	#this is how we check to see if it is running the cpuset mom
#	if diff $PBS_EXEC/sbin/pbs_mom $PBS_EXEC/sbin/pbs_mom.cpuset ; then
	if ldd $PBS_EXEC/sbin/pbs_mom 2> /dev/null | grep libcpuset ; then

		MakeStageName
		if ! mkdir $stage ; then
			Warning "Could not create $stage   exiting"
			exit 1
		fi
		#set a flag to let us know that stage exists
		stage_exists=1
		mkdir $stage/mom_priv

		#get the contents of mom_priv directory, it's cheesey to repeat get_mom_priv here, but we are not using the commands array in this case...
		mom_priv="config prologue epilogue mom.lock"

		for mom_priv_file in $mom_priv ; do
		  if [ -f "$PBS_HOME"/mom_priv/"$mom_priv_file" ]; then
		     RunAndOutput "cp -p $PBS_HOME/mom_priv/$mom_priv_file $stage/mom_priv/$mom_priv_file"
		  fi
		done

		RunAndOutput "for x in `$PBS_EXEC/sbin/pbs_mom -s list` ; do echo ; echo "---\$x---" ; echo ; $PBS_EXEC/sbin/pbs_mom -s show \$x ; done >> $stage/mom_priv/vnodedefs"

		CpusetInformation
		TarAndClean
		exit 0
	else
		Warning "Not a cpuset system!"
		exit 1
	fi
fi #cpusetonly

#if -n was not specified we set it to 30 as default
if [ -z "$n_value_in" ] ; then
	n_value_in=30
fi



if [ -z $force ] ; then
	clear
	echo
	echo "     Welcome to PBS Professional Technical Support"
	echo
	echo " This script is intended to provide PBS technical support"
	echo " engineers necessary information such as server settings,"
	echo " custom scripts, and logs files to be able to provide help "
	echo " more efficiently."
	echo
	echo " Note that this script must be run as root (UID 0)."
	echo " You are welcome to examine the contents of the resulting"
	echo " $outdir/pbs_diag_xxx.tar.gz file before you send it to PBS support."
	echo
	echo " Please press any key to continue."
	echo

	read wait

	clear
fi

#************************************************************
# Make sure all criteria are met to go ahead with script run
#************************************************************

# check to see if UID 0 is running this script
if [ "$UID" != 0 ] ; then
     echo
     Warning " User 'root' (or UID 0) needs to run this script.  Please rerun"
     Warning " this script as 'root'. "
     exit 101
fi

# check if system is configured as PBS server
if [ "$PBS_START_SERVER" = "0" ] ; then
   Warning " "
   Warning "The pbs_diag script must be running on the PBS server/scheduler"
   Warning "system and the pbs_server daemon must be running."
   Warning "Please run this script on the PBS server system, with the"
   Warning "pbs_server daemon running."
   Warning " "
   exit 103
fi



# setting up the staging area to collect necessary files

MakeStageName

#*********************************************
# Get all the commands to run into an array
#*********************************************

#as opposed to running the commands directly I am placing them all in an array
#this is so I can run them from a function and perform logging, and so I can output the array for approval by the user


#NOTE we cannot add "mkdir $stage" to the array, since the command is echoed to the logfile in that directory
commands[1]="mkdir $stage/server_priv"
commands[2]="mkdir $stage/sched_priv"
commands[3]="mkdir $stage/mom_priv"

#platform independant information

commands[4]="$PBS_EXEC/sbin/pbs_probe -v > $stage/pbs_probe.out"
commands[6]="$PBS_EXEC/bin/qmgr -c \" print server\" > $stage/qmgr_ps.out"
commands[7]="$PBS_EXEC/bin/qmgr -c \" print node @$PBS_SERVER\" > $stage/qmgr_pn.out"
commands[8]="cp /etc/pbs.conf $stage/."
commands[9]="$PBS_EXEC/bin/pbsnodes -a > $stage/pbsnodes_a.out"
commands[10]="$PBS_EXEC/bin/pbsnodes -va > $stage/pbsnodes_va.out"
commands[11]="$PBS_EXEC/bin/qstat -Bf > $stage/qstat_Bf.out"
commands[12]="$PBS_EXEC/bin/qstat -Qf > $stage/qstat_Qf.out"
commands[13]="$PBS_EXEC/bin/qstat -ns > $stage/qstat_ns.out"
commands[14]="$PBS_EXEC/bin/qstat -f  > $stage/qstat_f.out"  #should I remove this?  I am getting qstat -f for any jobId they enter, and this could be big...
commands[15]="$PBS_EXEC/bin/pbs_rstat -f > $stage/pbs_rstat_f.out"
commands[16]="$PBS_EXEC/bin/pbs_rstat > $stage/pbs_rstat.out"
commands[17]="$PBS_EXEC/bin/pbs_hostn -v `hostname` > $stage/pbs_hostn.out"
commands[18]="$PBS_EXEC/bin/qstat -t > $stage/qstat_t.out"
commands[19]="$PBS_EXEC/bin/qstat -tf > $stage/qstat_tf.out"
commands[20]="$PBS_EXEC/bin/qstat -x > $stage/qstat_x.out"
commands[21]="$PBS_EXEC/bin/qmgr -c \" p h @default\" > $stage/qmgr_ph.out"
commands[22]="$PBS_EXEC/bin/qmgr -c \" p sched\" > $stage/qmgr_psched.out"
commands[23]="$PBS_EXEC/bin/qstat -xf > $stage/qstat_xf.out"
commands[24]="$PBS_EXEC/bin/qmgr -c \" list pbshook\" > $stage/qmgr_lpbshook.out"
commands[25]="cp -r $PBS_HOME/datastore/pg_log/ $stage/pg_log/"
commands[26]="cp $PBS_HOME/pbs_environment $stage/."
commands[27]="$PBS_EXEC/bin/pbsnodes -avSj > $stage/pbsnodes_avSj.out"
commands[28]="$PBS_EXEC/bin/pbsnodes -aSj > $stage/pbsnodes_aSj.out"
commands[29]="$PBS_EXEC/bin/pbsnodes -avS > $stage/pbsnodes_avS.out"
commands[30]="$PBS_EXEC/bin/pbsnodes -aS > $stage/pbsnodes_aS.out"
commands[31]="$PBS_EXEC/bin/pbsnodes -aFdsv > $stage/pbsnodes_aFdsv.out"
commands[32]="$PBS_EXEC/bin/pbsnodes -avFdsv > $stage/pbsnodes_avFdsv.out"

#platform specific commands

#get the operating system type
ostype=`uname -m 2>/dev/null`
case "$ostype" in
        CRAY*) ostype="$ostype" ;;
        *) ostype=`uname 2>/dev/null` ;;
esac


#TESTING  unalias uname on all systems when testing

case  "$ostype" in
        IRIX*)
                ;;
        SunOS)
                ;;
        Linux) commands[100]="cat /etc/*release* > $stage/OSrelease"
               commands[101]="ps -ef | grep pbs | grep -v grep > $stage/ps_ef.out"
               commands[102]="uname -a > $stage/uname.out"
                ;;
        CRAY*)
                ;;
       Darwin) commands[100]="system_profiler SPSoftwareDataType |grep Ver > $stage/OSrelease"
               commands[101]="ps -ax | grep pbs | grep -v grep > $stage/ps_ef.out"
               commands[102]="uname -a > $stage/uname.out"
                ;;

        AIX*)    #THIS MAY NOT BE RIGHT
                ;;
        *)
                ;;

esac

get_server_priv
get_sched_priv
get_mom_priv

if [ -z $force ] ; then
	#display commands to the user so they can see what will be done
	echo "This script will now run these commands:"
	echo

	#again, mkdir $stage is a special case
	echo "mkdir $stage"

	#echo array of commands to screen, ask for permission to continue
	for c in "${commands[@]}"; do
	  echo "$c"
	done

	echo
	echo "Is it OK to run the above commands? ([y]|n)"
	read answer

	if [ -z "$answer" ]; then
	  answer="y"
	fi
else
	answer="y"
fi

if [ "$answer" = "Y"  ] || [ "$answer" = "y" ] || [ "$answer" = "yes" ]|| [ "$answer" = "Yes" ] || [ "$answer" = "YES" ]; then
     clear
else
     echo " exiting"
     exit 0
fi

#we got approval, let's set up stage dir and actually run the commands
if ! mkdir $stage ; then
	Warning "Could not create $stage   exiting"
	exit 1
fi

echo " Created $stage directory."

#set a flag to let us know that stage exists
stage_exists=1

#actually execute commands

for c in "${commands[@]}"; do
RunAndOutput "$c"
done


#Is this running on a cpuset aware mom system?  If so we need to ask if we can gather cpuset information
#this is how we check to see if it is running the cpuset mom
if ldd $PBS_EXEC/sbin/pbs_mom 2> /dev/null | grep -q libcpuset ; then
       	CpusetInformation
fi

if [ -z $force ] && [ -z $jlist ] ; then
clear
echo
echo " If you are experiencing problems relating to"
echo " specific PBS jobs, please enter the appropriate"
echo " job id(s)."
echo
echo " In addition to gathering the information about a"
echo " problematic job, it may be helpful to enter the"
echo " job id of a typical job that does not exhibit the"
echo " problematic behavior for comparison."
echo
echo " Please input one job id per line."
echo
echo " When finished, please enter "q"."
echo

#see if peer scheduling is enabled
if grep peer_queue $PBS_HOME/sched_priv/sched_config | grep -v "^\ *\#" ; then
   echo
   echo " PEER SCHEDULING IS ENABLED"
   echo " Enter the full PBS job id, including server name"
   echo " Ex. 1234.server1"

else
	echo
	echo " Enter numerical portion of job id: "

fi

	while read pbs_jobid
	do
		if [ -z "$pbs_jobid" ] ; then
			continue
		fi

		if [ "$pbs_jobid" == "q" ] || [ "$pbs_jobid" == "Q" ] ; then
			break
		fi

		JobId $pbs_jobid
		clear
		echo " Enter the next job id or 'q' to continue: "
	done
else

jlist=`echo $jlist | tr ',' ' '`
jlistct=`echo $jlist | tr ' ' '\n' | wc -l`

for j in $jlist ; do
	JobId $j
	shift $(($jlistct - 1))
done

fi

if [ -z "$force" ] && [ -z "$date_in" ] ; then

	clear
	echo
	echo " Did you experience strange PBS behavior on a"
	echo " specific date or range of dates?  If so, please"
	echo " enter the dates below in the form "
	echo " MM/DD/YYYY-MM/DD/YYYY"
	echo
	echo " You may also enter a single number indicating the"
	echo " number of days in the past you wish to collect"
	echo " logs from"
	echo
	echo " When finished, please enter "q"."
	echo

	while read date_range
	do

		#if they didn't enter anything keep waiting for input
	        if [ -z "$date_range" ] ; then
	                continue
	        fi

		#they want out
	        if [ "$date_range" == "q" ] || [ "$date_range" == "Q" ] ; then
                break
	        else
		        PBSLog $date_range
	        fi



        	clear
        	echo
	        echo " Enter another date, date range, or number of days."
	        echo
	        echo " Please enter the dates below in the form "
	        echo " MM/DD/YYYY-MM/DD/YYYY"
	        echo
	        echo " When done, please enter "q"."

	done
else
	if [ -n "$date_in" ] ; then
		PBSLog $date_in
	else
		PBSLog 1
	fi
fi


TarAndClean

exit 0
