#!/bin/bash
#
# Copyright (C) 1994-2018 Altair Engineering, Inc.
# For more information, contact Altair at www.altair.com.
#
# This file is part of the PBS Professional ("PBS Pro") software.
#
# Open Source License Information:
#
# PBS Pro is free software. You can redistribute it and/or modify it under the
# terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# PBS Pro is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.
# See the GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Commercial License Information:
#
# For a copy of the commercial license terms and conditions,
# go to: (http://www.pbspro.com/UserArea/agreement.html)
# or contact the Altair Legal Department.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.
#                                   pbs_dtj
#
# File: pbs_dtj
# Summary: gathers log information from all nodes that a job ran on
# Date: May 19, 2005
#
#
#
#  pbs_dtj (Distributed TraceJob) is a command that enables a user to gather
#  tracejob information from ALL of the nodes where a PBS Professional job ran.
#  The script uses rsh to connect to the nodes by default, though it will check
#  the pbs.conf file to see if PBS_SCP is set, and use ssh in that case.  Because
#  uses rsh or ssh to connect, it is assumed that the user running the script has
#  passwordless remote access to the nodes.
#
#  The script first performs a tracejob on the PBS server system.  It then extracts
#  from that information which nodes the job actually ran on, then performs the
#  tracejob on those nodes.  All output goes to stdout.
#
#  pbs_dtj accepts these options:
#
#  -u username
#    This option is useful if you are running the script as root, but you do not have
#    passwordless remote access set up for the root account.  You can specify a different
#    username here to be used when connecting to the remote nodes.
#
#  -r rcommand
#    This option can be used to override the default choice of rsh or ssh (based on
#    PBS_SCP in pbs.conf).  Anything could actually be used here as long as it allows
#    "rcommand username@remotehost command" rsh-like syntax.
#
#  -n
#    This option specifies the number of days ago the job ran.  It is analogous to the -n
#    option to the tracejob command.  Default value is 3.
#
#  -s
#    Optionally override pbs.conf to specify server name to use.
#
#


#function to test validity of characters in a Job ID, amy need to extend this if there are other characters allowed aside from . and _
IsValidJobID()
{
case $1 in
        *[!a-zA-Z0-9_\.\-\[\]]*|"")
        return 1;;
*)
        return 0;;
esac
}


#this is where the actual tracing is done
PerformTrace()
{

#FUTURE: I need to do a timeout to determine if passwordless ssh is working, do on a date command so I can timeout more reliably (as opposed to a tracejob on the server).  This should not be in the PerformTrace function though

        #create a temporary file name to store the tracejob of the server in
        tmpfile="/tmp/pbs_dtj_"`date +%y%m%d_%T | tr -d :`


        #if we are running as a different user we want to insert the su - username -c stuff
        if [ -n "$run_as_user" ] ; then

#DEBUG          echo "su - $run_as_user -c "$rcommand $server_hostname $PBS_EXEC/bin/tracejob -n$n_value_in $pbs_jobid""
#DEBUG          echo
                #Do the tracejob of the server, store it in the tempfile
                if ! su - $run_as_user -c "$rcommand $server_hostname $PBS_EXEC/bin/tracejob -n$n_value_in $pbs_jobid" > $tmpfile ; then
                        Warning "This user cannot connect to $server_hostname using $rcommand "
                        Warning "passwordlessly.  Please try again."
                        rm $tmpfile
                        exit 1
                fi

        #not running as another user, do not su - username -c stuff to avoid being prompted for passwd
        else
#DEBUG          echo "$rcommand $server_hostname $PBS_EXEC/bin/tracejob -n$n_value_in $pbs_jobid"
#DEBUG          echo

                #Do the tracejob of the server, store it in the tempfile
                if ! $rcommand $server_hostname $PBS_EXEC/bin/tracejob -n$n_value_in $pbs_jobid > $tmpfile ; then
                        Warning "This user cannot connect to $server_hostname using $rcommand "
                        Warning "passwordlessly.  Please try again."
                        rm $tmpfile
                        exit 1
                        fi
        fi

        #we grab the list of hosts that the job ran on, store them as a newline delimited list in hosts
#for versions 7.1 and before
#        hosts=`grep "Job Run at request of" $tmpfile | awk -F "on hosts" '{print $2}' | tr '+' '\n' | awk -F : '{print $1}'`

#This is where we get the list of hosts that a job ran on.  This is not the ideal way to do this,
#as we are seeing really exec_vnode here, not exec_host.  We could get exec_host from the
#tracejob, but it is only displayed if the tracejob command is run as rooti (accouting log)...

#In any case, this is ok for >90% of the time, as by default on all multi vnoded systems, we name the vnodes host[xxx].

#the alternative to making the host[xxx] naming convention assumption would be to get the vnode name,
#then find it's mapping from the mom_priv/vnodemap file.  The downfall here is that this file is
#not present on a "commands only" system.

        hosts=`grep "Job Run at request of" $tmpfile | awk -F "on exec_vnode" '{print $2}' | sed s/[\(\)]//g | tr '+' '\n' | awk -F : '{print $1}' | awk -F [ '{print $1}'`

        #we don't need to do the same command on a single host more than once
        hosts=`echo $hosts |  tr ' ' '\n' | sort | uniq`

        #output tracejob from the server first
        echo
        echo "------------------$server_hostname------------------"
        cat $tmpfile
        echo
        echo


        tmpscript="/tmp/pbs_dtj_script_"`date +%y%m%d_%T | tr -d :`
        echo "#!/bin/sh " > $tmpscript

        # we have to build a script that can be run under su if necessary, so all this stuff is echoed to the $tmpscript, then the script is run
        for host in $hosts ; do
                #we already echoed the $server_hostname tracejob, we don't need to do it again even if the job actually ran on the server
                if [ $host = $(echo $server_hostname | awk -F. '{print $1}') ] ; then
                        continue
                fi

                #I am looking to get the PBS_EXEC path on all of the mom systems so I can give a full path to tracejob
                echo "echo "------------------$host------------------"" >> $tmpscript
#DEBUG          echo "echo" >> $tmpscript
#DEBUG          echo "echo "$rcommand $host \$pbs_exec_local/bin/tracejob -n$n_value_in $pbs_jobid"" >> $tmpscript
#DEBUG          echo "echo" >> $tmpscript
                #We make sure we have the correct path for tracejob, then run it on the remote host
                echo "$rcommand $host \"localpath=\\\`grep PBS_EXEC \\\${PBS_CONF_FILE:-/etc/pbs.conf} | awk -F= {'print \\\$2'}\\\` ; \\\$localpath/bin/tracejob -n$n_value_in $pbs_jobid\"" >> $tmpscript
                echo "echo" >> $tmpscript
                echo "echo" >> $tmpscript

        done
        chmod +x $tmpscript

#don't su execute the script if I no -u argument was supplied  (su - user1 executed as user1 still requires a passwd)

        if [ -n "$run_as_user" ] ; then
                su - $run_as_user -c $tmpscript
        else
                $tmpscript
        fi

        rm $tmpfile
        rm $tmpscript
}


# Simple Warning message to STDERR
Warning () {
  msg=$1
  echo "$msg" >&2
}



###############START OF MAIN##################


while getopts ":u:r:n:s:" opt; do
    case $opt in
        u  )
        #this is so that the su - run_as_user commands are not used if they decided to enter their current user name into the -u option
        if [ $OPTARG == $LOGNAME ] ; then
                continue
        fi
        run_as_user=$OPTARG ;;
        r  )  rcommand=$OPTARG ;;
        n  )  n_value_in=$OPTARG ;;
        s  )  server_hostname_in=$OPTARG ;;
        \? ) Warning "usage: pbs_dtj [-u user] [-r remote_command] [-n xxx] [-s server_name] pbs_jobid"
            exit 1
    esac
done
shift $(($OPTIND - 1))

#if -n was not specified we set it to 3 as default
if [ -z "$n_value_in" ] ; then
        n_value_in=3
fi

#see if job_id was given on command line
if [ -n "$1" ] ; then
        pbs_jobid=$1
        non_interactive=1
fi


if [ -n "$non_interactive" ] ; then
        if ! IsValidJobID $pbs_jobid ; then
                Warning "Not a valid job id!"
                exit 1
        fi
fi


#source the pbs.conf file
conf=${PBS_CONF_FILE:-/etc/pbs.conf}
source $conf

if [ -z $PBS_PRIMARY ] ; then
        if [ -z $PBS_SERVER_HOST_NAME ] ; then
               server_hostname=$PBS_SERVER
        else
               server_hostname=$PBS_SERVER_HOST_NAME
        fi
else
        server_hostname=$PBS_PRIMARY
fi

if [ -n "$server_hostname_in" ] ; then
        server_hostname=$server_hostname_in
fi


#       echo "This tool must be run as a user who has passwordless ssh or rsh access"
#       echo "to all of the execution hosts in the cluster."
#       echo

#       echo "This process may slow down systems that are currently running PBS jobs."
#       echo "This is because we will need to parse the logs on any systems where a job ran"
#       echo "to collect information about it."



#see if remote command was supplied on command line via -r
if [ -z "$rcommand" ] ; then
        #see if they are using ssh or rsh
        if [ -z $PBS_SCP ] ; then
                rcommand="rsh"
        else
                rcommand="ssh"
        fi
fi


#if this is a non interactive run, then call PerformTrace now.

if [ -n "$non_interactive" ] ; then

        PerformTrace
else
           echo " Enter the PBS job id.  Enter q to quit."
           echo
           echo " If peer scheduling is enabled please be sure to  "
           echo " enter the full PBS job id, including server name"
           echo " Ex. 1234.server1"


        while read pbs_jobid
        do

        if [ -z "$pbs_jobid" ] ; then
                continue
        fi

        if [ $pbs_jobid = "q" ] || [ $pbs_jobid = "Q" ] || [ $pbs_jobid = "quit" ] || [ $pbs_jobid = "QUIT" ]; then
                exit 0
        fi

        #make sure entry is not blank and does not conatain anything other than a-Z 0-9 _ or .
        if ! IsValidJobID $pbs_jobid ; then
                Warning "Not a valid job id, please try again."
                continue
        fi

        PerformTrace

        echo "Enter another job id, enter \"q\" to quit."

        done # done reading pbs_jobid
fi



exit 0

