Softpanorama

May the source be with you, but remember the KISS principle ;-)
Contents Bulletin Scripting in shell and Perl Network troubleshooting History Humor

ql command

News Enterprise Job schedulers Recommended Links qb ql qh  jobpar
Parallel Environment Client Commands Monitoring Queues        
Installation of SCE on a small set of multicore servers Usage of NFS Installation of the Master Host Installation of the Execution Hosts Creating and modifying SGE Queues Submitting Jobs To Queue Instance Monitoring and Controlling Jobs
qconf qstat qmod qalter -- Change Job Priority qsub -- Submitting Jobs To Queue Instance qacct command MPI
Troubleshooting Gridengine diag tool Slot limits and restricting number of slots per server Resource Quotas Perl Admin Tools and Scripts Humor Etc

The ql command parses a lot of SGE data and outputs an overall picture of the cluster load:

?
jbp@head1 [ 82 ] % ql 

      128 total nodes 

       12 nodes are down 

      232 total number of cpus 

      102 total load (rounded up: 141)   

       54 nodes have a load avg of  0.00 to  0.50

       10 nodes have a load avg of  0.50 to  1.00

       20 nodes have a load avg of  1.00 to  1.50

       14 nodes have a load avg of  1.50 to  2.00

       30 nodes have a load avg of  2.00 to  2.50

In this case, the system is about half-loaded there are 232 CPUs available in the system and 102 of them are being used. Since the 'rounded up' number is slightly higher than the total load, this indicates that some of the jobs are running at higher load than they "should" be. If we look at the next block of information, we see that 30 nodes have a load of 2.00 to 2.50, but we know that each machine has only 2 CPUs, so some of those machines are actually over-loaded. However, looking at this information, we can see that 130 CPUs are unused and thus if we wanted to launch a large parallel job, it is likely that we would be able to do so without waiting in the queue for very long.

 
#!/usr/bin/perl
#
# (C) 2004-2009, John Pormann, Duke University
#      jbp1@duke.edu
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# RCSID: $Id: ql,v 1.15 2006/08/09 14:50:39 jbp1 Exp jbp1 $
#
# ql - parse SGE queue info into overall system load info

use Getopt::Std;
getopts('vVr:hHjT:');

if( defined($opt_h) ) {
	print "usage: ql [-v] [-l] [-r #]\n"
	  .  "  -j    show job-load\n"
	  .  "  -H    show histogram\n"
	  .  "  -r #  sets rounding parameter\n"
	  .  "  -v    verbose output\n"
	  .  "  -V    really verbose output\n";
	exit( 1 );
}

if( not defined($opt_r) ) {
	$round_l = 0.5;
} else {
	$round_l = $opt_r;
}
$inv_round_l = 1.0/$round_l;

%hostinfo = ();
%jobinfo = ();
&process_sge_info();

if( defined($opt_V) ) {
	foreach $k ( keys(%hostinfo) ) {
		$v = $hostinfo{$k};
		@jlist = split( /\|/, $v );
		$x = scalar(@jlist) - 1;
		print "host [$k]  [$x][$v]\n";
	}
	foreach $k ( keys(%jobinfo) ) {
		$v = $jobinfo{$k};
		print "job [$k]  [$v]\n";
	}
}

@loadavg_hist = ();
@loadpct_hist = ();
@jobload_hist = ();
$tot_mach = 0;
$tot_cpus = 0;
# sum of the loads on the machines
$tot_load = 0;
# sum of the loads on the machines (rounded up)
$tot_rload = 0;
# sum of the job-loads on the machines
$tot_jload = 0;
# any 'down' machines?
$tot_down = 0;
# how many are up?
$tot_up   = 0;

foreach $host ( keys(%hostinfo) ) {
	$data = $hostinfo{$host};
	@list = split( '!', $data );

	$tot_mach++;

	if( ($list[0] eq '-') or ($list[2] eq '-') ) {
		if( defined($opt_V) ) {
			print "* host [$host] is down\n";
		}
		$tot_down++;
		$list[1] = 0;
		$list[2] = 0;
	} else {

		$tot_up++;
		$tot_cpus += $list[1];
		$tot_load += $list[2];
		$tot_rload += int( $list[2] + 0.99 );

		@jlist = split( /\|/, $data );
		$x = scalar(@jlist) - 1;
		$tot_jload += $x;
		$jobload_hist[$x]++;

		if( ($list[1]+0) > 0 ) {
			$lp = 100 * $list[2] / $list[1];
			$lp = int( $lp/$round_l + 0.5 )*$round_l;
		} else {
			$lp = 'inf';
		}
		$la = int( $list[2]*$inv_round_l + 0.5 );
		$loadavg_hist[$la]++;

		if( defined($opt_T) ) {
			if( $x >= ($opt_T+0) ) {
				print "host [$host] has [$x] jobs [$data]\n";
			}
		}
	}
}

printf "\t%4d total hosts\n",$tot_mach;
if( $tot_down > 0 ) {
	printf "\t%4d hosts are down\n",$tot_down;
}
printf "\t%4d total number of cpus\n",$tot_cpus;
printf "\t%4d total load (rounded up: %d)\n",$tot_load,$tot_rload;
print "\n";

if( defined($opt_j) and defined($opt_H) ) {
	&print_jobhist();
} elsif( defined($opt_j) ) {
	&print_jobload();
} elsif( defined($opt_H) ) {
	&print_loadhist();
} else {
	&print_loadavg();
}

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

sub process_sge_info {
	my ($host,$job,$prio);
	my @list;

	open( FP, "/usr/bin/qhost -q -j |" );

	$host = '';
	$job	= -1;

	while(  ) {
		chomp( $_ );
		if( $_ =~ m/HOSTNAME\s+ARCH\s+NCPU/ ) {
			# header line, skip it
		} elsif( $_ =~ m/job\-ID\s+prior\s+name/ ) {
			# header line, skip it
		} elsif( $_ =~ m/\-{20,}/ ) {
			# separator line, skip it
		} elsif( $_ =~ m/^\w/ ) {
			# first char is a letter, so it is a host
			@list = split( /\s+/, $_ );
			$host = shift( @list );
			$hostinfo{$host} = join( '!', @list );
		} elsif( $_ =~ m/^\s+\d/ ) {
			# first chars are spaces, then digit, so it is a job num
			$_ =~ s/^\s+//;
			@list = split( /\s+/, $_ );
			$job = shift( @list );
			if( exists($jobinfo{$job}) ) {
				$jobinfo{$job} .= "|$host!$prio!" . join( '!', @list );
			} else {
				$jobinfo{$job} = "$host!$prio!" . join( '!', @list );
			}
			$hostinfo{$host} .= "|$job!$prio";
		} elsif( $_ =~ m/^\s+high/ ) {
			$prio = 'hi';
		} elsif( $_ =~ m/^\s+low/ ) {
			$prio = 'lo';
		} elsif( $_ =~ m/^\s+\w/ ) {
			# first chars are spaces, then letter, so it is a continuation of a job
			$_ =~ s/^\s+//;
			$_ =~ s/\s+/!/g;
			@list = split( /!/, $_ );
			$jobinfo{$job} .= "|$host!$prio!" . join( '!', @list );
			$hostinfo{$host} .= "|$job!$prio";
		} else {
		}
	}

	# all done, rest of lines have pending job info
	close( FP );

	# get rid of 'global' item
	delete( $hostinfo{global} );

	return;
}

sub print_jobload {
	for($i=0;$i 0 ) {
			$x = $i;
			$y = $x + 0.99;
			printf "\t%4d nodes have a job-load of %5.2f to %5.2f\n",
					$jobload_hist[$i],$x,$y;
		}
	}
}

sub print_jobhist {
	$ppp = 50 / $tot_up;
	$xxx = ' ' x 9 . '|';
	$x = $ppp * 100;
	$y = 1/$ppp;
	printf "                            one '#' is 2%% or %.1f nodes\n", $x,$y;
	print  "   nds :   job-load  : |$xxx$xxx$xxx$xxx${xxx}100%\n";
		for($i=0;$i 0 ) {
			$x = $i * $round_l;
			$y = $x + $round_l;
			printf "\t%4d nodes have a load avg of %5.2f to %5.2f\n",$loadavg_hist[$i],$x,$y;
		}
	}
}