[gridengine users] All queues dropped because of overload or full

Pat Haley phaley at mit.edu
Wed May 25 14:58:54 UTC 2016


Hi All,

We've just upgraded our cluster from Rocks 6.0 to Rocks 6.2.  One issue 
that has come up is that SGE is not scheduling any jobs, they sit in the 
queue in a wait state.  Checking with "qstat -j" gives the following

scheduling info:            All queues dropped because of overload or full

We have also noticed that there are no sge deamons running on any of the 
execution nodes (I don't know if that is normal or not).  We have also 
collected the information below from qconf.  Any help in resolving this 
would be greatly appreciated.

Thanks

-------------------------------------------------

[root at mseas ~]# qconf -sql
no cqueue list defined

-------------------------------------------------
[root at mseas ~]# qconf -tsm
root at mseas.local triggers scheduler monitoring
[root at mseas common]# more schedd_runlog
Wed May 25 09:22:56 2016|-------------START-SCHEDULER-RUN-------------
Wed May 25 09:22:56 2016|All queues dropped because of overload or full
Wed May 25 09:22:56 2016|--------------STOP-SCHEDULER-RUN-------------

-------------------------------------------------
[root at mseas ~]# qconf -sconf
#global:
execd_spool_dir              /opt/gridengine/default/spool
mailer                       /bin/mail
xterm                        /usr/bin/X11/xterm
load_sensor                  none
prolog                       none
epilog                       none
shell_start_mode             posix_compliant
login_shells                 sh,bash,ksh,csh,tcsh
min_uid                      0
min_gid                      0
user_lists                   none
xuser_lists                  none
projects                     none
xprojects                    none
enforce_project              false
enforce_user                 auto
load_report_time             00:00:40
max_unheard                  00:05:00
reschedule_unknown           02:00:00
loglevel                     log_warning
administrator_mail           none
set_token_cmd                none
pag_cmd                      none
token_extend_time            none
shepherd_cmd                 none
qmaster_params               none
execd_params                 none
reporting_params             accounting=true reporting=true \
                              flush_time=00:00:15 joblog=true 
sharelog=00:00:00
finished_jobs                100
gid_range                    20000-20100
qlogin_command               builtin
qlogin_daemon                builtin
rlogin_command               builtin
rlogin_daemon                builtin
rsh_command                  builtin
rsh_daemon                   builtin
max_aj_instances             2000
max_aj_tasks                 75000
max_u_jobs                   0
max_jobs                     0
max_advance_reservations     0
auto_user_oticket            0
auto_user_fshare             0
auto_user_default_project    none
auto_user_delete_time        86400
delegated_file_staging       false
reprioritize                 0
jsv_url                      none
jsv_allowed_mod              ac,h,i,e,o,j,M,N,p,w

-------------------------------------------------

[root at mseas ~]# qconf -se compute-0-0
hostname              compute-0-0.local
load_scaling          NONE
complex_values        NONE
load_values arch=linux-x64,num_proc=4,mem_total=63802.578125M, \
swap_total=996.210938M,virtual_total=64798.789062M, \
                       load_avg=0.060000,load_short=0.040000, \
                       load_medium=0.060000,load_long=0.000000, \
                       mem_free=63263.187500M,swap_free=996.210938M, \
virtual_free=64259.398438M,mem_used=539.390625M, \
                       swap_used=0.000000M,virtual_used=539.390625M, \
cpu=0.000000,m_topology=SCCSCC,m_topology_inuse=SCCSCC, \
                       m_socket=2,m_core=4,np_load_avg=0.015000, \
np_load_short=0.010000,np_load_medium=0.015000, \
                       np_load_long=0.000000
processors            4
user_lists            NONE
xuser_lists           NONE
projects              NONE
xprojects             NONE
usage_scaling         NONE
report_variables      NONE

-------------------------------------------------
[root at mseas ~]# qconf -ssconf
algorithm                         default
schedule_interval                 0:0:15
maxujobs                          0
queue_sort_method                 load
job_load_adjustments              np_load_avg=0.50
load_adjustment_decay_time        0:7:30
load_formula                      np_load_avg
schedd_job_info                   true
flush_submit_sec                  0
flush_finish_sec                  0
params                            none
reprioritize_interval             0:0:0
halftime                          168
usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000
compensation_factor               5.000000
weight_user                       0.250000
weight_project                    0.250000
weight_department                 0.250000
weight_job                        0.250000
weight_tickets_functional         0
weight_tickets_share              0
share_override_tickets            TRUE
share_functional_shares           TRUE
max_functional_jobs_to_schedule   200
report_pjob_tickets               TRUE
max_pending_tasks_per_job         50
halflife_decay_list               none
policy_hierarchy                  OFS
weight_ticket                     0.500000
weight_waiting_time               0.278000
weight_deadline                   3600000.000000
weight_urgency                    0.500000
weight_priority                   0.000000
max_reservation                   0
default_duration                  INFINITY

-------------------------------------------------

-- 

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Pat Haley                          Email:  phaley at mit.edu
Center for Ocean Engineering       Phone:  (617) 253-6824
Dept. of Mechanical Engineering    Fax:    (617) 253-8125
MIT, Room 5-213                    http://web.mit.edu/phaley/www/
77 Massachusetts Avenue
Cambridge, MA  02139-4301



More information about the users mailing list