[gridengine users] All queues dropped because of overload or full

Pat Haley phaley at mit.edu
Wed May 25 15:59:30 UTC 2016


It looks similar but one big difference is when I run "qconf -sh" I see 
all my compute nodes listed along with my frontend.  However "qconf 
-sql" is empty.

Thanks

On 05/25/2016 11:12 AM, MacMullan IV, Hugh wrote:
> I'm no Rocks guy, but maybe this thread will help: https://lists.sdsc.edu/pipermail/npaci-rocks-discussion/2013-January/060918.html
>
>
>
> -----Original Message-----
> From: users-bounces at gridengine.org [mailto:users-bounces at gridengine.org] On Behalf Of Pat Haley
> Sent: Wednesday, May 25, 2016 10:59 AM
> To: users at gridengine.org
> Subject: [gridengine users] All queues dropped because of overload or full
>
>
> Hi All,
>
> We've just upgraded our cluster from Rocks 6.0 to Rocks 6.2.  One issue
> that has come up is that SGE is not scheduling any jobs, they sit in the
> queue in a wait state.  Checking with "qstat -j" gives the following
>
> scheduling info:            All queues dropped because of overload or full
>
> We have also noticed that there are no sge deamons running on any of the
> execution nodes (I don't know if that is normal or not).  We have also
> collected the information below from qconf.  Any help in resolving this
> would be greatly appreciated.
>
> Thanks
>
> -------------------------------------------------
>
> [root at mseas ~]# qconf -sql
> no cqueue list defined
>
> -------------------------------------------------
> [root at mseas ~]# qconf -tsm
> root at mseas.local triggers scheduler monitoring
> [root at mseas common]# more schedd_runlog
> Wed May 25 09:22:56 2016|-------------START-SCHEDULER-RUN-------------
> Wed May 25 09:22:56 2016|All queues dropped because of overload or full
> Wed May 25 09:22:56 2016|--------------STOP-SCHEDULER-RUN-------------
>
> -------------------------------------------------
> [root at mseas ~]# qconf -sconf
> #global:
> execd_spool_dir              /opt/gridengine/default/spool
> mailer                       /bin/mail
> xterm                        /usr/bin/X11/xterm
> load_sensor                  none
> prolog                       none
> epilog                       none
> shell_start_mode             posix_compliant
> login_shells                 sh,bash,ksh,csh,tcsh
> min_uid                      0
> min_gid                      0
> user_lists                   none
> xuser_lists                  none
> projects                     none
> xprojects                    none
> enforce_project              false
> enforce_user                 auto
> load_report_time             00:00:40
> max_unheard                  00:05:00
> reschedule_unknown           02:00:00
> loglevel                     log_warning
> administrator_mail           none
> set_token_cmd                none
> pag_cmd                      none
> token_extend_time            none
> shepherd_cmd                 none
> qmaster_params               none
> execd_params                 none
> reporting_params             accounting=true reporting=true \
>                                flush_time=00:00:15 joblog=true
> sharelog=00:00:00
> finished_jobs                100
> gid_range                    20000-20100
> qlogin_command               builtin
> qlogin_daemon                builtin
> rlogin_command               builtin
> rlogin_daemon                builtin
> rsh_command                  builtin
> rsh_daemon                   builtin
> max_aj_instances             2000
> max_aj_tasks                 75000
> max_u_jobs                   0
> max_jobs                     0
> max_advance_reservations     0
> auto_user_oticket            0
> auto_user_fshare             0
> auto_user_default_project    none
> auto_user_delete_time        86400
> delegated_file_staging       false
> reprioritize                 0
> jsv_url                      none
> jsv_allowed_mod              ac,h,i,e,o,j,M,N,p,w
>
> -------------------------------------------------
>
> [root at mseas ~]# qconf -se compute-0-0
> hostname              compute-0-0.local
> load_scaling          NONE
> complex_values        NONE
> load_values arch=linux-x64,num_proc=4,mem_total=63802.578125M, \
> swap_total=996.210938M,virtual_total=64798.789062M, \
>                         load_avg=0.060000,load_short=0.040000, \
>                         load_medium=0.060000,load_long=0.000000, \
>                         mem_free=63263.187500M,swap_free=996.210938M, \
> virtual_free=64259.398438M,mem_used=539.390625M, \
>                         swap_used=0.000000M,virtual_used=539.390625M, \
> cpu=0.000000,m_topology=SCCSCC,m_topology_inuse=SCCSCC, \
>                         m_socket=2,m_core=4,np_load_avg=0.015000, \
> np_load_short=0.010000,np_load_medium=0.015000, \
>                         np_load_long=0.000000
> processors            4
> user_lists            NONE
> xuser_lists           NONE
> projects              NONE
> xprojects             NONE
> usage_scaling         NONE
> report_variables      NONE
>
> -------------------------------------------------
> [root at mseas ~]# qconf -ssconf
> algorithm                         default
> schedule_interval                 0:0:15
> maxujobs                          0
> queue_sort_method                 load
> job_load_adjustments              np_load_avg=0.50
> load_adjustment_decay_time        0:7:30
> load_formula                      np_load_avg
> schedd_job_info                   true
> flush_submit_sec                  0
> flush_finish_sec                  0
> params                            none
> reprioritize_interval             0:0:0
> halftime                          168
> usage_weight_list cpu=1.000000,mem=0.000000,io=0.000000
> compensation_factor               5.000000
> weight_user                       0.250000
> weight_project                    0.250000
> weight_department                 0.250000
> weight_job                        0.250000
> weight_tickets_functional         0
> weight_tickets_share              0
> share_override_tickets            TRUE
> share_functional_shares           TRUE
> max_functional_jobs_to_schedule   200
> report_pjob_tickets               TRUE
> max_pending_tasks_per_job         50
> halflife_decay_list               none
> policy_hierarchy                  OFS
> weight_ticket                     0.500000
> weight_waiting_time               0.278000
> weight_deadline                   3600000.000000
> weight_urgency                    0.500000
> weight_priority                   0.000000
> max_reservation                   0
> default_duration                  INFINITY
>
> -------------------------------------------------
>

-- 

-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Pat Haley                          Email:  phaley at mit.edu
Center for Ocean Engineering       Phone:  (617) 253-6824
Dept. of Mechanical Engineering    Fax:    (617) 253-8125
MIT, Room 5-213                    http://web.mit.edu/phaley/www/
77 Massachusetts Avenue
Cambridge, MA  02139-4301



More information about the users mailing list