[gridengine dev] [DRAFT PATCH] Enhancement: exempt certain programs from execd control

Mark Dixon m.c.dixon at leeds.ac.uk
Thu Nov 10 12:44:24 UTC 2011


In an attempt to lighten people's mood today...

Please find a draft patch attached, allowing a GE admin to specify a list 
of programs that are not counted against a job's resource limits. The 
patch has been prepared against a vanilla ge-6.2u5 and only gives this 
feature to Linux execd's.

This is to address a problem where tightly-integrated parallel jobs can be 
killed due to all the instances of qrsh exceeding h_vmem on the MASTER, as 
documented here:

https://arc.liv.ac.uk/trac/SGE/ticket/694

Is this something the major forks would be interested in?

This is my first GE patch and I'm very rusty (it's been over decade since 
I last worked on a big C code), so any and all 
help/comments/insults/rotten fruit would be welcomed.


How to use once applied and built:

"qconf -mconf" - add EXEMPT_PROGRAMS=<filename> to the execd_params line 
(use ":" as a delimiter, if more than one filename is desired).

Setting this to contain whatever filenames `ls $SGE_ROOT/bin/*/qsh` 
expands to on your system should fix the mpirun-qrsh problem (qrsh is 
symlinked to qsh).


Major concerns yet to resolve:

1) I'm not happy with how I'm configuring this feature. Using execd_params 
doesn't seem to fit nicely, so am still looking for the best way to do 
this (a line in "qconf -se", allowing per-execd config, perhaps? Or a 
separate line in "qconf -sconf"?)

2) Using execd_params also means that simply removing the variable doesn't 
actually turn off this feature - you need to set "EXEMPT_PROGRAMS=" and 
wait a bit for the config to propogate to the execds, before removing the 
reference completely. Ugh.

3) I think it will break running a standalone "pdc" binary, as pt_open now 
refers to the cluster configuration. Don't know how big an issue this is.

4) I don't know what "touch_time_stamp" is supposed to achieve, but other 
(similar) bits use it. Commented-out for now.

5) The name of the featire. I don't like the string "exempt_programs" I 
keep using. "resource_exempt" instead?

6) I assume this needs to be submitted under the SISSL 1.2?

7) I'm not happy about the amount of memory allocation error checking 
going on... but seems to be on par with similar routines.


Any guidance would be wonderful :)

Thanks,

Mark
-- 
-----------------------------------------------------------------
Mark Dixon                       Email    : m.c.dixon at leeds.ac.uk
HPC/Grid Systems Support         Tel (int): 35429
Information Systems Services     Tel (ext): +44(0)113 343 5429
University of Leeds, LS2 9JT, UK
-----------------------------------------------------------------
-------------- next part --------------
--- gridengine/source/daemons/common/procfs.c	2011-11-10 11:29:37.654547000 +0000
+++ gridengine/source/daemons/common/procfs.c	2011-11-10 11:28:31.901153000 +0000
@@ -80,7 +80,8 @@
 #include "sge_log.h"
 #include "msg_sge.h"
 #include "sgermon.h"
-#include "basis_types.h"
+#include "sge_conf.h"
+#include "sge_all_listsL.h"
 #include "sgedefs.h"
 #include "exec_ifm.h"
 #include "pdc.h"
@@ -104,6 +105,7 @@
 
 #endif
 
+static lList* exempt_programs = NULL;
 
 /*-----------------------------------------------------------------------*/
 #if defined(LINUX) || defined(ALPHA) || defined(SOLARIS)
@@ -388,10 +390,21 @@
 int pt_open(void)
 {
    cwd = opendir(PROC_DIR);
+
+   if (cwd) {
+      /* storage free'd in pt_close */
+      exempt_programs = mconf_get_exempt_programs();
+   }
+
    return !cwd;
 }
 void pt_close(void)
 {
+   /* free storage assigned in pt_open */
+   if (exempt_programs) {
+      lFreeList(&exempt_programs);
+   }
+
    closedir(cwd);
 }
 
@@ -515,6 +528,58 @@
             continue;
          }
 
+         /**
+          ** get program filename to decide if this
+          ** process should be ignored.
+          **/
+#  if defined(LINUX)
+         /* 
+          * Read proc symlink containing exe filename
+          */
+         sprintf(procnam, PROC_DIR "/%s/exe", dent->d_name);
+         if ((ret = readlink(procnam, buffer, BIGLINE-1))<=0) {
+            close(fd);
+            if (ret == -1 && errno != ENOENT) {
+#ifdef MONITOR_PDC
+               INFO((SGE_EVENT, "could not read %s: %s\n", procnam, strerror(errno)));
+#endif
+               touch_time_stamp(dent->d_name, time_stamp, job_list);
+            }
+            continue;
+         } else {
+            /* readlink doesn't NULL-terminate */
+            buffer[ret] = '\0';
+         }
+         buffer[BIGLINE-1] = '\0';
+
+         /*
+          * Compare against list of exe filenames to ignore
+          */
+         lListElem *exempt_ep = lFirst(exempt_programs);
+         {
+            int found_it = 0;
+            do {
+               const char *exempt_program = NULL;
+
+               if (exempt_ep != NULL) {
+                   exempt_program = lGetString(exempt_ep, ST_name);
+               }
+
+               if (exempt_program && ! strcmp(exempt_program, buffer)) {
+//               touch_time_stamp(dent->d_name, time_stamp, job_list); // Do we need to do this? Not sure what it does...
+                  found_it = 1;
+                  break;
+               }
+            } while((exempt_ep = lNext(exempt_ep)));
+
+            if (found_it == 1) {
+                continue;
+            }
+         }
+#  else
+         /* Not supported yet - to do */
+#  endif
+
          /** 
           ** get a list of supplementary group ids to decide
           ** whether this process will be needed;
--- gridengine/source/libs/sgeobj/sge_conf.c	2009-10-20 08:19:19.000000000 +0100
+++ gridengine/source/libs/sgeobj/sge_conf.c	2011-11-10 10:44:28.000000000 +0000
@@ -63,7 +63,7 @@
 #include "sgeobj/msg_sgeobjlib.h"
 
 #include "sge.h"
-#include "basis_types.h"
+#include "sge_all_listsL.h"
 #include "sge_conf.h"
 #include "sge_feature.h"
 #include "sge_usage.h"
@@ -174,6 +174,7 @@
 static bool enable_windomacc = false;
 static bool enable_binding = false;
 static bool enable_addgrp_kill = false;
+static char* exempt_programs = NULL;
 static u_long32 pdc_interval = 1;
 static char s_descriptors[100];
 static char h_descriptors[100];
@@ -989,6 +990,13 @@
             sge_strlcpy(h_locks, s+sizeof("H_LOCKS"), 100);
             continue;
          }
+         if (!strncasecmp(s, "EXEMPT_PROGRAMS", sizeof("EXEMPT_PROGRAMS")-1)) {
+            if (exempt_programs) {
+               free(exempt_programs);
+            }
+            exempt_programs = sge_strdup(NULL, &(s[sizeof("EXEMPT_PROGRAMS")]));
+            continue;
+         }
       }
       SGE_UNLOCK(LOCK_MASTER_CONF, LOCK_WRITE);
       sge_free_saved_vars(conf_context);
@@ -2048,6 +2056,28 @@
    DRETURN(ret);
 }
 
+/* returned pointer needs to be freed */
+lList* mconf_get_exempt_programs(void) {
+   lList* exempt_list = NULL;
+
+   DENTER(BASIS_LAYER, "mconf_get_exempt_programs");
+   SGE_LOCK(LOCK_MASTER_CONF, LOCK_READ);
+
+   exempt_list = lCreateList("exempt_programs", ST_Type);
+
+   struct saved_vars_s *conf_context = NULL;
+   char *s;
+   for(s=sge_strtok_r(exempt_programs, ":", &conf_context); s; s=sge_strtok_r(NULL, ":", &conf_context)) {
+       lListElem *element;
+       element = lCreateElem(ST_Type);
+       lSetString(element, ST_name, s);
+       lAppendElem(exempt_list, element);
+   }
+
+   SGE_UNLOCK(LOCK_MASTER_CONF, LOCK_READ);
+   DRETURN(exempt_list);
+}
+
 u_long32 mconf_get_pdc_interval(void) {
    u_long32 ret;
 
--- gridengine/source/libs/sgeobj/sge_conf.h	2009-10-20 08:19:20.000000000 +0100
+++ gridengine/source/libs/sgeobj/sge_conf.h	2011-11-08 16:42:46.000000000 +0000
@@ -146,6 +146,7 @@
 bool mconf_get_enable_test_sleep_after_request(void);
 int mconf_get_max_job_deletion_time(void);
 bool mconf_get_enable_addgrp_kill(void);
+lList* mconf_get_exempt_programs(void);
 u_long32 mconf_get_pdc_interval(void);
 bool mconf_get_enable_reschedule_kill(void);
 bool mconf_get_enable_reschedule_slave(void);


More information about the dev mailing list