[gridengine dev] [PATCH] Fix PE task array job failure due to missing job script on execd

Mark Dixon m.c.dixon at leeds.ac.uk
Fri Dec 9 11:56:43 UTC 2011


Commit message from the attached patch (prepared against 6.2u5) reproduced 
below... please let me know if this is useful and/or if there are any 
problems with it :)

Cheers,

Mark
-- 
-----------------------------------------------------------------
Mark Dixon                       Email    : m.c.dixon at leeds.ac.uk
HPC/Grid Systems Support         Tel (int): 35429
Information Systems Services     Tel (ext): +44(0)113 343 5429
University of Leeds, LS2 9JT, UK
-----------------------------------------------------------------

This patch resolves an issue when the execd attempted to minimise the
number of times a job script is copied; it did not correctly count how
many MASTERs for a job were running on that instance at job start and end.

For tightly-integrated parallel task array jobs where a job script
(not binary) was submitted, this often resulted in task failure, queue
instances being placed into an errored state and emails sent to the
admin containing an error from the shepherd "unable to find job file".

This was relatively important to fix, as a "-tc 1" option to qsub and
friends make this a tidy way to chain runs of a checkpointing application
without needing to muck about with job dependencies.

Summary:

* Introduced a new routine, count_master_tasks, which returns the number
of MASTERs running on the execd, for a single job

* Edited handle_job to use count_master_tasks. Previous logic had an
incorrect assumption that JAT_granted_destin_identifier_list contained
a full list of hosts included in the task, whereas on a SLAVE actually
only contains themselves: this resulted in the job script not being
copied to the execd when it should.

* Edited remove_acked_job_exit to use count_master_tasks, and only when a
MASTER exits. Previous logic was flawed and over-aggressive when deleting
job scripts and often did this when a SLAVE exited.
-------------- next part --------------
From 812b0510bcc4c453a56c9aff1bc112e797ac8023 Mon Sep 17 00:00:00 2001
From: Mark Dixon <m.c.dixon at leeds.ac.uk>
Date: Fri, 9 Dec 2011 11:17:06 +0000
Subject: [PATCH] Fix PE task array job failure due to missing job script on execd

This patch resolves an issue when the execd attempted to minimise the
number of times a job script is copied; it did not correctly count how
many MASTERs for a job were running on that instance at job start and end.

For tightly-integrated parallel task array jobs where a job script
(not binary) was submitted, this often resulted in task failure, queue
instances being placed into an errored state and emails sent to the
admin containing an error from the shepherd "unable to find job file".

This was relatively important to fix, as a "-tc 1" option to qsub and
friends make this a tidy way to chain runs of a checkpointing application
without needing to muck about with job dependencies.

Summary:

* Introduced a new routine, count_master_tasks, which returns the number
of MASTERs running on the execd, for a single job

* Edited handle_job to use count_master_tasks. Previous logic had an
incorrect assumption that JAT_granted_destin_identifier_list contained
a full list of hosts included in the task, whereas on a SLAVE actually
only contains themselves: this resulted in the job script not being
copied to the execd when it should.

* Edited remove_acked_job_exit to use count_master_tasks, and only when a
MASTER exits. Previous logic was flawed and over-aggressive when deleting
job scripts and often did this when a SLAVE exited.
---
 source/daemons/execd/execd_job_exec.c |   31 +-----------
 source/daemons/execd/reaper_execd.c   |   85 +++++++++++++++++++++++++++-----
 source/daemons/execd/reaper_execd.h   |    2 +
 3 files changed, 75 insertions(+), 43 deletions(-)

diff --git a/source/daemons/execd/execd_job_exec.c b/source/daemons/execd/execd_job_exec.c
index d9fb7b2..094c4b0 100644
--- a/source/daemons/execd/execd_job_exec.c
+++ b/source/daemons/execd/execd_job_exec.c
@@ -330,19 +330,8 @@ static int handle_job(sge_gdi_ctx_class_t *ctx, lListElem *jelem, lListElem *jat
       /* interactive jobs and slave jobs do not have a script file */
       if (!slave && lGetString(jelem, JB_script_file)) {
          int nwritten;
-         int found_script = 0;
-         const void *iterator = NULL;
-         lListElem *tmp_job = NULL;
-         lListElem *next_tmp_job = NULL;
          u_long32 job_id = lGetUlong(jelem, JB_job_number);
-         lList *gdi_list = NULL;
-         lList *ja_task_list = NULL;   
-         const char *hostname = NULL;
-         const char *tmp_hostname = NULL;
 
-         gdi_list = lGetList(jatep, JAT_granted_destin_identifier_list);
-         hostname = lGetHost(lFirst(gdi_list), JG_qhostname);
-         
          if (!mconf_get_simulate_jobs()) {
             /*
              * Is another array task of the same job already here?
@@ -352,23 +341,7 @@ static int handle_job(sge_gdi_ctx_class_t *ctx, lListElem *jelem, lListElem *jat
              * check wether there is another master task of the same job running
              * on this host. This is important in case of array pe-jobs.
              */
-            next_tmp_job = lGetElemUlongFirst(*(object_type_get_master_list(SGE_TYPE_JOB)),
-                                              JB_job_number, job_id, &iterator);
-            while ((tmp_job = next_tmp_job) != NULL) {
-               next_tmp_job = lGetElemUlongNext(*(object_type_get_master_list(SGE_TYPE_JOB)),
-                                              JB_job_number, job_id, &iterator);
-              
-               ja_task_list = lGetList(tmp_job,JB_ja_tasks);
-               gdi_list = lGetList(lFirst(ja_task_list), JAT_granted_destin_identifier_list);
-               tmp_hostname = lGetHost(lFirst(gdi_list), JG_qhostname);
-              
-               if (sge_hostcmp(hostname, tmp_hostname) == 0) {
-                  found_script = 1;
-                  break;
-               }
-            }
-
-            if (!found_script) {
+            if (count_master_tasks(*(object_type_get_master_list(SGE_TYPE_JOB)), job_id) == 0) {
                int fd;
 
                /* We are root. Make the scriptfile readable for the jobs submitter,
diff --git a/source/daemons/execd/reaper_execd.c b/source/daemons/execd/reaper_execd.c
index 3dcfb28..2b7084b 100644
--- a/source/daemons/execd/reaper_execd.c
+++ b/source/daemons/execd/reaper_execd.c
@@ -847,7 +847,6 @@ void remove_acked_job_exit(sge_gdi_ctx_class_t *ctx, u_long32 job_id, u_long32 j
    SGE_STRUCT_STAT statbuf;
    lListElem *jep = NULL, *petep = NULL, *jatep = NULL;
    const char *pe_task_id_str; 
-   const void *iterator;
    const char *sge_root = ctx->get_sge_root(ctx);
 
    DENTER(TOP_LAYER, "remove_acked_job_exit");
@@ -950,23 +949,21 @@ void remove_acked_job_exit(sge_gdi_ctx_class_t *ctx, u_long32 job_id, u_long32 j
       }
 
       if (pe_task_id_str == NULL) {
+         /*
+          * Clean-up the job container (master and slave tasks)
+          */
          if (!mconf_get_simulate_jobs()) {
             job_remove_spool_file(job_id, ja_task_id, NULL, SPOOL_WITHIN_EXECD);
 
             if (!JOB_TYPE_IS_BINARY(lGetUlong(jep, JB_type)) &&
-                lGetString(jep, JB_exec_file)) {
-               int task_number = 0;
-               lListElem *tmp_job = NULL;
-
-               /* it is possible to remove the exec_file if
-                  less than one task of a job is running */
-               tmp_job = lGetElemUlongFirst(*(object_type_get_master_list(SGE_TYPE_JOB)), JB_job_number, job_id, &iterator);
-               while (tmp_job != NULL && task_number <= 2) {
-                  task_number++;
-                  tmp_job = lGetElemUlongNext(*(object_type_get_master_list(SGE_TYPE_JOB)), JB_job_number, job_id, &iterator);
-               }
-               
-               if (task_number <= 1) {
+                lGetString(jep, JB_exec_file) &&
+                lGetUlong(jatep, JAT_status) != JSLAVE) {
+
+               /* 
+                * Unlink job_script if there are no other master tasks
+                * (e.g. task arrays) for this job present on execd
+                */
+               if (count_master_tasks(*(object_type_get_master_list(SGE_TYPE_JOB)), job_id) <= 1) {
                   DPRINTF(("unlinking script file %s\n", lGetString(jep, JB_exec_file)));
                   unlink(lGetString(jep, JB_exec_file));
                }
@@ -977,6 +974,10 @@ void remove_acked_job_exit(sge_gdi_ctx_class_t *ctx, u_long32 job_id, u_long32 j
       }
 
       if (pe_task_id_str != NULL) {
+         /*
+          * Clean-up the PE task (a second pass cleans the container)
+          */
+
          /* unchain pe task element from task list */
          lRemoveElem(lGetList(jatep, JAT_task_list), &petep);
 
@@ -2262,4 +2263,60 @@ static void clean_up_binding(char* binding)
    DRETURN_VOID;
 }
 
+/****** reaper_execd/count_master_tasks() ****************************************
+*  NAME
+*     count_master_tasks() -- Counts the number of master tasks present for a job
+*
+*  SYNOPSIS
+*     int count_master_tasks(const lList *lp, u_long32 job_id)
+*
+*  FUNCTION
+*     Counts the number of master tasks for a specific job that are present
+*     on the execd.
+*
+*  INPUTS
+*     const lList *lp - Pointer to the job list
+*     u_long32 job_id - Job number to count master tasks for
+*
+*  RESULT
+*     int             - Number of master tasks found
+*
+*
+*  NOTES
+*
+*  SEE ALSO
+*     ???/???
+*******************************************************************************/
+int count_master_tasks(const lList *lp, u_long32 job_id) {
+   const void *iterator = NULL;
+   int         master_jobs = 0;
+
+   DENTER(TOP_LAYER, "count_master_jobs");
+
+   lListElem *jep = lGetElemUlongFirst(lp, JB_job_number, job_id, &iterator);
+   while (jep != NULL) {
+
+      lListElem *jatep = lFirst(lGetList(jep, JB_ja_tasks));
+      while (jatep != NULL) {
+         lList *gdi_list = lGetList(jatep, JAT_granted_destin_identifier_list);
+
+         /*
+          * Don't need to loop over gdi_list. This is either:
+          * 1) A complete list, with the first entry as the master
+          * 2) A single entry, containing the slave record for this host
+          */
+         if (lGetUlong(lFirst(gdi_list), JG_tag_slave_job) == 0) {
+            master_jobs++;
+         }
+
+         jatep = lNext(jatep);
+      }
+
+      jep = lGetElemUlongNext(lp, JB_job_number, job_id, &iterator);
+   }
+
+   DPRINTF(("Found %d master jobs for "sge_u32, master_jobs, job_id));
+
+   return master_jobs;
+}
 
diff --git a/source/daemons/execd/reaper_execd.h b/source/daemons/execd/reaper_execd.h
index bf7e4ee..b335778 100644
--- a/source/daemons/execd/reaper_execd.h
+++ b/source/daemons/execd/reaper_execd.h
@@ -43,4 +43,6 @@ void remove_acked_job_exit(sge_gdi_ctx_class_t *ctx, u_long32 job_id, u_long32 j
 void reaper_sendmail(sge_gdi_ctx_class_t *ctx, lListElem *jep, lListElem *jr);
 
 void execd_slave_job_exit(u_long32 job_id, u_long32 ja_task_id);
+
+int count_master_tasks(const lList *lp, u_long32 job_id);
 #endif
-- 
1.7.3.1



More information about the dev mailing list