root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sysrq_trigger
  2. panic_local
  3. panic_sbd
  4. pcmk__panic
  5. pcmk__locate_sbd
  6. pcmk__get_sbd_timeout
  7. pcmk__get_sbd_sync_resource_startup
  8. pcmk__auto_watchdog_timeout
  9. pcmk__valid_sbd_timeout

   1 /*
   2  * Copyright 2013-2020 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sched.h>
  13 #include <sys/ioctl.h>
  14 #include <sys/reboot.h>
  15 
  16 #include <sys/types.h>
  17 #include <sys/stat.h>
  18 #include <unistd.h>
  19 #include <ctype.h>
  20 #include <dirent.h>
  21 #include <signal.h>
  22 
  23 #ifdef _POSIX_MEMLOCK
  24 #  include <sys/mman.h>
  25 #endif
  26 
  27 static pid_t sbd_pid = 0;
  28 
  29 static void
  30 sysrq_trigger(char t)
     /* [previous][next][first][last][top][bottom][index][help] */
  31 {
  32 #if HAVE_LINUX_PROCFS
  33     FILE *procf;
  34 
  35     // Root can always write here, regardless of kernel.sysrq value
  36     procf = fopen("/proc/sysrq-trigger", "a");
  37     if (!procf) {
  38         crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
  39         return;
  40     }
  41     crm_info("sysrq-trigger: %c", t);
  42     fprintf(procf, "%c\n", t);
  43     fclose(procf);
  44 #endif // HAVE_LINUX_PROCFS
  45     return;
  46 }
  47 
  48 
  49 /*!
  50  * \internal
  51  * \brief Panic the local host (if root) or tell pacemakerd to do so
  52  */
  53 static void
  54 panic_local(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  55 {
  56     int rc = pcmk_ok;
  57     uid_t uid = geteuid();
  58     pid_t ppid = getppid();
  59 
  60     if(uid != 0 && ppid > 1) {
  61         /* We're a non-root pacemaker daemon (pacemaker-based,
  62          * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
  63          * the original pacemakerd parent.
  64          *
  65          * Of these, only the controller is likely to be initiating resets.
  66          */
  67         crm_emerg("Signaling parent %lld to panic", (long long) ppid);
  68         crm_exit(CRM_EX_PANIC);
  69         return;
  70 
  71     } else if (uid != 0) {
  72 #if HAVE_LINUX_PROCFS
  73         /*
  74          * No permissions, and no pacemakerd parent to escalate to.
  75          * Track down the new pacemakerd process and send a signal instead.
  76          */
  77         union sigval signal_value;
  78 
  79         memset(&signal_value, 0, sizeof(signal_value));
  80         ppid = pcmk__procfs_pid_of("pacemakerd");
  81         crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
  82 
  83         if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
  84             crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
  85                        (long long) ppid);
  86         }
  87 #endif // HAVE_LINUX_PROCFS
  88 
  89         /* The best we can do now is die */
  90         crm_exit(CRM_EX_PANIC);
  91         return;
  92     }
  93 
  94     /* We're either pacemakerd, or a pacemaker daemon running as root */
  95 
  96     if (pcmk__str_eq("crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
  97         sysrq_trigger('c');
  98     } else if (pcmk__str_eq("sync-crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
  99         sync();
 100         sysrq_trigger('c');
 101     } else {
 102         if (pcmk__str_eq("sync-reboot", getenv("PCMK_panic_action"), pcmk__str_casei)) {
 103             sync();
 104         }
 105         sysrq_trigger('b');
 106     }
 107     /* reboot(RB_HALT_SYSTEM); rc = errno; */
 108     reboot(RB_AUTOBOOT);
 109     rc = errno;
 110 
 111     crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
 112               (long long) ppid, pcmk_rc_str(rc), rc);
 113 
 114     if(ppid > 1) {
 115         /* child daemon */
 116         exit(CRM_EX_PANIC);
 117     } else {
 118         /* pacemakerd or orphan child */
 119         exit(CRM_EX_FATAL);
 120     }
 121 }
 122 
 123 /*!
 124  * \internal
 125  * \brief Tell sbd to kill the local host, then exit
 126  */
 127 static void
 128 panic_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 129 {
 130     union sigval signal_value;
 131     pid_t ppid = getppid();
 132 
 133     crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
 134 
 135     memset(&signal_value, 0, sizeof(signal_value));
 136     /* TODO: Arrange for a slightly less brutal option? */
 137     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 138         crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
 139                    (long long) sbd_pid);
 140         panic_local();
 141     }
 142 
 143     if(ppid > 1) {
 144         /* child daemon */
 145         exit(CRM_EX_PANIC);
 146     } else {
 147         /* pacemakerd or orphan child */
 148         exit(CRM_EX_FATAL);
 149     }
 150 }
 151 
 152 /*!
 153  * \internal
 154  * \brief Panic the local host
 155  *
 156  * Panic the local host either by sbd (if running), directly, or by asking
 157  * pacemakerd. If trace logging this function, exit instead.
 158  *
 159  * \param[in] origin   Function caller (for logging only)
 160  */
 161 void
 162 pcmk__panic(const char *origin)
     /* [previous][next][first][last][top][bottom][index][help] */
 163 {
 164     static struct qb_log_callsite *panic_cs = NULL;
 165 
 166     if (panic_cs == NULL) {
 167         panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay",
 168                                        LOG_TRACE, __LINE__, crm_trace_nonlog);
 169     }
 170 
 171     /* Ensure sbd_pid is set */
 172     (void) pcmk__locate_sbd();
 173 
 174     if (panic_cs && panic_cs->targets) {
 175         /* getppid() == 1 means our original parent no longer exists */
 176         crm_emerg("Shutting down instead of panicking the node "
 177                   CRM_XS " origin=%s sbd=%lld parent=%d",
 178                   origin, (long long) sbd_pid, getppid());
 179         crm_exit(CRM_EX_FATAL);
 180         return;
 181     }
 182 
 183     if(sbd_pid > 1) {
 184         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
 185                   (long long) sbd_pid, origin);
 186         panic_sbd();
 187 
 188     } else {
 189         crm_emerg("Panicking the system directly: %s", origin);
 190         panic_local();
 191     }
 192 }
 193 
 194 /*!
 195  * \internal
 196  * \brief Return the process ID of sbd (or 0 if it is not running)
 197  */
 198 pid_t
 199 pcmk__locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 200 {
 201     char *pidfile = NULL;
 202     char *sbd_path = NULL;
 203     int rc;
 204 
 205     if(sbd_pid > 1) {
 206         return sbd_pid;
 207     }
 208 
 209     /* Look for the pid file */
 210     pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
 211     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
 212 
 213     /* Read the pid file */
 214     rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
 215     if (rc == pcmk_rc_ok) {
 216         crm_trace("SBD detected at pid %lld (via PID file %s)",
 217                   (long long) sbd_pid, pidfile);
 218 
 219 #if HAVE_LINUX_PROCFS
 220     } else {
 221         /* Fall back to /proc for systems that support it */
 222         sbd_pid = pcmk__procfs_pid_of("sbd");
 223         crm_trace("SBD detected at pid %lld (via procfs)",
 224                   (long long) sbd_pid);
 225 #endif // HAVE_LINUX_PROCFS
 226     }
 227 
 228     if(sbd_pid < 0) {
 229         sbd_pid = 0;
 230         crm_trace("SBD not detected");
 231     }
 232 
 233     free(pidfile);
 234     free(sbd_path);
 235 
 236     return sbd_pid;
 237 }
 238 
 239 long
 240 pcmk__get_sbd_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 241 {
 242     static long sbd_timeout = -2;
 243 
 244     if (sbd_timeout == -2) {
 245         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
 246     }
 247     return sbd_timeout;
 248 }
 249 
 250 bool
 251 pcmk__get_sbd_sync_resource_startup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 252 {
 253     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
 254     static bool checked_sync_resource_startup = false;
 255 
 256     if (!checked_sync_resource_startup) {
 257         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
 258 
 259         if (sync_env == NULL) {
 260             crm_trace("Defaulting to %sstart-up synchronization with sbd",
 261                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
 262 
 263         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
 264             crm_warn("Defaulting to %sstart-up synchronization with sbd "
 265                      "because environment value '%s' is invalid",
 266                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
 267         }
 268         checked_sync_resource_startup = true;
 269     }
 270     return sync_resource_startup != 0;
 271 }
 272 
 273 long
 274 pcmk__auto_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 275 {
 276     long sbd_timeout = pcmk__get_sbd_timeout();
 277 
 278     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
 279 }
 280 
 281 bool
 282 pcmk__valid_sbd_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 283 {
 284     long st_timeout = value? crm_get_msec(value) : 0;
 285 
 286     if (st_timeout < 0) {
 287         st_timeout = pcmk__auto_watchdog_timeout();
 288         crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
 289                   st_timeout, value);
 290     }
 291 
 292     if (st_timeout == 0) {
 293         crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
 294                   value? value : "default");
 295 
 296     } else if (pcmk__locate_sbd() == 0) {
 297         crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
 298                   "but SBD not active", (value? value : "auto"));
 299         crm_exit(CRM_EX_FATAL);
 300         return false;
 301 
 302     } else {
 303         long sbd_timeout = pcmk__get_sbd_timeout();
 304 
 305         if (st_timeout < sbd_timeout) {
 306             crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
 307                       "(must be >%ldms)", value, sbd_timeout);
 308             crm_exit(CRM_EX_FATAL);
 309             return false;
 310         }
 311         crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
 312                  value, sbd_timeout);
 313     }
 314     return true;
 315 }

/* [previous][next][first][last][top][bottom][index][help] */