root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. too_many_st_failures
  4. st_fail_count_reset
  5. st_fail_count_increment
  6. cib_fencing_updated
  7. send_stonith_update
  8. abort_for_stonith_failure
  9. add_stonith_cleanup
  10. remove_stonith_cleanup
  11. purge_stonith_cleanup
  12. execute_stonith_cleanup
  13. fail_incompletable_stonith
  14. tengine_stonith_connection_destroy
  15. handle_fence_notification
  16. te_connect_stonith
  17. controld_trigger_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. controld_execute_fence_action
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/stonith-ng.h>
  15 #include <crm/fencing/internal.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 static void
  20 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  21 
  22 /*
  23  * stonith failure counting
  24  *
  25  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  26  * number of fencing failures for each target node, and the most we'll restart a
  27  * transition for.
  28  */
  29 
  30 struct st_fail_rec {
  31     int count;
  32 };
  33 
  34 static bool fence_reaction_panic = FALSE;
  35 static unsigned long int stonith_max_attempts = 10;
  36 static GHashTable *stonith_failures = NULL;
  37 
  38 void
  39 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  40 {
  41     stonith_max_attempts = char2score(value);
  42     if (stonith_max_attempts < 1UL) {
  43         stonith_max_attempts = 10UL;
  44     }
  45 }
  46 
  47 void
  48 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  49 {
  50     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  51         fence_reaction_panic = TRUE;
  52 
  53     } else {
  54         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  55             crm_warn("Invalid value '%s' for %s, using 'stop'",
  56                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  57         }
  58         fence_reaction_panic = FALSE;
  59     }
  60 }
  61 
  62 static gboolean
  63 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  64 {
  65     GHashTableIter iter;
  66     const char *key = NULL;
  67     struct st_fail_rec *value = NULL;
  68 
  69     if (stonith_failures == NULL) {
  70         return FALSE;
  71     }
  72 
  73     if (target == NULL) {
  74         g_hash_table_iter_init(&iter, stonith_failures);
  75         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
  76                (gpointer *) &value)) {
  77 
  78             if (value->count >= stonith_max_attempts) {
  79                 target = (const char*)key;
  80                 goto too_many;
  81             }
  82         }
  83     } else {
  84         value = g_hash_table_lookup(stonith_failures, target);
  85         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
  86             goto too_many;
  87         }
  88     }
  89     return FALSE;
  90 
  91 too_many:
  92     crm_warn("Too many failures (%d) to fence %s, giving up",
  93              value->count, target);
  94     return TRUE;
  95 }
  96 
  97 /*!
  98  * \internal
  99  * \brief Reset a stonith fail count
 100  *
 101  * \param[in] target  Name of node to reset, or NULL for all
 102  */
 103 void
 104 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 105 {
 106     if (stonith_failures == NULL) {
 107         return;
 108     }
 109 
 110     if (target) {
 111         struct st_fail_rec *rec = NULL;
 112 
 113         rec = g_hash_table_lookup(stonith_failures, target);
 114         if (rec) {
 115             rec->count = 0;
 116         }
 117     } else {
 118         GHashTableIter iter;
 119         const char *key = NULL;
 120         struct st_fail_rec *rec = NULL;
 121 
 122         g_hash_table_iter_init(&iter, stonith_failures);
 123         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 124                                       (gpointer *) &rec)) {
 125             rec->count = 0;
 126         }
 127     }
 128 }
 129 
 130 static void
 131 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 132 {
 133     struct st_fail_rec *rec = NULL;
 134 
 135     if (stonith_failures == NULL) {
 136         stonith_failures = pcmk__strkey_table(free, free);
 137     }
 138 
 139     rec = g_hash_table_lookup(stonith_failures, target);
 140     if (rec) {
 141         rec->count++;
 142     } else {
 143         rec = malloc(sizeof(struct st_fail_rec));
 144         if(rec == NULL) {
 145             return;
 146         }
 147 
 148         rec->count = 1;
 149         g_hash_table_insert(stonith_failures, strdup(target), rec);
 150     }
 151 }
 152 
 153 /* end stonith fail count functions */
 154 
 155 
 156 static void
 157 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 158                     void *user_data)
 159 {
 160     if (rc < pcmk_ok) {
 161         crm_err("Fencing update %d for %s: failed - %s (%d)",
 162                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 163         crm_log_xml_warn(msg, "Failed update");
 164         abort_transition(INFINITY, pcmk__graph_shutdown, "CIB update failed",
 165                          NULL);
 166 
 167     } else {
 168         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 169     }
 170 }
 171 
 172 static void
 173 send_stonith_update(pcmk__graph_action_t *action, const char *target,
     /* [previous][next][first][last][top][bottom][index][help] */
 174                     const char *uuid)
 175 {
 176     int rc = pcmk_ok;
 177     crm_node_t *peer = NULL;
 178 
 179     /* We (usually) rely on the membership layer to do node_update_cluster,
 180      * and the peer status callback to do node_update_peer, because the node
 181      * might have already rejoined before we get the stonith result here.
 182      */
 183     int flags = node_update_join | node_update_expected;
 184 
 185     /* zero out the node-status & remove all LRM status info */
 186     xmlNode *node_state = NULL;
 187 
 188     CRM_CHECK(target != NULL, return);
 189     CRM_CHECK(uuid != NULL, return);
 190 
 191     /* Make sure the membership and join caches are accurate */
 192     peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
 193 
 194     CRM_CHECK(peer != NULL, return);
 195 
 196     if (peer->state == NULL) {
 197         /* Usually, we rely on the membership layer to update the cluster state
 198          * in the CIB. However, if the node has never been seen, do it here, so
 199          * the node is not considered unclean.
 200          */
 201         flags |= node_update_cluster;
 202     }
 203 
 204     if (peer->uuid == NULL) {
 205         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 206         peer->uuid = strdup(uuid);
 207     }
 208 
 209     crmd_peer_down(peer, TRUE);
 210 
 211     /* Generate a node state update for the CIB */
 212     node_state = create_node_state_update(peer, flags, NULL, __func__);
 213 
 214     /* we have to mark whether or not remote nodes have already been fenced */
 215     if (peer->flags & crm_remote_node) {
 216         char *now_s = pcmk__ttoa(time(NULL));
 217 
 218         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 219         free(now_s);
 220     }
 221 
 222     /* Force our known ID */
 223     crm_xml_add(node_state, XML_ATTR_UUID, uuid);
 224 
 225     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
 226                                     cib_quorum_override | cib_scope_local | cib_can_create);
 227 
 228     /* Delay processing the trigger until the update completes */
 229     crm_debug("Sending fencing update %d for %s", rc, target);
 230     fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
 231 
 232     /* Make sure it sticks */
 233     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 234 
 235     controld_delete_node_state(peer->uname, controld_section_all,
 236                                cib_scope_local);
 237     free_xml(node_state);
 238     return;
 239 }
 240 
 241 /*!
 242  * \internal
 243  * \brief Abort transition due to stonith failure
 244  *
 245  * \param[in] abort_action  Whether to restart or stop transition
 246  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 247  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 248  */
 249 static void
 250 abort_for_stonith_failure(enum pcmk__graph_next abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 251                           const char *target, const xmlNode *reason)
 252 {
 253     /* If stonith repeatedly fails, we eventually give up on starting a new
 254      * transition for that reason.
 255      */
 256     if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
 257         abort_action = pcmk__graph_wait;
 258     }
 259     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 260 }
 261 
 262 
 263 /*
 264  * stonith cleanup list
 265  *
 266  * If the DC is shot, proper notifications might not go out.
 267  * The stonith cleanup list allows the cluster to (re-)send
 268  * notifications once a new DC is elected.
 269  */
 270 
 271 static GList *stonith_cleanup_list = NULL;
 272 
 273 /*!
 274  * \internal
 275  * \brief Add a node to the stonith cleanup list
 276  *
 277  * \param[in] target  Name of node to add
 278  */
 279 void
 280 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 281     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 282 }
 283 
 284 /*!
 285  * \internal
 286  * \brief Remove a node from the stonith cleanup list
 287  *
 288  * \param[in] Name of node to remove
 289  */
 290 void
 291 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 292 {
 293     GList *iter = stonith_cleanup_list;
 294 
 295     while (iter != NULL) {
 296         GList *tmp = iter;
 297         char *iter_name = tmp->data;
 298 
 299         iter = iter->next;
 300         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 301             crm_trace("Removing %s from the cleanup list", iter_name);
 302             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 303             free(iter_name);
 304         }
 305     }
 306 }
 307 
 308 /*!
 309  * \internal
 310  * \brief Purge all entries from the stonith cleanup list
 311  */
 312 void
 313 purge_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 314 {
 315     if (stonith_cleanup_list) {
 316         GList *iter = NULL;
 317 
 318         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 319             char *target = iter->data;
 320 
 321             crm_info("Purging %s from stonith cleanup list", target);
 322             free(target);
 323         }
 324         g_list_free(stonith_cleanup_list);
 325         stonith_cleanup_list = NULL;
 326     }
 327 }
 328 
 329 /*!
 330  * \internal
 331  * \brief Send stonith updates for all entries in cleanup list, then purge it
 332  */
 333 void
 334 execute_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 335 {
 336     GList *iter;
 337 
 338     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 339         char *target = iter->data;
 340         crm_node_t *target_node = crm_get_peer(0, target);
 341         const char *uuid = crm_peer_uuid(target_node);
 342 
 343         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 344         send_stonith_update(NULL, target, uuid);
 345         free(target);
 346     }
 347     g_list_free(stonith_cleanup_list);
 348     stonith_cleanup_list = NULL;
 349 }
 350 
 351 /* end stonith cleanup list functions */
 352 
 353 
 354 /* stonith API client
 355  *
 356  * Functions that need to interact directly with the fencer via its API
 357  */
 358 
 359 static stonith_t *stonith_api = NULL;
 360 static crm_trigger_t *stonith_reconnect = NULL;
 361 static char *te_client_id = NULL;
 362 
 363 static gboolean
 364 fail_incompletable_stonith(pcmk__graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 365 {
 366     GList *lpc = NULL;
 367     const char *task = NULL;
 368     xmlNode *last_action = NULL;
 369 
 370     if (graph == NULL) {
 371         return FALSE;
 372     }
 373 
 374     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 375         GList *lpc2 = NULL;
 376         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
 377 
 378         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 379             continue;
 380         }
 381 
 382         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 383             pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
 384 
 385             if ((action->type != pcmk__cluster_graph_action)
 386                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 387                 continue;
 388             }
 389 
 390             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 391             if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 392                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 393                 last_action = action->xml;
 394                 pcmk__update_graph(graph, action);
 395                 crm_notice("Failing action %d (%s): fencer terminated",
 396                            action->id, ID(action->xml));
 397             }
 398         }
 399     }
 400 
 401     if (last_action != NULL) {
 402         crm_warn("Fencer failure resulted in unrunnable actions");
 403         abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
 404         return TRUE;
 405     }
 406 
 407     return FALSE;
 408 }
 409 
 410 static void
 411 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413     te_cleanup_stonith_history_sync(st, FALSE);
 414 
 415     if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 416         crm_crit("Fencing daemon connection failed");
 417         mainloop_set_trigger(stonith_reconnect);
 418 
 419     } else {
 420         crm_info("Fencing daemon disconnected");
 421     }
 422 
 423     if (stonith_api) {
 424         /* the client API won't properly reconnect notifications
 425          * if they are still in the table - so remove them
 426          */
 427         if (stonith_api->state != stonith_disconnected) {
 428             stonith_api->cmds->disconnect(st);
 429         }
 430         stonith_api->cmds->remove_notification(stonith_api, NULL);
 431     }
 432 
 433     if (AM_I_DC) {
 434         fail_incompletable_stonith(transition_graph);
 435         trigger_graph();
 436     }
 437 }
 438 
 439 /*!
 440  * \internal
 441  * \brief Handle an event notification from the fencing API
 442  *
 443  * \param[in] st     Fencing API connection (ignored)
 444  * \param[in] event  Fencing API event notification
 445  */
 446 static void
 447 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 448 {
 449     bool succeeded = true;
 450     const char *executioner = "the cluster";
 451     const char *client = "a client";
 452     const char *reason = NULL;
 453     int exec_status;
 454 
 455     if (te_client_id == NULL) {
 456         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 457                                          (unsigned long) getpid());
 458     }
 459 
 460     if (event == NULL) {
 461         crm_err("Notify data not found");
 462         return;
 463     }
 464 
 465     if (event->executioner != NULL) {
 466         executioner = event->executioner;
 467     }
 468     if (event->client_origin != NULL) {
 469         client = event->client_origin;
 470     }
 471 
 472     exec_status = stonith__event_execution_status(event);
 473     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 474         || (exec_status != PCMK_EXEC_DONE)) {
 475         succeeded = false;
 476         if (exec_status == PCMK_EXEC_DONE) {
 477             exec_status = PCMK_EXEC_ERROR;
 478         }
 479     }
 480     reason = stonith__event_exit_reason(event);
 481 
 482     crmd_alert_fencing_op(event);
 483 
 484     if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
 485         // Unfencing doesn't need special handling, just a log message
 486         if (succeeded) {
 487             crm_notice("%s was unfenced by %s at the request of %s@%s",
 488                        event->target, executioner, client, event->origin);
 489         } else {
 490             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 491                     event->target, executioner,
 492                     pcmk_exec_status_str(exec_status),
 493                     ((reason == NULL)? "" : ": "),
 494                     ((reason == NULL)? "" : reason),
 495                     stonith__event_exit_status(event));
 496         }
 497         return;
 498     }
 499 
 500     if (succeeded
 501         && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) {
 502         /* We were notified of our own fencing. Most likely, either fencing was
 503          * misconfigured, or fabric fencing that doesn't cut cluster
 504          * communication is in use.
 505          *
 506          * Either way, shutting down the local host is a good idea, to require
 507          * administrator intervention. Also, other nodes would otherwise likely
 508          * set our status to lost because of the fencing callback and discard
 509          * our subsequent election votes as "not part of our cluster".
 510          */
 511         crm_crit("We were allegedly just fenced by %s for %s!",
 512                  executioner, event->origin); // Dumps blackbox if enabled
 513         if (fence_reaction_panic) {
 514             pcmk__panic(__func__);
 515         } else {
 516             crm_exit(CRM_EX_FATAL);
 517         }
 518         return; // Should never get here
 519     }
 520 
 521     /* Update the count of fencing failures for this target, in case we become
 522      * DC later. The current DC has already updated its fail count in
 523      * tengine_stonith_callback().
 524      */
 525     if (!AM_I_DC) {
 526         if (succeeded) {
 527             st_fail_count_reset(event->target);
 528         } else {
 529             st_fail_count_increment(event->target);
 530         }
 531     }
 532 
 533     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 534                "%s%s%s%s " CRM_XS " event=%s",
 535                event->target, (succeeded? "" : " not"),
 536                event->action, executioner, client, event->origin,
 537                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 538                ((reason == NULL)? "" : " ("),
 539                ((reason == NULL)? "" : reason),
 540                ((reason == NULL)? "" : ")"),
 541                event->id);
 542 
 543     if (succeeded) {
 544         crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
 545                                                          CRM_GET_PEER_ANY);
 546         const char *uuid = NULL;
 547 
 548         if (peer == NULL) {
 549             return;
 550         }
 551 
 552         uuid = crm_peer_uuid(peer);
 553 
 554         if (AM_I_DC) {
 555             /* The DC always sends updates */
 556             send_stonith_update(NULL, event->target, uuid);
 557 
 558             /* @TODO Ideally, at this point, we'd check whether the fenced node
 559              * hosted any guest nodes, and call remote_node_down() for them.
 560              * Unfortunately, the controller doesn't have a simple, reliable way
 561              * to map hosts to guests. It might be possible to track this in the
 562              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 563              * on the scheduler creating fence pseudo-events for the guests.
 564              */
 565 
 566             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 567                 /* Abort the current transition if it wasn't the cluster that
 568                  * initiated fencing.
 569                  */
 570                 crm_info("External fencing operation from %s fenced %s",
 571                          client, event->target);
 572                 abort_transition(INFINITY, pcmk__graph_restart,
 573                                  "External Fencing Operation", NULL);
 574             }
 575 
 576             /* Assume it was our leader if we don't currently have one */
 577         } else if (pcmk__str_eq(fsa_our_dc, event->target,
 578                                 pcmk__str_null_matches|pcmk__str_casei)
 579                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 580 
 581             crm_notice("Fencing target %s %s our leader",
 582                        event->target, (fsa_our_dc? "was" : "may have been"));
 583 
 584             /* Given the CIB resyncing that occurs around elections,
 585              * have one node update the CIB now and, if the new DC is different,
 586              * have them do so too after the election
 587              */
 588             if (pcmk__str_eq(event->executioner, fsa_our_uname,
 589                              pcmk__str_casei)) {
 590                 send_stonith_update(NULL, event->target, uuid);
 591             }
 592             add_stonith_cleanup(event->target);
 593         }
 594 
 595         /* If the target is a remote node, and we host its connection,
 596          * immediately fail all monitors so it can be recovered quickly.
 597          * The connection won't necessarily drop when a remote node is fenced,
 598          * so the failure might not otherwise be detected until the next poke.
 599          */
 600         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 601             remote_ra_fail(event->target);
 602         }
 603 
 604         crmd_peer_down(peer, TRUE);
 605      }
 606 }
 607 
 608 /*!
 609  * \brief Connect to fencer
 610  *
 611  * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
 612  *
 613  * \return TRUE
 614  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 615  *       30 attempts, meaning the controller could be blocked as long as 58s.
 616  */
 617 static gboolean
 618 te_connect_stonith(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 619 {
 620     int rc = pcmk_ok;
 621 
 622     if (stonith_api == NULL) {
 623         stonith_api = stonith_api_new();
 624         if (stonith_api == NULL) {
 625             crm_err("Could not connect to fencer: API memory allocation failed");
 626             return TRUE;
 627         }
 628     }
 629 
 630     if (stonith_api->state != stonith_disconnected) {
 631         crm_trace("Already connected to fencer, no need to retry");
 632         return TRUE;
 633     }
 634 
 635     if (user_data == NULL) {
 636         // Blocking (retry failures now until successful)
 637         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 638         if (rc != pcmk_ok) {
 639             crm_err("Could not connect to fencer in 30 attempts: %s "
 640                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 641         }
 642     } else {
 643         // Non-blocking (retry failures later in main loop)
 644         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 645         if (rc != pcmk_ok) {
 646             if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 647                 crm_notice("Fencer connection failed (will retry): %s "
 648                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 649                 mainloop_set_trigger(stonith_reconnect);
 650             } else {
 651                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 652                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 653             }
 654             return TRUE;
 655         }
 656     }
 657 
 658     if (rc == pcmk_ok) {
 659         stonith_api->cmds->register_notification(stonith_api,
 660                                                  T_STONITH_NOTIFY_DISCONNECT,
 661                                                  tengine_stonith_connection_destroy);
 662         stonith_api->cmds->register_notification(stonith_api,
 663                                                  T_STONITH_NOTIFY_FENCE,
 664                                                  handle_fence_notification);
 665         stonith_api->cmds->register_notification(stonith_api,
 666                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 667                                                  tengine_stonith_history_synced);
 668         te_trigger_stonith_history_sync(TRUE);
 669         crm_notice("Fencer successfully connected");
 670     }
 671 
 672     return TRUE;
 673 }
 674 
 675 /*!
 676     \internal
 677     \brief Schedule fencer connection attempt in main loop
 678 */
 679 void
 680 controld_trigger_fencer_connect(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 681 {
 682     if (stonith_reconnect == NULL) {
 683         stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
 684                                                  te_connect_stonith,
 685                                                  GINT_TO_POINTER(TRUE));
 686     }
 687     controld_set_fsa_input_flags(R_ST_REQUIRED);
 688     mainloop_set_trigger(stonith_reconnect);
 689 }
 690 
 691 void
 692 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694     if (stonith_api) {
 695         // Prevent fencer connection from coming up again
 696         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 697 
 698         if (stonith_api->state != stonith_disconnected) {
 699             stonith_api->cmds->disconnect(stonith_api);
 700         }
 701         stonith_api->cmds->remove_notification(stonith_api, NULL);
 702     }
 703     if (destroy) {
 704         if (stonith_api) {
 705             stonith_api->cmds->free(stonith_api);
 706             stonith_api = NULL;
 707         }
 708         if (stonith_reconnect) {
 709             mainloop_destroy_trigger(stonith_reconnect);
 710             stonith_reconnect = NULL;
 711         }
 712         if (te_client_id) {
 713             free(te_client_id);
 714             te_client_id = NULL;
 715         }
 716     }
 717 }
 718 
 719 static gboolean
 720 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 721 {
 722     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 723         stonith_history_t *history = NULL;
 724 
 725         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 726         stonith_api->cmds->history(stonith_api,
 727                                    st_opt_sync_call | st_opt_broadcast,
 728                                    NULL, &history, 5);
 729         stonith_history_free(history);
 730         return TRUE;
 731     } else {
 732         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 733         return FALSE;
 734     }
 735 }
 736 
 737 static void
 738 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 739 {
 740     char *uuid = NULL;
 741     int stonith_id = -1;
 742     int transition_id = -1;
 743     pcmk__graph_action_t *action = NULL;
 744     const char *target = NULL;
 745 
 746     if ((data == NULL) || (data->userdata == NULL)) {
 747         crm_err("Ignoring fence operation %d result: "
 748                 "No transition key given (bug?)",
 749                 ((data == NULL)? -1 : data->call_id));
 750         return;
 751     }
 752 
 753     if (!AM_I_DC) {
 754         const char *reason = stonith__exit_reason(data);
 755 
 756         if (reason == NULL) {
 757            reason = pcmk_exec_status_str(stonith__execution_status(data));
 758         }
 759         crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
 760                    data->call_id, stonith__exit_status(data), reason,
 761                    (const char *) data->userdata);
 762         return;
 763     }
 764 
 765     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 766                                     &stonith_id, NULL),
 767               goto bail);
 768 
 769     if (transition_graph->complete || (stonith_id < 0)
 770         || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none)
 771         || (transition_graph->id != transition_id)) {
 772         crm_info("Ignoring fence operation %d result: "
 773                  "Not from current transition " CRM_XS
 774                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 775                  data->call_id, pcmk__btoa(transition_graph->complete),
 776                  stonith_id, uuid, te_uuid, transition_id, transition_graph->id);
 777         goto bail;
 778     }
 779 
 780     action = controld_get_action(stonith_id);
 781     if (action == NULL) {
 782         crm_err("Ignoring fence operation %d result: "
 783                 "Action %d not found in transition graph (bug?) "
 784                 CRM_XS " uuid=%s transition=%d",
 785                 data->call_id, stonith_id, uuid, transition_id);
 786         goto bail;
 787     }
 788 
 789     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 790     if (target == NULL) {
 791         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 792                 data->call_id);
 793         goto bail;
 794     }
 795 
 796     stop_te_timer(action);
 797     if (stonith__exit_status(data) == CRM_EX_OK) {
 798         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 799         const char *op = crm_meta_value(action->params, "stonith_action");
 800 
 801         crm_info("Fence operation %d for %s succeeded", data->call_id, target);
 802         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 803             te_action_confirmed(action, NULL);
 804             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
 805                 const char *value = NULL;
 806                 char *now = pcmk__ttoa(time(NULL));
 807                 gboolean is_remote_node = FALSE;
 808 
 809                 /* This check is not 100% reliable, since this node is not
 810                  * guaranteed to have the remote node cached. However, it
 811                  * doesn't have to be reliable, since the attribute manager can
 812                  * learn a node's "remoteness" by other means sooner or later.
 813                  * This allows it to learn more quickly if this node does have
 814                  * the information.
 815                  */
 816                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 817                     is_remote_node = TRUE;
 818                 }
 819 
 820                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 821                              is_remote_node);
 822                 free(now);
 823 
 824                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 825                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 826                              is_remote_node);
 827 
 828                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 829                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 830                              is_remote_node);
 831 
 832             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 833                 send_stonith_update(action, target, uuid);
 834                 pcmk__set_graph_action_flags(action,
 835                                              pcmk__graph_action_sent_update);
 836             }
 837         }
 838         st_fail_count_reset(target);
 839 
 840     } else {
 841         enum pcmk__graph_next abort_action = pcmk__graph_restart;
 842         int status = stonith__execution_status(data);
 843         const char *reason = stonith__exit_reason(data);
 844 
 845         if (reason == NULL) {
 846             if (status == PCMK_EXEC_DONE) {
 847                 reason = "Agent returned error";
 848             } else {
 849                 reason = pcmk_exec_status_str(status);
 850             }
 851         }
 852         pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 853 
 854         /* If no fence devices were available, there's no use in immediately
 855          * checking again, so don't start a new transition in that case.
 856          */
 857         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 858             crm_warn("Fence operation %d for %s failed: %s "
 859                      "(aborting transition and giving up for now)",
 860                      data->call_id, target, reason);
 861             abort_action = pcmk__graph_wait;
 862         } else {
 863             crm_notice("Fence operation %d for %s failed: %s "
 864                        "(aborting transition)", data->call_id, target, reason);
 865         }
 866 
 867         /* Increment the fail count now, so abort_for_stonith_failure() can
 868          * check it. Non-DC nodes will increment it in
 869          * handle_fence_notification().
 870          */
 871         st_fail_count_increment(target);
 872         abort_for_stonith_failure(abort_action, target, NULL);
 873     }
 874 
 875     pcmk__update_graph(transition_graph, action);
 876     trigger_graph();
 877 
 878   bail:
 879     free(data->userdata);
 880     free(uuid);
 881     return;
 882 }
 883 
 884 static int
 885 fence_with_delay(const char *target, const char *type, const char *delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 886 {
 887     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 888     int timeout_sec = (int) (transition_graph->stonith_timeout / 1000);
 889     int delay_i;
 890 
 891     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 892         stonith__set_call_options(options, target, st_opt_allow_suicide);
 893     }
 894     pcmk__scan_min_int(delay, &delay_i, 0);
 895     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 896                                                type, timeout_sec, 0, delay_i);
 897 }
 898 
 899 /*!
 900  * \internal
 901  * \brief Execute a fencing action from a transition graph
 902  *
 903  * \param[in] graph   Transition graph being executed (ignored)
 904  * \param[in] action  Fencing action to execute
 905  *
 906  * \return Standard Pacemaker return code
 907  */
 908 int
 909 controld_execute_fence_action(pcmk__graph_t *graph,
     /* [previous][next][first][last][top][bottom][index][help] */
 910                               pcmk__graph_action_t *action)
 911 {
 912     int rc = 0;
 913     const char *id = NULL;
 914     const char *uuid = NULL;
 915     const char *target = NULL;
 916     const char *type = NULL;
 917     char *transition_key = NULL;
 918     const char *priority_delay = NULL;
 919     gboolean invalid_action = FALSE;
 920 
 921     id = ID(action->xml);
 922     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 923     uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 924     type = crm_meta_value(action->params, "stonith_action");
 925 
 926     CRM_CHECK(id != NULL, invalid_action = TRUE);
 927     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 928     CRM_CHECK(type != NULL, invalid_action = TRUE);
 929     CRM_CHECK(target != NULL, invalid_action = TRUE);
 930 
 931     if (invalid_action) {
 932         crm_log_xml_warn(action->xml, "BadAction");
 933         return EPROTO;
 934     }
 935 
 936     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 937 
 938     crm_notice("Requesting fencing (%s) of node %s "
 939                CRM_XS " action=%s timeout=%u%s%s",
 940                type, target, id, transition_graph->stonith_timeout,
 941                priority_delay ? " priority_delay=" : "",
 942                priority_delay ? priority_delay : "");
 943 
 944     /* Passing NULL means block until we can connect... */
 945     te_connect_stonith(NULL);
 946 
 947     rc = fence_with_delay(target, type, priority_delay);
 948     transition_key = pcmk__transition_key(transition_graph->id, action->id, 0,
 949                                           te_uuid),
 950     stonith_api->cmds->register_callback(stonith_api, rc,
 951                                          (int) (transition_graph->stonith_timeout / 1000),
 952                                          st_opt_timeout_updates, transition_key,
 953                                          "tengine_stonith_callback", tengine_stonith_callback);
 954     return pcmk_rc_ok;
 955 }
 956 
 957 bool
 958 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 959 {
 960     gboolean rv = TRUE;
 961 
 962     if (stonith_api && (stonith_api->state != stonith_disconnected) &&
 963         stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
 964                                                        fsa_our_uname)) {
 965         rv = pcmk__valid_sbd_timeout(value);
 966     }
 967     return rv;
 968 }
 969 
 970 /* end stonith API client functions */
 971 
 972 
 973 /*
 974  * stonith history synchronization
 975  *
 976  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
 977  * joins or leaves, we need to synchronize the history across all nodes.
 978  */
 979 
 980 static crm_trigger_t *stonith_history_sync_trigger = NULL;
 981 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
 982 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
 983 
 984 void
 985 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
 986 {
 987     if (free_timers) {
 988         mainloop_timer_del(stonith_history_sync_timer_short);
 989         stonith_history_sync_timer_short = NULL;
 990         mainloop_timer_del(stonith_history_sync_timer_long);
 991         stonith_history_sync_timer_long = NULL;
 992     } else {
 993         mainloop_timer_stop(stonith_history_sync_timer_short);
 994         mainloop_timer_stop(stonith_history_sync_timer_long);
 995     }
 996 
 997     if (st) {
 998         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
 999     }
1000 }
1001 
1002 static void
1003 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
1004 {
1005     te_cleanup_stonith_history_sync(st, FALSE);
1006     crm_debug("Fence-history synced - cancel all timers");
1007 }
1008 
1009 static gboolean
1010 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1011 {
1012     mainloop_set_trigger(stonith_history_sync_trigger);
1013     return FALSE;
1014 }
1015 
1016 void
1017 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1018 {
1019     /* trigger a sync in 5s to give more nodes the
1020      * chance to show up so that we don't create
1021      * unnecessary stonith-history-sync traffic
1022      *
1023      * the long timeout of 30s is there as a fallback
1024      * so that after a successful connection to fenced
1025      * we will wait for 30s for the DC to trigger a
1026      * history-sync
1027      * if this doesn't happen we trigger a sync locally
1028      * (e.g. fenced segfaults and is restarted by pacemakerd)
1029      */
1030 
1031     /* as we are finally checking the stonith-connection
1032      * in do_stonith_history_sync we should be fine
1033      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1034      * around
1035      */
1036     if (stonith_history_sync_trigger == NULL) {
1037         stonith_history_sync_trigger =
1038             mainloop_add_trigger(G_PRIORITY_LOW,
1039                                  do_stonith_history_sync, NULL);
1040     }
1041 
1042     if (long_timeout) {
1043         if(stonith_history_sync_timer_long == NULL) {
1044             stonith_history_sync_timer_long =
1045                 mainloop_timer_add("history_sync_long", 30000,
1046                                    FALSE, stonith_history_sync_set_trigger,
1047                                    NULL);
1048         }
1049         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1050         mainloop_timer_start(stonith_history_sync_timer_long);
1051     } else {
1052         if(stonith_history_sync_timer_short == NULL) {
1053             stonith_history_sync_timer_short =
1054                 mainloop_timer_add("history_sync_short", 5000,
1055                                    FALSE, stonith_history_sync_set_trigger,
1056                                    NULL);
1057         }
1058         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1059         mainloop_timer_start(stonith_history_sync_timer_short);
1060     }
1061 
1062 }
1063 
1064 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */