pacemaker  2.1.6-802a72226b
Scalable High-Availability cluster resource manager
membership.c
Go to the documentation of this file.
1 /*
2  * Copyright 2004-2023 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #ifndef _GNU_SOURCE
13 # define _GNU_SOURCE
14 #endif
15 
16 #include <sys/param.h>
17 #include <sys/types.h>
18 #include <stdio.h>
19 #include <unistd.h>
20 #include <string.h>
21 #include <glib.h>
22 #include <crm/common/ipc.h>
24 #include <crm/cluster/internal.h>
25 #include <crm/msg_xml.h>
26 #include <crm/stonith-ng.h>
27 #include "crmcluster_private.h"
28 
29 /* The peer cache remembers cluster nodes that have been seen.
30  * This is managed mostly automatically by libcluster, based on
31  * cluster membership events.
32  *
33  * Because cluster nodes can have conflicting names or UUIDs,
34  * the hash table key is a uniquely generated ID.
35  */
36 GHashTable *crm_peer_cache = NULL;
37 
38 /*
39  * The remote peer cache tracks pacemaker_remote nodes. While the
40  * value has the same type as the peer cache's, it is tracked separately for
41  * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs,
42  * so the name (which is also the UUID) is used as the hash table key; there
43  * is no equivalent of membership events, so management is not automatic; and
44  * most users of the peer cache need to exclude pacemaker_remote nodes.
45  *
46  * That said, using a single cache would be more logical and less error-prone,
47  * so it would be a good idea to merge them one day.
48  *
49  * libcluster provides two avenues for populating the cache:
50  * crm_remote_peer_get() and crm_remote_peer_cache_remove() directly manage it,
51  * while crm_remote_peer_cache_refresh() populates it via the CIB.
52  */
53 GHashTable *crm_remote_peer_cache = NULL;
54 
55 /*
56  * The known node cache tracks cluster and remote nodes that have been seen in
57  * the CIB. It is useful mainly when a caller needs to know about a node that
58  * may no longer be in the membership, but doesn't want to add the node to the
59  * main peer cache tables.
60  */
61 static GHashTable *known_node_cache = NULL;
62 
63 unsigned long long crm_peer_seq = 0;
64 gboolean crm_have_quorum = FALSE;
65 static gboolean crm_autoreap = TRUE;
66 
67 // Flag setting and clearing for crm_node_t:flags
68 
69 #define set_peer_flags(peer, flags_to_set) do { \
70  (peer)->flags = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
71  "Peer", (peer)->uname, \
72  (peer)->flags, (flags_to_set), \
73  #flags_to_set); \
74  } while (0)
75 
76 #define clear_peer_flags(peer, flags_to_clear) do { \
77  (peer)->flags = pcmk__clear_flags_as(__func__, __LINE__, \
78  LOG_TRACE, \
79  "Peer", (peer)->uname, \
80  (peer)->flags, (flags_to_clear), \
81  #flags_to_clear); \
82  } while (0)
83 
84 static void update_peer_uname(crm_node_t *node, const char *uname);
85 
86 int
88 {
89  if (crm_remote_peer_cache == NULL) {
90  return 0;
91  }
92  return g_hash_table_size(crm_remote_peer_cache);
93 }
94 
106 crm_node_t *
107 crm_remote_peer_get(const char *node_name)
108 {
109  crm_node_t *node;
110 
111  if (node_name == NULL) {
112  errno = -EINVAL;
113  return NULL;
114  }
115 
116  /* Return existing cache entry if one exists */
117  node = g_hash_table_lookup(crm_remote_peer_cache, node_name);
118  if (node) {
119  return node;
120  }
121 
122  /* Allocate a new entry */
123  node = calloc(1, sizeof(crm_node_t));
124  if (node == NULL) {
125  return NULL;
126  }
127 
128  /* Populate the essential information */
130  node->uuid = strdup(node_name);
131  if (node->uuid == NULL) {
132  free(node);
133  errno = -ENOMEM;
134  return NULL;
135  }
136 
137  /* Add the new entry to the cache */
138  g_hash_table_replace(crm_remote_peer_cache, node->uuid, node);
139  crm_trace("added %s to remote cache", node_name);
140 
141  /* Update the entry's uname, ensuring peer status callbacks are called */
142  update_peer_uname(node, node_name);
143  return node;
144 }
145 
146 void
147 crm_remote_peer_cache_remove(const char *node_name)
148 {
149  if (g_hash_table_remove(crm_remote_peer_cache, node_name)) {
150  crm_trace("removed %s from remote peer cache", node_name);
151  }
152 }
153 
165 static const char *
166 remote_state_from_cib(const xmlNode *node_state)
167 {
168  bool status = false;
169 
170  if (pcmk__xe_get_bool_attr(node_state, XML_NODE_IN_CLUSTER, &status) == pcmk_rc_ok && !status) {
171  return CRM_NODE_LOST;
172  } else {
173  return CRM_NODE_MEMBER;
174  }
175 }
176 
177 /* user data for looping through remote node xpath searches */
178 struct refresh_data {
179  const char *field; /* XML attribute to check for node name */
180  gboolean has_state; /* whether to update node state based on XML */
181 };
182 
190 static void
191 remote_cache_refresh_helper(xmlNode *result, void *user_data)
192 {
193  const struct refresh_data *data = user_data;
194  const char *remote = crm_element_value(result, data->field);
195  const char *state = NULL;
196  crm_node_t *node;
197 
198  CRM_CHECK(remote != NULL, return);
199 
200  /* Determine node's state, if the result has it */
201  if (data->has_state) {
202  state = remote_state_from_cib(result);
203  }
204 
205  /* Check whether cache already has entry for node */
206  node = g_hash_table_lookup(crm_remote_peer_cache, remote);
207 
208  if (node == NULL) {
209  /* Node is not in cache, so add a new entry for it */
210  node = crm_remote_peer_get(remote);
211  CRM_ASSERT(node);
212  if (state) {
213  pcmk__update_peer_state(__func__, node, state, 0);
214  }
215 
216  } else if (pcmk_is_set(node->flags, crm_node_dirty)) {
217  /* Node is in cache and hasn't been updated already, so mark it clean */
219  if (state) {
220  pcmk__update_peer_state(__func__, node, state, 0);
221  }
222  }
223 }
224 
225 static void
226 mark_dirty(gpointer key, gpointer value, gpointer user_data)
227 {
229 }
230 
231 static gboolean
232 is_dirty(gpointer key, gpointer value, gpointer user_data)
233 {
234  return pcmk_is_set(((crm_node_t*)value)->flags, crm_node_dirty);
235 }
236 
242 void
244 {
245  struct refresh_data data;
246 
247  crm_peer_init();
248 
249  /* First, we mark all existing cache entries as dirty,
250  * so that later we can remove any that weren't in the CIB.
251  * We don't empty the cache, because we need to detect changes in state.
252  */
253  g_hash_table_foreach(crm_remote_peer_cache, mark_dirty, NULL);
254 
255  /* Look for guest nodes and remote nodes in the status section */
256  data.field = "id";
257  data.has_state = TRUE;
259  remote_cache_refresh_helper, &data);
260 
261  /* Look for guest nodes and remote nodes in the configuration section,
262  * because they may have just been added and not have a status entry yet.
263  * In that case, the cached node state will be left NULL, so that the
264  * peer status callback isn't called until we're sure the node started
265  * successfully.
266  */
267  data.field = "value";
268  data.has_state = FALSE;
270  remote_cache_refresh_helper, &data);
271  data.field = "id";
272  data.has_state = FALSE;
274  remote_cache_refresh_helper, &data);
275 
276  /* Remove all old cache entries that weren't seen in the CIB */
277  g_hash_table_foreach_remove(crm_remote_peer_cache, is_dirty, NULL);
278 }
279 
280 gboolean
282 {
283  if(node == NULL) {
284  return FALSE;
285  }
286 
287  if (pcmk_is_set(node->flags, crm_remote_node)) {
288  /* remote nodes are never considered active members. This
289  * guarantees they will never be considered for DC membership.*/
290  return FALSE;
291  }
292 #if SUPPORT_COROSYNC
293  if (is_corosync_cluster()) {
294  return crm_is_corosync_peer_active(node);
295  }
296 #endif
297  crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type()));
298  return FALSE;
299 }
300 
301 static gboolean
302 crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data)
303 {
304  crm_node_t *node = value;
305  crm_node_t *search = user_data;
306 
307  if (search == NULL) {
308  return FALSE;
309 
310  } else if (search->id && node->id != search->id) {
311  return FALSE;
312 
313  } else if (search->id == 0 && !pcmk__str_eq(node->uname, search->uname, pcmk__str_casei)) {
314  return FALSE;
315 
316  } else if (crm_is_peer_active(value) == FALSE) {
317  crm_info("Removing node with name %s and id %u from membership cache",
318  (node->uname? node->uname : "unknown"), node->id);
319  return TRUE;
320  }
321  return FALSE;
322 }
323 
332 guint
333 reap_crm_member(uint32_t id, const char *name)
334 {
335  int matches = 0;
336  crm_node_t search = { 0, };
337 
338  if (crm_peer_cache == NULL) {
339  crm_trace("Membership cache not initialized, ignoring purge request");
340  return 0;
341  }
342 
343  search.id = id;
344  pcmk__str_update(&search.uname, name);
345  matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, &search);
346  if(matches) {
347  crm_notice("Purged %d peer%s with id=%u%s%s from the membership cache",
348  matches, pcmk__plural_s(matches), search.id,
349  (search.uname? " and/or uname=" : ""),
350  (search.uname? search.uname : ""));
351 
352  } else {
353  crm_info("No peers with id=%u%s%s to purge from the membership cache",
354  search.id, (search.uname? " and/or uname=" : ""),
355  (search.uname? search.uname : ""));
356  }
357 
358  free(search.uname);
359  return matches;
360 }
361 
362 static void
363 count_peer(gpointer key, gpointer value, gpointer user_data)
364 {
365  guint *count = user_data;
366  crm_node_t *node = value;
367 
368  if (crm_is_peer_active(node)) {
369  *count = *count + 1;
370  }
371 }
372 
373 guint
375 {
376  guint count = 0;
377 
378  if (crm_peer_cache) {
379  g_hash_table_foreach(crm_peer_cache, count_peer, &count);
380  }
381  return count;
382 }
383 
384 static void
385 destroy_crm_node(gpointer data)
386 {
387  crm_node_t *node = data;
388 
389  crm_trace("Destroying entry for node %u: %s", node->id, node->uname);
390 
391  free(node->uname);
392  free(node->state);
393  free(node->uuid);
394  free(node->expected);
395  free(node->conn_host);
396  free(node);
397 }
398 
399 void
401 {
402  if (crm_peer_cache == NULL) {
403  crm_peer_cache = pcmk__strikey_table(free, destroy_crm_node);
404  }
405 
406  if (crm_remote_peer_cache == NULL) {
407  crm_remote_peer_cache = pcmk__strikey_table(NULL, destroy_crm_node);
408  }
409 
410  if (known_node_cache == NULL) {
411  known_node_cache = pcmk__strikey_table(free, destroy_crm_node);
412  }
413 }
414 
415 void
417 {
418  if (crm_peer_cache != NULL) {
419  crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache));
420  g_hash_table_destroy(crm_peer_cache);
421  crm_peer_cache = NULL;
422  }
423 
424  if (crm_remote_peer_cache != NULL) {
425  crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache));
426  g_hash_table_destroy(crm_remote_peer_cache);
427  crm_remote_peer_cache = NULL;
428  }
429 
430  if (known_node_cache != NULL) {
431  crm_trace("Destroying known node cache with %d members",
432  g_hash_table_size(known_node_cache));
433  g_hash_table_destroy(known_node_cache);
434  known_node_cache = NULL;
435  }
436 
437 }
438 
439 static void (*peer_status_callback)(enum crm_status_type, crm_node_t *,
440  const void *) = NULL;
441 
452 void
453 crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *))
454 {
455  peer_status_callback = dispatch;
456 }
457 
469 void
470 crm_set_autoreap(gboolean autoreap)
471 {
472  crm_autoreap = autoreap;
473 }
474 
475 static void
476 dump_peer_hash(int level, const char *caller)
477 {
478  GHashTableIter iter;
479  const char *id = NULL;
480  crm_node_t *node = NULL;
481 
482  g_hash_table_iter_init(&iter, crm_peer_cache);
483  while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) {
484  do_crm_log(level, "%s: Node %u/%s = %p - %s", caller, node->id, node->uname, node, id);
485  }
486 }
487 
488 static gboolean
489 hash_find_by_data(gpointer key, gpointer value, gpointer user_data)
490 {
491  return value == user_data;
492 }
493 
504 crm_node_t *
505 pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags)
506 {
507  crm_node_t *node = NULL;
508 
509  CRM_ASSERT(id > 0 || uname != NULL);
510 
511  crm_peer_init();
512 
513  if ((uname != NULL) && pcmk_is_set(flags, CRM_GET_PEER_REMOTE)) {
514  node = g_hash_table_lookup(crm_remote_peer_cache, uname);
515  }
516 
517  if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) {
519  }
520  return node;
521 }
522 
532 crm_node_t *
533 crm_get_peer_full(unsigned int id, const char *uname, int flags)
534 {
535  crm_node_t *node = NULL;
536 
537  CRM_ASSERT(id > 0 || uname != NULL);
538 
539  crm_peer_init();
540 
542  node = g_hash_table_lookup(crm_remote_peer_cache, uname);
543  }
544 
545  if ((node == NULL) && pcmk_is_set(flags, CRM_GET_PEER_CLUSTER)) {
546  node = crm_get_peer(id, uname);
547  }
548  return node;
549 }
550 
560 crm_node_t *
561 pcmk__search_cluster_node_cache(unsigned int id, const char *uname)
562 {
563  GHashTableIter iter;
564  crm_node_t *node = NULL;
565  crm_node_t *by_id = NULL;
566  crm_node_t *by_name = NULL;
567 
568  CRM_ASSERT(id > 0 || uname != NULL);
569 
570  crm_peer_init();
571 
572  if (uname != NULL) {
573  g_hash_table_iter_init(&iter, crm_peer_cache);
574  while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
575  if(node->uname && strcasecmp(node->uname, uname) == 0) {
576  crm_trace("Name match: %s = %p", node->uname, node);
577  by_name = node;
578  break;
579  }
580  }
581  }
582 
583  if (id > 0) {
584  g_hash_table_iter_init(&iter, crm_peer_cache);
585  while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
586  if(node->id == id) {
587  crm_trace("ID match: %u = %p", node->id, node);
588  by_id = node;
589  break;
590  }
591  }
592  }
593 
594  node = by_id; /* Good default */
595  if(by_id == by_name) {
596  /* Nothing to do if they match (both NULL counts) */
597  crm_trace("Consistent: %p for %u/%s", by_id, id, uname);
598 
599  } else if(by_id == NULL && by_name) {
600  crm_trace("Only one: %p for %u/%s", by_name, id, uname);
601 
602  if(id && by_name->id) {
603  dump_peer_hash(LOG_WARNING, __func__);
604  crm_crit("Node %u and %u share the same name '%s'",
605  id, by_name->id, uname);
606  node = NULL; /* Create a new one */
607 
608  } else {
609  node = by_name;
610  }
611 
612  } else if(by_name == NULL && by_id) {
613  crm_trace("Only one: %p for %u/%s", by_id, id, uname);
614 
615  if(uname && by_id->uname) {
616  dump_peer_hash(LOG_WARNING, __func__);
617  crm_crit("Node '%s' and '%s' share the same cluster nodeid %u: assuming '%s' is correct",
618  uname, by_id->uname, id, uname);
619  }
620 
621  } else if(uname && by_id->uname) {
622  if(pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) {
623  crm_notice("Node '%s' has changed its ID from %u to %u", by_id->uname, by_name->id, by_id->id);
624  g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name);
625 
626  } else {
627  crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname);
628  dump_peer_hash(LOG_INFO, __func__);
629  crm_abort(__FILE__, __func__, __LINE__, "member weirdness", TRUE,
630  TRUE);
631  }
632 
633  } else if(id && by_name->id) {
634  crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname);
635 
636  } else {
637  /* Simple merge */
638 
639  /* Only corosync-based clusters use node IDs. The functions that call
640  * pcmk__update_peer_state() and crm_update_peer_proc() only know
641  * nodeid, so 'by_id' is authoritative when merging.
642  */
643  dump_peer_hash(LOG_DEBUG, __func__);
644 
645  crm_info("Merging %p into %p", by_name, by_id);
646  g_hash_table_foreach_remove(crm_peer_cache, hash_find_by_data, by_name);
647  }
648 
649  return node;
650 }
651 
652 #if SUPPORT_COROSYNC
653 static guint
654 remove_conflicting_peer(crm_node_t *node)
655 {
656  int matches = 0;
657  GHashTableIter iter;
658  crm_node_t *existing_node = NULL;
659 
660  if (node->id == 0 || node->uname == NULL) {
661  return 0;
662  }
663 
665  return 0;
666  }
667 
668  g_hash_table_iter_init(&iter, crm_peer_cache);
669  while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) {
670  if (existing_node->id > 0
671  && existing_node->id != node->id
672  && existing_node->uname != NULL
673  && strcasecmp(existing_node->uname, node->uname) == 0) {
674 
675  if (crm_is_peer_active(existing_node)) {
676  continue;
677  }
678 
679  crm_warn("Removing cached offline node %u/%s which has conflicting uname with %u",
680  existing_node->id, existing_node->uname, node->id);
681 
682  g_hash_table_iter_remove(&iter);
683  matches++;
684  }
685  }
686 
687  return matches;
688 }
689 #endif
690 
699 /* coverity[-alloc] Memory is referenced in one or both hashtables */
700 crm_node_t *
701 crm_get_peer(unsigned int id, const char *uname)
702 {
703  crm_node_t *node = NULL;
704  char *uname_lookup = NULL;
705 
706  CRM_ASSERT(id > 0 || uname != NULL);
707 
708  crm_peer_init();
709 
711 
712  /* if uname wasn't provided, and find_peer did not turn up a uname based on id.
713  * we need to do a lookup of the node name using the id in the cluster membership. */
714  if ((node == NULL || node->uname == NULL) && (uname == NULL)) {
715  uname_lookup = get_node_name(id);
716  }
717 
718  if (uname_lookup) {
719  uname = uname_lookup;
720  crm_trace("Inferred a name of '%s' for node %u", uname, id);
721 
722  /* try to turn up the node one more time now that we know the uname. */
723  if (node == NULL) {
725  }
726  }
727 
728 
729  if (node == NULL) {
730  char *uniqueid = crm_generate_uuid();
731 
732  node = calloc(1, sizeof(crm_node_t));
733  CRM_ASSERT(node);
734 
735  crm_info("Created entry %s/%p for node %s/%u (%d total)",
736  uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache));
737  g_hash_table_replace(crm_peer_cache, uniqueid, node);
738  }
739 
740  if(id > 0 && uname && (node->id == 0 || node->uname == NULL)) {
741  crm_info("Node %u is now known as %s", id, uname);
742  }
743 
744  if(id > 0 && node->id == 0) {
745  node->id = id;
746  }
747 
748  if (uname && (node->uname == NULL)) {
749  update_peer_uname(node, uname);
750  }
751 
752  if(node->uuid == NULL) {
753  const char *uuid = crm_peer_uuid(node);
754 
755  if (uuid) {
756  crm_info("Node %u has uuid %s", id, uuid);
757 
758  } else {
759  crm_info("Cannot obtain a UUID for node %u/%s", id, node->uname);
760  }
761  }
762 
763  free(uname_lookup);
764 
765  return node;
766 }
767 
779 static void
780 update_peer_uname(crm_node_t *node, const char *uname)
781 {
782  CRM_CHECK(uname != NULL,
783  crm_err("Bug: can't update node name without name"); return);
784  CRM_CHECK(node != NULL,
785  crm_err("Bug: can't update node name to %s without node", uname);
786  return);
787 
788  if (pcmk__str_eq(uname, node->uname, pcmk__str_casei)) {
789  crm_debug("Node uname '%s' did not change", uname);
790  return;
791  }
792 
793  for (const char *c = uname; *c; ++c) {
794  if ((*c >= 'A') && (*c <= 'Z')) {
795  crm_warn("Node names with capitals are discouraged, consider changing '%s'",
796  uname);
797  break;
798  }
799  }
800 
801  pcmk__str_update(&node->uname, uname);
802 
803  if (peer_status_callback != NULL) {
804  peer_status_callback(crm_status_uname, node, NULL);
805  }
806 
807 #if SUPPORT_COROSYNC
809  remove_conflicting_peer(node);
810  }
811 #endif
812 }
813 
822 static inline const char *
823 proc2text(enum crm_proc_flag proc)
824 {
825  const char *text = "unknown";
826 
827  switch (proc) {
828  case crm_proc_none:
829  text = "none";
830  break;
831  case crm_proc_based:
832  text = "pacemaker-based";
833  break;
834  case crm_proc_controld:
835  text = "pacemaker-controld";
836  break;
837  case crm_proc_schedulerd:
838  text = "pacemaker-schedulerd";
839  break;
840  case crm_proc_execd:
841  text = "pacemaker-execd";
842  break;
843  case crm_proc_attrd:
844  text = "pacemaker-attrd";
845  break;
846  case crm_proc_fenced:
847  text = "pacemaker-fenced";
848  break;
849  case crm_proc_cpg:
850  text = "corosync-cpg";
851  break;
852  }
853  return text;
854 }
855 
872 crm_node_t *
873 crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const char *status)
874 {
875  uint32_t last = 0;
876  gboolean changed = FALSE;
877 
878  CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL",
879  source, proc2text(flag), status);
880  return NULL);
881 
882  /* Pacemaker doesn't spawn processes on remote nodes */
883  if (pcmk_is_set(node->flags, crm_remote_node)) {
884  return node;
885  }
886 
887  last = node->processes;
888  if (status == NULL) {
889  node->processes = flag;
890  if (node->processes != last) {
891  changed = TRUE;
892  }
893 
894  } else if (pcmk__str_eq(status, ONLINESTATUS, pcmk__str_casei)) {
895  if ((node->processes & flag) != flag) {
896  node->processes = pcmk__set_flags_as(__func__, __LINE__,
897  LOG_TRACE, "Peer process",
898  node->uname, node->processes,
899  flag, "processes");
900  changed = TRUE;
901  }
902 
903  } else if (node->processes & flag) {
904  node->processes = pcmk__clear_flags_as(__func__, __LINE__,
905  LOG_TRACE, "Peer process",
906  node->uname, node->processes,
907  flag, "processes");
908  changed = TRUE;
909  }
910 
911  if (changed) {
912  if (status == NULL && flag <= crm_proc_none) {
913  crm_info("%s: Node %s[%u] - all processes are now offline", source, node->uname,
914  node->id);
915  } else {
916  crm_info("%s: Node %s[%u] - %s is now %s", source, node->uname, node->id,
917  proc2text(flag), status);
918  }
919 
920  /* Call the client callback first, then update the peer state,
921  * in case the node will be reaped
922  */
923  if (peer_status_callback != NULL) {
924  peer_status_callback(crm_status_processes, node, &last);
925  }
926 
927  /* The client callback shouldn't touch the peer caches,
928  * but as a safety net, bail if the peer cache was destroyed.
929  */
930  if (crm_peer_cache == NULL) {
931  return NULL;
932  }
933 
934  if (crm_autoreap) {
935  const char *peer_state = NULL;
936 
937  if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
938  peer_state = CRM_NODE_MEMBER;
939  } else {
940  peer_state = CRM_NODE_LOST;
941  }
942  node = pcmk__update_peer_state(__func__, node, peer_state, 0);
943  }
944  } else {
945  crm_trace("%s: Node %s[%u] - %s is unchanged (%s)", source, node->uname, node->id,
946  proc2text(flag), status);
947  }
948  return node;
949 }
950 
959 void
960 pcmk__update_peer_expected(const char *source, crm_node_t *node,
961  const char *expected)
962 {
963  char *last = NULL;
964  gboolean changed = FALSE;
965 
966  CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected);
967  return);
968 
969  /* Remote nodes don't participate in joins */
970  if (pcmk_is_set(node->flags, crm_remote_node)) {
971  return;
972  }
973 
974  last = node->expected;
975  if (expected != NULL && !pcmk__str_eq(node->expected, expected, pcmk__str_casei)) {
976  node->expected = strdup(expected);
977  changed = TRUE;
978  }
979 
980  if (changed) {
981  crm_info("%s: Node %s[%u] - expected state is now %s (was %s)", source, node->uname, node->id,
982  expected, last);
983  free(last);
984  } else {
985  crm_trace("%s: Node %s[%u] - expected state is unchanged (%s)", source, node->uname,
986  node->id, expected);
987  }
988 }
989 
1006 static crm_node_t *
1007 update_peer_state_iter(const char *source, crm_node_t *node, const char *state,
1008  uint64_t membership, GHashTableIter *iter)
1009 {
1010  gboolean is_member;
1011 
1012  CRM_CHECK(node != NULL,
1013  crm_err("Could not set state for unknown host to %s"
1014  CRM_XS " source=%s", state, source);
1015  return NULL);
1016 
1017  is_member = pcmk__str_eq(state, CRM_NODE_MEMBER, pcmk__str_casei);
1018  if (is_member) {
1019  node->when_lost = 0;
1020  if (membership) {
1021  node->last_seen = membership;
1022  }
1023  }
1024 
1025  if (state && !pcmk__str_eq(node->state, state, pcmk__str_casei)) {
1026  char *last = node->state;
1027 
1028  node->state = strdup(state);
1029  crm_notice("Node %s state is now %s " CRM_XS
1030  " nodeid=%u previous=%s source=%s", node->uname, state,
1031  node->id, (last? last : "unknown"), source);
1032  if (peer_status_callback != NULL) {
1033  peer_status_callback(crm_status_nstate, node, last);
1034  }
1035  free(last);
1036 
1037  if (crm_autoreap && !is_member
1038  && !pcmk_is_set(node->flags, crm_remote_node)) {
1039  /* We only autoreap from the peer cache, not the remote peer cache,
1040  * because the latter should be managed only by
1041  * crm_remote_peer_cache_refresh().
1042  */
1043  if(iter) {
1044  crm_notice("Purged 1 peer with id=%u and/or uname=%s from the membership cache", node->id, node->uname);
1045  g_hash_table_iter_remove(iter);
1046 
1047  } else {
1048  reap_crm_member(node->id, node->uname);
1049  }
1050  node = NULL;
1051  }
1052 
1053  } else {
1054  crm_trace("Node %s state is unchanged (%s) " CRM_XS
1055  " nodeid=%u source=%s", node->uname, state, node->id, source);
1056  }
1057  return node;
1058 }
1059 
1075 crm_node_t *
1076 pcmk__update_peer_state(const char *source, crm_node_t *node,
1077  const char *state, uint64_t membership)
1078 {
1079  return update_peer_state_iter(source, node, state, membership, NULL);
1080 }
1081 
1088 void
1089 pcmk__reap_unseen_nodes(uint64_t membership)
1090 {
1091  GHashTableIter iter;
1092  crm_node_t *node = NULL;
1093 
1094  crm_trace("Reaping unseen nodes...");
1095  g_hash_table_iter_init(&iter, crm_peer_cache);
1096  while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) {
1097  if (node->last_seen != membership) {
1098  if (node->state) {
1099  /*
1100  * Calling update_peer_state_iter() allows us to
1101  * remove the node from crm_peer_cache without
1102  * invalidating our iterator
1103  */
1104  update_peer_state_iter(__func__, node, CRM_NODE_LOST,
1105  membership, &iter);
1106 
1107  } else {
1108  crm_info("State of node %s[%u] is still unknown",
1109  node->uname, node->id);
1110  }
1111  }
1112  }
1113 }
1114 
1115 static crm_node_t *
1116 find_known_node(const char *id, const char *uname)
1117 {
1118  GHashTableIter iter;
1119  crm_node_t *node = NULL;
1120  crm_node_t *by_id = NULL;
1121  crm_node_t *by_name = NULL;
1122 
1123  if (uname) {
1124  g_hash_table_iter_init(&iter, known_node_cache);
1125  while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
1126  if (node->uname && strcasecmp(node->uname, uname) == 0) {
1127  crm_trace("Name match: %s = %p", node->uname, node);
1128  by_name = node;
1129  break;
1130  }
1131  }
1132  }
1133 
1134  if (id) {
1135  g_hash_table_iter_init(&iter, known_node_cache);
1136  while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
1137  if(strcasecmp(node->uuid, id) == 0) {
1138  crm_trace("ID match: %s= %p", id, node);
1139  by_id = node;
1140  break;
1141  }
1142  }
1143  }
1144 
1145  node = by_id; /* Good default */
1146  if (by_id == by_name) {
1147  /* Nothing to do if they match (both NULL counts) */
1148  crm_trace("Consistent: %p for %s/%s", by_id, id, uname);
1149 
1150  } else if (by_id == NULL && by_name) {
1151  crm_trace("Only one: %p for %s/%s", by_name, id, uname);
1152 
1153  if (id) {
1154  node = NULL;
1155 
1156  } else {
1157  node = by_name;
1158  }
1159 
1160  } else if (by_name == NULL && by_id) {
1161  crm_trace("Only one: %p for %s/%s", by_id, id, uname);
1162 
1163  if (uname) {
1164  node = NULL;
1165  }
1166 
1167  } else if (uname && by_id->uname
1168  && pcmk__str_eq(uname, by_id->uname, pcmk__str_casei)) {
1169  /* Multiple nodes have the same uname in the CIB.
1170  * Return by_id. */
1171 
1172  } else if (id && by_name->uuid
1173  && pcmk__str_eq(id, by_name->uuid, pcmk__str_casei)) {
1174  /* Multiple nodes have the same id in the CIB.
1175  * Return by_name. */
1176  node = by_name;
1177 
1178  } else {
1179  node = NULL;
1180  }
1181 
1182  if (node == NULL) {
1183  crm_debug("Couldn't find node%s%s%s%s",
1184  id? " " : "",
1185  id? id : "",
1186  uname? " with name " : "",
1187  uname? uname : "");
1188  }
1189 
1190  return node;
1191 }
1192 
1193 static void
1194 known_node_cache_refresh_helper(xmlNode *xml_node, void *user_data)
1195 {
1196  const char *id = crm_element_value(xml_node, XML_ATTR_ID);
1197  const char *uname = crm_element_value(xml_node, XML_ATTR_UNAME);
1198  crm_node_t * node = NULL;
1199 
1200  CRM_CHECK(id != NULL && uname !=NULL, return);
1201  node = find_known_node(id, uname);
1202 
1203  if (node == NULL) {
1204  char *uniqueid = crm_generate_uuid();
1205 
1206  node = calloc(1, sizeof(crm_node_t));
1207  CRM_ASSERT(node != NULL);
1208 
1209  node->uname = strdup(uname);
1210  CRM_ASSERT(node->uname != NULL);
1211 
1212  node->uuid = strdup(id);
1213  CRM_ASSERT(node->uuid != NULL);
1214 
1215  g_hash_table_replace(known_node_cache, uniqueid, node);
1216 
1217  } else if (pcmk_is_set(node->flags, crm_node_dirty)) {
1218  pcmk__str_update(&node->uname, uname);
1219 
1220  /* Node is in cache and hasn't been updated already, so mark it clean */
1222  }
1223 
1224 }
1225 
1226 static void
1227 refresh_known_node_cache(xmlNode *cib)
1228 {
1229  crm_peer_init();
1230 
1231  g_hash_table_foreach(known_node_cache, mark_dirty, NULL);
1232 
1234  known_node_cache_refresh_helper, NULL);
1235 
1236  /* Remove all old cache entries that weren't seen in the CIB */
1237  g_hash_table_foreach_remove(known_node_cache, is_dirty, NULL);
1238 }
1239 
1240 void
1242 {
1244  refresh_known_node_cache(cib);
1245 }
1246 
1257 crm_node_t *
1258 pcmk__search_known_node_cache(unsigned int id, const char *uname,
1259  uint32_t flags)
1260 {
1261  crm_node_t *node = NULL;
1262  char *id_str = NULL;
1263 
1264  CRM_ASSERT(id > 0 || uname != NULL);
1265 
1266  node = pcmk__search_node_caches(id, uname, flags);
1267 
1268  if (node || !(flags & CRM_GET_PEER_CLUSTER)) {
1269  return node;
1270  }
1271 
1272  if (id > 0) {
1273  id_str = crm_strdup_printf("%u", id);
1274  }
1275 
1276  node = find_known_node(id_str, uname);
1277 
1278  free(id_str);
1279  return node;
1280 }
1281 
1282 
1283 // Deprecated functions kept only for backward API compatibility
1284 // LCOV_EXCL_START
1285 
1286 #include <crm/cluster/compat.h>
1287 
1288 int
1289 crm_terminate_member(int nodeid, const char *uname, void *unused)
1290 {
1291  return stonith_api_kick(nodeid, uname, 120, TRUE);
1292 }
1293 
1294 int
1295 crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection)
1296 {
1297  return stonith_api_kick(nodeid, uname, 120, TRUE);
1298 }
1299 
1300 // LCOV_EXCL_STOP
1301 // End deprecated API
#define LOG_TRACE
Definition: logging.h:37
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:235
#define crm_notice(fmt, args...)
Definition: logging.h:379
#define CRM_NODE_LOST
Definition: cluster.h:32
GHashTable * crm_peer_cache
Definition: membership.c:36
#define crm_crit(fmt, args...)
Definition: logging.h:376
char data[0]
Definition: cpg.c:55
char * crm_generate_uuid(void)
Definition: utils.c:509
uint64_t flags
Definition: cluster.h:62
void crm_peer_destroy(void)
Definition: membership.c:416
const char * name
Definition: cib.c:24
uint32_t id
Definition: cluster.h:72
char * uuid
Definition: cluster.h:60
int stonith_api_kick(uint32_t nodeid, const char *uname, int timeout, bool off)
Definition: st_client.c:1960
void pcmk__update_peer_expected(const char *source, crm_node_t *node, const char *expected)
Definition: membership.c:960
gboolean crm_have_quorum
Definition: membership.c:64
crm_node_t * pcmk__update_peer_state(const char *source, crm_node_t *node, const char *state, uint64_t membership)
Update a node&#39;s state and membership information.
Definition: membership.c:1076
GHashTable * crm_remote_peer_cache
Definition: membership.c:53
unsigned long long crm_peer_seq
Definition: membership.c:63
char * get_node_name(uint32_t nodeid)
Get the node name corresponding to a cluster node ID.
Definition: cluster.c:204
void crm_set_autoreap(gboolean autoreap)
Tell the library whether to automatically reap lost nodes.
Definition: membership.c:470
void crm_peer_init(void)
Definition: membership.c:400
void crm_remote_peer_cache_remove(const char *node_name)
Definition: membership.c:147
crm_node_t * pcmk__search_cluster_node_cache(unsigned int id, const char *uname)
Definition: membership.c:561
gboolean crm_is_corosync_peer_active(const crm_node_t *node)
Check whether a Corosync cluster peer is active.
Definition: corosync.c:531
int crm_remote_peer_cache_size(void)
Definition: membership.c:87
#define crm_warn(fmt, args...)
Definition: logging.h:378
#define PCMK__XP_GUEST_NODE_CONFIG
Definition: xml_internal.h:179
uint32_t processes
Definition: cluster.h:64
crm_node_t * crm_get_peer_full(unsigned int id, const char *uname, int flags)
Get a node cache entry (cluster or Pacemaker Remote)
Definition: membership.c:533
#define clear_peer_flags(peer, flags_to_clear)
Definition: membership.c:76
guint reap_crm_member(uint32_t id, const char *name)
Remove all peer cache entries matching a node ID and/or uname.
Definition: membership.c:333
gboolean crm_is_peer_active(const crm_node_t *node)
Definition: membership.c:281
#define crm_debug(fmt, args...)
Definition: logging.h:382
void pcmk__reap_unseen_nodes(uint64_t membership)
Definition: membership.c:1089
#define XML_ATTR_ID
Definition: msg_xml.h:147
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: nvpair.c:496
time_t when_lost
Definition: cluster.h:73
crm_status_type
Definition: cluster.h:179
#define crm_trace(fmt, args...)
Definition: logging.h:383
#define do_crm_log(level, fmt, args...)
Log a message.
Definition: logging.h:172
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition: util.h:121
void pcmk__refresh_node_caches_from_cib(xmlNode *cib)
Definition: membership.c:1241
int pcmk__xe_get_bool_attr(const xmlNode *node, const char *name, bool *value)
Definition: nvpair.c:927
#define PCMK__XP_REMOTE_NODE_CONFIG
Definition: xml_internal.h:185
void pcmk__str_update(char **str, const char *value)
Definition: strings.c:1193
#define XML_ATTR_UNAME
Definition: msg_xml.h:170
#define CRM_NODE_MEMBER
Definition: cluster.h:33
crm_node_t * pcmk__search_node_caches(unsigned int id, const char *uname, uint32_t flags)
Definition: membership.c:505
void crm_set_status_callback(void(*dispatch)(enum crm_status_type, crm_node_t *, const void *))
Set a client function that will be called after peer status changes.
Definition: membership.c:453
const char * name_for_cluster_type(enum cluster_type_e type)
Get a log-friendly string equivalent of a cluster type.
Definition: cluster.c:317
uint32_t id
Definition: cpg.c:45
crm_node_t * pcmk__search_known_node_cache(unsigned int id, const char *uname, uint32_t flags)
Definition: membership.c:1258
Deprecated Pacemaker cluster API.
int crm_terminate_member(int nodeid, const char *uname, void *unused)
Definition: membership.c:1289
char * expected
Definition: cluster.h:77
gboolean is_corosync_cluster(void)
Check whether the local cluster is a Corosync cluster.
Definition: cluster.c:402
#define CRM_XS
Definition: logging.h:55
void crm_remote_peer_cache_refresh(xmlNode *cib)
Repopulate the remote peer cache based on CIB XML.
Definition: membership.c:243
guint crm_active_peers(void)
Definition: membership.c:374
#define PCMK__XP_REMOTE_NODE_STATUS
Definition: xml_internal.h:190
crm_node_t * crm_remote_peer_get(const char *node_name)
Get a remote node peer cache entry, creating it if necessary.
Definition: membership.c:107
pcmk__action_result_t result
Definition: pcmk_fence.c:35
#define crm_err(fmt, args...)
Definition: logging.h:377
#define CRM_ASSERT(expr)
Definition: results.h:42
Fencing aka. STONITH.
char * conn_host
Definition: cluster.h:80
char uname[MAX_NAME]
Definition: cpg.c:50
int crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection)
Definition: membership.c:1295
char * state
Definition: cluster.h:61
#define pcmk__plural_s(i)
void crm_foreach_xpath_result(xmlNode *xml, const char *xpath, void(*helper)(xmlNode *, void *), void *user_data)
Run a supplied function for each result of an xpath search.
Definition: xpath.c:173
bool pcmk__corosync_has_nodelist(void)
Definition: corosync.c:730
IPC interface to Pacemaker daemons.
crm_node_t * crm_update_peer_proc(const char *source, crm_node_t *node, uint32_t flag, const char *status)
Definition: membership.c:873
#define set_peer_flags(peer, flags_to_set)
Definition: membership.c:69
char * uname
Definition: cluster.h:59
uint64_t last_seen
Definition: cluster.h:63
#define XML_NODE_IN_CLUSTER
Definition: msg_xml.h:295
#define ONLINESTATUS
Definition: util.h:39
void crm_abort(const char *file, const char *function, int line, const char *condition, gboolean do_core, gboolean do_fork)
Definition: utils.c:397
#define PCMK__XP_MEMBER_NODE_CONFIG
Definition: xml_internal.h:174
crm_node_t * crm_get_peer(unsigned int id, const char *uname)
Get a cluster node cache entry.
Definition: membership.c:701
#define crm_info(fmt, args...)
Definition: logging.h:380
const char * crm_peer_uuid(crm_node_t *node)
Get (and set if needed) a node&#39;s UUID.
Definition: cluster.c:38
crm_proc_flag
Definition: internal.h:17
uint64_t flags
Definition: remote.c:215
GHashTable * pcmk__strikey_table(GDestroyNotify key_destroy_func, GDestroyNotify value_destroy_func)
Definition: strings.c:649
enum cluster_type_e get_cluster_type(void)
Get (and validate) the local cluster type.
Definition: cluster.c:338