pacemaker 2.1.8-2.1.8
Scalable High-Availability cluster resource manager
Loading...
Searching...
No Matches
failcounts.c
Go to the documentation of this file.
1/*
2 * Copyright 2008-2024 the Pacemaker project contributors
3 *
4 * This source code is licensed under the GNU Lesser General Public License
5 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6 */
7
8#include <crm_internal.h>
9
10#include <sys/types.h>
11#include <regex.h>
12#include <glib.h>
13
14#include <crm/crm.h>
15#include <crm/common/xml.h>
16#include <crm/common/util.h>
18
19static gboolean
20is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
21 const xmlNode *lrm_op_xml)
22{
23 gboolean matched = FALSE;
24 const char *conf_op_name = NULL;
25 const char *lrm_op_task = NULL;
26 const char *conf_op_interval_spec = NULL;
27 guint conf_op_interval_ms = 0;
28 guint lrm_op_interval_ms = 0;
29 const char *lrm_op_id = NULL;
30 char *last_failure_key = NULL;
31
32 if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
33 return FALSE;
34 }
35
36 // Get name and interval from configured op
37 conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
38 conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
39 pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
40
41 // Get name and interval from op history entry
42 lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
43 crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
44
45 if ((conf_op_interval_ms != lrm_op_interval_ms)
46 || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
47 return FALSE;
48 }
49
50 lrm_op_id = pcmk__xe_id(lrm_op_xml);
51 last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
52
53 if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
54 matched = TRUE;
55
56 } else {
57 char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
58 conf_op_interval_ms);
59
60 if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
61 int rc = 0;
62 int target_rc = pe__target_rc_from_xml(lrm_op_xml);
63
65 if (rc != target_rc) {
66 matched = TRUE;
67 }
68 }
69 free(expected_op_key);
70 }
71
72 free(last_failure_key);
73 return matched;
74}
75
76static gboolean
77block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
78 const xmlNode *xml_op)
79{
80 char *xml_name = clone_strip(rsc->id);
81
82 /* @TODO This xpath search occurs after template expansion, but it is unable
83 * to properly detect on-fail in id-ref, operation meta-attributes, or
84 * op_defaults, or evaluate rules.
85 *
86 * Also, PCMK_META_ON_FAIL defaults to PCMK_VALUE_BLOCK (in
87 * unpack_operation()) for stop actions when stonith is disabled.
88 *
89 * Ideally, we'd unpack the operation before this point, and pass in a
90 * meta-attributes table that takes all that into consideration.
91 */
92 char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
93 "[@" PCMK_XA_ID "='%s']"
94 "//" PCMK_XE_OP
96 "='" PCMK_VALUE_BLOCK "']",
97 xml_name);
98
99 xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
100 gboolean should_block = FALSE;
101
102 free(xpath);
103
104 if (xpathObj) {
105 int max = numXpathResults(xpathObj);
106 int lpc = 0;
107
108 for (lpc = 0; lpc < max; lpc++) {
109 xmlNode *pref = getXpathResult(xpathObj, lpc);
110
111 if (xml_op) {
112 should_block = is_matched_failure(xml_name, pref, xml_op);
113 if (should_block) {
114 break;
115 }
116
117 } else {
118 const char *conf_op_name = NULL;
119 const char *conf_op_interval_spec = NULL;
120 guint conf_op_interval_ms = 0;
121 char *lrm_op_xpath = NULL;
122 xmlXPathObject *lrm_op_xpathObj = NULL;
123
124 // Get name and interval from configured op
125 conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
126 conf_op_interval_spec = crm_element_value(pref,
128 pcmk_parse_interval_spec(conf_op_interval_spec,
129 &conf_op_interval_ms);
130
131#define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']" \
132 "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']" \
133 "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']" \
134 "[@" PCMK_META_INTERVAL "='%u']"
135
136 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
137 node->details->uname, xml_name,
138 conf_op_name,
139 conf_op_interval_ms);
140 lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
141
142 free(lrm_op_xpath);
143
144 if (lrm_op_xpathObj) {
145 int max2 = numXpathResults(lrm_op_xpathObj);
146 int lpc2 = 0;
147
148 for (lpc2 = 0; lpc2 < max2; lpc2++) {
149 xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
150 lpc2);
151
152 should_block = is_matched_failure(xml_name, pref,
153 lrm_op_xml);
154 if (should_block) {
155 break;
156 }
157 }
158 }
159 freeXpathObject(lrm_op_xpathObj);
160
161 if (should_block) {
162 break;
163 }
164 }
165 }
166 }
167
168 free(xml_name);
169 freeXpathObject(xpathObj);
170
171 return should_block;
172}
173
183static inline char *
184rsc_fail_name(const pcmk_resource_t *rsc)
185{
186 const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
187
188 return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
189}
190
205static int
206generate_fail_regex(const char *prefix, const char *rsc_name,
207 gboolean is_legacy, gboolean is_unique, regex_t *re)
208{
209 char *pattern;
210
211 /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
212 * per-operation.
213 */
214 const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
215
216 /* Ignore instance numbers for anything other than globally unique clones.
217 * Anonymous clone fail counts could contain an instance number if the
218 * clone was initially unique, failed, then was converted to anonymous.
219 * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
220 * clone instance numbers.
221 */
222 const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
223
224 pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
225 instance_pattern, op_pattern);
226 if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
227 free(pattern);
228 return EINVAL;
229 }
230
231 free(pattern);
232 return pcmk_rc_ok;
233}
234
247static int
248generate_fail_regexes(const pcmk_resource_t *rsc,
249 regex_t *failcount_re, regex_t *lastfailure_re)
250{
251 int rc = pcmk_rc_ok;
252 char *rsc_name = rsc_fail_name(rsc);
253 const char *version = crm_element_value(rsc->cluster->input,
255
256 // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource
257 gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
258
259 if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
261 failcount_re) != pcmk_rc_ok) {
262 rc = EINVAL;
263
264 } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
265 is_legacy,
267 lastfailure_re) != pcmk_rc_ok) {
268 rc = EINVAL;
269 regfree(failcount_re);
270 }
271
272 free(rsc_name);
273 return rc;
274}
275
276// Data for fail-count-related iterators
277struct failcount_data {
278 const pcmk_node_t *node;// Node to check for fail count
279 pcmk_resource_t *rsc; // Resource to check for fail count
280 uint32_t flags; // Fail count flags
281 const xmlNode *xml_op; // History entry for expiration purposes (or NULL)
282 regex_t failcount_re; // Fail count regular expression to match
283 regex_t lastfailure_re; // Last failure regular expression to match
284 int failcount; // Fail count so far
285 time_t last_failure; // Time of most recent failure so far
286};
287
296static void
297update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
298{
299 struct failcount_data *fc_data = user_data;
300
301 // If this is a matching fail count attribute, update fail count
302 if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
303 fc_data->failcount = pcmk__add_scores(fc_data->failcount,
304 char2score(value));
305 pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
306 (const char *) key, (const char *) value,
307 fc_data->rsc->id,
308 pcmk_readable_score(fc_data->failcount));
309 return;
310 }
311
312 // If this is a matching last failure attribute, update last failure
313 if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
314 0) == 0) {
315 long long last_ll;
316
317 if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
318 fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
319 last_ll);
320 }
321 }
322}
323
331static void
332update_failcount_for_filler(gpointer data, gpointer user_data)
333{
334 pcmk_resource_t *filler = data;
335 struct failcount_data *fc_data = user_data;
336 time_t filler_last_failure = 0;
337
338 fc_data->failcount += pe_get_failcount(fc_data->node, filler,
339 &filler_last_failure, fc_data->flags,
340 fc_data->xml_op);
341 fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
342}
343
360int
362 time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
363{
364 struct failcount_data fc_data = {
365 .node = node,
366 .rsc = rsc,
367 .flags = flags,
368 .xml_op = xml_op,
369 .failcount = 0,
370 .last_failure = (time_t) 0,
371 };
372
373 // Calculate resource failcount as sum of all matching operation failcounts
374 CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
375 &fc_data.lastfailure_re) == pcmk_rc_ok,
376 return 0);
377 g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
378 &fc_data);
379 regfree(&(fc_data.failcount_re));
380 regfree(&(fc_data.lastfailure_re));
381
382 // If failure blocks the resource, disregard any failure timeout
383 if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
384 && block_failure(node, rsc, xml_op)) {
385
386 pcmk__config_warn("Ignoring failure timeout %d for %s "
387 "because it conflicts with "
389 rsc->failure_timeout, rsc->id);
390 rsc->failure_timeout = 0;
391 }
392
393 // If all failures have expired, ignore fail count
394 if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
395 && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
396
397 time_t now = get_effective_time(rsc->cluster);
398
399 if (now > (fc_data.last_failure + rsc->failure_timeout)) {
400 pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
401 rsc->id, pcmk__node_name(node),
402 rsc->failure_timeout);
403 fc_data.failcount = 0;
404 }
405 }
406
407 /* Add the fail count of any filler resources, except that we never want the
408 * fail counts of a bundle container's fillers to count towards the
409 * container's fail count.
410 *
411 * Most importantly, a Pacemaker Remote connection to a bundle container
412 * is a filler of the container, but can reside on a different node than the
413 * container itself. Counting its fail count on its node towards the
414 * container's fail count on that node could lead to attempting to stop the
415 * container on the wrong node.
416 */
417 if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
418 && !pcmk__is_bundled(rsc)) {
419
420 g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
421 if (fc_data.failcount > 0) {
422 pcmk__rsc_info(rsc,
423 "Container %s and the resources within it "
424 "have failed %s time%s on %s",
425 rsc->id, pcmk_readable_score(fc_data.failcount),
426 pcmk__plural_s(fc_data.failcount),
427 pcmk__node_name(node));
428 }
429
430 } else if (fc_data.failcount > 0) {
431 pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
432 rsc->id, pcmk_readable_score(fc_data.failcount),
433 pcmk__plural_s(fc_data.failcount),
434 pcmk__node_name(node));
435 }
436
437 if (last_failure != NULL) {
438 if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
439 *last_failure = fc_data.last_failure;
440 } else {
441 *last_failure = 0;
442 }
443 }
444 return fc_data.failcount;
445}
446
459 const char *reason, pcmk_scheduler_t *scheduler)
460{
461 char *key = NULL;
462 pcmk_action_t *clear = NULL;
463
464 CRM_CHECK(rsc && node && reason && scheduler, return NULL);
465
467 clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
468 scheduler);
470 crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
471 rsc->id, pcmk__node_name(node), reason, clear->uuid);
472 return clear;
473}
#define PCMK_ACTION_CLEAR_FAILCOUNT
Definition actions.h:46
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition actions.c:196
const char * name
Definition cib.c:26
#define PCMK__LAST_FAILURE_PREFIX
Definition internal.h:351
#define PCMK__FAIL_COUNT_PREFIX
Definition internal.h:350
uint64_t flags
Definition remote.c:3
uint32_t version
Definition remote.c:1
Utility functions.
char int pcmk_parse_interval_spec(const char *input, guint *result_ms)
Parse milliseconds from a Pacemaker interval specification.
Definition strings.c:451
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
int compare_version(const char *version1, const char *version2)
Definition utils.c:188
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition util.h:98
char data[0]
Definition cpg.c:10
A dumping ground.
pcmk_action_t * pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node, const char *reason, pcmk_scheduler_t *scheduler)
Schedule a controller operation to clear a fail count.
Definition failcounts.c:458
int pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc, time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
Definition failcounts.c:361
#define XPATH_FMT
@ pcmk__fc_effective
@ pcmk__fc_fillers
#define CRM_XS
Definition logging.h:56
#define crm_notice(fmt, args...)
Definition logging.h:395
#define CRM_CHECK(expr, failure_action)
Definition logging.h:245
#define pcmk__config_warn(fmt...)
pcmk_scheduler_t * scheduler
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition nvpair.c:446
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition nvpair.c:482
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition nvpair.c:539
#define pcmk__insert_meta(obj, name, value)
#define PCMK_META_INTERVAL
Definition options.h:91
#define PCMK_META_ON_FAIL
Definition options.h:98
#define PCMK_VALUE_TRUE
Definition options.h:215
#define PCMK_VALUE_BLOCK
Definition options.h:135
#define PCMK__META_OP_NO_WAIT
time_t get_effective_time(pcmk_scheduler_t *scheduler)
Definition utils.c:395
pcmk_action_t * custom_action(pcmk_resource_t *rsc, char *key, const char *task, const pcmk_node_t *on_node, gboolean optional, pcmk_scheduler_t *scheduler)
Create or update an action object.
char * clone_strip(const char *last_rsc_id)
Definition unpack.c:1955
int pe__target_rc_from_xml(const xmlNode *xml_op)
Definition unpack.c:4385
@ pcmk_rsc_unique
Definition resources.h:100
@ pcmk_rc_ok
Definition results.h:162
#define pcmk__rsc_info(rsc, fmt, args...)
#define pcmk__rsc_trace(rsc, fmt, args...)
#define pcmk__rsc_debug(rsc, fmt, args...)
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition scores.c:86
int char2score(const char *score)
Get the integer value of a score string.
Definition scores.c:36
int pcmk__add_scores(int score1, int score2)
Definition scores.c:116
#define pcmk__plural_s(i)
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition strings.c:97
@ pcmk__str_casei
char * uuid
Definition actions.h:344
struct pe_node_shared_s * details
Definition nodes.h:167
GHashTable * attrs
Definition nodes.h:142
const char * uname
Definition nodes.h:73
int failure_timeout
Definition resources.h:421
pcmk_scheduler_t * cluster
Definition resources.h:408
char * clone_name
Definition resources.h:397
xmlNode * xml
Definition resources.h:400
unsigned long long flags
Definition resources.h:428
GList * fillers
Definition resources.h:477
xmlNode * input
Definition scheduler.h:196
Wrappers for and extensions to libxml2.
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition xpath.c:58
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition xpath.c:39
xmlXPathObjectPtr xpath_search(const xmlNode *xml_top, const char *path)
Definition xpath.c:139
#define PCMK_XA_OPERATION
Definition xml_names.h:344
#define PCMK_XA_ID
Definition xml_names.h:296
#define PCMK_XA_CRM_FEATURE_SET
Definition xml_names.h:249
#define PCMK_XE_PRIMITIVE
Definition xml_names.h:160
#define PCMK_XA_NAME
Definition xml_names.h:325
#define PCMK_XE_OP
Definition xml_names.h:143
#define PCMK__XA_RC_CODE