pacemaker 2.1.8-2.1.8
Scalable High-Availability cluster resource manager
Loading...
Searching...
No Matches
watchdog.c
Go to the documentation of this file.
1/*
2 * Copyright 2013-2024 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU Lesser General Public License
7 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 */
9
10#include <crm_internal.h>
11
12#include <sched.h>
13#include <sys/ioctl.h>
14#include <sys/reboot.h>
15
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <unistd.h>
19#include <ctype.h>
20#include <dirent.h>
21#include <signal.h>
22
23static pid_t sbd_pid = 0;
24
25static void
26sysrq_trigger(char t)
27{
28#if HAVE_LINUX_PROCFS
29 FILE *procf;
30
31 // Root can always write here, regardless of kernel.sysrq value
32 procf = fopen("/proc/sysrq-trigger", "a");
33 if (!procf) {
34 crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
35 return;
36 }
37 crm_info("sysrq-trigger: %c", t);
38 fprintf(procf, "%c\n", t);
39 fclose(procf);
40#endif // HAVE_LINUX_PROCFS
41 return;
42}
43
44
49static void
50panic_local(void)
51{
52 int rc = pcmk_ok;
53 uid_t uid = geteuid();
54 pid_t ppid = getppid();
55 const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
56
57 // Default panic action is to reboot
58 char sysrq = 'b';
59 int reboot_cmd = RB_AUTOBOOT;
60
61 if(uid != 0 && ppid > 1) {
62 /* We're a non-root pacemaker daemon (pacemaker-based,
63 * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
64 * the original pacemakerd parent.
65 *
66 * Of these, only the controller is likely to be initiating resets.
67 */
68 crm_emerg("Signaling parent %lld to panic", (long long) ppid);
70 return;
71
72 } else if (uid != 0) {
73#if HAVE_LINUX_PROCFS
74 /*
75 * No permissions, and no pacemakerd parent to escalate to.
76 * Track down the new pacemakerd process and send a signal instead.
77 */
78 union sigval signal_value;
79
80 memset(&signal_value, 0, sizeof(signal_value));
81 ppid = pcmk__procfs_pid_of("pacemakerd");
82 crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
83
84 if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
85 crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
86 (long long) ppid);
87 }
88#endif // HAVE_LINUX_PROCFS
89
90 /* The best we can do now is die */
92 return;
93 }
94
95 /* We're either pacemakerd, or a pacemaker daemon running as root */
96
97 if (pcmk__starts_with(panic_action, "sync-")) {
98 sync();
99 panic_action += strlen("sync-");
100 };
101
102 if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
103 sysrq = 'c';
104
105 } else if (pcmk__str_eq(panic_action, "off", pcmk__str_casei)) {
106 sysrq = 'o';
107#ifdef RB_POWER_OFF
108 reboot_cmd = RB_POWER_OFF;
109#elif defined(RB_POWEROFF)
110 reboot_cmd = RB_POWEROFF;
111#endif
112 }
113
114 sysrq_trigger(sysrq);
115 reboot(reboot_cmd);
116 rc = errno;
117
118 crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
119 (long long) ppid, pcmk_rc_str(rc), rc);
120
121 if(ppid > 1) {
122 /* child daemon */
124 } else {
125 /* pacemakerd or orphan child */
127 }
128}
129
134static void
135panic_sbd(void)
136{
137 union sigval signal_value;
138 pid_t ppid = getppid();
139
140 crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
141
142 memset(&signal_value, 0, sizeof(signal_value));
143 /* TODO: Arrange for a slightly less brutal option? */
144 if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
145 crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
146 (long long) sbd_pid);
147 panic_local();
148 }
149
150 if(ppid > 1) {
151 /* child daemon */
153 } else {
154 /* pacemakerd or orphan child */
156 }
157}
158
168void
169pcmk__panic(const char *origin)
170{
171 /* Ensure sbd_pid is set */
172 (void) pcmk__locate_sbd();
173
175 {
176 // getppid() == 1 means our original parent no longer exists
177 crm_emerg("Shutting down instead of panicking the node "
178 CRM_XS " origin=%s sbd=%lld parent=%d",
179 origin, (long long) sbd_pid, getppid());
181 return;
182 },
183 {}
184 );
185
186 if(sbd_pid > 1) {
187 crm_emerg("Signaling sbd[%lld] to panic the system: %s",
188 (long long) sbd_pid, origin);
189 panic_sbd();
190
191 } else {
192 crm_emerg("Panicking the system directly: %s", origin);
193 panic_local();
194 }
195}
196
201pid_t
203{
204 char *pidfile = NULL;
205 char *sbd_path = NULL;
206 int rc;
207
208 if(sbd_pid > 1) {
209 return sbd_pid;
210 }
211
212 /* Look for the pid file */
213 pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
214 sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
215
216 /* Read the pid file */
217 rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
218 if (rc == pcmk_rc_ok) {
219 crm_trace("SBD detected at pid %lld (via PID file %s)",
220 (long long) sbd_pid, pidfile);
221
222#if HAVE_LINUX_PROCFS
223 } else {
224 /* Fall back to /proc for systems that support it */
225 sbd_pid = pcmk__procfs_pid_of("sbd");
226 crm_trace("SBD detected at pid %lld (via procfs)",
227 (long long) sbd_pid);
228#endif // HAVE_LINUX_PROCFS
229 }
230
231 if(sbd_pid < 0) {
232 sbd_pid = 0;
233 crm_trace("SBD not detected");
234 }
235
236 free(pidfile);
237 free(sbd_path);
238
239 return sbd_pid;
240}
241
242long
244{
245 static long sbd_timeout = -2;
246
247 if (sbd_timeout == -2) {
248 sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
249 }
250 return sbd_timeout;
251}
252
253bool
255{
256 static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
257 static bool checked_sync_resource_startup = false;
258
259 if (!checked_sync_resource_startup) {
260 const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
261
262 if (sync_env == NULL) {
263 crm_trace("Defaulting to %sstart-up synchronization with sbd",
264 (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
265
266 } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
267 crm_warn("Defaulting to %sstart-up synchronization with sbd "
268 "because environment value '%s' is invalid",
269 (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
270 }
271 checked_sync_resource_startup = true;
272 }
273 return sync_resource_startup != 0;
274}
275
276long
278{
279 long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
280
281 return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
282}
283
284bool
286{
287 /* @COMPAT At a compatibility break, accept either negative values or a
288 * specific string like "auto" (but not both) to mean "auto-calculate the
289 * timeout." Reject other values that aren't parsable as timeouts.
290 */
291 long st_timeout = value? crm_get_msec(value) : 0;
292
293 if (st_timeout < 0) {
295 crm_debug("Using calculated value %ld for "
297 st_timeout, value);
298 }
299
300 if (st_timeout == 0) {
301 crm_debug("Watchdog may be enabled but "
302 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
303 value? value : "default");
304
305 } else if (pcmk__locate_sbd() == 0) {
307 " configured (%s) but SBD not active",
308 pcmk__s(value, "auto"));
310 return false;
311
312 } else {
313 long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
314
315 if (st_timeout < sbd_timeout) {
317 " (%s) too short (must be >%ldms)",
318 value, sbd_timeout);
320 return false;
321 }
322 crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
323 " %s and SBD timeout %ldms",
324 value, sbd_timeout);
325 }
326 return true;
327}
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition pid.c:172
pid_t pcmk__procfs_pid_of(const char *name)
Definition procfs.c:111
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition strings.c:356
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
int crm_str_to_boolean(const char *s, int *ret)
Definition strings.c:496
#define SBIN_DIR
Definition config.h:580
#define PCMK__SBD_SYNC_DEFAULT
Definition config.h:571
#define PCMK_RUN_DIR
Definition config.h:541
#define crm_info(fmt, args...)
Definition logging.h:397
#define crm_warn(fmt, args...)
Definition logging.h:392
#define CRM_XS
Definition logging.h:56
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition logging.h:331
#define crm_debug(fmt, args...)
Definition logging.h:400
#define crm_trace(fmt, args...)
Definition logging.h:402
#define crm_emerg(fmt, args...)
Definition logging.h:385
#define pcmk__if_tracing(if_action, else_action)
#define PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
Definition options.h:68
#define PCMK__ENV_PANIC_ACTION
const char * pcmk__env_option(const char *option)
Definition options.c:1088
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition results.c:501
@ CRM_EX_PANIC
Panic the local host.
Definition results.h:287
@ CRM_EX_FATAL
Do not respawn.
Definition results.h:286
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition results.c:936
@ pcmk_rc_ok
Definition results.h:162
#define pcmk_ok
Definition results.h:69
bool pcmk__starts_with(const char *str, const char *prefix)
Check whether a string starts with a certain sequence.
Definition strings.c:556
@ pcmk__str_casei
pid_t pcmk__locate_sbd(void)
Definition watchdog.c:202
long pcmk__get_sbd_watchdog_timeout(void)
Definition watchdog.c:243
bool pcmk__valid_stonith_watchdog_timeout(const char *value)
Definition watchdog.c:285
long pcmk__auto_stonith_watchdog_timeout(void)
Definition watchdog.c:277
bool pcmk__get_sbd_sync_resource_startup(void)
Definition watchdog.c:254
void pcmk__panic(const char *origin)
Definition watchdog.c:169