summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPradeepKiruvale <pradeepkumar.kj@softwareag.com>2022-06-16 19:14:21 +0530
committerGitHub <noreply@github.com>2022-06-16 19:14:21 +0530
commitb865681acc2a086b817eb98b12f6b63ae7cd7c3e (patch)
tree6a4730d05d6d07d70a0b96e925a7138439b28f73
parent3d35d4915256b451dcc03678de55de3e3ac7e023 (diff)
Enable systemd watchdog monitoring for tedge-watchdog daemon (#1196)
* Closes #1174, add watchdog to tedge watchdog Signed-off-by: Pradeep Kumar K J <pradeepkumar.kj@softwareag.com>
-rw-r--r--configuration/init/systemd/tedge-watchdog.service3
-rw-r--r--crates/core/tedge_watchdog/src/error.rs2
-rw-r--r--crates/core/tedge_watchdog/src/systemd_watchdog.rs36
3 files changed, 38 insertions, 3 deletions
diff --git a/configuration/init/systemd/tedge-watchdog.service b/configuration/init/systemd/tedge-watchdog.service
index 43db63fc..bcea2e94 100644
--- a/configuration/init/systemd/tedge-watchdog.service
+++ b/configuration/init/systemd/tedge-watchdog.service
@@ -6,7 +6,8 @@ StartLimitIntervalSec=0
[Service]
Type=notify
ExecStart=/usr/bin/tedge_watchdog
-Restart=on-failure
+Restart=always
+WatchdogSec=30
[Install]
WantedBy=multi-user.target
diff --git a/crates/core/tedge_watchdog/src/error.rs b/crates/core/tedge_watchdog/src/error.rs
index ff5fb14b..f9763798 100644
--- a/crates/core/tedge_watchdog/src/error.rs
+++ b/crates/core/tedge_watchdog/src/error.rs
@@ -25,6 +25,6 @@ pub enum WatchdogError {
#[error(transparent)]
ParseSystemdFile(#[from] std::io::Error),
- #[error("Did not find the WatchdogSec{file}")]
+ #[error("Did not find the WatchdogSec in {file}")]
NoWatchdogSec { file: String },
}
diff --git a/crates/core/tedge_watchdog/src/systemd_watchdog.rs b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
index 8682a9c1..43ed9af3 100644
--- a/crates/core/tedge_watchdog/src/systemd_watchdog.rs
+++ b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
@@ -29,6 +29,40 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err
// Send ready notification to systemd.
notify_systemd(process::id(), "--ready")?;
+ // Send heart beat notifications to systemd, to notify about its own health status
+ start_watchdog_for_self().await?;
+
+ // Monitor health of tedge services
+ start_watchdog_for_tedge_services(tedge_config_dir).await;
+ Ok(())
+}
+
+async fn start_watchdog_for_self() -> Result<(), WatchdogError> {
+ match get_watchdog_sec("/lib/systemd/system/tedge-watchdog.service") {
+ Ok(interval) => {
+ let _handle = tokio::spawn(async move {
+ loop {
+ let _ = notify_systemd(process::id(), "WATCHDOG=1").map_err(|e| {
+ eprintln!("Notifying systemd failed with {}", e);
+ });
+ tokio::time::sleep(tokio::time::Duration::from_secs(interval / 4)).await;
+ }
+ });
+ Ok(())
+ }
+
+ Err(WatchdogError::NoWatchdogSec { file }) => {
+ warn!(
+ "Watchdog is not enabled for tedge-watchdog : {}",
+ WatchdogError::NoWatchdogSec { file }
+ );
+ Ok(())
+ }
+ Err(e) => Err(e),
+ }
+}
+
+async fn start_watchdog_for_tedge_services(tedge_config_dir: PathBuf) {
let tedge_services = vec![
"tedge-mapper-c8y",
"tedge-mapper-az",
@@ -65,7 +99,6 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err
}
}
futures::future::join_all(watchdog_tasks).await;
- Ok(())
}
async fn monitor_tedge_service(
@@ -115,6 +148,7 @@ async fn monitor_tedge_service(
let elapsed = start.elapsed();
if elapsed < tokio::time::Duration::from_secs(interval) {
tokio::time::sleep(tokio::time::Duration::from_secs(interval) - elapsed).await;
+ warn!("tedge systemd watchdog not started because no services to monitor")
}
}
}