summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPradeepKiruvale <pradeepkumar.kj@softwareag.com>2022-04-07 14:03:27 +0530
committerGitHub <noreply@github.com>2022-04-07 14:03:27 +0530
commitba619f6bacca6d9f41aa21a0aa88f290d18798db (patch)
tree1b5a684a6d958b48e467437fa21b251aa0730e5e
parent7826f3d6aeb61e6b4ae0545da27659e438974cfd (diff)
Handle errors and update docs (#1060)
-rw-r--r--Cargo.lock5
-rw-r--r--crates/core/tedge_watchdog/Cargo.toml3
-rw-r--r--crates/core/tedge_watchdog/src/systemd_watchdog.rs46
-rw-r--r--docs/src/SUMMARY.md2
-rw-r--r--docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md36
-rw-r--r--docs/src/howto-guides/README.md2
6 files changed, 56 insertions, 38 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 836b5277..3e3f6390 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1392,9 +1392,9 @@ dependencies = [
[[package]]
name = "log"
-version = "0.4.14"
+version = "0.4.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
+checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8"
dependencies = [
"cfg-if 1.0.0",
"value-bag",
@@ -2966,6 +2966,7 @@ dependencies = [
"tedge_utils",
"thiserror",
"tokio",
+ "tracing",
]
[[package]]
diff --git a/crates/core/tedge_watchdog/Cargo.toml b/crates/core/tedge_watchdog/Cargo.toml
index 65fd3d59..14088179 100644
--- a/crates/core/tedge_watchdog/Cargo.toml
+++ b/crates/core/tedge_watchdog/Cargo.toml
@@ -21,4 +21,5 @@ freedesktop_entry_parser = "1.3.0"
tedge_config = { path = "../../common/tedge_config" }
tedge_utils = { path = "../../common/tedge_utils", features = ["logging"] }
thiserror ="1.0.30"
-tokio = { version = "1.12", features = ["sync", "time"] } \ No newline at end of file
+tokio = { version = "1.12", features = ["sync", "time"] }
+tracing = { version = "0.1", features = ["attributes", "log"] }
diff --git a/crates/core/tedge_watchdog/src/systemd_watchdog.rs b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
index 8a6ad4b6..77825103 100644
--- a/crates/core/tedge_watchdog/src/systemd_watchdog.rs
+++ b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
@@ -1,10 +1,9 @@
use crate::error::WatchdogError;
+use freedesktop_entry_parser::parse_entry;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use mqtt_channel::{Config, Message, PubChannel, Topic};
use nanoid::nanoid;
-
-use freedesktop_entry_parser::parse_entry;
use serde::{Deserialize, Serialize};
use std::time::Instant;
use std::{
@@ -15,6 +14,7 @@ use tedge_config::{
ConfigRepository, ConfigSettingAccessor, MqttBindAddressSetting, MqttPortSetting,
TEdgeConfigLocation,
};
+use tracing::{error, info, warn};
#[derive(Serialize, Deserialize)]
pub struct HealthStatus {
@@ -33,7 +33,7 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err
"tedge-agent",
];
- let watchdog_threads = FuturesUnordered::new();
+ let watchdog_tasks = FuturesUnordered::new();
for service in tedge_services {
match get_watchdog_sec(&format!("/lib/systemd/system/{service}.service")) {
@@ -43,22 +43,25 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err
let tedge_config_location =
tedge_config::TEdgeConfigLocation::from_custom_root(tedge_config_dir.clone());
- watchdog_threads.push(tokio::spawn(async move {
+ watchdog_tasks.push(tokio::spawn(async move {
monitor_tedge_service(
tedge_config_location,
service,
&req_topic,
&res_topic,
- interval,
+ interval / 2,
)
.await
}));
}
- Err(_e) => continue, // Watchdog not enabled for this service
+ Err(_) => {
+ warn!("Watchdog is not enabled for {}", service);
+ continue;
+ }
}
}
- futures::future::join_all(watchdog_threads).await;
+ futures::future::join_all(watchdog_tasks).await;
Ok(())
}
@@ -76,14 +79,14 @@ async fn monitor_tedge_service(
let mut received = client.received;
let mut publisher = client.published;
- println!("Starting watchdog for {} service", name);
+ info!("Starting watchdog for {} service", name);
loop {
let message = Message::new(&Topic::new(req_topic)?, "");
let _ = publisher
.publish(message)
.await
- .map_err(|e| eprintln!("Publish failed with error: {}", e));
+ .map_err(|e| warn!("Publish failed with error: {}", e));
let start = Instant::now();
@@ -99,10 +102,14 @@ async fn monitor_tedge_service(
}
Ok(None) => {}
Err(elapsed) => {
- eprintln!("The {name} failed with {elapsed}");
+ warn!("The {name} failed with {elapsed}");
}
}
- tokio::time::sleep(tokio::time::Duration::from_secs(interval) - start.elapsed()).await;
+
+ let elapsed = start.elapsed();
+ if elapsed < tokio::time::Duration::from_secs(interval) {
+ tokio::time::sleep(tokio::time::Duration::from_secs(interval) - elapsed).await;
+ }
}
}
@@ -134,13 +141,16 @@ fn notify_systemd(pid: u32, status: &str) -> Result<ExitStatus, WatchdogError> {
fn get_watchdog_sec(service_file: &str) -> Result<u64, WatchdogError> {
let entry = parse_entry(service_file)?;
if let Some(interval) = entry.section("Service").attr("WatchdogSec") {
- interval.parse().map_err({
- eprintln!(
- "Failed to parse the to WatchdogSec to integer from {}",
- service_file
- );
- WatchdogError::ParseWatchdogSecToInt
- })
+ match interval.parse::<u64>() {
+ Ok(i) => Ok(i),
+ Err(e) => {
+ error!(
+ "Failed to parse the to WatchdogSec to integer from {}",
+ service_file
+ );
+ Err(WatchdogError::ParseWatchdogSecToInt(e))
+ }
+ }
} else {
Err(WatchdogError::NoWatchdogSec {
file: service_file.to_string(),
diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
index 4f608c57..1bb4003f 100644
--- a/docs/src/SUMMARY.md
+++ b/docs/src/SUMMARY.md
@@ -34,7 +34,7 @@
- [How to use apama software management plugin](./howto-guides/017_apama_software_management_plugin.md)
- [How to change temp path](./howto-guides/018_change_temp_path.md)
- [How to use thin-edge.io with your preferred init system](./howto-guides/019_how_to_use_preferred_init_system.md)
- - [How to enable watchdog using systemd?](./howto-guides/021_enable_tedge_watchdog_using_systemd.md)
+ - [How to enable systemd watchdog monitoring for tedge services?](./howto-guides/021_enable_tedge_watchdog_using_systemd.md)
- [Developer Documentation](dev_doc.md)
- [Architecture](architecture/README.md)
diff --git a/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md b/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md
index aedaa335..49c60fee 100644
--- a/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md
+++ b/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md
@@ -1,19 +1,16 @@
-# Watchdog feature using `systemd` in `thin-edge.io`
+# Enabling systemd watchdog for thin-edge services
## Introduction
-`Watchdog` feature is used to check the health of a service/process by constantly exchanging
-the message between the watchdog process and the service/process that is being monitored.
-When the process does not update its health status within a specified `time` period, then the watchdog process will
-restart the service that is being monitored.
+The systemd watchdog feature enables systemd to detect when a service is unhealthy or unresponsive and attempt to fix it by restarting that service.
+To detect if a service is healthy or not, systemd relies on periodic health notifications from that service at regular intervals.
+If the service fails to send that notification within a time threshold, then systemd will assume that service to be unhealthy and restart it.
-This document shows how `thin-edge.io` services are managed using the systemd, then one can use `systemd`
-feature to check the health of these services as well. This document provides
-information about how to use `systemd` for checking the health of the services.
+This document describes how the systemd watchdog mechanism can be enabled for thin-edge services.
## Enabling the `watchdog` feature in `systemd`
-Enabling the `watchdog` feature in systemd for a `thin-edge.io` service (tedge_agent, tedge_mapper_c8y/az/collectd)
+Enabling systemd `watchdog` for a `thin-edge.io` service (tedge_agent, tedge_mapper_c8y/az/collectd)
using the `systemd` is a two-step process.
### Step 1: Enable the `watchdog` feature in the `systemd` service file
@@ -37,18 +34,27 @@ RestartPreventExitStatus=255
WatchdogSec=5
```
-> Note: The systemd service file usually present in `/lib/systemd/system/tedge-mapper-c8y.service`.
+> Note: The systemd service file for tedge services are usually present
+in `/lib/systemd/system` directory, like `/lib/systemd/system/tedge-mapper-c8y.service`.
### Step 2: Start the `tedge-watchdog` service
-Start the `watchdog` service as below.
+The `tedge-watchdog` service is responsible for periodically checking the health of
+all tedge services for which the watchdog feature is enabled, and send systemd
+watchdog notifications on their behalf to systemd.
+
+Start and enable the `tedge-watchdog` service as follows:
+
```shell
systemctl start tedge-watchdog.service
-```
+systemctl enable tedge-watchdog.service
+```
+
+Now, the `tedge-watchdog` service will be keep sending health check messages to the monitored services periodically within their configured `WatchdogSec` interval.
+
+The health check request for service is published to `tedge/health-check/<service-name>` topic and the health status response from that service is expected on `tedge/health/<service-name>` topic.
-Now the `tedge-watchdog` service will be keep sending health check messages for every `WatchdogSec/2` seconds.
-Once the response is received from the particular service, the `watchdog` service will send the notification
-to the systemd on behalf of the service.
+Once the health status response is received from a particular service, the `tedge-watchdog` service will send the watchdog notification on behalf of that service to systemd.
## Debugging
One can observe the message exchange between the `service` and the `watchdog` by subscribing to `tedge/health/#` and `tedge/health-check/#` topics.
diff --git a/docs/src/howto-guides/README.md b/docs/src/howto-guides/README.md
index e8c6b8b4..859cf1b7 100644
--- a/docs/src/howto-guides/README.md
+++ b/docs/src/howto-guides/README.md
@@ -15,4 +15,4 @@
13. [How to access the logs on the device?](./014_thin_edge_logs.md)
14. [How to install thin-edge.io on any Linux OS (no deb support)?](./015_installation_without_deb_support.md)
16. [How to manage apama software artefatcs with apama plugin?](./017_apama_software_management_plugin.md)
-17. [How to enable watchdog using systemd?](./021_enable_tedge_watchdog_using_systemd.md) \ No newline at end of file
+17. [How to enable watchdog using systemd?](./021_enable_tedge_watchdog_using_systemd.md)