diff options
author | PradeepKiruvale <pradeepkumar.kj@softwareag.com> | 2022-04-07 14:03:27 +0530 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-04-07 14:03:27 +0530 |
commit | ba619f6bacca6d9f41aa21a0aa88f290d18798db (patch) | |
tree | 1b5a684a6d958b48e467437fa21b251aa0730e5e | |
parent | 7826f3d6aeb61e6b4ae0545da27659e438974cfd (diff) |
Handle errors and update docs (#1060)
-rw-r--r-- | Cargo.lock | 5 | ||||
-rw-r--r-- | crates/core/tedge_watchdog/Cargo.toml | 3 | ||||
-rw-r--r-- | crates/core/tedge_watchdog/src/systemd_watchdog.rs | 46 | ||||
-rw-r--r-- | docs/src/SUMMARY.md | 2 | ||||
-rw-r--r-- | docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md | 36 | ||||
-rw-r--r-- | docs/src/howto-guides/README.md | 2 |
6 files changed, 56 insertions, 38 deletions
@@ -1392,9 +1392,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.14" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +checksum = "6389c490849ff5bc16be905ae24bc913a9c8892e19b2341dbc175e14c341c2b8" dependencies = [ "cfg-if 1.0.0", "value-bag", @@ -2966,6 +2966,7 @@ dependencies = [ "tedge_utils", "thiserror", "tokio", + "tracing", ] [[package]] diff --git a/crates/core/tedge_watchdog/Cargo.toml b/crates/core/tedge_watchdog/Cargo.toml index 65fd3d59..14088179 100644 --- a/crates/core/tedge_watchdog/Cargo.toml +++ b/crates/core/tedge_watchdog/Cargo.toml @@ -21,4 +21,5 @@ freedesktop_entry_parser = "1.3.0" tedge_config = { path = "../../common/tedge_config" } tedge_utils = { path = "../../common/tedge_utils", features = ["logging"] } thiserror ="1.0.30" -tokio = { version = "1.12", features = ["sync", "time"] }
\ No newline at end of file +tokio = { version = "1.12", features = ["sync", "time"] } +tracing = { version = "0.1", features = ["attributes", "log"] } diff --git a/crates/core/tedge_watchdog/src/systemd_watchdog.rs b/crates/core/tedge_watchdog/src/systemd_watchdog.rs index 8a6ad4b6..77825103 100644 --- a/crates/core/tedge_watchdog/src/systemd_watchdog.rs +++ b/crates/core/tedge_watchdog/src/systemd_watchdog.rs @@ -1,10 +1,9 @@ use crate::error::WatchdogError; +use freedesktop_entry_parser::parse_entry; use futures::stream::FuturesUnordered; use futures::StreamExt; use mqtt_channel::{Config, Message, PubChannel, Topic}; use nanoid::nanoid; - -use freedesktop_entry_parser::parse_entry; use serde::{Deserialize, Serialize}; use std::time::Instant; use std::{ @@ -15,6 +14,7 @@ use tedge_config::{ ConfigRepository, ConfigSettingAccessor, MqttBindAddressSetting, MqttPortSetting, TEdgeConfigLocation, }; +use tracing::{error, info, warn}; #[derive(Serialize, Deserialize)] pub struct HealthStatus { @@ -33,7 +33,7 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err "tedge-agent", ]; - let watchdog_threads = FuturesUnordered::new(); + let watchdog_tasks = FuturesUnordered::new(); for service in tedge_services { match get_watchdog_sec(&format!("/lib/systemd/system/{service}.service")) { @@ -43,22 +43,25 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err let tedge_config_location = tedge_config::TEdgeConfigLocation::from_custom_root(tedge_config_dir.clone()); - watchdog_threads.push(tokio::spawn(async move { + watchdog_tasks.push(tokio::spawn(async move { monitor_tedge_service( tedge_config_location, service, &req_topic, &res_topic, - interval, + interval / 2, ) .await })); } - Err(_e) => continue, // Watchdog not enabled for this service + Err(_) => { + warn!("Watchdog is not enabled for {}", service); + continue; + } } } - futures::future::join_all(watchdog_threads).await; + futures::future::join_all(watchdog_tasks).await; Ok(()) } @@ -76,14 +79,14 @@ async fn monitor_tedge_service( let mut received = client.received; let mut publisher = client.published; - println!("Starting watchdog for {} service", name); + info!("Starting watchdog for {} service", name); loop { let message = Message::new(&Topic::new(req_topic)?, ""); let _ = publisher .publish(message) .await - .map_err(|e| eprintln!("Publish failed with error: {}", e)); + .map_err(|e| warn!("Publish failed with error: {}", e)); let start = Instant::now(); @@ -99,10 +102,14 @@ async fn monitor_tedge_service( } Ok(None) => {} Err(elapsed) => { - eprintln!("The {name} failed with {elapsed}"); + warn!("The {name} failed with {elapsed}"); } } - tokio::time::sleep(tokio::time::Duration::from_secs(interval) - start.elapsed()).await; + + let elapsed = start.elapsed(); + if elapsed < tokio::time::Duration::from_secs(interval) { + tokio::time::sleep(tokio::time::Duration::from_secs(interval) - elapsed).await; + } } } @@ -134,13 +141,16 @@ fn notify_systemd(pid: u32, status: &str) -> Result<ExitStatus, WatchdogError> { fn get_watchdog_sec(service_file: &str) -> Result<u64, WatchdogError> { let entry = parse_entry(service_file)?; if let Some(interval) = entry.section("Service").attr("WatchdogSec") { - interval.parse().map_err({ - eprintln!( - "Failed to parse the to WatchdogSec to integer from {}", - service_file - ); - WatchdogError::ParseWatchdogSecToInt - }) + match interval.parse::<u64>() { + Ok(i) => Ok(i), + Err(e) => { + error!( + "Failed to parse the to WatchdogSec to integer from {}", + service_file + ); + Err(WatchdogError::ParseWatchdogSecToInt(e)) + } + } } else { Err(WatchdogError::NoWatchdogSec { file: service_file.to_string(), diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 4f608c57..1bb4003f 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -34,7 +34,7 @@ - [How to use apama software management plugin](./howto-guides/017_apama_software_management_plugin.md) - [How to change temp path](./howto-guides/018_change_temp_path.md) - [How to use thin-edge.io with your preferred init system](./howto-guides/019_how_to_use_preferred_init_system.md) - - [How to enable watchdog using systemd?](./howto-guides/021_enable_tedge_watchdog_using_systemd.md) + - [How to enable systemd watchdog monitoring for tedge services?](./howto-guides/021_enable_tedge_watchdog_using_systemd.md) - [Developer Documentation](dev_doc.md) - [Architecture](architecture/README.md) diff --git a/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md b/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md index aedaa335..49c60fee 100644 --- a/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md +++ b/docs/src/howto-guides/021_enable_tedge_watchdog_using_systemd.md @@ -1,19 +1,16 @@ -# Watchdog feature using `systemd` in `thin-edge.io` +# Enabling systemd watchdog for thin-edge services ## Introduction -`Watchdog` feature is used to check the health of a service/process by constantly exchanging -the message between the watchdog process and the service/process that is being monitored. -When the process does not update its health status within a specified `time` period, then the watchdog process will -restart the service that is being monitored. +The systemd watchdog feature enables systemd to detect when a service is unhealthy or unresponsive and attempt to fix it by restarting that service. +To detect if a service is healthy or not, systemd relies on periodic health notifications from that service at regular intervals. +If the service fails to send that notification within a time threshold, then systemd will assume that service to be unhealthy and restart it. -This document shows how `thin-edge.io` services are managed using the systemd, then one can use `systemd` -feature to check the health of these services as well. This document provides -information about how to use `systemd` for checking the health of the services. +This document describes how the systemd watchdog mechanism can be enabled for thin-edge services. ## Enabling the `watchdog` feature in `systemd` -Enabling the `watchdog` feature in systemd for a `thin-edge.io` service (tedge_agent, tedge_mapper_c8y/az/collectd) +Enabling systemd `watchdog` for a `thin-edge.io` service (tedge_agent, tedge_mapper_c8y/az/collectd) using the `systemd` is a two-step process. ### Step 1: Enable the `watchdog` feature in the `systemd` service file @@ -37,18 +34,27 @@ RestartPreventExitStatus=255 WatchdogSec=5 ``` -> Note: The systemd service file usually present in `/lib/systemd/system/tedge-mapper-c8y.service`. +> Note: The systemd service file for tedge services are usually present +in `/lib/systemd/system` directory, like `/lib/systemd/system/tedge-mapper-c8y.service`. ### Step 2: Start the `tedge-watchdog` service -Start the `watchdog` service as below. +The `tedge-watchdog` service is responsible for periodically checking the health of +all tedge services for which the watchdog feature is enabled, and send systemd +watchdog notifications on their behalf to systemd. + +Start and enable the `tedge-watchdog` service as follows: + ```shell systemctl start tedge-watchdog.service -``` +systemctl enable tedge-watchdog.service +``` + +Now, the `tedge-watchdog` service will be keep sending health check messages to the monitored services periodically within their configured `WatchdogSec` interval. + +The health check request for service is published to `tedge/health-check/<service-name>` topic and the health status response from that service is expected on `tedge/health/<service-name>` topic. -Now the `tedge-watchdog` service will be keep sending health check messages for every `WatchdogSec/2` seconds. -Once the response is received from the particular service, the `watchdog` service will send the notification -to the systemd on behalf of the service. +Once the health status response is received from a particular service, the `tedge-watchdog` service will send the watchdog notification on behalf of that service to systemd. ## Debugging One can observe the message exchange between the `service` and the `watchdog` by subscribing to `tedge/health/#` and `tedge/health-check/#` topics. diff --git a/docs/src/howto-guides/README.md b/docs/src/howto-guides/README.md index e8c6b8b4..859cf1b7 100644 --- a/docs/src/howto-guides/README.md +++ b/docs/src/howto-guides/README.md @@ -15,4 +15,4 @@ 13. [How to access the logs on the device?](./014_thin_edge_logs.md) 14. [How to install thin-edge.io on any Linux OS (no deb support)?](./015_installation_without_deb_support.md) 16. [How to manage apama software artefatcs with apama plugin?](./017_apama_software_management_plugin.md) -17. [How to enable watchdog using systemd?](./021_enable_tedge_watchdog_using_systemd.md)
\ No newline at end of file +17. [How to enable watchdog using systemd?](./021_enable_tedge_watchdog_using_systemd.md) |