summaryrefslogtreecommitdiffstats
path: root/crates
diff options
context:
space:
mode:
authorAlbin Suresh <albin.suresh@softwareag.com>2022-05-31 12:38:28 +0530
committerGitHub <noreply@github.com>2022-05-31 12:38:28 +0530
commitaacb5ff692e4ae0c6f70673e6aa9798b7d13cb71 (patch)
tree3d21f1c1eb134439801e110cf1f0aab61164eaa1 /crates
parent505b038e76e5ccf7da9c0323962f52ac734b76f8 (diff)
parent34a3f2f29ee2e6eaeadf5a1f41f85f841ef3d4bb (diff)
Merge PR #1170 Fix tedge watchdog timeout misalignment with monitored services
Fix tedge watchdog timeout misalignment with monitored services
Diffstat (limited to 'crates')
-rw-r--r--crates/core/tedge_agent/src/agent.rs4
-rw-r--r--crates/core/tedge_mapper/src/collectd/monitor.rs4
-rw-r--r--crates/core/tedge_mapper/src/core/mapper.rs7
-rw-r--r--crates/core/tedge_watchdog/Cargo.toml3
-rw-r--r--crates/core/tedge_watchdog/src/systemd_watchdog.rs96
5 files changed, 95 insertions, 19 deletions
diff --git a/crates/core/tedge_agent/src/agent.rs b/crates/core/tedge_agent/src/agent.rs
index b7d946b0..755fc7af 100644
--- a/crates/core/tedge_agent/src/agent.rs
+++ b/crates/core/tedge_agent/src/agent.rs
@@ -27,6 +27,7 @@ use tedge_config::{
TEdgeConfigLocation, TmpPathSetting, DEFAULT_LOG_PATH, DEFAULT_RUN_PATH,
};
use tedge_utils::file::create_directory_with_user_group;
+use time::OffsetDateTime;
use tokio::sync::Mutex;
use tracing::{debug, error, info, instrument, warn};
@@ -296,7 +297,8 @@ impl SmAgent {
topic if self.config.request_topics_health.accept_topic(topic) => {
let health_status = json!({
"status": "up",
- "pid": process::id()
+ "pid": process::id(),
+ "time": OffsetDateTime::now_utc().unix_timestamp(),
})
.to_string();
let health_message =
diff --git a/crates/core/tedge_mapper/src/collectd/monitor.rs b/crates/core/tedge_mapper/src/collectd/monitor.rs
index f927f887..b21f6dcf 100644
--- a/crates/core/tedge_mapper/src/collectd/monitor.rs
+++ b/crates/core/tedge_mapper/src/collectd/monitor.rs
@@ -3,6 +3,7 @@ use std::process;
use batcher::{BatchConfigBuilder, BatchDriver, BatchDriverInput, BatchDriverOutput, Batcher};
use mqtt_channel::{Connection, Message, QoS, SinkExt, StreamExt, Topic, TopicFilter};
use serde_json::json;
+use time::OffsetDateTime;
use tracing::{error, info, instrument};
use super::{batcher::MessageBatch, collectd::CollectdMessage, error::DeviceMonitorError};
@@ -109,7 +110,8 @@ impl DeviceMonitor {
if health_check_topics.accept(&message) {
let health_status = json!({
"status": "up",
- "pid": process::id()
+ "pid": process::id(),
+ "time": OffsetDateTime::now_utc().unix_timestamp(),
})
.to_string();
let health_message = Message::new(&health_status_topic, health_status);
diff --git a/crates/core/tedge_mapper/src/core/mapper.rs b/crates/core/tedge_mapper/src/core/mapper.rs
index 9309a31e..fb5713f4 100644
--- a/crates/core/tedge_mapper/src/core/mapper.rs
+++ b/crates/core/tedge_mapper/src/core/mapper.rs
@@ -7,6 +7,7 @@ use mqtt_channel::{
UnboundedSender,
};
use serde_json::json;
+use time::OffsetDateTime;
use tracing::{error, info, instrument};
const SYNC_WINDOW: Duration = Duration::from_secs(3);
@@ -133,7 +134,8 @@ impl Mapper {
if self.health_check_topics.accept(&message) {
let health_status = json!({
"status": "up",
- "pid": process::id()
+ "pid": process::id(),
+ "time": OffsetDateTime::now_utc().unix_timestamp(),
})
.to_string();
let health_message = Message::new(&self.health_status_topic, health_status);
@@ -242,7 +244,7 @@ mod tests {
let common_health_check_topic = "tedge/health-check";
let health_status = broker
.wait_for_response_on_publish(
- &common_health_check_topic,
+ common_health_check_topic,
"",
&health_topic,
Duration::from_secs(1),
@@ -252,6 +254,7 @@ mod tests {
let health_status: Value = serde_json::from_str(health_status.as_str())?;
assert_json_include!(actual: &health_status, expected: json!({"status": "up"}));
assert!(health_status["pid"].is_number());
+ assert!(health_status["time"].is_number());
Ok(())
}
diff --git a/crates/core/tedge_watchdog/Cargo.toml b/crates/core/tedge_watchdog/Cargo.toml
index 7374eb5c..da3647c0 100644
--- a/crates/core/tedge_watchdog/Cargo.toml
+++ b/crates/core/tedge_watchdog/Cargo.toml
@@ -28,5 +28,6 @@ freedesktop_entry_parser = "1.3.0"
tedge_config = { path = "../../common/tedge_config" }
tedge_utils = { path = "../../common/tedge_utils", features = ["logging"] }
thiserror ="1.0.30"
-tokio = { version = "1.12", features = ["sync", "time"] }
+time = { version = "0.3", features = ["formatting", "serde-well-known"] }
+tokio = { version = "1.12", features = ["sync", "time", "rt-multi-thread"] }
tracing = { version = "0.1", features = ["attributes", "log"] }
diff --git a/crates/core/tedge_watchdog/src/systemd_watchdog.rs b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
index 77825103..8682a9c1 100644
--- a/crates/core/tedge_watchdog/src/systemd_watchdog.rs
+++ b/crates/core/tedge_watchdog/src/systemd_watchdog.rs
@@ -1,5 +1,6 @@
use crate::error::WatchdogError;
use freedesktop_entry_parser::parse_entry;
+use futures::channel::mpsc;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use mqtt_channel::{Config, Message, PubChannel, Topic};
@@ -14,12 +15,14 @@ use tedge_config::{
ConfigRepository, ConfigSettingAccessor, MqttBindAddressSetting, MqttPortSetting,
TEdgeConfigLocation,
};
-use tracing::{error, info, warn};
+use time::OffsetDateTime;
+use tracing::{debug, error, info, warn};
-#[derive(Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
pub struct HealthStatus {
status: String,
pid: u32,
+ time: i64,
}
pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Error> {
@@ -49,7 +52,7 @@ pub async fn start_watchdog(tedge_config_dir: PathBuf) -> Result<(), anyhow::Err
service,
&req_topic,
&res_topic,
- interval / 2,
+ interval / 4,
)
.await
}));
@@ -90,19 +93,22 @@ async fn monitor_tedge_service(
let start = Instant::now();
- match tokio::time::timeout(tokio::time::Duration::from_secs(interval), received.next())
- .await
+ let request_timestamp = OffsetDateTime::now_utc().unix_timestamp();
+ match tokio::time::timeout(
+ tokio::time::Duration::from_secs(interval),
+ get_latest_health_status_message(request_timestamp, &mut received),
+ )
+ .await
{
- Ok(Some(msg)) => {
- let message = msg.payload_str()?;
-
- let p: HealthStatus = serde_json::from_str(message)?;
-
- notify_systemd(p.pid, "WATCHDOG=1")?;
+ Ok(health_status) => {
+ debug!(
+ "Sending notification for {} with pid: {}",
+ name, health_status.pid
+ );
+ notify_systemd(health_status.pid, "WATCHDOG=1")?;
}
- Ok(None) => {}
- Err(elapsed) => {
- warn!("The {name} failed with {elapsed}");
+ Err(_) => {
+ warn!("No health check response received from {name} in time");
}
}
@@ -113,6 +119,31 @@ async fn monitor_tedge_service(
}
}
+async fn get_latest_health_status_message(
+ request_timestamp: i64,
+ messages: &mut mpsc::UnboundedReceiver<Message>,
+) -> HealthStatus {
+ loop {
+ if let Some(message) = messages.next().await {
+ if let Ok(message) = message.payload_str() {
+ debug!("Health response received: {}", message);
+ if let Ok(health_status) = serde_json::from_str::<HealthStatus>(message) {
+ if health_status.time >= request_timestamp {
+ return health_status;
+ } else {
+ debug!(
+ "Ignoring stale health response: {:?} older than request time: {}",
+ health_status, request_timestamp
+ );
+ }
+ } else {
+ error!("Invalid health response received: {}", message);
+ }
+ }
+ }
+ }
+}
+
fn get_mqtt_config(
tedge_config_location: TEdgeConfigLocation,
client_id: &str,
@@ -157,3 +188,40 @@ fn get_watchdog_sec(service_file: &str) -> Result<u64, WatchdogError> {
})
}
}
+
+#[cfg(test)]
+mod tests {
+ use anyhow::Result;
+ use serde_json::json;
+
+ use super::*;
+
+ #[tokio::test]
+ async fn test_get_latest_health_status_message() -> Result<()> {
+ let (mut sender, mut receiver) = mpsc::unbounded::<Message>();
+ let health_topic = Topic::new("tedge/health/test-service").expect("Valid topic");
+
+ for x in 1..5i64 {
+ let health_status = json!({
+ "status": "up",
+ "pid": 123u32,
+ "time": x,
+ })
+ .to_string();
+ let health_message = Message::new(&health_topic, health_status);
+ sender.publish(health_message).await?;
+ }
+
+ let health_status = get_latest_health_status_message(3, &mut receiver).await;
+ assert_eq!(health_status.time, 3);
+
+ let timeout_error = tokio::time::timeout(
+ tokio::time::Duration::from_secs(1),
+ get_latest_health_status_message(5, &mut receiver),
+ )
+ .await;
+ assert!(timeout_error.is_err());
+
+ Ok(())
+ }
+}