Remove relative speed setting, select endpoint by utilization instead

This patch removes the "speed" setting from the configuration, which was introduced to set a relative speed for each endpoint, with the idea that the scheduler then would select a faster node preferably. Instead, the utilization of an endpoint is now calculated (number of running jobs vs allowed maximum jobs on the endpoint), and the endpoint with lower utilization is selected. Signed-off-by: Matthias Beyer <matthias.beyer@atos.net>
author: Matthias Beyer <matthias.beyer@atos.net> 2021-03-04 14:08:10 +0100
committer: Matthias Beyer <mail@beyermatthias.de> 2021-03-04 14:30:25 +0100
commit: 60a3fa633a33e315c1439a9f2436fcdb48da62ae (patch)
tree: aa10ed05e61781d0ef25d098ad5567f613bdeeab
parent: 248c28c0b882930908493af94f714ce4de3706ac (diff)
4 files changed, 13 insertions, 10 deletions
diff --git a/config.toml b/config.toml
index 9536234..15843de 100644
--- a/config.toml
+++ b/config.toml
@@ -174,12 +174,14 @@ verify_images_present = true
 name          = "testhostname"
 uri           = "http://0.0.0.0:8095" # the URI of the endpoint. Either http or socket path
 endpoint_type = "http" # either "http" or "socket"
-speed         = 1 # currently ignored, but required to be present
 
 # maximum number of jobs running on this endpoint.
 # Set this to a reasonable high number to be able to run a lot of small jobs.
 # For example, if you're compiling with `make -j 1`, this should at least be the
 # number of CPU cores, maybe a bit more (eg. (ncpu * 1.1))
+#
+# Also, if two nodes have the same number of running jobs, and a new job comes
+# in, the node with more "free slots" will be considered first.
 maxjobs       = 1
 
 
diff --git a/src/config/endpoint_config.rs b/src/config/endpoint_config.rs
index 12a588a..13db137 100644
--- a/src/config/endpoint_config.rs
+++ b/src/config/endpoint_config.rs
@@ -26,14 +26,6 @@ pub struct Endpoint {
     #[getset(get = "pub")]
     endpoint_type: EndpointType,
 
-    /// Relative speed to other endpoints
-    ///
-    /// So if you have two servers, one with 12 cores and one with 24, you want to set "1" for the
-    /// first and "2" for the second (or "12" for the first and "24" for the second - the ratio is
-    /// the thing here)!
-    #[getset(get_copy = "pub")]
-    speed: usize,
-
     /// Maximum number of jobs which are allowed on this endpoint
     #[getset(get_copy = "pub")]
     maxjobs: usize,
diff --git a/src/endpoint/configured.rs b/src/endpoint/configured.rs
index bbaaec6..ba1c54e 100644
--- a/src/endpoint/configured.rs
+++ b/src/endpoint/configured.rs
@@ -234,6 +234,13 @@ impl Endpoint {
     pub fn running_jobs(&self) -> usize {
         self.running_jobs.load(std::sync::atomic::Ordering::Relaxed)
     }
+
+    /// Super non-scientific utilization calculation for the endpoint
+    pub fn utilization(&self) -> f64 {
+        let max_jobs = self.num_max_jobs() as f64;
+        let run_jobs = self.running_jobs() as f64;
+        100.0 / max_jobs * run_jobs
+    }
 }
 
 pub struct EndpointHandle(Arc<Endpoint>);
diff --git a/src/endpoint/scheduler.rs b/src/endpoint/scheduler.rs
index 4f49034..34a79d7 100644
--- a/src/endpoint/scheduler.rs
+++ b/src/endpoint/scheduler.rs
@@ -110,7 +110,9 @@ impl EndpointScheduler {
                     trace!("Endpoint {} considered for scheduling job: {}", ep.name(), r);
                     r
                 })
-                .sorted_by(|ep1, ep2| ep1.running_jobs().cmp(&ep2.running_jobs()))
+                .sorted_by(|ep1, ep2| {
+                    ep1.utilization().partial_cmp(&ep2.utilization()).unwrap_or(std::cmp::Ordering::Equal)
+                })
                 .next();
 
             if let Some(endpoint) = ep {
author	Matthias Beyer <matthias.beyer@atos.net>	2021-03-04 14:08:10 +0100
committer	Matthias Beyer <mail@beyermatthias.de>	2021-03-04 14:30:25 +0100
commit	60a3fa633a33e315c1439a9f2436fcdb48da62ae (patch)
tree	aa10ed05e61781d0ef25d098ad5567f613bdeeab
parent	248c28c0b882930908493af94f714ce4de3706ac (diff)