1 files changed, 276 insertions, 0 deletions
diff --git a/src/app/data_harvester/processes/linux/process.rs b/src/app/data_harvester/processes/linux/process.rs
new file mode 100644
index 00000000..f2fc6327
--- /dev/null
+++ b/src/app/data_harvester/processes/linux/process.rs
@@ -0,0 +1,276 @@
+//! Linux process code for getting process data via `/proc/`.
+//! Based on the [procfs](https://github.com/eminence/procfs) crate.
+
+use std::{
+    fs::File,
+    io::{self, BufRead, BufReader, Read},
+    path::PathBuf,
+};
+
+use anyhow::anyhow;
+use libc::uid_t;
+use once_cell::sync::Lazy;
+use rustix::{
+    fd::OwnedFd,
+    fs::{Mode, OFlags},
+    path::Arg,
+};
+
+use crate::Pid;
+
+static PAGESIZE: Lazy<u64> = Lazy::new(|| rustix::param::page_size() as u64);
+
+fn next_part<'a>(iter: &mut impl Iterator<Item = &'a str>) -> Result<&'a str, io::Error> {
+    iter.next()
+        .ok_or_else(|| io::Error::from(io::ErrorKind::InvalidData))
+}
+
+/// A wrapper around the data in `/proc/<PID>/stat`. For documentation, see [here](https://man7.org/linux/man-pages/man5/proc.5.html).
+///
+/// Note this does not necessarily get all fields, only the ones we use in bottom.
+pub(crate) struct Stat {
+    /// The filename of the executable without parentheses.
+    pub comm: String,
+
+    /// The current process state, represented by a char.
+    pub state: char,
+
+    /// The parent process PID.
+    pub ppid: Pid,
+
+    /// The amount of time this process has been scheduled in user mode in clock ticks.
+    pub utime: u64,
+
+    /// The amount of time this process has been scheduled in kernel mode in clock ticks.
+    pub stime: u64,
+
+    /// The resident set size, or the number of pages the process has in real memory.
+    pub rss: u64,
+}
+
+impl Stat {
+    fn from_file(mut f: File) -> anyhow::Result<Stat> {
+        // Since this is just one line, we can read it all at once. However, since it might have non-utf8 characters,
+        // we can't just use read_to_string.
+        let mut buffer = Vec::with_capacity(500);
+        f.read_to_end(&mut buffer)?;
+
+        let line = buffer.to_string_lossy();
+        let line = line.trim();
+
+        let (comm, rest) = {
+            let start_paren = line
+                .find('(')
+                .ok_or_else(|| anyhow!("start paren missing"))?;
+            let end_paren = line.find(')').ok_or_else(|| anyhow!("end paren missing"))?;
+
+            (
+                line[start_paren + 1..end_paren].to_string(),
+                &line[end_paren + 2..],
+            )
+        };
+
+        let mut rest = rest.split(' ');
+        let state = next_part(&mut rest)?
+            .chars()
+            .next()
+            .ok_or_else(|| anyhow!("missing state"))?;
+
+        let ppid: Pid = next_part(&mut rest)?.parse()?;
+
+        // Skip 9 fields until utime (pgrp, session, tty_nr, tpgid, flags, minflt, cminflt, majflt, cmajflt).
+        let mut rest = rest.skip(9);
+
+        let utime: u64 = next_part(&mut rest)?.parse()?;
+        let stime: u64 = next_part(&mut rest)?.parse()?;
+
+        // Skip 8 fields until rss (cutime, cstime, priority, nice, num_threads, itrealvalue, starttime, vsize).
+        let mut rest = rest.skip(8);
+
+        let rss: u64 = next_part(&mut rest)?.parse()?;
+
+        Ok(Stat {
+            comm,
+            state,
+            ppid,
+            utime,
+            stime,
+            rss,
+        })
+    }
+
+    /// Returns the Resident Set Size in bytes.
+    pub fn rss_bytes(&self) -> u64 {
+        self.rss * *PAGESIZE
+    }
+}
+
+/// A wrapper around the data in `/proc/<PID>/io`.
+///
+/// Note this does not necessarily get all fields, only the ones we use in bottom.
+pub(crate) struct Io {
+    pub read_bytes: u64,
+    pub write_bytes: u64,
+}
+
+impl Io {
+    fn from_file(f: File) -> anyhow::Result<Io> {
+        const NUM_FIELDS: u16 = 0; // Make sure to update this if you want more fields!
+        enum Fields {
+            ReadBytes,
+            WriteBytes,
+        }
+
+        let mut read_fields = 0;
+        let mut line = String::new();
+        let mut reader = BufReader::new(f);
+
+        let mut read_bytes = 0;
+        let mut write_bytes = 0;
+
+        // This saves us from doing a string allocation on each iteration compared to `lines()`.
+        while let Ok(bytes) = reader.read_line(&mut line) {
+            if bytes > 0 {
+                if line.is_empty() {
+                    // Empty, no need to clear.
+                    continue;
+                }
+
+                let mut parts = line.split_whitespace();
+
+                if let Some(field) = parts.next() {
+                    let curr_field = match field {
+                        "read_bytes:" => Fields::ReadBytes,
+                        "write_bytes:" => Fields::WriteBytes,
+                        _ => {
+                            line.clear();
+                            continue;
+                        }
+                    };
+
+                    if let Some(value) = parts.next() {
+                        let value = value.parse::<u64>()?;
+                        match curr_field {
+                            Fields::ReadBytes => {
+                                read_bytes = value;
+                                read_fields += 1;
+                            }
+                            Fields::WriteBytes => {
+                                write_bytes = value;
+                                read_fields += 1;
+                            }
+                        }
+                    }
+                }
+
+                // Quick short circuit if we read all required fields.
+                if read_fields == NUM_FIELDS {
+                    break;
+                }
+
+                line.clear();
+            } else {
+                break;
+            }
+        }
+
+        Ok(Io {
+            read_bytes,
+            write_bytes,
+        })
+    }
+}
+
+/// A wrapper around a Linux process operations in `/proc/<PID>`.
+///
+/// Core documentation based on [proc's manpages](https://man7.org/linux/man-pages/man5/proc.5.html).
+pub(crate) struct Process {
+    pub pid: Pid,
+    pub uid: Option<uid_t>,
+    pub stat: Stat,
+    pub io: anyhow::Result<Io>,
+    pub cmdline: anyhow::Result<Vec<String>>,
+}
+
+impl Process {
+    /// Creates a new [`Process`] given a `/proc/<PID>` path. This may fail if the process
+    /// no longer exists or there are permissions issues.
+    ///
+    /// Note that this pre-allocates fields on **creation**! As such, some data might end
+    /// up "outdated" depending on when you call some of the methods. Therefore, this struct
+    /// is only useful for either fields that are unlikely to change, or are short-lived and
+    /// will be discarded quickly.
+    pub(crate) fn from_path(pid_path: PathBuf) -> anyhow::Result<Process> {
+        // TODO: Pass in a buffer vec/string to share?
+
+        let fd = rustix::fs::openat(
+            rustix::fs::cwd(),
+            &pid_path,
+            OFlags::PATH | OFlags::DIRECTORY | OFlags::CLOEXEC,
+            Mode::empty(),
+        )?;
+
+        let pid = pid_path
+            .as_path()
+            .components()
+            .last()
+            .and_then(|s| s.to_string_lossy().parse::<Pid>().ok())
+            .or_else(|| {
+                rustix::fs::readlinkat(rustix::fs::cwd(), &pid_path, vec![])
+                    .ok()
+                    .and_then(|s| s.to_string_lossy().parse::<Pid>().ok())
+            })
+            .ok_or_else(|| anyhow!("PID for {pid_path:?} was not found"))?;
+
+        let uid = {
+            let metadata = rustix::fs::fstat(&fd);
+            match metadata {
+                Ok(md) => Some(md.st_uid),
+                Err(_) => None,
+            }
+        };
+
+        let mut root = pid_path.clone();
+        let cmdline = cmdline(&mut root, &fd);
+        root.pop();
+        let stat = open_at(&mut root, "stat", &fd).and_then(Stat::from_file)?;
+        root.pop();
+        let io = open_at(&mut root, "io", &fd).and_then(Io::from_file);
+
+        Ok(Process {
+            pid,
+            uid,
+            stat,
+            io,
+            cmdline,
+        })
+    }
+}
+
+fn cmdline(root: &mut PathBuf, fd: &OwnedFd) -> anyhow::Result<Vec<String>> {
+    let mut buf = String::new();
+    open_at(root, "cmdline", fd)
+        .map(|mut file| file.read_to_string(&mut buf))
+        .map(|_| {
+            buf.split('\0')
+                .filter_map(|s| {
+                    if !s.is_empty() {
+                        Some(s.to_string())
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>()
+        })
+        .map_err(Into::into)
+}
+
+/// Opens a path. Note that this function takes in a mutable root - this will mutate it to avoid allocations. You
+/// probably will want to pop the most recent child after if you need to use the buffer again.
+#[inline]
+fn open_at(root: &mut PathBuf, child: &str, fd: &OwnedFd) -> anyhow::Result<File> {
+    root.push(child);
+    let new_fd = rustix::fs::openat(&fd, &*root, OFlags::RDONLY | OFlags::CLOEXEC, Mode::empty())?;
+
+    Ok(File::from(new_fd))
+}