Merge pull request #141 from ytsssun/efa-support

ytsssun · web-flow · commit 22cee7be839f · 2024-09-19T09:47:25.000-07:00
diff --git a/sources/Cargo.lock b/sources/Cargo.lock
diff --git a/sources/api/corndog/Cargo.toml b/sources/api/corndog/Cargo.toml
@@ -11,6 +11,8 @@ exclude = ["README.md"]
 
 [dependencies]
 log.workspace = true
+num_cpus.workspace = true
+pciclient.workspace = true
 serde = { workspace = true, features = ["derive"] }
 serde_json.workspace = true
 simplelog.workspace = true
@@ -20,3 +22,6 @@ bottlerocket-modeled-types.workspace = true
 
 [build-dependencies]
 generate-readme.workspace = true
+
+[dev-dependencies]
+test-case.workspace = true
diff --git a/sources/api/corndog/README.md b/sources/api/corndog/README.md
@@ -7,6 +7,8 @@ It sets kernel-related settings, for example:
 * sysctl values, based on key/value pairs in `settings.kernel.sysctl`
 * lockdown mode, based on the value of `settings.kernel.lockdown`
 
+corndog also provides a settings generator for hugepages, subcommand "generate-hugepages-setting".
+
 ## Colophon
 
 This text was generated from `README.tpl` using [cargo-readme](https://crates.io/crates/cargo-readme), and includes the rustdoc from `src/main.rs`.
diff --git a/sources/api/corndog/src/main.rs b/sources/api/corndog/src/main.rs
@@ -3,6 +3,8 @@ corndog is a delicious way to get at the meat inside the kernels.
 It sets kernel-related settings, for example:
 * sysctl values, based on key/value pairs in `settings.kernel.sysctl`
 * lockdown mode, based on the value of `settings.kernel.lockdown`
+
+corndog also provides a settings generator for hugepages, subcommand "generate-hugepages-setting".
 */
 
 use bottlerocket_modeled_types::{Lockdown, SysctlKey};
@@ -20,6 +22,10 @@ use std::{env, process};
 const SYSCTL_PATH_PREFIX: &str = "/proc/sys";
 const LOCKDOWN_PATH: &str = "/sys/kernel/security/lockdown";
 const DEFAULT_CONFIG_PATH: &str = "/etc/corndog.toml";
+const NR_HUGEPAGES_PATH_SYSCTL: &str = "/proc/sys/vm/nr_hugepages";
+/// Number of hugepages we will assign per core.
+/// See [`compute_hugepages_for_efa`] for more detail on the computation consideration.
+const HUGEPAGES_2MB_PER_CORE: u64 = 110;
 
 /// Store the args we receive on the command line.
 struct Args {
@@ -45,20 +51,30 @@ fn run() -> Result<()> {
     SimpleLogger::init(args.log_level, LogConfig::default()).context(error::LoggerSnafu)?;
 
     // If the user has kernel settings, apply them.
-    let kernel = get_kernel_settings(args.config_path)?;
     match args.subcommand.as_ref() {
         "sysctl" => {
+            let kernel = get_kernel_settings(args.config_path)?;
             if let Some(sysctls) = kernel.sysctl {
                 debug!("Applying sysctls: {:#?}", sysctls);
                 set_sysctls(sysctls);
             }
         }
         "lockdown" => {
+            let kernel = get_kernel_settings(args.config_path)?;
             if let Some(lockdown) = kernel.lockdown {
                 debug!("Setting lockdown: {:#?}", lockdown);
                 set_lockdown(&lockdown)?;
             }
         }
+        "generate-hugepages-setting" => {
+            let hugepages_setting = generate_hugepages_setting()?;
+            // We will only fail if we cannot serialize the output to JSON string.
+            // sundog expects JSON-serialized output so that many types can be represented, allowing the
+            // API model to use more accurate types.
+            let output =
+                serde_json::to_string(&hugepages_setting).context(error::SerializeJsonSnafu)?;
+            println!("{}", output);
+        }
         _ => usage_msg(format!("Unknown subcommand '{}'", args.subcommand)), // should be unreachable
     }
 
@@ -107,6 +123,55 @@ where
     }
 }
 
+/// Generate the hugepages setting for defaults.
+fn generate_hugepages_setting() -> Result<String> {
+    // Check if customer has directly written to the nr_hugepage file.
+    let mut hugepages = fs::read_to_string(NR_HUGEPAGES_PATH_SYSCTL)
+        .map(check_for_existing_hugepages)
+        .unwrap_or("0".to_string());
+
+    // Check for EFA and compute if necessary, only when hugepages is "0".
+    if &hugepages == "0" && pciclient::is_efa_attached().unwrap_or(false) {
+        // We will use [`num_cpus`] to get the number of cores for the compute.
+        hugepages = compute_hugepages_for_efa(num_cpus::get());
+    }
+    Ok(hugepages)
+}
+
+// Check if customer has directly written to the nr_hugepage file.
+//
+// This would be a rare case to hit, as customer would normally modify the hugepages value
+// via settings API. (It could happen with a custom variant if hugepages
+// are set via a sysctl.d drop-in, for example.)
+//
+// We expect the existing_hugepages_value to be valid numeric digits. Otherwise, we will
+// use "0" as default.
+fn check_for_existing_hugepages(existing_hugepages_value: String) -> String {
+    match existing_hugepages_value.trim().parse::<u64>() {
+        Ok(value) => {
+            return value.to_string();
+        }
+        Err(err) => {
+            warn!(
+                "Failed to parse the existing hugepage value, using 0 as default. Error: {}",
+                err
+            );
+        }
+    }
+    "0".to_string()
+}
+
+/// Computation:
+/// - We need to allocate 110MB memory for each libfabric endpoint.
+/// - For optimal setup, Open MPI will open 2 libfabric endpoints each core.
+/// - The total number of hugepages will be set as (110MB * 2) * number_of_cores / hugepage_size
+/// - We will allocate default hugepage_size = 2MB.
+/// - The number of hugepage per core would be 110MB * 2 / 2MB = 110.
+fn compute_hugepages_for_efa(num_cores: usize) -> String {
+    let number_of_hugepages = num_cores as u64 * HUGEPAGES_2MB_PER_CORE;
+    number_of_hugepages.to_string()
+}
+
 /// Sets the requested lockdown mode in the kernel.
 ///
 /// The Linux kernel won't allow lowering the lockdown setting, but we want to allow users to
@@ -165,6 +230,7 @@ fn usage() -> ! {
     Subcommands:
         sysctl
         lockdown
+        generate-hugepages-setting
 
     Global arguments:
         --config-path PATH
@@ -207,7 +273,7 @@ fn parse_args(args: env::Args) -> Args {
                 )
             }
 
-            "sysctl" | "lockdown" => subcommand = Some(arg),
+            "sysctl" | "lockdown" | "generate-hugepages-setting" => subcommand = Some(arg),
 
             _ => usage(),
         }
@@ -251,6 +317,9 @@ mod error {
             source: Box<toml::de::Error>,
         },
 
+        #[snafu(display("Error serializing to JSON: {}", source))]
+        SerializeJson { source: serde_json::error::Error },
+
         #[snafu(display(
             "Failed to change lockdown from '{}' to '{}': {}",
             current,
@@ -271,6 +340,8 @@ type Result<T> = std::result::Result<T, error::Error>;
 
 #[cfg(test)]
 mod test {
+    use test_case::test_case;
+
     use super::*;
 
     #[test]
@@ -305,4 +376,21 @@ mod test {
             parse_kernel_setting("none integrity confidentiality\n")
         );
     }
+
+    #[test]
+    fn test_compute_hugepages_for_efa() {
+        let num_cores: usize = 2;
+        let computed_hugepages = compute_hugepages_for_efa(num_cores);
+        assert_eq!(computed_hugepages, "220")
+    }
+
+    #[test_case("".to_string(), "0".to_string())]
+    #[test_case("0".to_string(), "0".to_string())]
+    #[test_case("-1".to_string(), "0".to_string())]
+    #[test_case("abc".to_string(), "0".to_string())]
+    #[test_case("100".to_string(), "100".to_string())]
+    fn test_check_for_existing_hugepages(existing_value: String, expected_hugepages: String) {
+        let actual_hugepages = check_for_existing_hugepages(existing_value);
+        assert_eq!(actual_hugepages, expected_hugepages);
+    }
 }
diff --git a/sources/api/schnauzer/Cargo.toml b/sources/api/schnauzer/Cargo.toml
@@ -29,6 +29,7 @@ log.workspace = true
 maplit.workspace = true
 models.workspace = true
 num_cpus.workspace = true
+pciclient.workspace = true
 percent-encoding.workspace = true
 pest.workspace = true
 pest_derive.workspace = true
diff --git a/sources/api/schnauzer/src/helpers/mod.rs b/sources/api/schnauzer/src/helpers/mod.rs
@@ -125,6 +125,8 @@ const IPV6_LOCALHOST: IpAddr = IpAddr::V6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))
 
 const DEFAULT_ECS_METADATA_SERVICE_RPS: i32 = 40;
 const DEFAULT_ECS_METADATA_SERVICE_BURST: i32 = 60;
+/// We use -1 to indicate unlimited value for resource limits.
+const RLIMIT_UNLIMITED: i64 = -1;
 
 /// Potential errors during helper execution
 mod error {
@@ -258,6 +260,9 @@ mod error {
             source: std::net::AddrParseError,
         },
 
+        #[snafu(display("Failed to check if EFA device is attached: {}", source))]
+        CheckEfaFailure { source: pciclient::PciClientError },
+
         #[snafu(display(
             "Expected an absolute URL, got '{}' in template '{}': '{}'",
             url_str,
@@ -1250,7 +1255,7 @@ pub fn oci_defaults(
             runtime.get_capabilities(capabilities)
         }
         OciSpecSection::ResourceLimits => {
-            let rlimits = oci_spec_resource_limits(oci_defaults_values)?;
+            let rlimits = generate_oci_resource_limits(oci_defaults_values, EfaLspciDetector {})?;
             rlimits
                 .iter()
                 .map(|(rlimit_type, values)| runtime.get_resource_limits(rlimit_type, values))
@@ -1308,12 +1313,42 @@ fn oci_spec_capabilities(value: &Value) -> Result<String, RenderError> {
 /// This helper function generates the resource limits section of
 /// the OCI runtime spec from the provided `value` parameter, which is
 /// the settings data from the datastore (`settings.oci-defaults.resource-limits`).
+fn generate_oci_resource_limits<T: EfaDetector>(
+    value: &Value,
+    efa_detector: T,
+) -> Result<HashMap<OciDefaultsResourceLimitType, OciDefaultsResourceLimitV1>, RenderError> {
+    let mut rlimits = oci_spec_resource_limits(value)?;
+    if efa_detector.is_efa_attached()? {
+        // We need to increase the locked memory limits from the default 8096KB to unlimited
+        // to account for hugepages allocation.
+        rlimits
+            .entry(OciDefaultsResourceLimitType::MaxLockedMemory)
+            .or_insert(OciDefaultsResourceLimitV1 {
+                soft_limit: RLIMIT_UNLIMITED,
+                hard_limit: RLIMIT_UNLIMITED,
+            });
+    }
+    Ok(rlimits)
+}
+
 fn oci_spec_resource_limits(
     value: &Value,
 ) -> Result<HashMap<OciDefaultsResourceLimitType, OciDefaultsResourceLimitV1>, RenderError> {
     Ok(serde_json::from_value(value.clone())?)
 }
 
+trait EfaDetector {
+    fn is_efa_attached(&self) -> Result<bool, TemplateHelperError>;
+}
+
+struct EfaLspciDetector;
+
+impl EfaDetector for EfaLspciDetector {
+    fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
+        pciclient::is_efa_attached().context(error::CheckEfaFailureSnafu)
+    }
+}
+
 // =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=   =^..^=
 // helpers to the helpers
 
@@ -1992,6 +2027,22 @@ mod test_oci_spec {
     use serde_json::json;
     use OciDefaultsResourceLimitType::*;
 
+    // Custom struct that will always show that EFA is detected.
+    struct EfaPresentDetector;
+    impl EfaDetector for EfaPresentDetector {
+        fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
+            Ok(true)
+        }
+    }
+
+    // Custom struct that will always show that EFA is not detected.
+    struct EfaNotPresentDetector;
+    impl EfaDetector for EfaNotPresentDetector {
+        fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
+            Ok(false)
+        }
+    }
+
     #[test]
     fn oci_spec_capabilities_test() {
         let json = json!({
@@ -2058,6 +2109,28 @@ mod test_oci_spec {
         }
     }
 
+    #[test]
+    fn generate_oci_resource_limits_efa_detected() {
+        let json = json!({"max-open-files": {"hard-limit": 1, "soft-limit": 2}});
+        let rlimits = generate_oci_resource_limits(&json, EfaPresentDetector {}).unwrap();
+        let rendered = Containerd::get_resource_limits(
+            &MaxLockedMemory,
+            rlimits.get(&MaxLockedMemory).unwrap(),
+        );
+        assert_eq!(
+            rendered,
+            r#"{ "type": "RLIMIT_MEMLOCK", "hard": 18446744073709551615, "soft": 18446744073709551615 }"#
+        );
+    }
+
+    #[test]
+    fn generate_oci_resource_limits_efa_not_detected() {
+        let json = json!({"max-open-files": {"hard-limit": 1, "soft-limit": 2}});
+        let rlimits = generate_oci_resource_limits(&json, EfaNotPresentDetector {}).unwrap();
+        // If EFA is not detected, we will not set the max-locked-memory rlimit
+        assert_eq!(rlimits.get(&MaxLockedMemory), None)
+    }
+
     #[test]
     fn oci_spec_max_locked_memory_as_unlimited_resource_limit_test() {
         let json = json!({"max-locked-memory": {"hard-limit": "unlimited", "soft-limit": 18}});
diff --git a/sources/ghostdog/Cargo.toml b/sources/ghostdog/Cargo.toml
@@ -13,6 +13,7 @@ argh.workspace = true
 gptman.workspace = true
 hex-literal.workspace = true
 lazy_static.workspace = true
+pciclient.workspace = true
 signpost.workspace = true
 snafu.workspace = true
 
diff --git a/sources/ghostdog/README.md b/sources/ghostdog/README.md
@@ -4,6 +4,7 @@ Current version: 0.1.0
 
 ghostdog is a tool to manage ephemeral disks.
 It can be called as a udev helper program to identify ephemeral disks.
+It can also be called for EFA device detection which can be used for ExecCondition in systemd units.
 
 ## Colophon
 
diff --git a/sources/ghostdog/src/main.rs b/sources/ghostdog/src/main.rs