Skip to content

Commit 22cee7b

Browse files
authored
Merge pull request #141 from ytsssun/efa-support
2 parents 0e7cfc4 + d629a21 commit 22cee7b

File tree

9 files changed

+201
-3
lines changed

9 files changed

+201
-3
lines changed

sources/Cargo.lock

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sources/api/corndog/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ exclude = ["README.md"]
1111

1212
[dependencies]
1313
log.workspace = true
14+
num_cpus.workspace = true
15+
pciclient.workspace = true
1416
serde = { workspace = true, features = ["derive"] }
1517
serde_json.workspace = true
1618
simplelog.workspace = true
@@ -20,3 +22,6 @@ bottlerocket-modeled-types.workspace = true
2022

2123
[build-dependencies]
2224
generate-readme.workspace = true
25+
26+
[dev-dependencies]
27+
test-case.workspace = true

sources/api/corndog/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ It sets kernel-related settings, for example:
77
* sysctl values, based on key/value pairs in `settings.kernel.sysctl`
88
* lockdown mode, based on the value of `settings.kernel.lockdown`
99

10+
corndog also provides a settings generator for hugepages, subcommand "generate-hugepages-setting".
11+
1012
## Colophon
1113

1214
This text was generated from `README.tpl` using [cargo-readme](https://crates.io/crates/cargo-readme), and includes the rustdoc from `src/main.rs`.

sources/api/corndog/src/main.rs

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ corndog is a delicious way to get at the meat inside the kernels.
33
It sets kernel-related settings, for example:
44
* sysctl values, based on key/value pairs in `settings.kernel.sysctl`
55
* lockdown mode, based on the value of `settings.kernel.lockdown`
6+
7+
corndog also provides a settings generator for hugepages, subcommand "generate-hugepages-setting".
68
*/
79

810
use bottlerocket_modeled_types::{Lockdown, SysctlKey};
@@ -20,6 +22,10 @@ use std::{env, process};
2022
const SYSCTL_PATH_PREFIX: &str = "/proc/sys";
2123
const LOCKDOWN_PATH: &str = "/sys/kernel/security/lockdown";
2224
const DEFAULT_CONFIG_PATH: &str = "/etc/corndog.toml";
25+
const NR_HUGEPAGES_PATH_SYSCTL: &str = "/proc/sys/vm/nr_hugepages";
26+
/// Number of hugepages we will assign per core.
27+
/// See [`compute_hugepages_for_efa`] for more detail on the computation consideration.
28+
const HUGEPAGES_2MB_PER_CORE: u64 = 110;
2329

2430
/// Store the args we receive on the command line.
2531
struct Args {
@@ -45,20 +51,30 @@ fn run() -> Result<()> {
4551
SimpleLogger::init(args.log_level, LogConfig::default()).context(error::LoggerSnafu)?;
4652

4753
// If the user has kernel settings, apply them.
48-
let kernel = get_kernel_settings(args.config_path)?;
4954
match args.subcommand.as_ref() {
5055
"sysctl" => {
56+
let kernel = get_kernel_settings(args.config_path)?;
5157
if let Some(sysctls) = kernel.sysctl {
5258
debug!("Applying sysctls: {:#?}", sysctls);
5359
set_sysctls(sysctls);
5460
}
5561
}
5662
"lockdown" => {
63+
let kernel = get_kernel_settings(args.config_path)?;
5764
if let Some(lockdown) = kernel.lockdown {
5865
debug!("Setting lockdown: {:#?}", lockdown);
5966
set_lockdown(&lockdown)?;
6067
}
6168
}
69+
"generate-hugepages-setting" => {
70+
let hugepages_setting = generate_hugepages_setting()?;
71+
// We will only fail if we cannot serialize the output to JSON string.
72+
// sundog expects JSON-serialized output so that many types can be represented, allowing the
73+
// API model to use more accurate types.
74+
let output =
75+
serde_json::to_string(&hugepages_setting).context(error::SerializeJsonSnafu)?;
76+
println!("{}", output);
77+
}
6278
_ => usage_msg(format!("Unknown subcommand '{}'", args.subcommand)), // should be unreachable
6379
}
6480

@@ -107,6 +123,55 @@ where
107123
}
108124
}
109125

126+
/// Generate the hugepages setting for defaults.
127+
fn generate_hugepages_setting() -> Result<String> {
128+
// Check if customer has directly written to the nr_hugepage file.
129+
let mut hugepages = fs::read_to_string(NR_HUGEPAGES_PATH_SYSCTL)
130+
.map(check_for_existing_hugepages)
131+
.unwrap_or("0".to_string());
132+
133+
// Check for EFA and compute if necessary, only when hugepages is "0".
134+
if &hugepages == "0" && pciclient::is_efa_attached().unwrap_or(false) {
135+
// We will use [`num_cpus`] to get the number of cores for the compute.
136+
hugepages = compute_hugepages_for_efa(num_cpus::get());
137+
}
138+
Ok(hugepages)
139+
}
140+
141+
// Check if customer has directly written to the nr_hugepage file.
142+
//
143+
// This would be a rare case to hit, as customer would normally modify the hugepages value
144+
// via settings API. (It could happen with a custom variant if hugepages
145+
// are set via a sysctl.d drop-in, for example.)
146+
//
147+
// We expect the existing_hugepages_value to be valid numeric digits. Otherwise, we will
148+
// use "0" as default.
149+
fn check_for_existing_hugepages(existing_hugepages_value: String) -> String {
150+
match existing_hugepages_value.trim().parse::<u64>() {
151+
Ok(value) => {
152+
return value.to_string();
153+
}
154+
Err(err) => {
155+
warn!(
156+
"Failed to parse the existing hugepage value, using 0 as default. Error: {}",
157+
err
158+
);
159+
}
160+
}
161+
"0".to_string()
162+
}
163+
164+
/// Computation:
165+
/// - We need to allocate 110MB memory for each libfabric endpoint.
166+
/// - For optimal setup, Open MPI will open 2 libfabric endpoints each core.
167+
/// - The total number of hugepages will be set as (110MB * 2) * number_of_cores / hugepage_size
168+
/// - We will allocate default hugepage_size = 2MB.
169+
/// - The number of hugepage per core would be 110MB * 2 / 2MB = 110.
170+
fn compute_hugepages_for_efa(num_cores: usize) -> String {
171+
let number_of_hugepages = num_cores as u64 * HUGEPAGES_2MB_PER_CORE;
172+
number_of_hugepages.to_string()
173+
}
174+
110175
/// Sets the requested lockdown mode in the kernel.
111176
///
112177
/// The Linux kernel won't allow lowering the lockdown setting, but we want to allow users to
@@ -165,6 +230,7 @@ fn usage() -> ! {
165230
Subcommands:
166231
sysctl
167232
lockdown
233+
generate-hugepages-setting
168234
169235
Global arguments:
170236
--config-path PATH
@@ -207,7 +273,7 @@ fn parse_args(args: env::Args) -> Args {
207273
)
208274
}
209275

210-
"sysctl" | "lockdown" => subcommand = Some(arg),
276+
"sysctl" | "lockdown" | "generate-hugepages-setting" => subcommand = Some(arg),
211277

212278
_ => usage(),
213279
}
@@ -251,6 +317,9 @@ mod error {
251317
source: Box<toml::de::Error>,
252318
},
253319

320+
#[snafu(display("Error serializing to JSON: {}", source))]
321+
SerializeJson { source: serde_json::error::Error },
322+
254323
#[snafu(display(
255324
"Failed to change lockdown from '{}' to '{}': {}",
256325
current,
@@ -271,6 +340,8 @@ type Result<T> = std::result::Result<T, error::Error>;
271340

272341
#[cfg(test)]
273342
mod test {
343+
use test_case::test_case;
344+
274345
use super::*;
275346

276347
#[test]
@@ -305,4 +376,21 @@ mod test {
305376
parse_kernel_setting("none integrity confidentiality\n")
306377
);
307378
}
379+
380+
#[test]
381+
fn test_compute_hugepages_for_efa() {
382+
let num_cores: usize = 2;
383+
let computed_hugepages = compute_hugepages_for_efa(num_cores);
384+
assert_eq!(computed_hugepages, "220")
385+
}
386+
387+
#[test_case("".to_string(), "0".to_string())]
388+
#[test_case("0".to_string(), "0".to_string())]
389+
#[test_case("-1".to_string(), "0".to_string())]
390+
#[test_case("abc".to_string(), "0".to_string())]
391+
#[test_case("100".to_string(), "100".to_string())]
392+
fn test_check_for_existing_hugepages(existing_value: String, expected_hugepages: String) {
393+
let actual_hugepages = check_for_existing_hugepages(existing_value);
394+
assert_eq!(actual_hugepages, expected_hugepages);
395+
}
308396
}

sources/api/schnauzer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ log.workspace = true
2929
maplit.workspace = true
3030
models.workspace = true
3131
num_cpus.workspace = true
32+
pciclient.workspace = true
3233
percent-encoding.workspace = true
3334
pest.workspace = true
3435
pest_derive.workspace = true

sources/api/schnauzer/src/helpers/mod.rs

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ const IPV6_LOCALHOST: IpAddr = IpAddr::V6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))
125125

126126
const DEFAULT_ECS_METADATA_SERVICE_RPS: i32 = 40;
127127
const DEFAULT_ECS_METADATA_SERVICE_BURST: i32 = 60;
128+
/// We use -1 to indicate unlimited value for resource limits.
129+
const RLIMIT_UNLIMITED: i64 = -1;
128130

129131
/// Potential errors during helper execution
130132
mod error {
@@ -258,6 +260,9 @@ mod error {
258260
source: std::net::AddrParseError,
259261
},
260262

263+
#[snafu(display("Failed to check if EFA device is attached: {}", source))]
264+
CheckEfaFailure { source: pciclient::PciClientError },
265+
261266
#[snafu(display(
262267
"Expected an absolute URL, got '{}' in template '{}': '{}'",
263268
url_str,
@@ -1250,7 +1255,7 @@ pub fn oci_defaults(
12501255
runtime.get_capabilities(capabilities)
12511256
}
12521257
OciSpecSection::ResourceLimits => {
1253-
let rlimits = oci_spec_resource_limits(oci_defaults_values)?;
1258+
let rlimits = generate_oci_resource_limits(oci_defaults_values, EfaLspciDetector {})?;
12541259
rlimits
12551260
.iter()
12561261
.map(|(rlimit_type, values)| runtime.get_resource_limits(rlimit_type, values))
@@ -1308,12 +1313,42 @@ fn oci_spec_capabilities(value: &Value) -> Result<String, RenderError> {
13081313
/// This helper function generates the resource limits section of
13091314
/// the OCI runtime spec from the provided `value` parameter, which is
13101315
/// the settings data from the datastore (`settings.oci-defaults.resource-limits`).
1316+
fn generate_oci_resource_limits<T: EfaDetector>(
1317+
value: &Value,
1318+
efa_detector: T,
1319+
) -> Result<HashMap<OciDefaultsResourceLimitType, OciDefaultsResourceLimitV1>, RenderError> {
1320+
let mut rlimits = oci_spec_resource_limits(value)?;
1321+
if efa_detector.is_efa_attached()? {
1322+
// We need to increase the locked memory limits from the default 8096KB to unlimited
1323+
// to account for hugepages allocation.
1324+
rlimits
1325+
.entry(OciDefaultsResourceLimitType::MaxLockedMemory)
1326+
.or_insert(OciDefaultsResourceLimitV1 {
1327+
soft_limit: RLIMIT_UNLIMITED,
1328+
hard_limit: RLIMIT_UNLIMITED,
1329+
});
1330+
}
1331+
Ok(rlimits)
1332+
}
1333+
13111334
fn oci_spec_resource_limits(
13121335
value: &Value,
13131336
) -> Result<HashMap<OciDefaultsResourceLimitType, OciDefaultsResourceLimitV1>, RenderError> {
13141337
Ok(serde_json::from_value(value.clone())?)
13151338
}
13161339

1340+
trait EfaDetector {
1341+
fn is_efa_attached(&self) -> Result<bool, TemplateHelperError>;
1342+
}
1343+
1344+
struct EfaLspciDetector;
1345+
1346+
impl EfaDetector for EfaLspciDetector {
1347+
fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
1348+
pciclient::is_efa_attached().context(error::CheckEfaFailureSnafu)
1349+
}
1350+
}
1351+
13171352
// =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^=
13181353
// helpers to the helpers
13191354

@@ -1992,6 +2027,22 @@ mod test_oci_spec {
19922027
use serde_json::json;
19932028
use OciDefaultsResourceLimitType::*;
19942029

2030+
// Custom struct that will always show that EFA is detected.
2031+
struct EfaPresentDetector;
2032+
impl EfaDetector for EfaPresentDetector {
2033+
fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
2034+
Ok(true)
2035+
}
2036+
}
2037+
2038+
// Custom struct that will always show that EFA is not detected.
2039+
struct EfaNotPresentDetector;
2040+
impl EfaDetector for EfaNotPresentDetector {
2041+
fn is_efa_attached(&self) -> Result<bool, TemplateHelperError> {
2042+
Ok(false)
2043+
}
2044+
}
2045+
19952046
#[test]
19962047
fn oci_spec_capabilities_test() {
19972048
let json = json!({
@@ -2058,6 +2109,28 @@ mod test_oci_spec {
20582109
}
20592110
}
20602111

2112+
#[test]
2113+
fn generate_oci_resource_limits_efa_detected() {
2114+
let json = json!({"max-open-files": {"hard-limit": 1, "soft-limit": 2}});
2115+
let rlimits = generate_oci_resource_limits(&json, EfaPresentDetector {}).unwrap();
2116+
let rendered = Containerd::get_resource_limits(
2117+
&MaxLockedMemory,
2118+
rlimits.get(&MaxLockedMemory).unwrap(),
2119+
);
2120+
assert_eq!(
2121+
rendered,
2122+
r#"{ "type": "RLIMIT_MEMLOCK", "hard": 18446744073709551615, "soft": 18446744073709551615 }"#
2123+
);
2124+
}
2125+
2126+
#[test]
2127+
fn generate_oci_resource_limits_efa_not_detected() {
2128+
let json = json!({"max-open-files": {"hard-limit": 1, "soft-limit": 2}});
2129+
let rlimits = generate_oci_resource_limits(&json, EfaNotPresentDetector {}).unwrap();
2130+
// If EFA is not detected, we will not set the max-locked-memory rlimit
2131+
assert_eq!(rlimits.get(&MaxLockedMemory), None)
2132+
}
2133+
20612134
#[test]
20622135
fn oci_spec_max_locked_memory_as_unlimited_resource_limit_test() {
20632136
let json = json!({"max-locked-memory": {"hard-limit": "unlimited", "soft-limit": 18}});

sources/ghostdog/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ argh.workspace = true
1313
gptman.workspace = true
1414
hex-literal.workspace = true
1515
lazy_static.workspace = true
16+
pciclient.workspace = true
1617
signpost.workspace = true
1718
snafu.workspace = true
1819

sources/ghostdog/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Current version: 0.1.0
44

55
ghostdog is a tool to manage ephemeral disks.
66
It can be called as a udev helper program to identify ephemeral disks.
7+
It can also be called for EFA device detection which can be used for ExecCondition in systemd units.
78

89
## Colophon
910

0 commit comments

Comments
 (0)