From 86a28ac5728d4abf233dfff4ac50ec1f6f1d3c26 Mon Sep 17 00:00:00 2001 From: malucius-rh Date: Tue, 20 May 2025 23:39:23 -0400 Subject: [PATCH 1/5] PCP poc: add option --- general_setup | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/general_setup b/general_setup index ecf4f4e..2ddb568 100755 --- a/general_setup +++ b/general_setup @@ -66,6 +66,7 @@ gs_usage_info() echo " --sysname: name of the system running, used in determining config files. Defaults to hostname." echo " --test_verification : Runs the test verification. Information is in the test_verify file in the tests github" echo " --tuned_setting: used in naming the tar file, default for RHEL is the current active tuned. For non" + echo " --use_pcp: Enables use of Performance Co-Pilot in wrappers, defaults to 0." echo " RHEL systems, default is none." echo " --usage: this usage message." exit 1 @@ -91,6 +92,8 @@ to_sysname=`hostname` to_pstats="default" to_no_pkg_install=0 +to_use_pcp=0 + to_tuned_setting="" i=1 @@ -195,7 +198,12 @@ do --usage) gs_usage_info ;; + --use_pcp) + i=$((i + 1)) + to_use_pcp=1 + shift 1 --) + ;; break; ;; *) From 180137854372bf76063179ad9e6e2b6b7c2acba6 Mon Sep 17 00:00:00 2001 From: malucius-rh Date: Tue, 20 May 2025 23:46:43 -0400 Subject: [PATCH 2/5] Create README --- pcp/README | 1 + 1 file changed, 1 insertion(+) create mode 100644 pcp/README diff --git a/pcp/README b/pcp/README new file mode 100644 index 0000000..71219a9 --- /dev/null +++ b/pcp/README @@ -0,0 +1 @@ +This directory contains infrastructure for PCP support for the Zathras family of test wrappers. It's curretnly in "proof of concept" stage. From 2ddc39b6c6939f5f2100cde5f34f4a494c35b183 Mon Sep 17 00:00:00 2001 From: malucius-rh Date: Tue, 20 May 2025 23:47:25 -0400 Subject: [PATCH 3/5] Add files via upload --- pcp/PCPrecord.service | 10 +++ pcp/PCPrecord_actions.sh | 166 +++++++++++++++++++++++++++++++++++++++ pcp/default.cfg | 76 ++++++++++++++++++ pcp/pcp_commands.inc | 40 ++++++++++ pcp/pcp_functions.inc | 114 +++++++++++++++++++++++++++ pcp/update_svc.sh | 23 ++++++ 6 files changed, 429 insertions(+) create mode 100644 pcp/PCPrecord.service create mode 100644 pcp/PCPrecord_actions.sh create mode 100644 pcp/default.cfg create mode 100644 pcp/pcp_commands.inc create mode 100644 pcp/pcp_functions.inc create mode 100644 pcp/update_svc.sh diff --git a/pcp/PCPrecord.service b/pcp/PCPrecord.service new file mode 100644 index 0000000..5b7559b --- /dev/null +++ b/pcp/PCPrecord.service @@ -0,0 +1,10 @@ +[Unit] +Description=PCP Recorder + +[Service] +Type=notify +WorkingDirectory=/usr/local/src/PCPrecord +ExecStart=/usr/local/src/PCPrecord/PCPrecord_actions.sh + +[Install] +WantedBy=multi-user.target diff --git a/pcp/PCPrecord_actions.sh b/pcp/PCPrecord_actions.sh new file mode 100644 index 0000000..69a28d3 --- /dev/null +++ b/pcp/PCPrecord_actions.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# Executed by systemd service 'PCPrecord.service' +# See: /etc/systemd/system/PCPrecord.service +################################################################ + +# GLOBALS ################### +# Include the PCP Functions file +source $PWD/pcp_functions.inc + +FIFO="/tmp/pcpFIFO" # get from cmdline +sample_rate=5 # hardcode DEFAULT for now +pmlogger_running="false" # Initialize service as OFF +om_workload_file="/tmp/openmetrics_workload.txt" + +############################# +# Functions ################# +update_om_workload() { +# Removes existing and Writes a new file +# Called by 'reset_om_metrics()', below + + # Check for proper number of args + if [ "$#" -ne 6 ]; then + echo "ERROR on number of parameters in ${FUNCNAME}" + exit 2 + else + v_iter_cnt=$1 + v_running=$2 + v_numthreads=$3 + v_runtime=$4 + v_throughput=$5 + v_latency=$6 + fi + + # Prepare for an update to the $om_workload_file (GLOBAL) + rm -f $om_workload_file + touch $om_workload_file + # Update metrics in the openmetric.workload file + printf "iteration %d\n" "$v_iter_cnt">>$om_workload_file + printf "running %d\n" "$v_started">>$om_workload_file + printf "numthreads %d\n" "$v_numthreads">>$om_workload_file + echo "runtime ${v_runtime}">>$om_workload_file + echo "throughput ${v_throughput}">>$om_workload_file + echo "latency ${v_latency}">>$om_workload_file +} + +reset_om_metrics() { + # Initialize openmetric.workload metric values + r_iteration=0 ; r_running=0 + r_numthreads=0 ; r_runtime="NaN" ; r_throughput="NaN" ; r_latency="NaN" + + # Update the openmetrics.workload + update_om_workload "$r_iteration" "$r_running" \ + "$r_numthreads" "$r_runtime" "$r_throughput" "$r_latency" +} + +error_exit() { + if [ "$?" != "0" ]; then + systemd-notify --status="ERROR: $1" + # Additional error handling logic can be added here + rm -f "$FIFO" + # Reset openmetric.workload metric values prior to leaving + reset_om_metrics +## if pmlogger_running = True then attempt forcible STOP? + exit 1 + fi +} +# END Functions ################# + +# Main ################# +# Initialize openmetric.workload metric values +reset_om_metrics + +# Verify required files and Packages are available +#---------------------------------- +test -f "${om_workload_file}" +error_exit "Initialization: ${om_workload_file} not found!" + +# Remove and recreate FIFO on every service 'start' +rm -f "$FIFO" +mkfifo "$FIFO" +error_exit "Initialization: mkfifo $FIFO failed" + +## DEBUG - measure processing interval: $postaction-$preaction +action='NONE' +interval=0.0 + +# Infinite Loop ################# +# Read FIFO and perform requested ACTION (start, stop, ...) +# Access each word in $action string for parsing 'actions' & 'metric' +# NOTE: 'Start, Stop, Reset' actions have no metrics +while : ; do + # Required or we get TIMEOUT on 'read action < "$FIFO" ' + # Signal readiness for next $action. SYNC point w/client Workload + # Report timing interval for most recent ACTION + systemd-notify --ready --status="READY: last-action - $action = ${interval}ms" + # Read the Request/'$action' and then process it + read action < "$FIFO" # Blocks until data is available + # Signal busy Processing this $action + systemd-notify --status="$action PMLOGGER Request" + action_arr=($action) # Array of 'words' in Request read from FIFO +## DEBUG - measure processing interval for ACTION: $postaction-$preaction + preaction=$(mark_ms) + case "${action_arr[0]}" in + Start) # 'Start $archive_dir $test_name $conf_file' + archive_dir="${action_arr[1]}" + archive_name="${action_arr[2]}" + conf_file="${action_arr[3]}" + # Start PMLOGGER to create ARCHIVE + if [ "$pmlogger_running" = "false" ]; then + # Signal Processing this $action + systemd-notify --status="DEBUG: $action PMLOGGER Request" + # These functions attempt to catch errors and verify success + pcp_verify $conf_file + error_exit "pcp_verify: Unable to start PMLOGGER" + pcp_start $conf_file $sample_rate $archive_dir $archive_name + error_exit "pcp_start: Unable to start PMLOGGER" + pmlogger_running="true" # Record this STATE info + fi + ;; + Stop) # artifacts_dir="${action_arr[1]}" + # Terminate PMLOGGER + if [ "$pmlogger_running" = "true" ]; then + # Will ZATHRAS Store PCP Archive related artifacts ? + # - Currently Missing from PCPSTOP logic + ##pcp_stop "${artifacts_dir}" + pcp_stop + error_exit "pcp_stop: Unable to stop PMLOGGER" + pmlogger_running="false" + fi + ;; + Reset) # om_workload_file="${action_arr[1]}" + # RESET the Workload Metrics + # the only Request that doesn't require $pmlogger_running + reset_om_metrics + error_exit "reset_om_metrics: Unable to RESET Workload Metrics" + ;; + throughput|latency|numthreads|runtime) # Workload Metrics + # metric="${action_arr[1]}" om_workload_file=$2 + if [ "$pmlogger_running" = "true" ]; then + # Forward workload metric to openmetrics_workload.txt + # Change only one metric line at a time + # Replaces the entire line using sed + # Should I only print 'action_arr[0] & action_arr[1]' + sed -i "s/^.*${action_arr[0]}.*$/${action}/" "$om_workload_file" + fi + ;; + running|iteration) # Workload States + # state="${action_arr[1]}" om_workload_file=$2 + if [ "$pmlogger_running" = "true" ]; then + sed -i "s/^.*${action_arr[0]}.*$/${action}/" "$om_workload_file" + fi + ;; + *) + systemd-notify --status="Unrecognized action - IGNORED" + ;; + esac +## DEBUG - measure time interval for processing ACTION + postaction=$(mark_ms) + interval=$(( 10*(postaction - preaction) )) +done + +# Cleanup +echo "Cleaning up" + +# Reset openmetric.workload metric values prior to leaving +reset_om_metrics diff --git a/pcp/default.cfg b/pcp/default.cfg new file mode 100644 index 0000000..8cc4bc9 --- /dev/null +++ b/pcp/default.cfg @@ -0,0 +1,76 @@ +#pmlogconf 2.0 +# +## Workload Metrics - hardcoded sampling rate +log advisory on 1 second { + openmetrics.workload + openmetrics.control.fetch_time +} + +## Intel RAPL & RFchassis metrics +log advisory on default { +# denki.rapl + openmetrics.RFchassis +} + +## platform, filesystem and hardware configuration +log advisory on once { + hinv + kernel.uname + filesys.mountdir + filesys.uuid + filesys.type + filesys.blocksize + filesys.capacity +} + +#+ tools/htop:y:default: +## metrics used by the htop command +log advisory on default { +# disk.all.read_bytes +# disk.all.write_bytes +# disk.all.avactive +# hinv.cpu.clock + kernel.all.load + kernel.all.uptime + kernel.all.cpu.user + kernel.all.cpu.nice + kernel.all.cpu.sys + kernel.all.cpu.idle + kernel.all.cpu.wait.total + kernel.all.cpu.intr + kernel.all.cpu.irq.soft + kernel.all.cpu.steal + kernel.all.cpu.guest + kernel.all.cpu.guest_nice +# kernel.all.pressure.cpu.some.avg +# kernel.all.pressure.io.some.avg +# kernel.all.pressure.io.full.avg +# kernel.all.pressure.memory.some.avg +# kernel.all.pressure.memory.full.avg +# kernel.percpu.cpu.user +# kernel.percpu.cpu.nice +# kernel.percpu.cpu.sys +# kernel.percpu.cpu.idle +# kernel.percpu.cpu.wait.total +# kernel.percpu.cpu.intr +# kernel.percpu.cpu.irq.soft +# kernel.percpu.cpu.steal +# kernel.percpu.cpu.guest +# kernel.percpu.cpu.guest_nice + mem.util.available + mem.util.free + mem.util.bufmem + mem.util.cached + mem.util.shmem + mem.util.slabReclaimable + mem.util.swapCached + mem.util.swapTotal + mem.util.swapFree + network.all.in.bytes + network.all.out.bytes + network.all.in.packets + network.all.out.packets +# zram.capacity +# zram.mm_stat.data_size.original +# zram.mm_stat.data_size.compressed +} diff --git a/pcp/pcp_commands.inc b/pcp/pcp_commands.inc new file mode 100644 index 0000000..9fd9a5c --- /dev/null +++ b/pcp/pcp_commands.inc @@ -0,0 +1,40 @@ +FIFO="/tmp/pcpFIFO" + +#Sets up and starts the PCP service +setup_pcp() { + working_dir="/usr/local/src/PCPrecord" + + mkdir -p "${working_dir}" + chmod 755 ${TOOLS_BIN}/pcp/*.sh + cp ${TOOLS_BIN}/pcp/PCPrecord.service /etc/systemd/system/. + cp ${TOOLS_BIN}/pcp/PCPrecord_actions.sh "${working_dir}/." + cp ${TOOLS_BIN}/pcp/pcp_functions.inc "${working_dir}/." + + # Stop and then Restart the service + systemctl stop PCPrecord.service + systemctl stop pmcd + sleep 1 + systemctl daemon-reload + sleep 1 + # WHY is this issuing warning to run 'systemctl daemon-reload'? + systemctl start PCPrecord.service + systemctl start pmcd + sleep 1 +} + +#Starts PCP +#Sends "Start" command to PCPrecord_actions +#Takes three args: +#1: Directory for PCP data, should be with the workload's own data +#2: Test name +#3: PMLogger config file to use +start_pcp() { + echo "Start ${1} ${2} ${3}\n" + printf "Start ${1} ${2} ${3}\n" > $FIFO +} + +#Stops PCP +#Sends "Stop" command to PCPrecord_actions +stop_pcp() { + printf "Stop\n" > $FIFO +} diff --git a/pcp/pcp_functions.inc b/pcp/pcp_functions.inc new file mode 100644 index 0000000..be01385 --- /dev/null +++ b/pcp/pcp_functions.inc @@ -0,0 +1,114 @@ +#!/bin/bash +# Collection of utility Functions for working with Perf Co-Pilot +# - pcp_verify($cfg_file) +# - pcp_start($cfg_file, $sample_rate, $archive_dir, $archive_name) +# - pcp_stop() +# +# NOTE: use of these Functions require that PCP is already installed on the system +################################################################################## + +# Global VARs +primary_pmlogger="false" # used to manage stop/restart +pmlogger_killed="success" # flags if private pmlogger was killed +#--------------------------------------------------------------- + +pcp_verify() +{ + local cfg_file="$1" # PMLOGGER Configuration File +# Verify user provided pmlogger.conf file exists. If not abort. + if [ ! -f "$cfg_file" ]; then + echo "File $cfg_file not found!"; echo + exit 20 + fi + +# TBD: use 'pmlogger -c $PWD/$cfg_file -C' to Verify syntax + +# Verify PMCD is running (pcp-zeroconf is installed) + systemctl is-active --quiet pmcd + if [ $? != 0 ]; then + echo "PCP pmcd is not running. Is PCP installed?" + echo "Suggested syntax: sudo dnf install pcp-zeroconf"; echo + exit 21 + fi + +# Manage primary pmlogger. STOP if it is running. +# Check if primary pmlogger is running + systemctl is-active --quiet pmlogger + if [ $? == 0 ]; then + echo "Primary PCP pmlogger is running. Being stopped to run script" + systemctl stop pmlogger + # Flag indicates primary pmlogger should be restarted by 'pcp_stop' + primary_pmlogger="true" + fi +} + +pcp_start() +{ + echo "PCP Starting private pmlogger" + + local cfg_file="$1" + local sample_rate="$2" + local archive_dir="$3" + local archive_basename="$4" + local archive_loc="${archive_dir}/${archive_basename}" + local pmlogger_log="${archive_loc}.log" + + mkdir -p "${archive_dir}" + +# Run PCP pmlogger +# JTH - VERIFY success, ensure pmlogger starts + pmlogger -c "${cfg_file}" -t "$sample_rate" -l "${pmlogger_log}"\ + "${archive_loc}" & + +# First check that the pmlogger process is running + timeout 5 bash -c \ + "until pgrep pmlogger>/dev/null; do sleep 0.5; done" + # Trap timeout condition + if [ $? -eq 124 ]; then + echo "Timed out waiting for PMLOGGER to Start1" + exit 30 + fi +# Now check that PMLOGGER has started logging +# timeout 5 bash -c \ +# "until grep -q "Starting logger " ${pmlogger_log}; do sleep 0.5; done" +# # Trap timeout condition +# if [ $? -eq 124 ]; then +# echo "Timed out waiting for PMLOGGER to Start2" +# exit 31 +# fi +} + +pcp_stop() +{ + echo "PCP Stop. Stopping private pmlogger, creating archive" + +# Stop PCP logger and pause for pmlogger to write archive + pkill -USR1 pmlogger +# Now check that PMLOGGER has stopped logging + timeout 5 bash -c \ + "until ! pgrep pmlogger>/dev/null; do sleep 0.5; done" + # Trap timeout condition + if [ $? -eq 124 ]; then + echo "Timed out waiting for PMLOGGER to Stop" + pmlogger_killed="failed" # Not used -yet + exit 40 + fi + pmlogger_killed="success" + +# Restore primary pmlogger, if it was previously running + if [ "$primary_pmlogger" == "true" ]; then + if [ "$pmlogger_killed" == "success" ]; then +#DEBUG echo "Primary PCP pmlogger being restored to original run state" + systemctl start pmlogger +#DEBUG else +#DEBUG echo "Primary PCP pmlogger NOT restored to original run state" + fi + fi +} + +mark_ms() { + read up rest Date: Thu, 29 May 2025 09:32:31 -0400 Subject: [PATCH 4/5] Add PCP shutdown sequence --- pcp/pcp_commands.inc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pcp/pcp_commands.inc b/pcp/pcp_commands.inc index 9fd9a5c..b51faed 100644 --- a/pcp/pcp_commands.inc +++ b/pcp/pcp_commands.inc @@ -38,3 +38,9 @@ start_pcp() { stop_pcp() { printf "Stop\n" > $FIFO } + +#Shut the services down +shutdown_pcp() { + systemctl stop PCPrecord.service + systemctl stop pmcd +} From 99d6edba80402f3a3da693bd0a0bb31072c25d33 Mon Sep 17 00:00:00 2001 From: malucius-rh Date: Wed, 4 Jun 2025 08:29:06 -0400 Subject: [PATCH 5/5] Adding openmetrics resets and "send load result to pcp archive" --- pcp/pcp_commands.inc | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pcp/pcp_commands.inc b/pcp/pcp_commands.inc index b51faed..61c24f0 100644 --- a/pcp/pcp_commands.inc +++ b/pcp/pcp_commands.inc @@ -23,16 +23,26 @@ setup_pcp() { } #Starts PCP +#Resets openmetrics #Sends "Start" command to PCPrecord_actions #Takes three args: #1: Directory for PCP data, should be with the workload's own data #2: Test name #3: PMLogger config file to use start_pcp() { + printf "Reset\n" > $FIFO + echo "PCP metrics reset" + #Without the sleep the "Start" will be missed + sleep 2 echo "Start ${1} ${2} ${3}\n" printf "Start ${1} ${2} ${3}\n" > $FIFO } +#Sends value to PCP archive as "throughput" +#Uses openmetrics +result2pcp() { + printf "throughput ${1}\n" > $FIFO + #Stops PCP #Sends "Stop" command to PCPrecord_actions stop_pcp() { @@ -44,3 +54,10 @@ shutdown_pcp() { systemctl stop PCPrecord.service systemctl stop pmcd } + +#Reset the openmetrics file +reset_pcp_om () { + printf "Reset\n" > $FIFO + echo "PCP metrics reset" + sleep 2 +}