Skip to content

Commit 0197f35

Browse files
authored
adding PSI metrics fixing vm check (#40)
* adding PSI metrics fixing vm check * ignore pytype
1 parent b0a5cf5 commit 0197f35

7 files changed

+189
-40
lines changed

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ BINARY_COLLECT := perf-collect
1010
BINARY_POSTPROCESS := perf-postprocess
1111
default: dist
1212

13-
.PHONY: test default dist format format_check style_error_check check dist/version_file dist/$(SOURCE_PACKAGE)
13+
.PHONY: test default dist format format_check style_error_check pytype check dist/version_file dist/$(SOURCE_PACKAGE)
1414

1515
clean_dir:
1616
rm -rf build/*
@@ -78,6 +78,9 @@ style_error_check:
7878
# ignore long lines and conflicts with black, i.e., black wins
7979
flake8 *.py src --ignore=E501,W503,E203
8080

81-
check: format_check style_error_check
81+
pytype: *.py src/*.py
82+
pytype ./*.py
83+
84+
check: format_check style_error_check pytype
8285

8386
dist: check dist/$(PACKAGE_EXTERNAL)

_version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.2.13
1+
1.2.14

perf-collect.py

Lines changed: 69 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
###########################################################################################################
77

8+
import json
89
import logging
910
import os
1011
import platform
@@ -39,19 +40,14 @@ def write_metadata(
3940
muxinterval,
4041
thread,
4142
socket,
42-
metadata_only=False,
43+
psi,
4344
):
4445
tsc_freq = str(perf_helpers.get_tsc_freq())
4546
data = ""
4647
time_stamp = ""
4748
validate_file(outcsv)
4849
with open(outcsv, "r") as original:
4950
time_stamp = original.readline()
50-
if metadata_only and time_stamp.startswith("### META DATA ###"):
51-
logging.warning(
52-
"Not prepending metadata, already present in %s " % (outcsv)
53-
)
54-
return
5551
data = original.read()
5652
with open(outcsv, "w") as modified:
5753
modified.write("### META DATA ###,\n")
@@ -120,6 +116,7 @@ def write_metadata(
120116
modified.write("cpusets" + cpusets + ",\n")
121117
modified.write("Percore mode," + threadmode + ",\n")
122118
modified.write("Persocket mode," + socketmode + ",\n")
119+
modified.write("PSI," + json.dumps(psi) + "\n")
123120
modified.write("PerfSpect version," + perf_helpers.get_tool_version() + ",\n")
124121
modified.write("### PERF EVENTS ###" + ",\n")
125122
for e in collection_events:
@@ -137,6 +134,30 @@ def write_metadata(
137134
modified.write(data)
138135

139136

137+
def get_psi():
138+
psi = []
139+
for resource in ["cpu", "memory", "io"]:
140+
with open("/proc/pressure/" + resource) as f:
141+
psi.append(f.readline().split()[4].split("=")[1])
142+
return psi
143+
144+
145+
def supports_psi():
146+
psi = []
147+
for resource in ["cpu", "memory", "io"]:
148+
try:
149+
with open("/proc/pressure/" + resource) as _:
150+
psi.append(resource)
151+
except Exception:
152+
pass
153+
if len(psi) == 3:
154+
logging.info("PSI metrics supported")
155+
return True
156+
else:
157+
logging.info("PSI metrics not supported")
158+
return False
159+
160+
140161
def resource_path(relative_path):
141162
"""Get absolute path to resource, works for dev and for PyInstaller"""
142163
base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
@@ -229,17 +250,21 @@ def validate_file(fname):
229250
nmi_watchdog = perf_helpers.disable_nmi_watchdog()
230251
initial_pmus = perf_helpers.pmu_contention_detect()
231252
interval = 5000
253+
collect_psi = False
232254

233255
if args.thread:
234256
logging.info("Run mode: thread")
257+
collect_psi = supports_psi()
235258
elif args.socket:
236259
logging.info("Run mode: socket")
260+
collect_psi = supports_psi()
237261
elif args.pid is not None:
238262
logging.info("Run mode: pid")
239263
elif args.cid is not None:
240264
logging.info("Run mode: cid")
241265
else:
242266
logging.info("Run mode: system")
267+
collect_psi = supports_psi()
243268

244269
if args.muxinterval > 1000:
245270
crash("Input argument muxinterval is too large, max is [1s or 1000ms]")
@@ -279,6 +304,25 @@ def validate_file(fname):
279304
else:
280305
crash("Unknown application type")
281306

307+
# get perf events to collect
308+
sys_devs = perf_helpers.get_sys_devices()
309+
if (
310+
"uncore_cha" not in sys_devs
311+
and "uncore_cbox" not in sys_devs
312+
and "uncore_upi" not in sys_devs
313+
and "uncore_qpi" not in sys_devs
314+
and "uncore_imc" not in sys_devs
315+
):
316+
logging.info("disabling uncore (possibly in a vm?)")
317+
have_uncore = False
318+
if arch == "icelake":
319+
logging.warning(
320+
"Due to lack of vPMU support, TMA L1 events will not be collected"
321+
)
322+
if arch == "sapphirerapids" or arch == "emeraldrapids":
323+
logging.warning(
324+
"Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
325+
)
282326
events, collection_events = prep_events.prepare_perf_events(
283327
eventfile,
284328
(
@@ -305,26 +349,6 @@ def validate_file(fname):
305349
if args.cid is not None:
306350
cgroups = perf_helpers.get_cgroups(args.cid)
307351

308-
# get perf events to collect
309-
sys_devs = perf_helpers.get_sys_devices()
310-
if (
311-
"uncore_cha" not in sys_devs
312-
and "uncore_cbox" not in sys_devs
313-
and "uncore_upi" not in sys_devs
314-
and "uncore_qpi" not in sys_devs
315-
and "uncore_imc" not in sys_devs
316-
):
317-
logging.info("disabling uncore (possibly in a vm?)")
318-
have_uncore = False
319-
if arch == "icelake":
320-
logging.warning(
321-
"Due to lack of vPMU support, TMA L1 events will not be collected"
322-
)
323-
if arch == "sapphirerapids" or arch == "emeraldrapids":
324-
logging.warning(
325-
"Due to lack of vPMU support, TMA L1 & L2 events will not be collected"
326-
)
327-
328352
if args.thread or args.socket or args.pid is not None or args.cid is not None:
329353
logging.info("Not collecting uncore events in this run mode")
330354

@@ -367,20 +391,33 @@ def validate_file(fname):
367391
if args.verbose:
368392
logging.info(cmd)
369393
try:
394+
psi = []
370395
start = time.time()
371-
subprocess.call(perfargs) # nosec
396+
perf = subprocess.Popen(perfargs) # nosec
397+
while perf.poll() is None:
398+
if collect_psi:
399+
psi.append(get_psi())
400+
time.sleep(interval / 1000)
372401
end = time.time()
373402
if end - start < 7:
374403
logging.warning(
375404
"PerfSpect was run for a short duration, some events might be zero or blank because they never got scheduled"
376405
)
377-
logging.info("Collection complete! Calculating TSC frequency now")
406+
407+
except subprocess.SubprocessError as e:
408+
perf.kill() # type: ignore
409+
crash("Failed to start perf\n" + str(e))
378410
except KeyboardInterrupt:
379-
logging.info("Collection stopped! Caculating TSC frequency now")
380-
except Exception:
381-
crash("perf encountered errors")
411+
perf.kill() # type: ignore
412+
except Exception as e:
413+
perf.kill() # type: ignore
414+
crash(str(e) + "\nperf encountered errors")
415+
416+
logging.info("Collection complete!")
382417

383418
cpuid_info = perf_helpers.get_cpuid_info(procinfo)
419+
if collect_psi:
420+
psi.append(get_psi())
384421
write_metadata(
385422
args.outcsv,
386423
collection_events,
@@ -390,7 +427,7 @@ def validate_file(fname):
390427
args.muxinterval,
391428
args.thread,
392429
args.socket,
393-
False,
430+
list(map(list, zip(*psi))),
394431
)
395432

396433
os.chmod(args.outcsv, 0o666) # nosec

perf-postprocess.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,8 @@ def get_metadata_as_dict(meta_data_lines):
309309
meta_data["SOCKET_CORES"] = []
310310
cores = ((line.split("\n")[0]).split(",")[1]).split(";")[:-1]
311311
meta_data["SOCKET_CORES"].append(cores)
312+
elif line.startswith("PSI"):
313+
meta_data["PSI"] = json.loads(line.split("PSI,")[1])
312314
return meta_data
313315

314316

@@ -623,8 +625,14 @@ def write_html(time_series_df, perf_mode, out_file_path):
623625
["MEMORY", "metric_TMA_..Memory_Bound(%)"],
624626
["BADSPECULATION", "metric_TMA_Bad_Speculation(%)"],
625627
["RETIRING", "metric_TMA_Retiring(%)"],
628+
["PSI_CPU", "cpu stall (us)"],
629+
["PSI_MEM", "memory stall (us)"],
630+
["PSI_IO", "io stall (us)"],
626631
]:
627-
html = html.replace(number[0], str(avg.loc[number[1], 0]))
632+
try:
633+
html = html.replace(number[0], str(avg.loc[number[1], 0]))
634+
except Exception:
635+
html = html.replace(number[0], "0")
628636

629637
with open(
630638
os.path.splitext(out_file_path)[0] + ".html", "w", encoding="utf-8"
@@ -827,6 +835,23 @@ def generate_metrics(
827835
len(errors["MISSING EVENTS"]) > 0 or len(errors["ZERO DIVISION"]) > 0
828836
):
829837
crash("Failing due to postprocessing errors")
838+
839+
# add psi
840+
if len(meta_data["PSI"]) > 0 and perf_mode == Mode.System:
841+
psi_len = range(len(time_series_df.columns))
842+
time_series_df.loc["cpu stall (us)"] = [
843+
int(meta_data["PSI"][0][x + 1]) - int(meta_data["PSI"][0][x])
844+
for x in psi_len
845+
]
846+
time_series_df.loc["memory stall (us)"] = [
847+
int(meta_data["PSI"][1][x + 1]) - int(meta_data["PSI"][1][x])
848+
for x in psi_len
849+
]
850+
time_series_df.loc["io stall (us)"] = [
851+
int(meta_data["PSI"][2][x + 1]) - int(meta_data["PSI"][2][x])
852+
for x in psi_len
853+
]
854+
830855
generate_metrics_time_series(time_series_df, perf_mode, out_file_path)
831856
generate_metrics_averages(time_series_df, perf_mode, out_file_path)
832857
if perf_mode == Mode.System:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
black
22
flake8
3+
pytype
34
simpleeval
45
pandas
56
plotly

src/base.html

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@
276276
variant="scrollable"
277277
>
278278
<Tab label="TMA" />
279+
<Tab label="PSI" />
279280
<Tab label="CPU" />
280281
<Tab label="Memory" />
281282
<Tab label="Power" />
@@ -380,6 +381,84 @@
380381
<TabPanel
381382
value={systemTabs}
382383
index={1}
384+
>
385+
<Grid container>
386+
<Grid item xs={5}>
387+
<Typography variant="h2">
388+
Pressure Stall Information (PSI)
389+
</Typography>
390+
<Typography variant="body1">
391+
Your workload spent an average of {(PSI_MEM + PSI_CPU + PSI_IO) / 50000}% of time stalled waiting on a hardware resource.
392+
</Typography>
393+
<ul>
394+
<li>
395+
<Typography variant="body1">
396+
Stall is the amount of time in (in microseconds) that any task was waiting on a given resource.
397+
</Typography>
398+
</li>
399+
<li>
400+
<Typography variant="body1">
401+
When CPU, memory or IO devices are contended, workloads experience latency spikes, throughput losses, and run the risk of OOM kills.
402+
</Typography>
403+
</li>
404+
<li>
405+
<Typography variant="body1">
406+
Low or zero PSI means you are paying for capacity you don't use. You might be able to downsize your server.
407+
</Typography>
408+
</li>
409+
<li>
410+
<Typography variant="body1">
411+
High PSI means you should optimize your code or upgrade your server in whichever category is the largest.
412+
</Typography>
413+
</li>
414+
<li>
415+
<a href="https://docs.kernel.org/accounting/psi.html">
416+
Learn more here
417+
</a>
418+
</li>
419+
</ul>
420+
</Grid>
421+
<Grid item xs={7}>
422+
<ReactECharts style={{ minHeight: "400px" }} option={{
423+
tooltip: {},
424+
series: {
425+
nodeClick: false,
426+
type: "sunburst",
427+
radius: [60, "90%"],
428+
itemStyle: {
429+
borderRadius: 7,
430+
borderWidth: 2,
431+
},
432+
data: [
433+
{
434+
name: "CPU stall",
435+
value: PSI_CPU,
436+
},
437+
{
438+
name: "Memory stall",
439+
value: PSI_MEM,
440+
},
441+
{
442+
name: "IO stall",
443+
value: PSI_IO,
444+
},
445+
{
446+
name: "Not stalled",
447+
value: 5000000 - (PSI_MEM + PSI_CPU + PSI_IO),
448+
},
449+
],
450+
radius: [20, "100%"],
451+
label: {
452+
rotate: "radial",
453+
},
454+
},
455+
}} />
456+
</Grid>
457+
</Grid>
458+
</TabPanel>
459+
<TabPanel
460+
value={systemTabs}
461+
index={2}
383462
>
384463
<Grid container>
385464
<Grid item xs={5}>
@@ -423,7 +502,7 @@
423502
</TabPanel>
424503
<TabPanel
425504
value={systemTabs}
426-
index={2}
505+
index={3}
427506
>
428507
<Grid container>
429508
<Grid item xs={5}>
@@ -467,7 +546,7 @@
467546
</TabPanel>
468547
<TabPanel
469548
value={systemTabs}
470-
index={3}
549+
index={4}
471550
>
472551
<Grid container>
473552
<Grid item xs={5}>

0 commit comments

Comments
 (0)