From 1bcfb49ed0aeac5fedad080e0c62f627002e8f78 Mon Sep 17 00:00:00 2001 From: Michael Tremeer Date: Thu, 21 Dec 2023 18:26:26 +1000 Subject: [PATCH 1/9] Add tpr stats output, and periodic warning when gen_tpr < 90% of max_tokens --- README.md | 3 +++ benchmark/loadcmd.py | 28 ++++++++++++++-------------- benchmark/statsaggregator.py | 31 +++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index a03d5669..90447348 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,9 @@ The tool supports four different shape profiles via command line option `--shape |`ttft_95th`|95th percentile of time in seconds from the beginning of the request until the first token was received.|yes|`0.130`| |`tbt_avg`|Average time in seconds between two consequitive generated tokens.|yes|`0.018`| |`tbt_95th`|95th percentail of time in seconds between two consequitive generated tokens.|yes|`0.021`| +|`gen_tpr_10th`|90th percentile of tokens per response.|yes|`389`| +|`gen_tpr_avg`|Average tokens per response.|yes|`509`| +|`gen_tpr_90th`|90th percentile of tokens per response.|yes|`626`| |`e2e_avg`|Average end to end request time.|yes|`1.2`| |`e2e_95th`|95th percentile of end to end request time.|yes|`1.5`| |`util_avg`|Average deployment utilization percentage as reported by the service.|yes|`89.3%`| diff --git a/benchmark/loadcmd.py b/benchmark/loadcmd.py index 8decdfce..bc96118a 100644 --- a/benchmark/loadcmd.py +++ b/benchmark/loadcmd.py @@ -23,11 +23,11 @@ class _RequestBuilder: Wrapper iterator class to build request payloads. """ def __init__(self, model:str, context_tokens:int, - max_tokens:None, - completions:None, - frequency_penalty:None, - presence_penalty:None, - temperature:None, + max_tokens:None, + completions:None, + frequency_penalty:None, + presence_penalty:None, + temperature:None, top_p:None): self.model = model self.context_tokens = context_tokens @@ -101,7 +101,7 @@ def load(args): logging.info("starting load...") _run_load(request_builder, - max_concurrency=args.clients, + max_concurrency=args.clients, api_key=api_key, url=url, rate_limiter=rate_limiter, @@ -112,19 +112,20 @@ def load(args): json_output=args.output_format=="jsonl") def _run_load(request_builder: Iterable[dict], - max_concurrency: int, + max_concurrency: int, api_key: str, url: str, - rate_limiter=None, + rate_limiter=None, backoff=False, - duration=None, + duration=None, aggregation_duration=60, request_count=None, json_output=False): aggregator = _StatsAggregator( window_duration=aggregation_duration, - dump_duration=1, + dump_duration=1, clients=max_concurrency, + expected_gen_tokens=request_builder.max_tokens, json_output=json_output) requester = OAIRequester(api_key, url, backoff=backoff) @@ -141,13 +142,13 @@ async def request_func(session:aiohttp.ClientSession): print(e) executer = AsyncHTTPExecuter( - request_func, - rate_limiter=rate_limiter, + request_func, + rate_limiter=rate_limiter, max_concurrency=max_concurrency) aggregator.start() executer.run( - call_count=request_count, + call_count=request_count, duration=duration) aggregator.stop() @@ -219,4 +220,3 @@ def _validate(args): raise ValueError("presence-penalty must be between -2.0 and 2.0") if args.temperature is not None and (args.temperature < 0 or args.temperature > 2): raise ValueError("temperature must be between 0 and 2.0") - \ No newline at end of file diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index a5804333..d6fb8388 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -6,6 +6,7 @@ import logging import threading import time +from typing import Optional import numpy as np @@ -29,7 +30,7 @@ def _values(self) -> [float]: for entry in self.samples: values.append(entry[1]) return values - + def _len(self) -> int: return len(self.samples) @@ -56,17 +57,19 @@ class _StatsAggregator(threading.Thread): generated_tokens = _Samples() utilizations = _Samples() - def __init__(self, clients:int, dump_duration:float=5, window_duration:float=60, json_output=False, *args,**kwargs): + def __init__(self, clients:int, dump_duration:float=5, window_duration:float=60, expected_gen_tokens: Optional[int] = None, json_output=False, *args,**kwargs): """ :param clients: number of clients used in testing :param dump_duration: duration in seconds to dump current aggregates. :param window_duration: duration of sliding window in second to consider for aggregation. + :param expected_gen_tokens: number of tokens expected in each response. :param json_output: whether to dump periodic stats as json or human readable. """ self.clients = clients self.dump_duration = dump_duration self.json_output = json_output self.window_duration = window_duration + self.expected_gen_tokens = expected_gen_tokens super(_StatsAggregator, self).__init__(*args, **kwargs) @@ -113,7 +116,7 @@ def aggregate_request(self, stats: RequestStats): f"request completed in {round(request_latency, 2)} seconds, while aggregation-window is {round(self.window_duration, 2)} " "seconds, consider increasing aggregation-window to at least 2x your typical request latency." ) - ) + ) self.request_timestamps._append(stats.request_start_time, stats.request_start_time) self.response_latencies._append(stats.request_start_time, stats.response_time - stats.request_start_time) self.first_token_latencies._append(stats.request_start_time, stats.first_token_time - stats.request_start_time) @@ -138,6 +141,8 @@ def _dump(self): tokens_per_minute += context_per_minute if gen_per_minute != "n/a": tokens_per_minute += gen_per_minute + gen_tpr_avg = int(np.sum(self.generated_tokens._values()) / self.generated_tokens._len()) if self.generated_tokens._len() > 0 else "n/a" + gen_tpr_95th = int(np.percentile(self.generated_tokens._values(), 95)) if self.generated_tokens._len() > 1 else "n/a" ttft_avg = round(np.average(self.first_token_latencies._values()), 3) if self.first_token_latencies._len() > 0 else "n/a" ttft_95th = round(np.percentile(self.first_token_latencies._values(), 95), 3) if self.first_token_latencies._len() > 1 else "n/a" tbt_avg = round(np.average(self.token_latencies._values()), 3) if self.token_latencies._len() > 0 else "n/a" @@ -145,6 +150,20 @@ def _dump(self): util_avg = f"{round(np.average(self.utilizations._values()), 1)}%" if self.utilizations._len() > 0 else "n/a" util_95th = f"{round(np.percentile(self.utilizations._values(), 95), 1)}%" if self.utilizations._len() > 1 else "n/a" rpm = round(60.0 * self.request_timestamps._len() / dynamic_window, 1) if self.request_timestamps._len() > 0 else "n/a" + # Periodically warn if generated TPR is consistently lower than requested, which can result in higher scores for RPM compared to reality + warning_period_secs = 10 + if all(( + run_seconds % warning_period_secs == 0, + self.expected_gen_tokens is not None, + isinstance(gen_tpr_avg, int) + )) and gen_tpr_avg < 0.9 * self.expected_gen_tokens: + logging.warning( + ( + f"Average tokens per response is {gen_tpr_avg}, compared to requested max_tokens of {self.expected_gen_tokens}." + " This may mean measured RPM and e2e request latency are higher here than in real-world workloads" + " (tpm, ttft & tbt stats will still be accurate)." + ) + ) # Handle the 1x extra processing_request due to next request being queued processing_requests_count = min(self.clients, self.processing_requests_count) if self.json_output: @@ -174,6 +193,10 @@ def _dump(self): "avg": tbt_avg, "95th": tbt_95th, }, + "gen_tpr": { + "avg": gen_tpr_avg, + "90th": gen_tpr_95th, + }, "util": { "avg": util_avg, "95th": util_95th, @@ -181,7 +204,7 @@ def _dump(self): } print(json.dumps(j), flush=True) else: - print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) + print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} gen_tpr_avg {gen_tpr_avg:<4} gen_tpr_95th {gen_tpr_95th:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) def _slide_window(self): with self.lock: From 0dea9476eb85eec78117408b9fde2034bdcb8282 Mon Sep 17 00:00:00 2001 From: Michael Tremeer Date: Thu, 21 Dec 2023 18:48:21 +1000 Subject: [PATCH 2/9] Add note to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 90447348..7db541dd 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,8 @@ The tool supports four different shape profiles via command line option `--shape |`generation`|Represents workloads with larger generation and smaller contexts. For example, question answering.|500|1000| |`custom`|Allows specifying custom values for context size (`--context-tokens`) and max generation tokens (`--max-tokens`).||| +Note: With the default prompting strategy, OpenAI models will typically return completions of a max of 700-1200 tokens. If setting `max_tokens` above 750, be aware that the results for `rpm` may be higher, and `e2e` latency lower, than if the model was returning completions of size `max_tokens` in every response. Refer to the `gen_tpr` stats at the end of each run to see how many tokens were generated across responses. + ### Output fields |field|description|sliding window|example| From 793b393ef01caefe1e23a98b1c488a71d4420cbf Mon Sep 17 00:00:00 2001 From: Michael Tremeer Date: Thu, 21 Dec 2023 18:50:06 +1000 Subject: [PATCH 3/9] README clarity --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7db541dd..a6f5f634 100644 --- a/README.md +++ b/README.md @@ -141,9 +141,9 @@ Note: With the default prompting strategy, OpenAI models will typically return c |`ttft_95th`|95th percentile of time in seconds from the beginning of the request until the first token was received.|yes|`0.130`| |`tbt_avg`|Average time in seconds between two consequitive generated tokens.|yes|`0.018`| |`tbt_95th`|95th percentail of time in seconds between two consequitive generated tokens.|yes|`0.021`| -|`gen_tpr_10th`|90th percentile of tokens per response.|yes|`389`| -|`gen_tpr_avg`|Average tokens per response.|yes|`509`| -|`gen_tpr_90th`|90th percentile of tokens per response.|yes|`626`| +|`gen_tpr_10th`|10th percentile of number of generated tokens per model response.|yes|`389`| +|`gen_tpr_avg`|Average number of generated tokens per model response.|yes|`509`| +|`gen_tpr_90th`|90th percentile of number of generated tokens per model response.|yes|`626`| |`e2e_avg`|Average end to end request time.|yes|`1.2`| |`e2e_95th`|95th percentile of end to end request time.|yes|`1.5`| |`util_avg`|Average deployment utilization percentage as reported by the service.|yes|`89.3%`| From efe63777b95d526af47938cc9aaaa9be2bca7665 Mon Sep 17 00:00:00 2001 From: Michael Tremeer Date: Thu, 21 Dec 2023 19:30:37 +1000 Subject: [PATCH 4/9] Switch warning to lower case --- benchmark/statsaggregator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index d6fb8388..cab66e49 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -159,8 +159,8 @@ def _dump(self): )) and gen_tpr_avg < 0.9 * self.expected_gen_tokens: logging.warning( ( - f"Average tokens per response is {gen_tpr_avg}, compared to requested max_tokens of {self.expected_gen_tokens}." - " This may mean measured RPM and e2e request latency are higher here than in real-world workloads" + f"average tokens per response is {gen_tpr_avg}, compared to requested max_tokens of {self.expected_gen_tokens}." + " this may mean measured rpm and e2e request latency are higher here than in real-world workloads" " (tpm, ttft & tbt stats will still be accurate)." ) ) From 9c9fa1f06e2324d901cfb558f990609db3e7e91a Mon Sep 17 00:00:00 2001 From: Michael Tremeer Date: Tue, 9 Jan 2024 10:14:54 +1000 Subject: [PATCH 5/9] Add context_tpr_avg, clarify warning text --- benchmark/statsaggregator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index cab66e49..e8a4326b 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -141,6 +141,7 @@ def _dump(self): tokens_per_minute += context_per_minute if gen_per_minute != "n/a": tokens_per_minute += gen_per_minute + context_tpr_avg = int(np.sum(self.context_tokens._values()) / self.context_tokens._len()) if self.context_tokens._len() > 0 else "n/a" gen_tpr_avg = int(np.sum(self.generated_tokens._values()) / self.generated_tokens._len()) if self.generated_tokens._len() > 0 else "n/a" gen_tpr_95th = int(np.percentile(self.generated_tokens._values(), 95)) if self.generated_tokens._len() > 1 else "n/a" ttft_avg = round(np.average(self.first_token_latencies._values()), 3) if self.first_token_latencies._len() > 0 else "n/a" @@ -160,7 +161,7 @@ def _dump(self): logging.warning( ( f"average tokens per response is {gen_tpr_avg}, compared to requested max_tokens of {self.expected_gen_tokens}." - " this may mean measured rpm and e2e request latency are higher here than in real-world workloads" + " this may mean measured rpm is higher and e2e request latency is faster than in real-world workloads" " (tpm, ttft & tbt stats will still be accurate)." ) ) @@ -193,6 +194,9 @@ def _dump(self): "avg": tbt_avg, "95th": tbt_95th, }, + "context_tpr": { + "avg": context_tpr_avg, + }, "gen_tpr": { "avg": gen_tpr_avg, "90th": gen_tpr_95th, @@ -204,7 +208,7 @@ def _dump(self): } print(json.dumps(j), flush=True) else: - print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} gen_tpr_avg {gen_tpr_avg:<4} gen_tpr_95th {gen_tpr_95th:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) + print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} context_tpr_avg {context_tpr_avg:<4} gen_tpr_avg {gen_tpr_avg:<4} gen_tpr_95th {gen_tpr_95th:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) def _slide_window(self): with self.lock: From 25aaf2687e20d6abc80029b273d1f2a5eab89e76 Mon Sep 17 00:00:00 2001 From: Youssef Shahin Date: Wed, 21 Feb 2024 15:26:24 -0800 Subject: [PATCH 6/9] output context size used per request --- README.md | 23 +++++++++++------------ benchmark/statsaggregator.py | 4 +--- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a6f5f634..d04991c3 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ > :warning: **Code in this repo is written for testing purposes and should not be used in production** -The Azure OpenAI Benchmarking tool is designed to aid customers in benchmarking their provisioned-throughput deployments. Provisioned throughput deployments provide a set amount of model compute. But determining the exact performance for you application is dependent on several variables such as: prompt size, generation size and call rate. +The Azure OpenAI Benchmarking tool is designed to aid customers in benchmarking their provisioned-throughput deployments. Provisioned throughput deployments provide a set amount of model compute. But determining the exact performance for you application is dependent on several variables such as: prompt size, generation size and call rate. -The benchmarking tool provides a simple way to run test traffic on your deploymnet and validate the throughput for your traffic workloads. The script will output key performance statistics including the average and 95th percentile latencies and utilization of the deployment. +The benchmarking tool provides a simple way to run test traffic on your deploymnet and validate the throughput for your traffic workloads. The script will output key performance statistics including the average and 95th percentile latencies and utilization of the deployment. You can use this tool to experiment with total throughput at 100% utilization across different traffic patterns for a ```Provisioned-Managed``` deployment type. These tests allow you to better optimize your solution design by adjusting the prompt size, generation size and PTUs deployed @@ -57,9 +57,9 @@ The table below provides an example prompt & generation size we have seen with s Or see the [pre-configured shape-profiles below](#shape-profiles). -### Run samples +### Run samples -During a run, statistics are output every second to `stdout` while logs are output to `stderr`. Some metrics may not show up immediately due to lack of data. +During a run, statistics are output every second to `stdout` while logs are output to `stderr`. Some metrics may not show up immediately due to lack of data. **Run load test at 60 RPM with exponential retry back-off** @@ -73,9 +73,9 @@ $ python -m benchmark.bench load \ 2023-10-19 18:21:06 INFO using shape profile balanced: context tokens: 500, max tokens: 500 2023-10-19 18:21:06 INFO warming up prompt cache 2023-10-19 18:21:06 INFO starting load... -2023-10-19 18:21:06 rpm: 1.0 requests: 1 failures: 0 throttled: 0 ctx tpm: 501.0 gen tpm: 103.0 ttft avg: 0.736 ttft 95th: n/a tbt avg: 0.088 tbt 95th: n/a e2e avg: 1.845 e2e 95th: n/a util avg: 0.0% util 95th: n/a -2023-10-19 18:21:07 rpm: 5.0 requests: 5 failures: 0 throttled: 0 ctx tpm: 2505.0 gen tpm: 515.0 ttft avg: 0.937 ttft 95th: 1.321 tbt avg: 0.042 tbt 95th: 0.043 e2e avg: 1.223 e2e 95th: 1.658 util avg: 0.8% util 95th: 1.6% -2023-10-19 18:21:08 rpm: 8.0 requests: 8 failures: 0 throttled: 0 ctx tpm: 4008.0 gen tpm: 824.0 ttft avg: 0.913 ttft 95th: 1.304 tbt avg: 0.042 tbt 95th: 0.043 e2e avg: 1.241 e2e 95th: 1.663 util avg: 1.3% util 95th: 2.6% +2023-10-19 18:21:06 rpm: 1.0 requests: 1 failures: 0 throttled: 0 ctx tpm: 501.0 gen tpm: 103.0 ttft avg: 0.736 ttft 95th: n/a tbt avg: 0.088 tbt 95th: n/a e2e avg: 1.845 e2e 95th: n/a util avg: 0.0% util 95th: n/a +2023-10-19 18:21:07 rpm: 5.0 requests: 5 failures: 0 throttled: 0 ctx tpm: 2505.0 gen tpm: 515.0 ttft avg: 0.937 ttft 95th: 1.321 tbt avg: 0.042 tbt 95th: 0.043 e2e avg: 1.223 e2e 95th: 1.658 util avg: 0.8% util 95th: 1.6% +2023-10-19 18:21:08 rpm: 8.0 requests: 8 failures: 0 throttled: 0 ctx tpm: 4008.0 gen tpm: 824.0 ttft avg: 0.913 ttft 95th: 1.304 tbt avg: 0.042 tbt 95th: 0.043 e2e avg: 1.241 e2e 95th: 1.663 util avg: 1.3% util 95th: 2.6% ``` **Load test with custom request shape** @@ -120,7 +120,7 @@ The tool supports four different shape profiles via command line option `--shape |`balanced`|[default] Balanced count of context and generation tokens. Should be representative of typical workloads.|500|500| |`context`|Represents workloads with larger context sizes compared to generation. For example, chat assistants.|2000|200| |`generation`|Represents workloads with larger generation and smaller contexts. For example, question answering.|500|1000| -|`custom`|Allows specifying custom values for context size (`--context-tokens`) and max generation tokens (`--max-tokens`).||| +|`custom`|Allows specifying custom values for context size (`--context-tokens`) and max generation tokens (`--max-tokens`).||| Note: With the default prompting strategy, OpenAI models will typically return completions of a max of 700-1200 tokens. If setting `max_tokens` above 750, be aware that the results for `rpm` may be higher, and `e2e` latency lower, than if the model was returning completions of size `max_tokens` in every response. Refer to the `gen_tpr` stats at the end of each run to see how many tokens were generated across responses. @@ -141,9 +141,8 @@ Note: With the default prompting strategy, OpenAI models will typically return c |`ttft_95th`|95th percentile of time in seconds from the beginning of the request until the first token was received.|yes|`0.130`| |`tbt_avg`|Average time in seconds between two consequitive generated tokens.|yes|`0.018`| |`tbt_95th`|95th percentail of time in seconds between two consequitive generated tokens.|yes|`0.021`| -|`gen_tpr_10th`|10th percentile of number of generated tokens per model response.|yes|`389`| +|`context_tpr_avg`|Average number of context tokens used in a model request.|yes|`509`| |`gen_tpr_avg`|Average number of generated tokens per model response.|yes|`509`| -|`gen_tpr_90th`|90th percentile of number of generated tokens per model response.|yes|`626`| |`e2e_avg`|Average end to end request time.|yes|`1.2`| |`e2e_95th`|95th percentile of end to end request time.|yes|`1.5`| |`util_avg`|Average deployment utilization percentage as reported by the service.|yes|`89.3%`| @@ -167,8 +166,8 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio ## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index e8a4326b..b61c93f0 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -143,7 +143,6 @@ def _dump(self): tokens_per_minute += gen_per_minute context_tpr_avg = int(np.sum(self.context_tokens._values()) / self.context_tokens._len()) if self.context_tokens._len() > 0 else "n/a" gen_tpr_avg = int(np.sum(self.generated_tokens._values()) / self.generated_tokens._len()) if self.generated_tokens._len() > 0 else "n/a" - gen_tpr_95th = int(np.percentile(self.generated_tokens._values(), 95)) if self.generated_tokens._len() > 1 else "n/a" ttft_avg = round(np.average(self.first_token_latencies._values()), 3) if self.first_token_latencies._len() > 0 else "n/a" ttft_95th = round(np.percentile(self.first_token_latencies._values(), 95), 3) if self.first_token_latencies._len() > 1 else "n/a" tbt_avg = round(np.average(self.token_latencies._values()), 3) if self.token_latencies._len() > 0 else "n/a" @@ -199,7 +198,6 @@ def _dump(self): }, "gen_tpr": { "avg": gen_tpr_avg, - "90th": gen_tpr_95th, }, "util": { "avg": util_avg, @@ -208,7 +206,7 @@ def _dump(self): } print(json.dumps(j), flush=True) else: - print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} context_tpr_avg {context_tpr_avg:<4} gen_tpr_avg {gen_tpr_avg:<4} gen_tpr_95th {gen_tpr_95th:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) + print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} context_tpr_avg {context_tpr_avg:<4} gen_tpr_avg {gen_tpr_avg:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) def _slide_window(self): with self.lock: From ecce0ec2922b32312c24534abd5fc45b8cafc932 Mon Sep 17 00:00:00 2001 From: Youssef Shahin Date: Wed, 21 Feb 2024 16:40:50 -0800 Subject: [PATCH 7/9] include new requirement pip --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cc918417..e205c8d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ numpy backoff wonderwords asyncio -aiohttp \ No newline at end of file +aiohttp +typing From 1891c05d7731b2e59c17f79b3aaf3bad0704a7f0 Mon Sep 17 00:00:00 2001 From: Youssef Shahin Date: Thu, 29 Feb 2024 15:48:51 -0800 Subject: [PATCH 8/9] remove context token count since its specified as an input Rename gen_tpr_avg to avg_gen_tokens. It is implicit that its per request/response --- README.md | 5 ++--- benchmark/statsaggregator.py | 18 +++++++----------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index d04991c3..8d4cbc9b 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ The tool supports four different shape profiles via command line option `--shape |`generation`|Represents workloads with larger generation and smaller contexts. For example, question answering.|500|1000| |`custom`|Allows specifying custom values for context size (`--context-tokens`) and max generation tokens (`--max-tokens`).||| -Note: With the default prompting strategy, OpenAI models will typically return completions of a max of 700-1200 tokens. If setting `max_tokens` above 750, be aware that the results for `rpm` may be higher, and `e2e` latency lower, than if the model was returning completions of size `max_tokens` in every response. Refer to the `gen_tpr` stats at the end of each run to see how many tokens were generated across responses. +Note: With the default prompting strategy, OpenAI models will typically return completions of a max of 700-1200 tokens. If setting `max_tokens` above 750, be aware that the results for `rpm` may be higher, and `e2e` latency lower, than if the model was returning completions of size `max_tokens` in every response. Refer to the `gen_tokens` stats at the end of each run to see how many tokens were generated across responses. ### Output fields @@ -141,8 +141,7 @@ Note: With the default prompting strategy, OpenAI models will typically return c |`ttft_95th`|95th percentile of time in seconds from the beginning of the request until the first token was received.|yes|`0.130`| |`tbt_avg`|Average time in seconds between two consequitive generated tokens.|yes|`0.018`| |`tbt_95th`|95th percentail of time in seconds between two consequitive generated tokens.|yes|`0.021`| -|`context_tpr_avg`|Average number of context tokens used in a model request.|yes|`509`| -|`gen_tpr_avg`|Average number of generated tokens per model response.|yes|`509`| +|`avg_gen_tokens`|Average number of generated tokens per model response.|yes|`509`| |`e2e_avg`|Average end to end request time.|yes|`1.2`| |`e2e_95th`|95th percentile of end to end request time.|yes|`1.5`| |`util_avg`|Average deployment utilization percentage as reported by the service.|yes|`89.3%`| diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index b61c93f0..33952729 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -141,8 +141,7 @@ def _dump(self): tokens_per_minute += context_per_minute if gen_per_minute != "n/a": tokens_per_minute += gen_per_minute - context_tpr_avg = int(np.sum(self.context_tokens._values()) / self.context_tokens._len()) if self.context_tokens._len() > 0 else "n/a" - gen_tpr_avg = int(np.sum(self.generated_tokens._values()) / self.generated_tokens._len()) if self.generated_tokens._len() > 0 else "n/a" + avg_gen_tokens = int(np.sum(self.generated_tokens._values()) / self.generated_tokens._len()) if self.generated_tokens._len() > 0 else "n/a" ttft_avg = round(np.average(self.first_token_latencies._values()), 3) if self.first_token_latencies._len() > 0 else "n/a" ttft_95th = round(np.percentile(self.first_token_latencies._values(), 95), 3) if self.first_token_latencies._len() > 1 else "n/a" tbt_avg = round(np.average(self.token_latencies._values()), 3) if self.token_latencies._len() > 0 else "n/a" @@ -155,11 +154,11 @@ def _dump(self): if all(( run_seconds % warning_period_secs == 0, self.expected_gen_tokens is not None, - isinstance(gen_tpr_avg, int) - )) and gen_tpr_avg < 0.9 * self.expected_gen_tokens: + isinstance(avg_gen_tokens, int) + )) and avg_gen_tokens < 0.9 * self.expected_gen_tokens: logging.warning( ( - f"average tokens per response is {gen_tpr_avg}, compared to requested max_tokens of {self.expected_gen_tokens}." + f"average tokens per response is {avg_gen_tokens}, compared to requested max_tokens of {self.expected_gen_tokens}." " this may mean measured rpm is higher and e2e request latency is faster than in real-world workloads" " (tpm, ttft & tbt stats will still be accurate)." ) @@ -193,11 +192,8 @@ def _dump(self): "avg": tbt_avg, "95th": tbt_95th, }, - "context_tpr": { - "avg": context_tpr_avg, - }, - "gen_tpr": { - "avg": gen_tpr_avg, + "gen_tokens": { + "avg": avg_gen_tokens, }, "util": { "avg": util_avg, @@ -206,7 +202,7 @@ def _dump(self): } print(json.dumps(j), flush=True) else: - print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} context_tpr_avg {context_tpr_avg:<4} gen_tpr_avg {gen_tpr_avg:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) + print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} avg_gen_tokens {avg_gen_tokens:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) def _slide_window(self): with self.lock: From 5c6f3c1b61e9d174b8f26eb9953a64f427b4a581 Mon Sep 17 00:00:00 2001 From: Youssef Shahin Date: Thu, 29 Feb 2024 16:04:07 -0800 Subject: [PATCH 9/9] change location in output and add missing : --- benchmark/statsaggregator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/statsaggregator.py b/benchmark/statsaggregator.py index 33952729..61b5d57c 100644 --- a/benchmark/statsaggregator.py +++ b/benchmark/statsaggregator.py @@ -180,6 +180,9 @@ def _dump(self): "gen": gen_per_minute, "total": tokens_per_minute, }, + "gen_tokens": { + "avg": avg_gen_tokens, + }, "e2e": { "avg": e2e_latency_avg, "95th": e2e_latency_95th, @@ -192,9 +195,6 @@ def _dump(self): "avg": tbt_avg, "95th": tbt_95th, }, - "gen_tokens": { - "avg": avg_gen_tokens, - }, "util": { "avg": util_avg, "95th": util_95th, @@ -202,7 +202,7 @@ def _dump(self): } print(json.dumps(j), flush=True) else: - print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} avg_gen_tokens {avg_gen_tokens:<4} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) + print(f"{timestamp} rpm: {rpm:<5} processing: {processing_requests_count:<4} completed: {self.total_requests_count:<5} failures: {self.total_failed_count:<4} throttled: {self.throttled_count:<4} requests: {self.total_requests_count:<5} tpm: {tokens_per_minute:<6} avg_gen_tokens: {avg_gen_tokens:<4} ttft_avg: {ttft_avg:<6} ttft_95th: {ttft_95th:<6} tbt_avg: {tbt_avg:<6} tbt_95th: {tbt_95th:<6} e2e_avg: {e2e_latency_avg:<6} e2e_95th: {e2e_latency_95th:<6} util_avg: {util_avg:<6} util_95th: {util_95th:<6}", flush=True) def _slide_window(self): with self.lock: