diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index 4044b61209..753cb05abe 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -2,6 +2,7 @@ import pytest import shutil import logging +from pathlib import Path from utils.constants import get_ov_cache_models_dir # Configure logging @@ -13,15 +14,15 @@ def setup_and_teardown(): """Fixture to set up and tear down the temporary directories.""" - ov_cache_models_dir = get_ov_cache_models_dir() + ov_cache_models_dir = Path(get_ov_cache_models_dir()) logger.info(f"Creating directory: {ov_cache_models_dir}") - os.makedirs(ov_cache_models_dir, exist_ok=True) + ov_cache_models_dir.mkdir(exist_ok=True, parents=True) yield if os.environ.get("CLEANUP_CACHE", "false").lower() != "false": - if os.path.exists(ov_cache_models_dir): + if ov_cache_models_dir.exists(): logger.info(f"Removing temporary directory: {ov_cache_models_dir}") shutil.rmtree(ov_cache_models_dir) else: diff --git a/tests/python_tests/data/test_dataset.py b/tests/python_tests/data/test_dataset.py index 59a50a3b2c..283a61fd4f 100644 --- a/tests/python_tests/data/test_dataset.py +++ b/tests/python_tests/data/test_dataset.py @@ -2,19 +2,21 @@ # SPDX-License-Identifier: Apache-2.0 from openvino_genai import GenerationConfig -from utils.generation_config import get_greedy, get_beam_search, get_multinomial_temperature +from utils.generation_config import get_greedy, get_beam_search + +PROMPTS = [ + "What is OpenVINO?", + "How are you?", + "What is your name?", + "Tell me something about Canada" +] + +GENERATION_CONFIGS = [ + get_greedy(), + get_beam_search(), + get_greedy(), + get_beam_search(), +] def get_test_dataset() -> tuple[list[str], list[GenerationConfig]]: - prompts = [ - "What is OpenVINO?", - "How are you?", - "What is your name?", - "Tell me something about Canada" - ] - generation_configs = [ - get_greedy(), - get_beam_search(), - get_greedy(), - get_beam_search(), - ] - return (prompts, generation_configs) + return PROMPTS, GENERATION_CONFIGS diff --git a/tests/python_tests/samples/conftest.py b/tests/python_tests/samples/conftest.py index 454d7a1cb7..8d9ee02b01 100644 --- a/tests/python_tests/samples/conftest.py +++ b/tests/python_tests/samples/conftest.py @@ -162,10 +162,20 @@ "cmu_us_awb_arctic-wav-arctic_a0001.bin": "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_a0001.bin" } -SAMPLES_PY_DIR = Path(os.environ.get("SAMPLES_PY_DIR", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../samples/python")))) -SAMPLES_CPP_DIR = Path(os.environ.get("SAMPLES_CPP_DIR", os.getcwd())) -SAMPLES_C_DIR = os.environ.get("SAMPLES_C_DIR", os.getcwd()) -SAMPLES_JS_DIR = os.environ.get("SAMPLES_JS_DIR", os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../samples/js"))) +SAMPLES_PY_DIR = Path( + os.environ.get( + "SAMPLES_PY_DIR", + Path(__file__).parent.joinpath("../../../samples/python").resolve(), + ) +) +SAMPLES_CPP_DIR = Path(os.environ.get("SAMPLES_CPP_DIR", Path.cwd())) +SAMPLES_C_DIR = Path(os.environ.get("SAMPLES_C_DIR", Path.cwd())) +SAMPLES_JS_DIR = Path( + os.environ.get( + "SAMPLES_JS_DIR", + Path(__file__).parent.joinpath("../../../samples/js").resolve(), + ) +) @pytest.fixture(scope="session", autouse=True) def setup_and_teardown(request, tmp_path_factory): diff --git a/tests/python_tests/samples/test_beam_search_causal_lm.py b/tests/python_tests/samples/test_beam_search_causal_lm.py index a6294d3072..564b848289 100644 --- a/tests/python_tests/samples/test_beam_search_causal_lm.py +++ b/tests/python_tests/samples/test_beam_search_causal_lm.py @@ -24,17 +24,17 @@ def test_sample_beam_search_causal_lm(self, convert_model, sample_args): if sys.platform == 'darwin': pytest.xfail("Ticket 173586") # C++ test - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'beam_search_causal_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'beam_search_causal_lm').as_posix() cpp_command = [cpp_sample, convert_model, f'"{sample_args}"'] cpp_result = run_sample(cpp_command) # Python test - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/beam_search_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/beam_search_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, f'"{sample_args}"'] py_result = run_sample(py_command) # Test JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "text_generation/beam_search_causal_lm.js") + js_sample = (SAMPLES_JS_DIR / "text_generation/beam_search_causal_lm.js").as_posix() js_command =['node', js_sample, convert_model, f'"{sample_args}"'] js_result = run_sample(js_command) @@ -64,19 +64,19 @@ def test_sample_beam_search_causal_lm_refs(self, request, convert_model, sample_ if sys.platform == 'darwin': pytest.xfail("Ticket 173586") # C++ test - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'beam_search_causal_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'beam_search_causal_lm').as_posix() cpp_command = [cpp_sample, convert_model] + [f'"{arg}"' for arg in sample_args] cpp_result = run_sample(cpp_command) cpp_predictions = cpp_result.stdout # Python test - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/beam_search_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/beam_search_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model] + [f'"{arg}"' for arg in sample_args] py_result = run_sample(py_command) py_predictions = py_result.stdout # Test JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "text_generation/beam_search_causal_lm.js") + js_sample = (SAMPLES_JS_DIR / "text_generation/beam_search_causal_lm.js").as_posix() js_command =['node', js_sample, convert_model] + [f'"{arg}"' for arg in sample_args] js_result = run_sample(js_command) js_predictions = js_result.stdout diff --git a/tests/python_tests/samples/test_benchmark_genai.py b/tests/python_tests/samples/test_benchmark_genai.py index 7d40ccbc45..efb82ac407 100644 --- a/tests/python_tests/samples/test_benchmark_genai.py +++ b/tests/python_tests/samples/test_benchmark_genai.py @@ -8,6 +8,7 @@ from conftest import SAMPLES_PY_DIR, SAMPLES_CPP_DIR, SAMPLES_C_DIR from test_utils import run_sample + class TestBenchmarkGenAI: @pytest.mark.llm @pytest.mark.samples @@ -22,7 +23,7 @@ def test_py_sample_benchmark_genai(self, convert_model, prompt, sample_args): if sys.platform == 'darwin': pytest.xfail("Ticket 173586") # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/benchmark_genai.py") + py_script = (SAMPLES_PY_DIR / "text_generation/benchmark_genai.py").resolve() py_command = [sys.executable, py_script, '-m', convert_model, '-p', f'"{prompt}"'] + sample_args run_sample(py_command) @@ -36,8 +37,10 @@ def test_py_sample_benchmark_genai(self, convert_model, prompt, sample_args): indirect=["convert_model"], ) def test_cpp_sample_benchmark_genai(self, convert_model, prompt, sample_args): + if sys.platform == "darwin": + pytest.xfail("CPP sample exits with code 1 on macs.") # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'benchmark_genai') + cpp_sample = (SAMPLES_CPP_DIR / 'benchmark_genai').resolve() cpp_command =[cpp_sample, '-m', convert_model, '-p', f'"{prompt}"'] + sample_args run_sample(cpp_command) @@ -50,8 +53,8 @@ def test_cpp_sample_benchmark_genai(self, convert_model, prompt, sample_args): ], indirect=["convert_model"], ) - def test_cpp_sample_benchmark_genai(self, convert_model, prompt, sample_args): + def test_c_sample_benchmark_genai(self, convert_model, prompt, sample_args): # Test C sample - c_sample = os.path.join(SAMPLES_C_DIR, 'benchmark_genai_c') + c_sample = (SAMPLES_C_DIR / 'benchmark_genai_c').resolve() c_command =[c_sample, '-m', convert_model, '-p', f'"{prompt}"'] + sample_args run_sample(c_command) diff --git a/tests/python_tests/samples/test_benchmark_vlm.py b/tests/python_tests/samples/test_benchmark_vlm.py index 5f9b772301..ea32608dd5 100644 --- a/tests/python_tests/samples/test_benchmark_vlm.py +++ b/tests/python_tests/samples/test_benchmark_vlm.py @@ -21,11 +21,11 @@ class TestBenchmarkVLM: def test_sample_benchmark_vlm(self, convert_model, download_test_content): num_iter = "3" # Run C++ benchmark sample - benchmark_sample = os.path.join(SAMPLES_CPP_DIR, 'benchmark_vlm') + benchmark_sample = (SAMPLES_CPP_DIR / 'benchmark_vlm').as_posix() benchmark_cpp_command = [benchmark_sample, "-m" , convert_model, "-i", download_test_content, "-n", num_iter] run_sample(benchmark_cpp_command) # Run Python benchmark sample - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'visual_language_chat/benchmark_vlm.py') + benchmark_script = (SAMPLES_PY_DIR / 'visual_language_chat/benchmark_vlm.py').as_posix() benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model, "-i", download_test_content, "-n", num_iter] run_sample(benchmark_py_command) \ No newline at end of file diff --git a/tests/python_tests/samples/test_chat_sample.py b/tests/python_tests/samples/test_chat_sample.py index 6b9c8a9151..baf9f0f102 100644 --- a/tests/python_tests/samples/test_chat_sample.py +++ b/tests/python_tests/samples/test_chat_sample.py @@ -21,19 +21,19 @@ def test_chat_sample_refs(self, request, convert_model, prompts): if sys.platform == 'darwin': pytest.xfail("Ticket 173586") # C++ test - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'chat_sample') + cpp_sample = (SAMPLES_CPP_DIR / 'chat_sample').as_posix() cpp_command = [cpp_sample, convert_model] cpp_result = run_sample(cpp_command, '\n'.join(prompts)) cpp_predictions = cpp_result.stdout # Python test - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/chat_sample.py") + py_script = (SAMPLES_PY_DIR / "text_generation/chat_sample.py").as_posix() py_command = [sys.executable, py_script, convert_model] py_result = run_sample(py_command, '\n'.join(prompts)) py_predictions = py_result.stdout # C test - c_sample = os.path.join(SAMPLES_C_DIR, 'chat_sample_c') + c_sample = (SAMPLES_C_DIR / 'chat_sample_c').as_posix() c_command = [c_sample, convert_model] c_result = run_sample(c_command, '\n'.join(prompts)) c_predictions = c_result.stdout diff --git a/tests/python_tests/samples/test_continuous_batching_tools.py b/tests/python_tests/samples/test_continuous_batching_tools.py index 2c77021ffb..158feec1d4 100644 --- a/tests/python_tests/samples/test_continuous_batching_tools.py +++ b/tests/python_tests/samples/test_continuous_batching_tools.py @@ -18,7 +18,7 @@ class TestContinuousBatching: ) def test_cpp_tool_accuracy(self, convert_model, sample_args): # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'continuous_batching_accuracy') + cpp_sample = (SAMPLES_CPP_DIR / 'continuous_batching_accuracy').as_posix() cpp_command =[cpp_sample, '-m', convert_model] + sample_args run_sample(cpp_command) @@ -28,7 +28,7 @@ def test_cpp_tool_accuracy(self, convert_model, sample_args): @pytest.mark.parametrize("sample_args", [["-n", "10", "--cache_size", "1"], ["-n", "10", "--dynamic_split_fuse", "--max_batch_size", "256", "--max_input_len", "256", "--cache_size", "1"]]) def test_cpp_tool_benchmark(self, convert_model, download_test_content, sample_args): # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'continuous_batching_benchmark') + cpp_sample = (SAMPLES_CPP_DIR / 'continuous_batching_benchmark').as_posix() cpp_command =[cpp_sample, '-m', convert_model, '--dataset', download_test_content] + sample_args run_sample(cpp_command) diff --git a/tests/python_tests/samples/test_encrypted_model_causal_lm.py b/tests/python_tests/samples/test_encrypted_model_causal_lm.py index 4f35ef9649..35580bca12 100644 --- a/tests/python_tests/samples/test_encrypted_model_causal_lm.py +++ b/tests/python_tests/samples/test_encrypted_model_causal_lm.py @@ -16,12 +16,12 @@ class TestEncryptedLM: def test_sample_encrypted_lm(self, convert_model, prompt): # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'encrypted_model_causal_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'encrypted_model_causal_lm').as_posix() cpp_command =[cpp_sample, convert_model, prompt] cpp_result = run_sample(cpp_command) # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/encrypted_model_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/encrypted_model_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, prompt] py_result = run_sample(py_command) diff --git a/tests/python_tests/samples/test_encrypted_model_vlm.py b/tests/python_tests/samples/test_encrypted_model_vlm.py index 98112b0b27..9cfe538180 100644 --- a/tests/python_tests/samples/test_encrypted_model_vlm.py +++ b/tests/python_tests/samples/test_encrypted_model_vlm.py @@ -19,17 +19,17 @@ def test_sample_encrypted_lm(self, convert_model, download_test_content, generat env = os.environ.copy() env["OPENVINO_LOG_LEVEL"] = "0" # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'encrypted_model_vlm') + cpp_sample = (SAMPLES_CPP_DIR / 'encrypted_model_vlm').as_posix() cpp_command =[cpp_sample, convert_model, os.path.dirname(generate_test_content), sample_args] cpp_result = run_sample(cpp_command, env=env) # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/encrypted_model_vlm.py") + py_script = (SAMPLES_PY_DIR / "visual_language_chat/encrypted_model_vlm.py").as_posix() py_command = [sys.executable, py_script, convert_model, os.path.dirname(generate_test_content), sample_args] py_result = run_sample(py_command, env=env) # Test common sample - py_common_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/visual_language_chat.py") + py_common_script = (SAMPLES_PY_DIR / "visual_language_chat/visual_language_chat.py").as_posix() py_common_command = [sys.executable, py_common_script, convert_model, os.path.dirname(generate_test_content)] py_common_result = run_sample(py_common_command, sample_args, env) diff --git a/tests/python_tests/samples/test_greedy_causal_lm.py b/tests/python_tests/samples/test_greedy_causal_lm.py index a195efaac8..ec2ea4ab13 100644 --- a/tests/python_tests/samples/test_greedy_causal_lm.py +++ b/tests/python_tests/samples/test_greedy_causal_lm.py @@ -29,24 +29,24 @@ def test_sample_greedy_causal_lm(self, request, convert_model, sample_args): prompt = sample_args # C++ test - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'greedy_causal_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'greedy_causal_lm').as_posix() cpp_command = [cpp_sample, convert_model, prompt] cpp_result = run_sample(cpp_command) cpp_predictions = cpp_result.stdout # Python test - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/greedy_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/greedy_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, prompt] py_result = run_sample(py_command) py_predictions = py_result.stdout # Test C sample - c_sample = os.path.join(SAMPLES_C_DIR, "greedy_causal_lm_c") + c_sample = (SAMPLES_C_DIR / "greedy_causal_lm_c").as_posix() c_command =[c_sample, convert_model, sample_args] c_result = run_sample(c_command) # Test JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "text_generation/greedy_causal_lm.js") + js_sample = (SAMPLES_JS_DIR / "text_generation/greedy_causal_lm.js").as_posix() js_command =['node', js_sample, convert_model, sample_args] js_result = run_sample(js_command) diff --git a/tests/python_tests/samples/test_inpainting.py b/tests/python_tests/samples/test_inpainting.py index c4e0846d64..0007fc258c 100644 --- a/tests/python_tests/samples/test_inpainting.py +++ b/tests/python_tests/samples/test_inpainting.py @@ -29,11 +29,11 @@ class TestInpainting: ) def test_sample_inpainting(self, download_model, prompt, download_test_content, download_mask_image): # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "image_generation/inpainting.py") + py_script = (SAMPLES_PY_DIR / "image_generation/inpainting.py").as_posix() py_command = [sys.executable, py_script, download_model, "'" + prompt + "'", download_test_content, download_mask_image] run_sample(py_command) # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'inpainting') + cpp_sample = (SAMPLES_CPP_DIR / 'inpainting').as_posix() cpp_command = [cpp_sample, download_model, "'" + prompt + "'", download_test_content, download_mask_image] run_sample(cpp_command) \ No newline at end of file diff --git a/tests/python_tests/samples/test_lora.py b/tests/python_tests/samples/test_lora.py index f608b0dd34..b65e873e67 100644 --- a/tests/python_tests/samples/test_lora.py +++ b/tests/python_tests/samples/test_lora.py @@ -15,6 +15,6 @@ class TestLora: @pytest.mark.parametrize("sample_args", ["How to create a table with two columns, one of them has type float, another one has type int?"]) @pytest.mark.parametrize("download_test_content", ["adapter_model.safetensors"], indirect=True) def test_python_sample_lora(self, convert_model, download_test_content, sample_args): - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/lora_greedy_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/lora_greedy_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, download_test_content, sample_args] run_sample(py_command) \ No newline at end of file diff --git a/tests/python_tests/samples/test_multinomial_causal_lm.py b/tests/python_tests/samples/test_multinomial_causal_lm.py index ab89580992..49a0dd0c51 100644 --- a/tests/python_tests/samples/test_multinomial_causal_lm.py +++ b/tests/python_tests/samples/test_multinomial_causal_lm.py @@ -25,18 +25,18 @@ class TestMultinomialCausalLM: ) def test_sample_multinomial_causal_lm(self, convert_model, sample_args): # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'multinomial_causal_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'multinomial_causal_lm').as_posix() cpp_command = [cpp_sample, convert_model, sample_args] cpp_result = run_sample(cpp_command) # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/multinomial_causal_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/multinomial_causal_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, sample_args] py_result = run_sample(py_command) # Test JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "text_generation/multinomial_causal_lm.js") + js_sample = (SAMPLES_JS_DIR / "text_generation/multinomial_causal_lm.js").as_posix() js_command =['node', js_sample, convert_model, sample_args] js_result = run_sample(js_command) diff --git a/tests/python_tests/samples/test_prompt_lookup_decoding_lm.py b/tests/python_tests/samples/test_prompt_lookup_decoding_lm.py index 40b938c648..c66ec06224 100644 --- a/tests/python_tests/samples/test_prompt_lookup_decoding_lm.py +++ b/tests/python_tests/samples/test_prompt_lookup_decoding_lm.py @@ -38,18 +38,18 @@ def test_prompt_lookup_decoding_lm(self, convert_model, sample_args): env = os.environ.copy() env["OPENVINO_LOG_LEVEL"] = "0" # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'prompt_lookup_decoding_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'prompt_lookup_decoding_lm').as_posix() cpp_command =[cpp_sample, convert_model, sample_args] cpp_result = run_sample(cpp_command, env=env) # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/prompt_lookup_decoding_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/prompt_lookup_decoding_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, sample_args] py_result = run_sample(py_command, env=env) # Greedy decoding - cpp_sample_ref = os.path.join(SAMPLES_CPP_DIR, 'greedy_causal_lm') + cpp_sample_ref = (SAMPLES_CPP_DIR / 'greedy_causal_lm').as_posix() cpp_command_ref = [cpp_sample_ref, convert_model, sample_args] cpp_result_ref = run_sample(cpp_command_ref, env=env) diff --git a/tests/python_tests/samples/test_rag_sample.py b/tests/python_tests/samples/test_rag_sample.py index 272c2fc270..ca88793378 100644 --- a/tests/python_tests/samples/test_rag_sample.py +++ b/tests/python_tests/samples/test_rag_sample.py @@ -15,17 +15,17 @@ class TestTextEmbeddingPipeline: @pytest.mark.parametrize("convert_model", ["bge-small-en-v1.5"], indirect=True) def test_sample_text_embedding_pipeline(self, convert_model): # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, "text_embeddings") + cpp_sample = (SAMPLES_CPP_DIR / "text_embeddings").as_posix() cpp_command = [cpp_sample, convert_model, "Document 1", "Document 2"] cpp_result = run_sample(cpp_command) # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "rag/text_embeddings.py") + py_script = (SAMPLES_PY_DIR / "rag/text_embeddings.py").as_posix() py_command = [sys.executable, py_script, convert_model, "Document 1", "Document 2"] py_result = run_sample(py_command) # Run JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "rag/text_embeddings.js") + js_sample = (SAMPLES_JS_DIR / "rag/text_embeddings.js").as_posix() js_command = ["node", js_sample, convert_model, "Document 1", "Document 2"] js_result = run_sample(js_command) @@ -40,7 +40,7 @@ class TestTextRerankPipeline: @pytest.mark.parametrize("convert_model", ["ms-marco-TinyBERT-L2-v2"], indirect=True) def test_sample_text_rerank_pipeline(self, convert_model): # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "rag/text_rerank.py") + py_script = (SAMPLES_PY_DIR / "rag/text_rerank.py").as_posix() document_1 = "Intel Core Ultra processors incorporate an AI-optimized\ architecture that supports new user experiences and the\ next wave of commercial applications." @@ -51,7 +51,7 @@ def test_sample_text_rerank_pipeline(self, convert_model): py_result = run_sample(py_command) # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, "text_rerank") + cpp_sample = (SAMPLES_CPP_DIR / "text_rerank").as_posix() cpp_command = [cpp_sample, convert_model, "What are the main features of Intel Core Ultra processors?", document_1, document_2] cpp_result = run_sample(cpp_command) diff --git a/tests/python_tests/samples/test_react_sample.py b/tests/python_tests/samples/test_react_sample.py index f7a312ac56..5a5fb81f54 100644 --- a/tests/python_tests/samples/test_react_sample.py +++ b/tests/python_tests/samples/test_react_sample.py @@ -17,12 +17,12 @@ def test_react_sample_refs(self, request, convert_model): if sys.platform == 'darwin': pytest.xfail("Ticket 173586") # Python test - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/react_sample.py") + py_script = (SAMPLES_PY_DIR / "text_generation/react_sample.py").as_posix() py_command = [sys.executable, py_script, convert_model] py_result = run_sample(py_command) # Test JS sample - js_sample = os.path.join(SAMPLES_JS_DIR, "text_generation/react_sample.js") + js_sample = (SAMPLES_JS_DIR / "text_generation/react_sample.js").as_posix() js_command =['node', js_sample, convert_model] js_result = run_sample(js_command) diff --git a/tests/python_tests/samples/test_speculative_decoding_lm.py b/tests/python_tests/samples/test_speculative_decoding_lm.py index 35a1bb285a..adff957686 100644 --- a/tests/python_tests/samples/test_speculative_decoding_lm.py +++ b/tests/python_tests/samples/test_speculative_decoding_lm.py @@ -27,17 +27,17 @@ def test_sample_speculative_decoding_lm(self, convert_model, convert_draft_model env = os.environ.copy() env["OPENVINO_LOG_LEVEL"] = "0" # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'speculative_decoding_lm') + cpp_sample = (SAMPLES_CPP_DIR / 'speculative_decoding_lm').as_posix() cpp_command =[cpp_sample, convert_model, convert_draft_model, sample_args] cpp_result = run_sample(cpp_command, env=env) # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/speculative_decoding_lm.py") + py_script = (SAMPLES_PY_DIR / "text_generation/speculative_decoding_lm.py").as_posix() py_command = [sys.executable, py_script, convert_model, convert_draft_model, sample_args] py_result = run_sample(py_command, env=env) # Greedy decoding - cpp_sample_ref = os.path.join(SAMPLES_CPP_DIR, 'greedy_causal_lm') + cpp_sample_ref = (SAMPLES_CPP_DIR / 'greedy_causal_lm').as_posix() cpp_command_ref = [cpp_sample_ref, convert_model, sample_args] cpp_result_ref = run_sample(cpp_command_ref, env=env) diff --git a/tests/python_tests/samples/test_structured_output_sample.py b/tests/python_tests/samples/test_structured_output_sample.py index 88e8f3a004..7c1c4ea7d5 100644 --- a/tests/python_tests/samples/test_structured_output_sample.py +++ b/tests/python_tests/samples/test_structured_output_sample.py @@ -40,7 +40,7 @@ class Transaction(BaseModel): ("Generate 10000 horses.", {"person": 0, "car": 0, "transaction": 0}), ]) def test_python_structured_output_sample(convert_model, prompt, expected_quantities): - py_script = os.path.join(SAMPLES_PY_DIR, "text_generation/structured_output_generation.py") + py_script = (SAMPLES_PY_DIR / "text_generation/structured_output_generation.py").as_posix() py_command = [sys.executable, py_script, convert_model] user_input = prompt + "\n" @@ -97,7 +97,7 @@ def test_python_structured_output_sample(convert_model, prompt, expected_quantit def test_cpp_structured_output_sample(convert_model, prompt, final_answer): if sys.platform == 'darwin': pytest.xfail("Ticket 173586") - cpp_sample = os.path.join(SAMPLES_CPP_DIR, "structured_output_generation") + cpp_sample = (SAMPLES_CPP_DIR / "structured_output_generation").as_posix() cpp_command = [cpp_sample, convert_model] user_input = prompt + "\n" diff --git a/tests/python_tests/samples/test_text2image.py b/tests/python_tests/samples/test_text2image.py index 407b89ca69..3390805d64 100644 --- a/tests/python_tests/samples/test_text2image.py +++ b/tests/python_tests/samples/test_text2image.py @@ -20,12 +20,12 @@ class TestText2Image: ) def test_sample_text2image(self, convert_model, sample_args): # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "image_generation/text2image.py") + py_script = (SAMPLES_PY_DIR / "image_generation/text2image.py").as_posix() py_command = [sys.executable, py_script, convert_model, sample_args] run_sample(py_command) # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'text2image') + cpp_sample = (SAMPLES_CPP_DIR / 'text2image').as_posix() cpp_command = [cpp_sample, convert_model, sample_args] run_sample(cpp_command) @@ -44,6 +44,6 @@ def test_sample_text2image(self, convert_model, sample_args): ) def test_sample_text2image_concurrency(self, convert_model, sample_args): # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'text2image_concurrency') + cpp_sample = (SAMPLES_CPP_DIR / 'text2image_concurrency').as_posix() cpp_command = [cpp_sample, convert_model, *sample_args] run_sample(cpp_command) diff --git a/tests/python_tests/samples/test_text2speech.py b/tests/python_tests/samples/test_text2speech.py index 2b7ba202ca..8f87858c92 100644 --- a/tests/python_tests/samples/test_text2speech.py +++ b/tests/python_tests/samples/test_text2speech.py @@ -37,12 +37,12 @@ def teardown_class(self): def test_sample_text_to_speech(self, convert_model, input_prompt): # Example: text2speech spt5_model_dir "Hello everyone" --speaker_embedding_file_path xvector.bin # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'text2speech') + cpp_sample = (SAMPLES_CPP_DIR / 'text2speech').as_posix() cpp_command = [cpp_sample, convert_model, input_prompt, self.temp_speaker_embedding_file.name] cpp_result = run_sample(cpp_command) # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "speech_generation/text2speech.py") + py_script = (SAMPLES_PY_DIR / "speech_generation/text2speech.py").as_posix() py_command = [sys.executable, py_script, convert_model, input_prompt, "--speaker_embedding_file_path", self.temp_speaker_embedding_file.name] py_result = run_sample(py_command) @@ -59,12 +59,12 @@ def test_sample_text_to_speech(self, convert_model, input_prompt): def test_sample_text_to_speech_no_speaker_embedding_file(self, convert_model, input_prompt): # Run C++ sample # Example: text2speech spt5_model_dir "Hello everyone" --speaker_embedding_file_path xvector.bin - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'text2speech') + cpp_sample = (SAMPLES_CPP_DIR / 'text2speech').as_posix() cpp_command = [cpp_sample, convert_model, input_prompt] cpp_result = run_sample(cpp_command) # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "speech_generation/text2speech.py") + py_script = (SAMPLES_PY_DIR / "speech_generation/text2speech.py").as_posix() py_command = [sys.executable, py_script, convert_model, input_prompt] py_result = run_sample(py_command) diff --git a/tests/python_tests/samples/test_tools_llm_benchmark.py b/tests/python_tests/samples/test_tools_llm_benchmark.py index 8ab52588cf..a448420e37 100644 --- a/tests/python_tests/samples/test_tools_llm_benchmark.py +++ b/tests/python_tests/samples/test_tools_llm_benchmark.py @@ -9,7 +9,8 @@ from data.models import get_gguf_model_list from utils.hugging_face import download_gguf_model from conftest import SAMPLES_PY_DIR, convert_model, download_test_content -from utils.hugging_face import download_and_convert_embeddings_models, download_and_convert_model +from utils.hugging_face import download_and_convert_model_class, download_and_convert_model +from optimum.intel import OVModelForFeatureExtraction convert_draft_model = convert_model download_mask_image = download_test_content @@ -38,7 +39,7 @@ class TestBenchmarkLLM: ) def test_python_tool_llm_benchmark_download_model(self, download_model, sample_args): # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [sys.executable, benchmark_script, "-m" , download_model] + sample_args run_sample(benchmark_py_command) @@ -50,8 +51,8 @@ def test_python_tool_llm_benchmark_download_model(self, download_model, sample_a pytest.param("tiny-random-qwen2", ["-d", "cpu", "-n", "1", "-ic", "10", "--optimum"]), pytest.param("tiny-random-qwen2", ["-d", "cpu", "-n", "1", "-ic", "10", "--optimum", "--num_beams", "2"]), pytest.param("tiny-random-qwen2", ["-d", "cpu", "-n", "1", "-ic", "20", "--max_ngram_size", "3", "--num_assistant_tokens", "5", "-p", "'Why is the Sun yellow?'"]), - pytest.param("tiny-random-llava", [ "-ic", "4", "-pf", os.path.join(SAMPLES_PY_DIR, "llm_bench/prompts/llava-1.5-7b.jsonl")]), - pytest.param("tiny-random-llava", [ "-ic", "4", "--optimum", "-pf", os.path.join(SAMPLES_PY_DIR, "llm_bench/prompts/llava-1.5-7b.jsonl")]), + pytest.param("tiny-random-llava", [ "-ic", "4", "-pf", (SAMPLES_PY_DIR / "llm_bench/prompts/llava-1.5-7b.jsonl").as_posix()]), + pytest.param("tiny-random-llava", [ "-ic", "4", "--optimum", "-pf", (SAMPLES_PY_DIR / "llm_bench/prompts/llava-1.5-7b.jsonl").as_posix()]), pytest.param("tiny-random-latent-consistency", [ "-d", "cpu", "-n", "1", "--num_steps", "4", "--static_reshape", "-p", "'an astronaut riding a horse on mars'"]), pytest.param("tiny-random-latent-consistency", [ "-d", "cpu", "-n", "1", "--num_steps", "4", "--static_reshape", "-p", "'an astronaut riding a horse on mars'", "--optimum"]), ], @@ -59,7 +60,7 @@ def test_python_tool_llm_benchmark_download_model(self, download_model, sample_a ) def test_python_tool_llm_benchmark_convert_model(self, convert_model, sample_args): # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model] + sample_args run_sample(benchmark_py_command) @@ -76,7 +77,7 @@ def test_python_tool_llm_benchmark_convert_model(self, convert_model, sample_arg @pytest.mark.parametrize("download_test_content", ["cat"], indirect=True) def test_python_tool_llm_benchmark_convert_model_media(self, convert_model, download_test_content, sample_args): # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model, "--media", download_test_content] + sample_args run_sample(benchmark_py_command) @@ -96,7 +97,7 @@ def test_python_tool_llm_benchmark_speculative(self, convert_model, convert_draf Test Speculative Decoding via GenAI """ # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [sys.executable, benchmark_script, "-m" , convert_model, "--draft_model", convert_draft_model, "-p", prompt] + sample_args run_sample(benchmark_py_command) @@ -115,7 +116,7 @@ def test_python_tool_llm_benchmark_jsonl(self, convert_model, generate_image_gen Test Speculative Decoding via GenAI with JSONL input """ # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -134,7 +135,7 @@ def test_python_tool_llm_benchmark_jsonl_lora(self, request, convert_model, down model_name = request.node.callspec.params['download_model'] # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -157,7 +158,7 @@ def test_python_tool_llm_benchmark_inpainting(self, convert_model, download_test os.chdir(os.path.dirname(download_test_content)) # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -178,7 +179,7 @@ def test_python_tool_llm_benchmark_i2i(self, convert_model, download_test_conten os.chdir(os.path.dirname(download_test_content)) # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -194,7 +195,7 @@ def test_python_tool_llm_benchmark_i2i(self, convert_model, download_test_conten @pytest.mark.parametrize("download_test_content", ["cmu_us_awb_arctic-wav-arctic_a0001.bin"], indirect=True) def test_python_tool_llm_benchmark_tts(self, convert_model, download_test_content, sample_args): # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -210,9 +211,10 @@ def test_python_tool_llm_benchmark_tts(self, convert_model, download_test_conten @pytest.mark.parametrize("convert_model", ["WhisperTiny"], indirect=True) @pytest.mark.parametrize("download_test_content", ["3283_1447_000.tar.gz"], indirect=True) def test_python_tool_llm_benchmark_optimum(self, convert_model, download_test_content, media_file, sample_args): - media_path = os.path.join(download_test_content, media_file) + from pathlib import Path + media_path = (Path(download_test_content) / media_file).as_posix() # Run Python benchmark - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -230,7 +232,7 @@ def test_python_tool_llm_benchmark_optimum(self, convert_model, download_test_co ["-d", "cpu", "-n", "1", "--embedding_max_length", "128", "--embedding_normalize", "--embedding_pooling", "mean", "--optimum", "--task", "text_embed"], ]) def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_args): - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -240,14 +242,14 @@ def test_python_tool_llm_benchmark_text_embeddings(self, convert_model, sample_a @pytest.mark.samples - @pytest.mark.parametrize("download_and_convert_embeddings_models", ["Qwen/Qwen3-Embedding-0.6B"], indirect=True) + @pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Embedding-0.6B"], indirect=True) @pytest.mark.parametrize("sample_args", [ ["-d", "cpu", "-n", "2", "--task", "text_embed", "--embedding_padding_side", "left", "--embedding_pooling", "last_token"], ["-d", "cpu", "-n", "2", "--task", "text_embed", "--embedding_padding_side", "left", "--embedding_pooling", "last_token", "--optimum"], ]) - def test_python_tool_llm_benchmark_text_embeddings_qwen3(self, download_and_convert_embeddings_models, sample_args): - convert_model, hf_tokenizer, models_path = download_and_convert_embeddings_models - benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') + def test_python_tool_llm_benchmark_text_reranking(self, model_id, sample_args): + _, _, models_path = download_and_convert_model_class(model_id, OVModelForFeatureExtraction) + benchmark_script = (SAMPLES_PY_DIR / 'llm_bench/benchmark.py').as_posix() benchmark_py_command = [ sys.executable, benchmark_script, @@ -280,12 +282,13 @@ def test_python_tool_llm_benchmark_text_reranking(self, convert_model, sample_ar ["-d", "cpu", "-n", "1", "--task", "text_rerank", "--optimum"], ]) def test_python_tool_llm_benchmark_text_reranking_qwen3(self, model_id, sample_args): - model, hf_tokenizer, models_path = download_and_convert_model(model_id) + model_schema = download_and_convert_model(model_id) benchmark_script = os.path.join(SAMPLES_PY_DIR, 'llm_bench/benchmark.py') benchmark_py_command = [ sys.executable, benchmark_script, - "-m", models_path, + "-m", + model_schema.models_path, ] + sample_args run_sample(benchmark_py_command) diff --git a/tests/python_tests/samples/test_utils.py b/tests/python_tests/samples/test_utils.py index eb14509175..24349fe630 100644 --- a/tests/python_tests/samples/test_utils.py +++ b/tests/python_tests/samples/test_utils.py @@ -4,7 +4,11 @@ import os import subprocess # nosec B404 -def run_sample(command, input_data=None, env=os.environ): +def run_sample( + command: list[str], + input_data: str | None = None, + env: dict[str, str] = os.environ, +): logger.info(f"Running sample command: {' '.join(map(str, command))}") if input_data: logger.info(f"Input data: {input_data}") diff --git a/tests/python_tests/samples/test_visual_language_chat.py b/tests/python_tests/samples/test_visual_language_chat.py index b0c25a80b7..ac9c55f787 100644 --- a/tests/python_tests/samples/test_visual_language_chat.py +++ b/tests/python_tests/samples/test_visual_language_chat.py @@ -25,12 +25,12 @@ class TestVisualLanguageChat: ) def test_sample_visual_language_chat(self, convert_model, download_test_content, questions): # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'visual_language_chat') + cpp_sample = (SAMPLES_CPP_DIR / 'visual_language_chat').as_posix() cpp_command =[cpp_sample, convert_model, download_test_content] cpp_result = run_sample(cpp_command, questions) # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/visual_language_chat.py") + py_script = (SAMPLES_PY_DIR / "visual_language_chat/visual_language_chat.py").as_posix() py_command = [sys.executable, py_script, convert_model, download_test_content] py_result = run_sample(py_command, questions) @@ -50,12 +50,12 @@ def test_sample_visual_language_chat(self, convert_model, download_test_content, @pytest.mark.parametrize("generate_test_content", ["images/lines.png"], indirect=True) def test_sample_visual_language_chat_images(self, convert_model, download_test_content, generate_test_content, questions): # Test Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "visual_language_chat/visual_language_chat.py") + py_script = (SAMPLES_PY_DIR / "visual_language_chat/visual_language_chat.py").as_posix() py_command = [sys.executable, py_script, convert_model, os.path.dirname(generate_test_content)] py_result = run_sample(py_command, questions) # Test CPP sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'visual_language_chat') + cpp_sample = (SAMPLES_CPP_DIR / 'visual_language_chat').as_posix() cpp_command =[cpp_sample, convert_model, os.path.dirname(generate_test_content)] cpp_result = run_sample(cpp_command, questions) diff --git a/tests/python_tests/samples/test_whisper_speech_recognition.py b/tests/python_tests/samples/test_whisper_speech_recognition.py index 9989b6e868..ecca6cbccb 100644 --- a/tests/python_tests/samples/test_whisper_speech_recognition.py +++ b/tests/python_tests/samples/test_whisper_speech_recognition.py @@ -15,17 +15,17 @@ class TestWhisperSpeechRecognition: @pytest.mark.parametrize("download_test_content", ["how_are_you_doing_today.wav"], indirect=True) def test_sample_whisper_speech_recognition(self, convert_model, download_test_content): # Run C++ sample - cpp_sample = os.path.join(SAMPLES_CPP_DIR, 'whisper_speech_recognition') + cpp_sample = (SAMPLES_CPP_DIR / 'whisper_speech_recognition').as_posix() cpp_command = [cpp_sample, convert_model, download_test_content] cpp_result = run_sample(cpp_command) # Run Python sample - py_script = os.path.join(SAMPLES_PY_DIR, "whisper_speech_recognition/whisper_speech_recognition.py") + py_script = (SAMPLES_PY_DIR / "whisper_speech_recognition/whisper_speech_recognition.py").as_posix() py_command = [sys.executable, py_script, convert_model, download_test_content] py_result = run_sample(py_command) # Run C sample - c_sample = os.path.join(SAMPLES_C_DIR, 'whisper_speech_recognition_c') + c_sample = (SAMPLES_C_DIR / 'whisper_speech_recognition_c').as_posix() c_command = [c_sample, convert_model, download_test_content] c_result = run_sample(c_command) diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index a2a9efc1a2..de437c11af 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -16,7 +16,7 @@ from utils.generation_config import get_greedy, get_beam_search, \ get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p -from utils.hugging_face import download_and_convert_model +from utils.hugging_face import OVConvertedModelSchema, download_and_convert_model from utils.ov_genai_pipelines import create_ov_pipeline, create_ov_cb_pipeline, PipelineType, dict_to_scheduler_config, generate_and_compare, prepare_generation_config_by_pipe_type from data.models import get_chat_models_list from data.test_dataset import get_test_dataset @@ -25,7 +25,23 @@ # e2e tests on random and real models # -def read_models_list(file_name: str): +CURRENT_DIR_NAME = Path(__file__).parent + +COMMON_QUESTIONS = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + + +COMMON_QUESTIONS_SHORT = [ + '1+1=', + 'Why is the Sun yellow?', +] + + +def read_models_list(file_name: str) -> list[str]: models = [] with open(file_name, encoding="utf-8") as f: for model_name in f: @@ -36,48 +52,65 @@ def read_models_list(file_name: str): models.append(model_name) return models + +@pytest.fixture(scope="module") +def llm_model(request: pytest.FixtureRequest) -> OVConvertedModelSchema: + return download_and_convert_model(request.param) + + +@pytest.fixture(scope="module") +def model_facebook_opt_125m() -> OVConvertedModelSchema: + model_id : str = "facebook/opt-125m" + return download_and_convert_model(model_id) + + @pytest.mark.precommit -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) -def test_e2e_precommit(model_id): +@pytest.mark.parametrize("llm_model", read_models_list(CURRENT_DIR_NAME / "models" / "precommit"), indirect=True) +def test_e2e_precommit(llm_model: OVConvertedModelSchema): prompts, generation_configs = get_test_dataset() - generate_and_compare(prompts=prompts, - generation_config=generation_configs, - model=model_id, - pipeline_type=PipelineType.CONTINUOUS_BATCHING) + generate_and_compare( + model_schema=llm_model, + prompts=prompts, + generation_config=generation_configs, + pipeline_type=PipelineType.CONTINUOUS_BATCHING, + ) @pytest.mark.real_models -@pytest.mark.parametrize("model_id", read_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) -def test_e2e_real_models(model_id): +@pytest.mark.parametrize("llm_model", read_models_list(CURRENT_DIR_NAME / "models" / "real_models"), indirect=True) +def test_e2e_real_models(llm_model: OVConvertedModelSchema): prompts, generation_config = get_test_dataset() - generate_and_compare(prompts=prompts, - generation_config=generation_config, - model=model_id, - pipeline_type=PipelineType.CONTINUOUS_BATCHING) + generate_and_compare( + model_schema=llm_model, + prompts=prompts, + generation_config=generation_config, + pipeline_type=PipelineType.CONTINUOUS_BATCHING, + ) # # Comparison with stateful -# TODO: remove these tests once test_llm_pipeline.py are generalized and parametrized to test both Stateful and PA paths -# - -test_configs = [ - dict(max_new_tokens=20), - dict(max_new_tokens=200, ignore_eos=True), - dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) -] -batched_prompts = [ - ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], - ['hello', 'Here is the longest nowel ever: '], - ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], - ['table is made', 'table is made [force left pad tokens]'] -] -@pytest.mark.parametrize("generation_config", test_configs) -@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. +# TODO: remove these tests once test_llm_pipeline.py are generalized +# and parametrized to test both Stateful and PA paths +@pytest.mark.parametrize( + "generation_config", + [ + {"max_new_tokens": 20}, + {"max_new_tokens": 200, "ignore_eos": True}, + {"max_new_tokens": 20, "num_beam_groups": 3, "num_beams": 15, "diversity_penalty": 1.0}, + ] +) +@pytest.mark.parametrize( + "prompt", + [ + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], + ['table is made', 'table is made [force left pad tokens]'], + ] +) # num_beams=15 diverges on the first prompt. @pytest.mark.precommit @pytest.mark.skip(reason="CVS-162891: Fix test_continuous_batching_vs_stateful tests after we started to compare cb vs sdpa") -def test_continuous_batching_vs_stateful(prompt, generation_config): - model_id = "facebook/opt-125m" - _, _, models_path = download_and_convert_model(model_id, padding_side="left") +def test_continuous_batching_vs_stateful(model_facebook_opt_125m: OVConvertedModelSchema, prompt, generation_config): + models_path = model_facebook_opt_125m.models_path cb_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.PAGED_ATTENTION) ov_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.STATEFUL) @@ -91,12 +124,17 @@ def test_continuous_batching_vs_stateful(prompt, generation_config): assert math.isclose(gen, ref, abs_tol=0.0003) -prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] -@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize( + "prompt", + [ + 'The Sun is yellow because', + 'Difference between Jupiter and Mars is that', + 'table is made of' + ] +) @pytest.mark.precommit -def test_cb_streamer_vs_return_vs_stateful(prompt): - model_id = "facebook/opt-125m" - _, _, models_path = download_and_convert_model(model_id) +def test_cb_streamer_vs_return_vs_stateful(model_facebook_opt_125m: OVConvertedModelSchema, prompt: str): + models_path = model_facebook_opt_125m.models_path ov_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.STATEFUL) cb_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.PAGED_ATTENTION) @@ -108,22 +146,36 @@ def test_cb_streamer_vs_return_vs_stateful(prompt): assert "".join(streamed) == reference -generation_configs = [ - dict(do_sample=False, max_new_tokens=20), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0, repetition_penalty=1.0) -] -questions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?' -] -@pytest.mark.parametrize("generation_config_kwargs", generation_configs[1:]) -@pytest.mark.parametrize("model_id", get_chat_models_list()) -@pytest.mark.parametrize("pipeline_type", [PipelineType.PAGED_ATTENTION, PipelineType.PROMPT_LOOKUP_DECODING, PipelineType.SPECULATIVE_DECODING] ) @pytest.mark.precommit -def test_chat_scenario_vs_stateful(model_id, generation_config_kwargs: dict, pipeline_type): - _, _, models_path = download_and_convert_model(model_id) +@pytest.mark.parametrize( + "generation_config_kwargs", + [ + { + "do_sample": False, + "num_beam_groups": 3, + "num_beams": 15, + "num_return_sequences": 1, + "max_new_tokens": 10, + "diversity_penalty": 1.0, + "repetition_penalty": 1.0, + } + ] +) +@pytest.mark.parametrize("llm_model", get_chat_models_list(), indirect=True) +@pytest.mark.parametrize( + "pipeline_type", + [ + PipelineType.PAGED_ATTENTION, + PipelineType.PROMPT_LOOKUP_DECODING, + PipelineType.SPECULATIVE_DECODING, + ] +) +def test_chat_scenario_vs_stateful( + llm_model: OVConvertedModelSchema, + generation_config_kwargs: dict, + pipeline_type: PipelineType +): + models_path = llm_model.models_path ov_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.STATEFUL) cb_pipe = create_ov_pipeline(models_path, pipeline_type=pipeline_type) @@ -136,11 +188,14 @@ def test_chat_scenario_vs_stateful(model_id, generation_config_kwargs: dict, pip if generation_config.is_beam_search() and pipeline_type != PipelineType.PAGED_ATTENTION: return - generation_config = prepare_generation_config_by_pipe_type(generation_config=generation_config, pipeline_type=pipeline_type) + generation_config = prepare_generation_config_by_pipe_type( + generation_config=generation_config, + pipeline_type=pipeline_type, + ) ov_pipe.set_generation_config(generation_config) - for question in questions: + for question in COMMON_QUESTIONS: generated = cb_pipe.generate(question, generation_config=generation_config) reference = ov_pipe.generate(question) assert generated == reference @@ -149,21 +204,30 @@ def test_chat_scenario_vs_stateful(model_id, generation_config_kwargs: dict, pip cb_pipe.finish_chat() -generation_configs = [ - dict(do_sample=False, max_new_tokens=20), - dict(do_sample=True, max_new_tokens=20, temperature=0.7), - dict(do_sample=False, num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0, repetition_penalty=1.0), -] -questions = [ - '1+1=', - 'Why is the Sun yellow?', -] -@pytest.mark.parametrize("generation_config_kwargs", generation_configs) -@pytest.mark.parametrize("model_id", get_chat_models_list()) -@pytest.mark.parametrize("pipeline_type", [PipelineType.CONTINUOUS_BATCHING, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING,]) @pytest.mark.precommit -def test_continuous_batching_add_request_health_check(model_id, generation_config_kwargs: dict, pipeline_type): - _, _, models_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", get_chat_models_list(), indirect=True) +@pytest.mark.parametrize( + "generation_config_kwargs", + [ + {"do_sample": False, "max_new_tokens": 20}, + {"do_sample": True, "max_new_tokens": 20, "temperature": 0.7}, + {"do_sample": False, "num_beam_groups": 3, "num_beams": 15, "num_return_sequences": 1, "max_new_tokens": 10, "diversity_penalty": 1.0, "repetition_penalty": 1.0}, + ] +) +@pytest.mark.parametrize( + "pipeline_type", + [ + PipelineType.CONTINUOUS_BATCHING, + PipelineType.SPECULATIVE_DECODING, + PipelineType.PROMPT_LOOKUP_DECODING, + ] +) +def test_continuous_batching_add_request_health_check( + llm_model: OVConvertedModelSchema, + generation_config_kwargs: dict, + pipeline_type: PipelineType +): + models_path = llm_model.models_path cb_pipe = create_ov_cb_pipeline(models_path, pipeline_type=pipeline_type) @@ -174,7 +238,7 @@ def test_continuous_batching_add_request_health_check(model_id, generation_confi generation_config = prepare_generation_config_by_pipe_type(generation_config=generation_config, pipeline_type=pipeline_type) handles = [] - for idx, question in enumerate(questions): + for idx, question in enumerate(COMMON_QUESTIONS_SHORT): handle = cb_pipe.add_request(idx, question, generation_config=generation_config) handles.append(handle) @@ -186,15 +250,21 @@ def test_continuous_batching_add_request_health_check(model_id, generation_confi for output in outputs: assert output.finish_reason == GenerationFinishReason.STOP or output.finish_reason == GenerationFinishReason.LENGTH -invalid_generation_configs = [ - dict(max_length=1, ignore_eos=True) # max_length smaller than number of prompt tokens, generation should stop right away -] -@pytest.mark.parametrize("generation_config_kwargs", invalid_generation_configs) -@pytest.mark.parametrize("model_id", get_chat_models_list()) +@pytest.mark.parametrize( + "generation_config_kwargs", + [ + {"max_length": 1, "ignore_eos": True}, # max_length smaller than number of prompt tokens, generation should stop right away + ] +) +@pytest.mark.parametrize("llm_model", get_chat_models_list(), indirect=True) @pytest.mark.parametrize("pipeline_type", [PipelineType.CONTINUOUS_BATCHING, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING,]) @pytest.mark.precommit -def test_continuous_batching_add_request_fails(model_id, generation_config_kwargs: dict, pipeline_type): - _, _, models_path = download_and_convert_model(model_id) +def test_continuous_batching_add_request_fails( + llm_model: OVConvertedModelSchema, + generation_config_kwargs: dict, + pipeline_type: PipelineType, +): + models_path = llm_model.models_path cb_pipe = create_ov_cb_pipeline(models_path, pipeline_type=pipeline_type) @@ -203,11 +273,12 @@ def test_continuous_batching_add_request_fails(model_id, generation_config_kwarg if generation_config.is_beam_search() and pipeline_type != PipelineType.CONTINUOUS_BATCHING: pytest.skip("Assisted generation does not support beam search") - generation_config = prepare_generation_config_by_pipe_type(generation_config=generation_config, pipeline_type=pipeline_type) - handles = [] - for idx, question in enumerate(questions): + generation_config = prepare_generation_config_by_pipe_type( + generation_config=generation_config, pipeline_type=pipeline_type + ) + for idx, question in enumerate(COMMON_QUESTIONS_SHORT): with pytest.raises(RuntimeError): - handle = cb_pipe.add_request(idx, question, generation_config=generation_config) + cb_pipe.add_request(idx, question, generation_config=generation_config) # # Stress tests to check OOM case @@ -215,9 +286,12 @@ def test_continuous_batching_add_request_fails(model_id, generation_config_kwarg # todo: iefode: bug reproducer!!! @pytest.mark.precommit -@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()], - ids=["greedy", "beam_search", "multinomial_all_parameters"]) -def test_post_oom_health(sampling_config): +@pytest.mark.parametrize( + "sampling_config", + [get_greedy(), get_beam_search(), get_multinomial_all_parameters()], + ids=["greedy", "beam_search", "multinomial_all_parameters"] +) +def test_post_oom_health(model_facebook_opt_125m: OVConvertedModelSchema, sampling_config): generation_config = sampling_config generation_config.ignore_eos = True generation_config.max_new_tokens = 1000000 @@ -225,13 +299,14 @@ def test_post_oom_health(sampling_config): scheduler_config = dict_to_scheduler_config() scheduler_config.num_kv_blocks = 10 # Low cache size to trigger OOM quickly - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) + models_path = model_facebook_opt_125m.models_path - cb_pipe = create_ov_pipeline(models_path, - pipeline_type=PipelineType.CONTINUOUS_BATCHING, - device="CPU", - scheduler_config=scheduler_config) + cb_pipe = create_ov_pipeline( + models_path, + pipeline_type=PipelineType.CONTINUOUS_BATCHING, + device="CPU", + scheduler_config=scheduler_config, + ) # First run should return incomplete response output = cb_pipe.generate(["What is OpenVINO?"], [generation_config]) @@ -266,27 +341,33 @@ def get_beam_search_seq_len_300() -> GenerationConfig: generation_config.num_return_sequences = generation_config.num_beams return generation_config -scheduler_params_list = [({"num_kv_blocks": 2, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 2, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_parallel_sampling_seq_len_300()), - ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_parallel_sampling_seq_len_300()), - ({"num_kv_blocks": 34, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), - ({"num_kv_blocks": 34, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), - ({"num_kv_blocks": 100, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), - ({"num_kv_blocks": 100, "dynamic_split_fuse": False}, get_beam_search_seq_len_300())] -@pytest.mark.parametrize("params", scheduler_params_list) + +@pytest.mark.parametrize( + "params", + [ + ({"num_kv_blocks": 2, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 2, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": True}, get_parallel_sampling_seq_len_300()), + ({"num_kv_blocks": 10, "dynamic_split_fuse": False}, get_parallel_sampling_seq_len_300()), + ({"num_kv_blocks": 34, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 34, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 100, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 100, "dynamic_split_fuse": False}, get_beam_search_seq_len_300()), + ] +) @pytest.mark.precommit -def test_preemption(params): - model_id = "facebook/opt-125m" +def test_preemption(model_facebook_opt_125m: OVConvertedModelSchema, params): scheduler_params = params[0] generation_config = params[1] prompts, _ = get_test_dataset() - generate_and_compare(prompts=prompts, - pipeline_type=PipelineType.CONTINUOUS_BATCHING, - model=model_id, - scheduler_config=scheduler_params, - generation_config=generation_config) + generate_and_compare( + model_schema=model_facebook_opt_125m, + prompts=prompts, + pipeline_type=PipelineType.CONTINUOUS_BATCHING, + scheduler_config=scheduler_params, + generation_config=generation_config + ) multinomial_params = RandomSamplingTestStruct( generation_config=[ @@ -327,23 +408,23 @@ def test_preemption(params): # todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits::max() -@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.precommit +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") -def test_preemption_with_multinomial(dynamic_split_fuse): +def test_preemption_with_multinomial(model_facebook_opt_125m: OVConvertedModelSchema, dynamic_split_fuse): generation_configs = multinomial_params.generation_config for config in generation_configs: config.max_new_tokens = 30 - model_id : str = "facebook/opt-125m" - model, hf_tokenizer, models_path = download_and_convert_model(model_id) scheduler_config = dict_to_scheduler_config({"num_kv_blocks": 3, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare(model=models_path, - pipeline_type=PipelineType.CONTINUOUS_BATCHING, - prompts=multinomial_params.prompts, - ref=multinomial_params.ref_texts, - generation_config=generation_configs, - scheduler_config=scheduler_config) + generate_and_compare( + model_schema=model_facebook_opt_125m, + pipeline_type=PipelineType.CONTINUOUS_BATCHING, + prompts=multinomial_params.prompts, + ref=multinomial_params.ref_texts, + generation_config=generation_configs, + scheduler_config=scheduler_config, + ) multinomial_params_n_seq = RandomSamplingTestStruct( @@ -413,9 +494,8 @@ def test_preemption_with_multinomial(dynamic_split_fuse): @pytest.mark.parametrize("dynamic_split_fuse", [True, False]) @pytest.mark.precommit @pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") -def test_preemption_with_multinomial_n_seq(dynamic_split_fuse): - model_id : str = "facebook/opt-125m" - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) +def test_preemption_with_multinomial_n_seq(model_facebook_opt_125m: OVConvertedModelSchema, dynamic_split_fuse): + models_path = model_facebook_opt_125m.models_path # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = dict_to_scheduler_config({"num_kv_blocks": 8, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) @@ -427,11 +507,11 @@ def test_preemption_with_multinomial_n_seq(dynamic_split_fuse): scheduler_config=scheduler_config) -@pytest.mark.parametrize("pipeline_type", [PipelineType.PROMPT_LOOKUP_DECODING]) @pytest.mark.precommit -def test_dynamic_split_fuse_doesnt_affect_generated_text(pipeline_type): +def test_dynamic_split_fuse_doesnt_affect_generated_text(): model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - _, _, models_path = download_and_convert_model(model_id) + pipeline_type = PipelineType.PROMPT_LOOKUP_DECODING + models_path = download_and_convert_model(model_id).models_path scheduler_config_ref = dict_to_scheduler_config({"dynamic_split_fuse": False, "max_num_batched_tokens": sys.maxsize}) cb_pipe_ref = create_ov_pipeline(models_path, scheduler_config=scheduler_config_ref, pipeline_type=pipeline_type) @@ -451,44 +531,37 @@ def test_dynamic_split_fuse_doesnt_affect_generated_text(pipeline_type): assert generated == reference -def get_data_by_pipeline_type(model_path: Path, pipeline_type: str, generation_config: GenerationConfig): - device = "CPU" - prompt = "Prompt example is" - generation_config.max_new_tokens = 10 - pipe = None - if pipeline_type == "continuous_batching": - scheduler_config = SchedulerConfig() - pipe = ContinuousBatchingPipeline(model_path, scheduler_config, device) - prompt = [prompt] - generation_config = [generation_config] - elif pipeline_type == "speculative_decoding": - generation_config.assistant_confidence_threshold = 0.4 - pipe = LLMPipeline(model_path, device, draft_model=draft_model(model_path)) - elif pipeline_type == "prompt_lookup_decoding": - generation_config.num_assistant_tokens = 5 - generation_config.max_ngram_size = 3 - pipe = LLMPipeline(model_path, device, prompt_lookup=True) - elif "llm_pipeline": - pipe = LLMPipeline(model_path, device) - else: - raise RuntimeError(f"{pipeline_type} is unknown pipeline type!") - return pipe, prompt, generation_config - - -def run_extended_perf_metrics_collection(model_id, generation_config: GenerationConfig, prompt: str, pipeline_type: PipelineType): - _, _, model_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(model_path, pipeline_type=pipeline_type) - return ov_pipe.generate([prompt], generation_config).extended_perf_metrics - - -@pytest.mark.parametrize("pipeline_type", [PipelineType.PAGED_ATTENTION, PipelineType.SPECULATIVE_DECODING]) @pytest.mark.precommit -def test_speculative_decoding_extended_perf_metrics(pipeline_type): +@pytest.mark.parametrize( + "pipeline_type", + [ + PipelineType.PAGED_ATTENTION, + PipelineType.SPECULATIVE_DECODING, + ] +) +def test_speculative_decoding_extended_perf_metrics(pipeline_type: PipelineType): + def run_extended_perf_metrics_collection( + model_id: str, + generation_config: GenerationConfig, + prompt: str, + pipeline_type: PipelineType + ): + model_path = download_and_convert_model(model_id).models_path + ov_pipe = create_ov_pipeline(model_path, pipeline_type=pipeline_type) + return ov_pipe.generate([prompt], generation_config).extended_perf_metrics + import time start_time = time.perf_counter() model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - generation_config = GenerationConfig(do_sample=False, max_new_tokens=20, ignore_eos=True, num_assistant_tokens=5) - extended_perf_metrics = run_extended_perf_metrics_collection(model_id, generation_config, "Why is the Sun yellow?", pipeline_type) + generation_config = GenerationConfig( + do_sample=False, + max_new_tokens=20, + ignore_eos=True, + num_assistant_tokens=5, + ) + extended_perf_metrics = run_extended_perf_metrics_collection( + model_id, generation_config, "Why is the Sun yellow?", pipeline_type + ) total_time = (time.perf_counter() - start_time) * 1000 if (pipeline_type == PipelineType.SPECULATIVE_DECODING): @@ -512,7 +585,10 @@ def test_speculative_decoding_extended_perf_metrics(pipeline_type): total_iteration_number_draft = len(extended_perf_metrics.draft_model_metrics.raw_metrics.m_durations) assert total_iteration_number_draft > 0 and total_iteration_number_draft < ((generation_config.max_new_tokens - 1) * generation_config.num_assistant_tokens + 1) - for model_metrics in [extended_perf_metrics.main_model_metrics, extended_perf_metrics.draft_model_metrics]: + for model_metrics in [ + extended_perf_metrics.main_model_metrics, + extended_perf_metrics.draft_model_metrics, + ]: mean_ttst, std_ttst = model_metrics.get_ttst() assert (mean_ttst, std_ttst) == (model_metrics.get_ttst().mean, model_metrics.get_ttst().std) assert mean_ttst > 0 and mean_ttst < model_metrics.get_ttft().mean diff --git a/tests/python_tests/test_gguf_reader.py b/tests/python_tests/test_gguf_reader.py index 63a69694d0..81e05d9547 100644 --- a/tests/python_tests/test_gguf_reader.py +++ b/tests/python_tests/test_gguf_reader.py @@ -7,30 +7,75 @@ import gc import sys from pathlib import Path +from dataclasses import dataclass +from typing import Any import openvino as ov import openvino_genai as ov_genai -from utils.hugging_face import generation_config_to_hf, download_gguf_model, load_hf_model_from_gguf, load_hf_tokenizer_from_gguf -from utils.ov_genai_pipelines import create_ov_pipeline, get_gguf_pipeline_types +from utils.hugging_face import ( + generation_config_to_hf, + download_gguf_model, + load_hf_model_from_gguf, + load_hf_tokenizer_from_gguf, +) +from utils.ov_genai_pipelines import ( + create_ov_pipeline, + get_gguf_pipeline_types, + PipelineType, +) from data.models import get_gguf_model_list -@pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types()) -@pytest.mark.parametrize("model_ids", get_gguf_model_list()) +GGUF_PIPELINE_TYPES = get_gguf_pipeline_types() +GGUF_MODEL_LIST = get_gguf_model_list() + + +@dataclass(frozen=True) +class ModelInfo: + gguf_model_id: str + gguf_filename: str + gguf_full_path: str + dynamic_quantization_group_size: str | None + opt_model: Any | None + hf_tokenizer: Any | None + + +@pytest.fixture(scope="module") +def model_gguf(request: pytest.FixtureRequest) -> ModelInfo: + meta_info = request.param + gguf_model_id = meta_info["gguf_model_id"] + gguf_filename = meta_info["gguf_filename"] + opt_model = load_hf_model_from_gguf(gguf_model_id, gguf_filename) + hf_tokenizer = load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename) + gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename) + return ModelInfo( + gguf_model_id=gguf_model_id, + gguf_filename=gguf_filename, + gguf_full_path=gguf_full_path, + dynamic_quantization_group_size=meta_info["dynamic_quantization_group_size"], + opt_model=opt_model, + hf_tokenizer=hf_tokenizer, + ) + + @pytest.mark.precommit +@pytest.mark.parametrize("pipeline_type", GGUF_PIPELINE_TYPES) +@pytest.mark.parametrize("model_gguf", GGUF_MODEL_LIST, indirect=True) @pytest.mark.skipif(sys.platform == "win32", reason="CVS-174065") -def test_pipelines_with_gguf_generate(pipeline_type, model_ids): +def test_pipelines_with_gguf_generate( + model_gguf: ModelInfo, + pipeline_type: PipelineType, +): if sys.platform == 'darwin': pytest.skip(reason="168882: Sporadic segmentation fault failure on MacOS.") - gguf_model_id = model_ids["gguf_model_id"] - gguf_filename = model_ids["gguf_filename"] - dynamic_quantization_group_size = model_ids["dynamic_quantization_group_size"] - prompt = 'Why is the Sun yellow?' - opt_model = load_hf_model_from_gguf(gguf_model_id, gguf_filename) - hf_tokenizer = load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename) - gc.collect() + opt_model = model_gguf.opt_model + hf_tokenizer = model_gguf.hf_tokenizer + gguf_full_path = model_gguf.gguf_full_path + dynamic_quantization_group_size = model_gguf.dynamic_quantization_group_size + + prompt = 'Why is the Sun yellow?' ov_generation_config = ov_genai.GenerationConfig() ov_generation_config.max_new_tokens = 30 @@ -42,15 +87,25 @@ def test_pipelines_with_gguf_generate(pipeline_type, model_ids): hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) generate_outputs = None with torch.no_grad(): - generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) - del opt_model - gc.collect() + generate_outputs = opt_model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + generation_config=hf_generation_config, + tokenizer=hf_tokenizer, + ) + prompt_len = 0 if ov_generation_config.echo else input_ids.numel() - all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + all_text_batch = hf_tokenizer.batch_decode( + [generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], + skip_special_tokens=True + ) res_string_input_1 = all_text_batch[0] - gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename) - ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type, dynamic_quantization_group_size=dynamic_quantization_group_size) + ov_pipe_gguf = create_ov_pipeline( + gguf_full_path, + pipeline_type=pipeline_type, + dynamic_quantization_group_size=dynamic_quantization_group_size, + ) encoded_result = ov_pipe_gguf.generate(ov.Tensor(input_ids.numpy()), generation_config=ov_generation_config) del ov_pipe_gguf gc.collect() @@ -59,31 +114,44 @@ def test_pipelines_with_gguf_generate(pipeline_type, model_ids): assert res_string_input_1 == res_string_input_2 -@pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types()) -@pytest.mark.parametrize("model_ids", get_gguf_model_list()) -@pytest.mark.parametrize("enable_save_ov_model", [False, True]) -@pytest.mark.parametrize("prompt", [ - 'Why is the Sun yellow?', - # To check that special tokens are handled correctly. - '<|endoftext|> <|im_end|>', - '<|endoftext|><|endoftext|><|im_end|>', - '<|endoftext|> Why the Sky is Blue? <|im_end|>', -]) @pytest.mark.precommit +@pytest.mark.parametrize("pipeline_type", GGUF_PIPELINE_TYPES) +@pytest.mark.parametrize("enable_save_ov_model", [False, True]) +@pytest.mark.parametrize( + "prompt", + [ + 'Why is the Sun yellow?', + # To check that special tokens are handled correctly. + '<|endoftext|> <|im_end|>', + '<|endoftext|><|endoftext|><|im_end|>', + '<|endoftext|> Why the Sky is Blue? <|im_end|>', + ], + ids=[ + "regular_prompt", + "only_special_tokens", + "multiple_special_tokens", + "special_tokens_with_text" + ], +) +@pytest.mark.parametrize("model_gguf", GGUF_MODEL_LIST, indirect=True) @pytest.mark.skipif(sys.platform == "win32", reason="CVS-174065") -def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model, prompt): +def test_full_gguf_pipeline( + model_gguf: ModelInfo, + pipeline_type: PipelineType, + enable_save_ov_model: bool, + prompt: str, +): if sys.platform == 'darwin': pytest.skip(reason="168882: Sporadic segmentation fault failure on MacOS.") - gguf_model_id = model_ids["gguf_model_id"] - gguf_filename = model_ids["gguf_filename"] - dynamic_quantization_group_size = model_ids["dynamic_quantization_group_size"] + gguf_model_id = model_gguf.gguf_model_id + gguf_filename = model_gguf.gguf_filename + gguf_full_path = model_gguf.gguf_full_path + opt_model = model_gguf.opt_model + hf_tokenizer = model_gguf.hf_tokenizer + dynamic_quantization_group_size = model_gguf.dynamic_quantization_group_size if gguf_model_id == "sammysun0711/tiny-random-deepseek-distill-qwen-gguf" and "<|endoftext|>" in prompt: pytest.skip(reason="Prompts to test special tokens for this model fail on HF side") - - opt_model = load_hf_model_from_gguf(gguf_model_id, gguf_filename) - hf_tokenizer = load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename) - gc.collect() # TODO: remove explicit switch-off of bos token hf_tokenizer.add_bos_token = False @@ -98,14 +166,18 @@ def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model, prom hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) generate_outputs = None with torch.no_grad(): - generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) - del opt_model + generate_outputs = opt_model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + generation_config=hf_generation_config, + tokenizer=hf_tokenizer, + ) + gc.collect() prompt_len = 0 if ov_generation_config.echo else input_ids.numel() all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) res_string_input_1 = all_text_batch[0] - gguf_full_path = download_gguf_model(gguf_model_id, gguf_filename) ov_pipe_gguf = create_ov_pipeline(gguf_full_path, pipeline_type=pipeline_type, enable_save_ov_model=enable_save_ov_model, dynamic_quantization_group_size=dynamic_quantization_group_size) res_string_input_2 = ov_pipe_gguf.generate(prompt, generation_config=ov_generation_config) @@ -114,22 +186,30 @@ def test_full_gguf_pipeline(pipeline_type, model_ids, enable_save_ov_model, prom assert ov_pipe_gguf.get_tokenizer().get_bos_token() == hf_tokenizer.decode([ov_pipe_gguf.get_tokenizer().get_bos_token_id()]) del ov_pipe_gguf - gc.collect() if enable_save_ov_model: gguf_full_path = Path(gguf_full_path) ov_pipe_native = create_ov_pipeline(gguf_full_path.parent, pipeline_type=pipeline_type, dynamic_quantization_group_size=dynamic_quantization_group_size) res_string_input_3 = ov_pipe_native.generate(prompt, generation_config=ov_generation_config) - del ov_pipe_native - gc.collect() assert res_string_input_2 == res_string_input_3 assert res_string_input_1 == res_string_input_2 + gc.collect() + -@pytest.mark.parametrize("pipeline_type", get_gguf_pipeline_types()) -@pytest.mark.parametrize("model_ids", [{"gguf_model_id": "Qwen/Qwen3-0.6B-GGUF", "gguf_filename": "Qwen3-0.6B-Q8_0.gguf"}]) -@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 172335") @pytest.mark.precommit +@pytest.mark.parametrize("pipeline_type", GGUF_PIPELINE_TYPES) +@pytest.mark.parametrize( + "model_ids", + [ + { + "gguf_model_id": "Qwen/Qwen3-0.6B-GGUF", + "gguf_filename": "Qwen3-0.6B-Q8_0.gguf", + "dynamic_quantization_group_size": None, + } + ] +) +@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 172335") @pytest.mark.skipif(sys.platform == "win32", reason="CVS-174065") def test_full_gguf_qwen3_pipeline(pipeline_type, model_ids): # Temporal testing solution until transformers starts to support qwen3 in GGUF format diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 260adae397..c7e1786af2 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -123,7 +123,9 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(test_struct, apply scheduler_config_opt.sparse_attention_config.num_last_dense_tokens_in_prefill = 10 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" - _, tokenizer, models_path = download_and_convert_model(model_id) + model_schema = download_and_convert_model(model_id) + tokenizer = model_schema.hf_tokenizer + models_path = model_schema.models_path model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_llm_properties()) model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_llm_properties()) @@ -182,23 +184,26 @@ def get_beam_search_seq_len_300() -> GenerationConfig: scheduler_params_list = [ - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()), - ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300())] + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": True}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": True, "enable_prefix_caching": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "enable_prefix_caching": False}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 0, "cache_size": 0, "dynamic_split_fuse": False, "max_num_batched_tokens": 600, "use_cache_eviction": True, "cache_eviction_config": SHORT_CACHE_EVICTION_CONFIG}, get_greedy_seq_len_300()), +] @pytest.mark.parametrize("params", scheduler_params_list) @pytest.mark.precommit def test_dynamic_memory_allocation(params): prompts, _ = get_test_dataset() - generate_and_compare(prompts=prompts, - model="facebook/opt-125m", - scheduler_config=params[0], - generation_config=params[1], - pipeline_type=PipelineType.CONTINUOUS_BATCHING) + generate_and_compare( + model_schema=download_and_convert_model("facebook/opt-125m"), + prompts=prompts, + scheduler_config=params[0], + generation_config=params[1], + pipeline_type=PipelineType.CONTINUOUS_BATCHING + ) -@dataclass +@dataclass(frozen=True) class LongBenchTestData: subset: str threshold: float @@ -216,7 +221,7 @@ def test_optimized_generation_longbench(test_struct): device = "CPU" num_kv_blocks = 1000 if device == "CPU" else 500 model_id = "Qwen/Qwen2-0.5B-Instruct" - _, _, models_path = download_and_convert_model(model_id) + models_path = download_and_convert_model(model_id).models_path scheduler_config = get_scheduler_config(num_kv_blocks) scheduler_config_opt = get_scheduler_config(num_kv_blocks) @@ -292,7 +297,7 @@ def test_kvcrush_vs_snapkv_baseline(subset): seqs_per_request = 32 num_kv_blocks = 1000 if device == "CPU" else 500 model_id = "Qwen/Qwen2-0.5B-Instruct" - _, _, models_path = download_and_convert_model(model_id) + models_path = download_and_convert_model(model_id).models_path # Setup baseline and KVCrush configurations scheduler_config_baseline = get_scheduler_config(num_kv_blocks) diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 4bf7ff545c..d69dd95c9e 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -8,16 +8,18 @@ import sys import os import json +import logging import numpy as np from pathlib import Path -from typing import Literal +from typing import Literal, Callable, Optional from pydantic import BaseModel, Field +from unittest.mock import MagicMock import openvino as ov import openvino_genai as ov_genai from utils.constants import get_default_llm_properties, extra_generate_kwargs -from utils.hugging_face import generation_config_to_hf, download_and_convert_model +from utils.hugging_face import generation_config_to_hf, download_and_convert_model, OVConvertedModelSchema # model_tmp_path fixture import required from utils.tokenizers import delete_rt_info, model_tmp_path from utils.ov_genai_pipelines import create_ov_pipeline, generate_and_compare, get_main_pipeline_types, PipelineType @@ -27,44 +29,151 @@ # e2e work # -test_cases = [ - (dict(max_new_tokens=20), '你好! 你好嗎?'), - (dict(max_new_tokens=30, num_beams=15, num_beam_groups=3, num_return_sequences=15, diversity_penalty=1.0), 'Why is the Sun yellow?'), +INPUTS_TEST_CASES = [ + ( + {'max_new_tokens': 20}, + '你好! 你好嗎?', + ), + ( + { + 'max_new_tokens': 30, + 'num_beams': 15, + 'num_beam_groups': 3, + 'num_return_sequences': 15, + 'diversity_penalty': 1.0, + }, + 'Why is the Sun yellow?' + ), +] + +PERF_TEST_CASES = [ + ({'max_new_tokens': 20}, 'table is made of'), ] -@pytest.mark.parametrize("generation_config_dict,prompt", test_cases) -@pytest.mark.parametrize("model_id", get_models_list()) -@pytest.mark.parametrize("pipeline_type", get_main_pipeline_types()) -@pytest.mark.precommit -def test_string_inputs(model_id, generation_config_dict, prompt, pipeline_type): - generate_and_compare(model=model_id, prompts=[prompt], generation_config=generation_config_dict, pipeline_type=pipeline_type) +PERF_METRICS_TEST_CASES = [ + ({'max_new_tokens': 20}, 'Generate json of a person'), +] -input_tensors_list = [ +INPUT_TENSORS_LIST = [ # input_ids, attention_mask (np.array([[1, 4, 42]], dtype=np.int64), None), (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)), ] -@pytest.mark.parametrize("inputs", input_tensors_list) -@pytest.mark.parametrize("model_id", get_models_list()) + +TEST_CONFIGS = [ + {'max_new_tokens': 20}, + {'max_new_tokens': 20, 'num_beam_groups': 2, 'num_beams': 6, 'diversity_penalty': 1.0} +] + +BATCHED_PROMPTS = [ + ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], + ['table is made', 'table is made [force left pad tokens]'] +] + +CHAT_INPUTS = [ + ({'max_new_tokens': 20}, ""), + ({'max_new_tokens': 20}, "Pretend that 1+1=1"), + ( + { + 'max_new_tokens': 10, + 'num_beam_groups': 3, + 'num_beams': 15, + 'num_return_sequences': 1, + 'diversity_penalty': 1.0, + }, + "" + ) +] + +MODELS_LIST = get_models_list() +CHAT_MODELS_LIST = get_chat_models_list() + +PIPELINE_MAIN_TYPES = get_main_pipeline_types() + +QUESTIONS = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + +CALLBACK_QUESTIONS = [ + '1+1=', + 'Why is the Sun yellow?', + 'What is the previous answer?', + 'What was my first question?' +] + + +def user_defined_callback(subword): + logging.info(subword) + + +def user_defined_status_callback(subword): + logging.info(subword) + return ov_genai.StreamingStatus.RUNNING + + +CALLBACK_FUNCTIONS = [ + logging.info, + user_defined_callback, + user_defined_status_callback, + lambda subword: logging.info(subword), +] + + +@pytest.fixture(scope="module") +def llm_model(request: pytest.FixtureRequest) -> OVConvertedModelSchema: + return download_and_convert_model(request.param) + + +@pytest.fixture(scope="module") +def ov_pipe(llm_model: OVConvertedModelSchema) -> ov_genai.LLMPipeline: + return create_ov_pipeline(llm_model.models_path) + + @pytest.mark.precommit -def test_encoded_inputs(model_id, inputs): - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("generation_config_dict,prompt", INPUTS_TEST_CASES) +@pytest.mark.parametrize("pipeline_type", PIPELINE_MAIN_TYPES) +def test_string_inputs( + llm_model: OVConvertedModelSchema, + generation_config_dict: dict, + prompt: str, + pipeline_type: PipelineType, +) -> None: + generate_and_compare( + model_schema=llm_model, + prompts=[prompt], + generation_config=generation_config_dict, + pipeline_type=pipeline_type, + ) + +@pytest.mark.precommit +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("inputs", INPUT_TENSORS_LIST) +def test_encoded_inputs( + llm_model: OVConvertedModelSchema, + ov_pipe: ov_genai.LLMPipeline, + inputs: tuple[np.ndarray, Optional[np.ndarray]], +) -> None: ov_generation_config = ov_genai.GenerationConfig(max_new_tokens=20) - hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(llm_model.opt_model.generation_config, ov_generation_config) input_ids, attention_mask = inputs prompt_len = input_ids.shape[1] if attention_mask is not None: inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) - inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) + inputs_hf = {'inputs': torch.tensor(input_ids), 'attention_mask': torch.tensor(attention_mask)} else: - inputs_hf = dict(inputs=torch.tensor(input_ids)) + inputs_hf = {'inputs': torch.tensor(input_ids)} inputs_ov = ov.Tensor(input_ids) - hf_output = opt_model.generate(**inputs_hf, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] + hf_output = llm_model.opt_model.generate(**inputs_hf, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] ov_output = ov_pipe.generate(inputs_ov, ov_generation_config) hf_res = hf_output[prompt_len:].numpy() @@ -72,121 +181,105 @@ def test_encoded_inputs(model_id, inputs): assert np.all(ov_res == hf_res) -test_configs = [ - dict(max_new_tokens=20), - dict(max_new_tokens=20, num_beam_groups=2, num_beams=6, diversity_penalty=1.0) -] -batched_prompts = [ - ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], - ['hello', 'Here is the longest nowel ever: '], - ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], - ['table is made', 'table is made [force left pad tokens]'] -] -@pytest.mark.parametrize("generation_config_dict", test_configs) -@pytest.mark.parametrize("prompts", batched_prompts) -@pytest.mark.parametrize("model_id", get_models_list()) -@pytest.mark.parametrize("pipeline_type", get_main_pipeline_types()) @pytest.mark.precommit -def test_batch_string_inputs(model_id, generation_config_dict, prompts, pipeline_type): - generate_and_compare(model=model_id, prompts=prompts, generation_config=generation_config_dict, pipeline_type=pipeline_type) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("generation_config_dict", TEST_CONFIGS) +@pytest.mark.parametrize("prompts", BATCHED_PROMPTS) +@pytest.mark.parametrize("pipeline_type", PIPELINE_MAIN_TYPES) +def test_batch_string_inputs( + llm_model: OVConvertedModelSchema, + generation_config_dict: dict, + prompts: list[str], + pipeline_type: PipelineType, +) -> None: + generate_and_compare( + model_schema=llm_model, + prompts=prompts, + generation_config=generation_config_dict, + pipeline_type=pipeline_type, + ) @pytest.mark.precommit -def test_batch_size_switch(): - model_id = 'katuni4ka/tiny-random-phi3' - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) - +@pytest.mark.parametrize("llm_model", ['katuni4ka/tiny-random-phi3'], indirect=True) +def test_batch_size_switch(ov_pipe: ov_genai.LLMPipeline) -> None: ov_pipe.generate(["a"], max_new_tokens=2) ov_pipe.generate(["1", "2"], max_new_tokens=2) ov_pipe.generate(["a"], max_new_tokens=2) @pytest.mark.precommit -def test_empty_encoded_inputs_throw(): - model_id = 'katuni4ka/tiny-random-phi3' - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", ['katuni4ka/tiny-random-phi3'], indirect=True) +def test_empty_encoded_inputs_throw(ov_pipe: ov_genai.LLMPipeline) -> None: with pytest.raises(RuntimeError): ov_pipe.generate(ov.Tensor(np.array([[]], dtype=np.int64)), max_new_tokens=2) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_chat_models_list()) -def test_different_input_types_works_same_and_change_nothing(model_id): - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", CHAT_MODELS_LIST, indirect=True) +def test_different_input_types_works_same_and_change_nothing( + llm_model: OVConvertedModelSchema, + ov_pipe: ov_genai.LLMPipeline, +) -> None: ov_generation_config = ov_genai.GenerationConfig() ov_generation_config.max_new_tokens = 30 ov_generation_config.apply_chat_template = False - res_string_input_1 = ov_pipe.generate(questions[0], generation_config=ov_generation_config) + res_string_input_1 = ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) tokenizer = ov_pipe.get_tokenizer() - ov_tokens = tokenizer.encode(questions[0], add_special_tokens=True) + ov_tokens = tokenizer.encode(QUESTIONS[0], add_special_tokens=True) res_encoded_input = ov_pipe.generate(ov_tokens, generation_config=ov_generation_config) - res_encoded_input_str = hf_tokenizer.decode(res_encoded_input.tokens[0], skip_special_tokens=True) + res_encoded_input_str = llm_model.hf_tokenizer.decode(res_encoded_input.tokens[0], skip_special_tokens=True) assert res_string_input_1 == res_encoded_input_str - res_string_input_2 = ov_pipe.generate(questions[0], generation_config=ov_generation_config) + res_string_input_2 = ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) assert res_string_input_1 == res_string_input_2 # # Chat scenario # - -chat_inputs = [ - (dict(max_new_tokens=20), ""), - (dict(max_new_tokens=20), "Pretend that 1+1=1"), - (dict(max_new_tokens=10, num_beam_groups=3, num_beams=15, num_return_sequences=1, diversity_penalty=1.0), "") -] - -questions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the Sun yellow?', - 'What was my first question?' -] - -@pytest.mark.parametrize("inputs", chat_inputs) -@pytest.mark.parametrize("model_id", get_chat_models_list()) -@pytest.mark.parametrize("string_inputs", [True, False]) @pytest.mark.precommit -def test_chat_scenario(model_id, inputs, string_inputs): +@pytest.mark.parametrize("llm_model", CHAT_MODELS_LIST, indirect=True) +@pytest.mark.parametrize("inputs", CHAT_INPUTS) +@pytest.mark.parametrize("string_inputs", [True, False]) +def test_chat_scenario( + llm_model: OVConvertedModelSchema, + inputs: tuple[dict, str], + string_inputs: bool, +) -> None: chat_history_hf = [] chat_history_ov = [] - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) if string_inputs: - ov_pipe = create_ov_pipeline(models_path) + ov_pipe = create_ov_pipeline(llm_model.models_path) else: - # chat is not supported for PA backend with encoded_inputs format - ov_pipe = create_ov_pipeline(models_path, pipeline_type=PipelineType.STATEFUL) + ov_pipe = create_ov_pipeline(llm_model.models_path, pipeline_type=PipelineType.STATEFUL) generation_config_kwargs, system_message = inputs ov_generation_config = ov_genai.GenerationConfig(**generation_config_kwargs) - hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(llm_model.opt_model.generation_config, ov_generation_config) prev_chat_len = 0 ov_pipe.start_chat(system_message) chat_history_hf.append({"role": "system", "content": system_message}) chat_history_ov.append({"role": "system", "content": system_message}) - for prompt in questions: + for prompt in QUESTIONS: chat_history_hf.append({'role': 'user', 'content': prompt}) chat_history_ov.append({'role': 'user', 'content': prompt}) - chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + chat_prompt = llm_model.hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = llm_model.hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) prompt_len = tokenized['input_ids'].numel() - answer = opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] - answer_str = hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) + answer = llm_model.opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] + answer_str = llm_model.hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) if string_inputs: @@ -198,7 +291,7 @@ def test_chat_scenario(model_id, inputs, string_inputs): result_ov = ov_pipe.generate(inputs_ov, generation_config=ov_generation_config).tokens[0] - answer_ov = hf_tokenizer.decode(result_ov, skip_special_tokens=True) + answer_ov = llm_model.hf_tokenizer.decode(result_ov, skip_special_tokens=True) prev_chat_len = len(tokenized['input_ids'][0]) + len(result_ov) chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) @@ -206,35 +299,37 @@ def test_chat_scenario(model_id, inputs, string_inputs): ov_pipe.finish_chat() if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') + logging.info(f'hf_output: {chat_history_hf}') + logging.info(f'ov_output: {chat_history_ov}') assert chat_history_ov == chat_history_hf @pytest.mark.precommit -def test_chat_scenario_several_chats_in_series(): - opt_model, hf_tokenizer, models_path = download_and_convert_model(get_chat_models_list()[0]) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", [CHAT_MODELS_LIST[0]], indirect=True) +def test_chat_scenario_several_chats_in_series( + llm_model: OVConvertedModelSchema, + ov_pipe: ov_genai.LLMPipeline, +) -> None: - generation_config_kwargs, _ = chat_inputs[0] + generation_config_kwargs, _ = CHAT_INPUTS[0] ov_generation_config = ov_genai.GenerationConfig(**generation_config_kwargs) - hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(llm_model.opt_model.generation_config, ov_generation_config) for i in range(2): chat_history_hf = [] chat_history_ov = [] ov_pipe.start_chat() - for prompt in questions[:2]: + for prompt in QUESTIONS[:2]: chat_history_hf.append({'role': 'user', 'content': prompt}) chat_history_ov.append({'role': 'user', 'content': prompt}) - chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + chat_prompt = llm_model.hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = llm_model.hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) prompt_len = tokenized['input_ids'].numel() - answer = opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] - answer_str = hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) + answer = llm_model.opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] + answer_str = llm_model.hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config) @@ -243,44 +338,40 @@ def test_chat_scenario_several_chats_in_series(): ov_pipe.finish_chat() if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') + logging.info(f'hf_output: {chat_history_hf}') + logging.info(f'ov_output: {chat_history_ov}') assert chat_history_ov == chat_history_hf @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_chat_models_list()) -def test_chat_scenario_several_start(model_id): - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", CHAT_MODELS_LIST, indirect=True) +def test_chat_scenario_several_start(ov_pipe: ov_genai.LLMPipeline) -> None: - generation_config_kwargs, _ = chat_inputs[0] + generation_config_kwargs, _ = CHAT_INPUTS[0] ov_generation_config = ov_genai.GenerationConfig(**generation_config_kwargs) ov_pipe.start_chat() ov_pipe.start_chat() - ov_pipe.generate(questions[0], generation_config=ov_generation_config) + ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) ov_pipe.finish_chat() @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_chat_models_list()) -def test_generate_works_same_before_and_after_chat(model_id): - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", CHAT_MODELS_LIST, indirect=True) +def test_generate_works_same_before_and_after_chat(ov_pipe: ov_genai.LLMPipeline) -> None: - generation_config_kwargs, _ = chat_inputs[0] + generation_config_kwargs, _ = CHAT_INPUTS[0] ov_generation_config = ov_genai.GenerationConfig(**generation_config_kwargs) ov_generation_config.apply_chat_template = False - res_before_chat = ov_pipe.generate(questions[0], generation_config=ov_generation_config) + res_before_chat = ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) ov_pipe.start_chat() - ov_pipe.generate(questions[0], generation_config=ov_generation_config) + ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) ov_pipe.finish_chat() - res_after_chat = ov_pipe.generate(questions[0], generation_config=ov_generation_config) + res_after_chat = ov_pipe.generate(QUESTIONS[0], generation_config=ov_generation_config) assert res_after_chat == res_before_chat @@ -288,74 +379,68 @@ def test_generate_works_same_before_and_after_chat(model_id): # Streaming with callback # -def user_defined_callback(subword): - print(subword) - - -def user_defined_status_callback(subword): - print(subword) - return ov_genai.StreamingStatus.RUNNING - -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) @pytest.mark.precommit -def test_callback_one_string(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_callback_one_string( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: generation_config = ov_pipe.get_generation_config() generation_config.max_new_tokens = 10 ov_pipe.generate('table is made of', generation_config, callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) @pytest.mark.precommit -def test_callback_batch_throws(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_callback_batch_throws( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: with pytest.raises(RuntimeError): ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) @pytest.mark.precommit -def test_callback_kwargs_one_string(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_callback_kwargs_one_string( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: ov_pipe.generate('table is made of', max_new_tokens=10, streamer=callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) @pytest.mark.precommit -def test_callback_decoding_metallama(model_id, callback): - # On metallama this prompt generates output which can shorten after adding new tokens. - # Test that streamer correctly handles such cases. +def test_callback_decoding_metallama( + llm_model: OVConvertedModelSchema, + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' - if model_id != 'meta-llama/Meta-Llama-3-8B-Instruct': + if llm_model.model_id != 'meta-llama/Meta-Llama-3-8B-Instruct': pytest.skip() - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) ov_pipe.generate(prompt, max_new_tokens=300, streamer=callback) -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) @pytest.mark.precommit -def test_callback_kwargs_batch_throws(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_callback_kwargs_batch_throws( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: with pytest.raises(RuntimeError): ov_pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_callback_terminate_by_bool(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_callback_terminate_by_bool(ov_pipe: ov_genai.LLMPipeline) -> None: current_iter = 0 num_iters = 10 @@ -368,7 +453,7 @@ def callback(subword): ov_generation_config = ov_genai.GenerationConfig(max_new_tokens=max_new_tokens, ignore_eos=True) # without attention mask - input_ids, _ = input_tensors_list[0] + input_ids, _ = INPUT_TENSORS_LIST[0] inputs_ov = ov.Tensor(input_ids) ov_output = ov_pipe.generate(inputs_ov, ov_generation_config, streamer=callback) @@ -376,49 +461,44 @@ def callback(subword): @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_callback_terminate_by_status(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) - +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_callback_terminate_by_status(ov_pipe: ov_genai.LLMPipeline) -> None: current_iter = 0 num_iters = 10 + def callback(subword): nonlocal current_iter current_iter += 1 - return ov_genai.StreamingStatus.STOP if current_iter == num_iters else ov_genai.StreamingStatus.RUNNING + return ( + ov_genai.StreamingStatus.STOP + if current_iter == num_iters + else ov_genai.StreamingStatus.RUNNING + ) max_new_tokens = 100 ov_generation_config = ov_genai.GenerationConfig(max_new_tokens=max_new_tokens, ignore_eos=True) # without attention mask - input_ids, _ = input_tensors_list[0] + input_ids, _ = INPUT_TENSORS_LIST[0] inputs_ov = ov.Tensor(input_ids) ov_output = ov_pipe.generate(inputs_ov, ov_generation_config, streamer=callback) assert len(ov_output.tokens[0]) < max_new_tokens -@pytest.mark.parametrize("model_id", get_chat_models_list()) +@pytest.mark.parametrize("llm_model", CHAT_MODELS_LIST, indirect=True) @pytest.mark.precommit -def test_chat_scenario_callback_cancel(model_id): - callback_questions = [ - '1+1=', - 'Why is the Sun yellow?', - 'What is the previous answer?', - 'What was my first question?' - ] - - generation_config_kwargs = dict(max_new_tokens=20) +def test_chat_scenario_callback_cancel( + llm_model: OVConvertedModelSchema, + ov_pipe: ov_genai.LLMPipeline, +) -> None: + generation_config_kwargs = {'max_new_tokens': 20} chat_history_hf = [] chat_history_ov = [] - opt_model, hf_tokenizer, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) - ov_generation_config = ov_genai.GenerationConfig(**generation_config_kwargs) - hf_generation_config = generation_config_to_hf(opt_model.generation_config, ov_generation_config) + hf_generation_config = generation_config_to_hf(llm_model.opt_model.generation_config, ov_generation_config) current_iter = 0 num_iters = 3 @@ -428,17 +508,17 @@ def callback(subword): return ov_genai.StreamingStatus.CANCEL if current_iter == num_iters else ov_genai.StreamingStatus.RUNNING ov_pipe.start_chat() - for prompt in callback_questions: - if (prompt != callback_questions[1]): + for prompt in CALLBACK_QUESTIONS: + if (prompt != CALLBACK_QUESTIONS[1]): chat_history_hf.append({'role': 'user', 'content': prompt}) chat_history_ov.append({'role': 'user', 'content': prompt}) - chat_prompt = hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + chat_prompt = llm_model.hf_tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = llm_model.hf_tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) prompt_len = tokenized['input_ids'].numel() - answer = opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] - answer_str = hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) + answer = llm_model.opt_model.generate(**tokenized, generation_config=hf_generation_config, **extra_generate_kwargs()).sequences[0] + answer_str = llm_model.hf_tokenizer.decode(answer[prompt_len:], skip_special_tokens=True) chat_history_hf.append({'role': 'assistant', 'content': answer_str}) answer_ov = ov_pipe.generate(prompt, generation_config=ov_generation_config) @@ -449,8 +529,8 @@ def callback(subword): ov_pipe.finish_chat() if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') + logging.info(f'hf_output: {chat_history_hf}') + logging.info(f'ov_output: {chat_history_ov}') assert chat_history_ov == chat_history_hf @@ -499,12 +579,13 @@ def end(self): print('end') +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) @pytest.mark.parametrize("streamer_base", [PrinterNone, PrinterBool, PrinterStatus]) -@pytest.mark.parametrize("model_id", get_models_list()) @pytest.mark.precommit -def test_streamer_one_string(streamer_base, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_streamer_one_string( + ov_pipe: ov_genai.LLMPipeline, + streamer_base: type, +) -> None: generation_config = ov_pipe.get_generation_config() generation_config.max_new_tokens = 10 printer = streamer_base(ov_pipe.get_tokenizer()) @@ -512,70 +593,65 @@ def test_streamer_one_string(streamer_base, model_id): @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_streamer_batch_throws(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_streamer_batch_throws(ov_pipe: ov_genai.LLMPipeline) -> None: printer = PrinterNone(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): ov_pipe.generate(['1', '2'], ov_pipe.get_generation_config(), printer) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_streamer_kwargs_one_string(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_streamer_kwargs_one_string(ov_pipe: ov_genai.LLMPipeline) -> None: printer = PrinterNone(ov_pipe.get_tokenizer()) ov_pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_streamer_kwargs_batch_throws(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_streamer_kwargs_batch_throws(ov_pipe: ov_genai.LLMPipeline) -> None: printer = PrinterNone(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): ov_pipe.generate('', num_beams=2, streamer=printer) @pytest.mark.precommit -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_operator_with_callback_one_string(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) +def test_operator_with_callback_one_string( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: ten_tokens = ov_pipe.get_generation_config() ten_tokens.max_new_tokens = 10 ov_pipe('talbe is made of', ten_tokens, callback) @pytest.mark.precommit -@pytest.mark.parametrize("callback", [print, user_defined_callback, user_defined_status_callback, lambda subword: print(subword)]) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_operator_with_callback_batch_throws(callback, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("callback", CALLBACK_FUNCTIONS) +def test_operator_with_callback_batch_throws( + ov_pipe: ov_genai.LLMPipeline, + callback: Callable, +) -> None: with pytest.raises(RuntimeError): ov_pipe(['1', '2'], ov_pipe.get_generation_config(), callback) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) @pytest.mark.parametrize("streamer_base", [PrinterNone, PrinterBool, PrinterStatus]) -@pytest.mark.parametrize("model_id", get_models_list()) @pytest.mark.precommit -def test_operator_with_streamer_kwargs_one_string(streamer_base, model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +def test_operator_with_streamer_kwargs_one_string( + ov_pipe: ov_genai.LLMPipeline, + streamer_base: type, +) -> None: printer = streamer_base(ov_pipe.get_tokenizer()) ov_pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_operator_with_streamer_kwargs_batch_throws(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_operator_with_streamer_kwargs_batch_throws(ov_pipe: ov_genai.LLMPipeline) -> None: printer = PrinterNone(ov_pipe.get_tokenizer()) with pytest.raises(RuntimeError): ov_pipe('', num_beams=2, streamer=printer) @@ -584,7 +660,6 @@ def test_operator_with_streamer_kwargs_batch_throws(model_id): # Tests on generation configs handling # - def load_genai_pipe_with_configs(configs: list[tuple], temp_path): # Load LLMPipeline where all configs are cleared. # remove existing jsons from previous tests @@ -596,7 +671,7 @@ def load_genai_pipe_with_configs(configs: list[tuple], temp_path): with (temp_path / config_name).open('w', encoding="utf-8") as f: json.dump(config_json, f) - ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_llm_properties()) + ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU') for _, config_name in configs: os.remove(temp_path / config_name) @@ -617,11 +692,9 @@ def test_eos_token_is_inherited_from_default_generation_config(model_tmp_path): @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_pipeline_validates_generation_config(model_id): - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) - invalid_generation_config = dict(num_beam_groups=3, num_beams=15, do_sample=True) # beam sample is not supported +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_pipeline_validates_generation_config(ov_pipe: ov_genai.LLMPipeline) -> None: + invalid_generation_config = {'num_beam_groups': 3, 'num_beams': 15, 'do_sample': True} # beam sample is not supported with pytest.raises(RuntimeError): ov_pipe.generate("dummy prompt", **invalid_generation_config) @@ -630,34 +703,22 @@ def test_pipeline_validates_generation_config(model_id): # @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_unicode_pybind_decoding_one_string(model_id): - # On this model this prompt generates unfinished utf string. - # Test that pybind will not fail. - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_unicode_pybind_decoding_one_string(ov_pipe: ov_genai.LLMPipeline) -> None: res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=False) assert '�' == res_str[-1] @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_unicode_pybind_decoding_batched(model_id): - # On this model this prompt generates unfinished utf string. - # Test that pybind will not fail. - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_unicode_pybind_decoding_batched(ov_pipe: ov_genai.LLMPipeline) -> None: res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=False) assert '�' == res_str.texts[0][-1] @pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list()) -def test_unicode_pybind_decoding_one_string_streamer(model_id): - # On this model this prompt generates unfinished utf-8 string - # and streams it. Test that pybind will not fail while we pass string to python. - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +def test_unicode_pybind_decoding_one_string_streamer(ov_pipe: ov_genai.LLMPipeline) -> None: res_str = [] ov_pipe.generate(",", max_new_tokens=4, apply_chat_template=False, streamer=lambda x: res_str.append(x)) assert '�' == ''.join(res_str)[-1] @@ -666,22 +727,18 @@ def test_unicode_pybind_decoding_one_string_streamer(model_id): # Perf metrics # -def run_perf_metrics_collection(model_id, generation_config_dict: dict, prompt: str) -> ov_genai.PerfMetrics: - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) - return ov_pipe.generate([prompt], **generation_config_dict).perf_metrics - - -test_cases = [ - (dict(max_new_tokens=20), 'table is made of'), -] -@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("llm_model", ['katuni4ka/tiny-random-gemma2'], indirect=True) +@pytest.mark.parametrize("generation_config,prompt", PERF_TEST_CASES) @pytest.mark.precommit -def test_perf_metrics(generation_config, prompt): +def test_perf_metrics( + llm_model: OVConvertedModelSchema, + generation_config: dict, + prompt: str, +) -> None: import time start_time = time.perf_counter() - model_id = 'katuni4ka/tiny-random-gemma2' - perf_metrics = run_perf_metrics_collection(model_id, generation_config, prompt) + ov_pipe = create_ov_pipeline(llm_model.models_path) + perf_metrics = ov_pipe.generate([prompt], **generation_config).perf_metrics total_time = (time.perf_counter() - start_time) * 1000 # Check that load time is adequate. @@ -749,27 +806,28 @@ def test_perf_metrics(generation_config, prompt): assert len(raw_metrics.m_durations) > 0 -test_cases = [ - (dict(max_new_tokens=20), 'Generate json of a person'), -] -@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("llm_model", ['katuni4ka/tiny-random-gemma2'], indirect=True) +@pytest.mark.parametrize("generation_config,prompt", PERF_METRICS_TEST_CASES) @pytest.mark.precommit -def test_perf_metrics_with_structured_output(generation_config, prompt): +def test_perf_metrics_with_structured_output( + ov_pipe: ov_genai.LLMPipeline, + generation_config: dict, + prompt: str, +) -> None: class Person(BaseModel): name: str = Field(pattern=r"^[A-Z][a-z]{1,20}$") surname: str = Field(pattern=r"^[A-Z][a-z]{1,20}$") age: int city: Literal["Dublin", "Dubai", "Munich"] - generation_config.update(dict(structured_output_config=ov_genai.StructuredOutputConfig(json_schema=json.dumps(Person.model_json_schema())))) - - model_id = 'katuni4ka/tiny-random-gemma2' - _, _, models_path = download_and_convert_model(model_id) - ov_pipe = create_ov_pipeline(models_path) + generation_config.update({'structured_output_config': ov_genai.StructuredOutputConfig(json_schema=json.dumps(Person.model_json_schema()))}) perf_metrics = ov_pipe.generate([prompt], **generation_config).perf_metrics raw_metrics = perf_metrics.raw_metrics assert len(perf_metrics.get_grammar_compiler_init_times()) > 0 - assert 'xgrammar' in perf_metrics.get_grammar_compiler_init_times() and perf_metrics.get_grammar_compiler_init_times()['xgrammar'] > 0.0 + assert ( + 'xgrammar' in perf_metrics.get_grammar_compiler_init_times() + and perf_metrics.get_grammar_compiler_init_times()['xgrammar'] > 0.0 + ) assert len(raw_metrics.grammar_compile_times) > 0 @@ -786,19 +844,18 @@ class Person(BaseModel): assert accumulated_metrics.raw_metrics.grammar_compile_times == raw_metrics.grammar_compile_times + raw_metrics_2.grammar_compile_times -@pytest.mark.parametrize("pipeline_type", get_main_pipeline_types()) +@pytest.mark.parametrize("llm_model", ["facebook/opt-125m"], indirect=True) +@pytest.mark.parametrize("pipeline_type", PIPELINE_MAIN_TYPES) @pytest.mark.parametrize("stop_str", {True, False}) @pytest.mark.precommit -def test_pipelines_generate_with_streaming(pipeline_type, stop_str): - # streamer - it_cnt = 0 - def py_streamer(py_str: str): - nonlocal it_cnt - it_cnt += 1 - return False +def test_pipelines_generate_with_streaming( + llm_model: OVConvertedModelSchema, + pipeline_type: PipelineType, + stop_str: bool, +) -> None: + mock_streamer = MagicMock(return_value=False) prompt = "Prompt example is" - model_id : str = "facebook/opt-125m" generation_config = ov_genai.GenerationConfig() generation_config.max_new_tokens = 10 @@ -806,13 +863,15 @@ def py_streamer(py_str: str): generation_config.stop_strings = {" the", "Prom"} generation_config.include_stop_str_in_output = False - _ = generate_and_compare(model=model_id, - prompts=prompt, - generation_config=generation_config, - pipeline_type=pipeline_type, - streamer=py_streamer) + generate_and_compare( + model_schema=llm_model, + prompts=prompt, + generation_config=generation_config, + pipeline_type=pipeline_type, + streamer=mock_streamer, + ) if stop_str: - assert it_cnt == 0 + mock_streamer.assert_not_called() else: - assert it_cnt > 0 + mock_streamer.assert_called() diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index a1109eb0ee..6c35c96ebe 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -2,88 +2,100 @@ # SPDX-License-Identifier: Apache-2.0 from openvino_genai import GenerationConfig, Tokenizer, LLMPipeline, StreamerBase -import os +from pathlib import Path import pytest import platform import sys +import logging from utils.constants import get_default_llm_properties from utils.tokenizers import model_tmp_path -from utils.hugging_face import download_and_convert_model -from utils.ov_genai_pipelines import create_ov_pipeline +from utils.hugging_face import download_and_convert_model, OVConvertedModelSchema from utils.generation_config import \ get_greedy, \ get_greedy_with_penalties, \ get_multinomial_all_parameters, \ get_multinomial_temperature_and_presence_penalty, \ get_beam_search -from data.models import get_models_list, get_chat_models_list +from data.models import get_models_list if sys.platform == 'darwin' or platform.machine() in ["aarch64", "arm64", "ARM64"]: pytest.skip("NPU plugin is available only on Linux and Windows x86_64", allow_module_level=True) -default_config = { - 'NPUW_DEVICES': 'CPU', - 'NPUW_ONLINE_PIPELINE': 'NONE' - } | get_default_llm_properties() +DEFAULT_CONFIG: dict = { + 'NPUW_DEVICES': 'CPU', + 'NPUW_ONLINE_PIPELINE': 'NONE' +} | get_default_llm_properties() -static_config = { **default_config, 'STATIC_PIPELINE': 'STATEFUL' } +STATIC_CONFIG: dict = { **DEFAULT_CONFIG, 'STATIC_PIPELINE': 'STATEFUL' } # Test both, static and generic pipelines -pipeline_configs = [default_config, static_config] +PIPELINE_CONFIGS: list[dict] = [ + pytest.param(DEFAULT_CONFIG, id="generic_pipeline"), + pytest.param(STATIC_CONFIG, id="static_pipeline") +] -blob_with_weights = [True, False] +BLOB_WITH_WEIGHTS: list[bool] = [True, False] -def generate_chat_history(model_path, device, pipeline_config, questions): - pipe = LLMPipeline(model_path, device, **pipeline_config) - pipe.start_chat() - chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ] - pipe.finish_chat() - return chat_history +MODELS_LIST = get_models_list() -generation_configs = [ - get_greedy(), - get_greedy_with_penalties() -] -@pytest.mark.precommit -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -@pytest.mark.xfail(reason="Generation result mismatch. Ticket 171117", raises=AssertionError) -def test_generation_compare_with_stateful(generation_config, config, model_id): - prompt = 'What is OpenVINO?' - _, _, model_path = download_and_convert_model(model_id) +@pytest.fixture(scope="module") +def llm_model(request: pytest.FixtureRequest) -> OVConvertedModelSchema: + return download_and_convert_model(request.param) - stateful_pipe = LLMPipeline(model_path, "CPU", **get_default_llm_properties()) - ref_out = stateful_pipe.generate(prompt, generation_config) - static_pipe = LLMPipeline(model_path, "NPU", **config) - actual_out = static_pipe.generate(prompt, generation_config) +@pytest.fixture(scope="module") +def ov_model(llm_model: OVConvertedModelSchema) -> LLMPipeline: + return LLMPipeline( + llm_model.models_path, + "CPU", + **get_default_llm_properties(), + ) - assert ref_out == actual_out + +@pytest.fixture(scope="module") +def npu_config(request: pytest.FixtureRequest) -> LLMPipeline: + return request.param + + +@pytest.fixture(scope="module") +def npu_model(llm_model: OVConvertedModelSchema, npu_config: dict) -> LLMPipeline: + return LLMPipeline( + llm_model.models_path, + "NPU", + **npu_config, + ) @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("with_weights", blob_with_weights) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_pipeline_from_blob(model_tmp_path, config, with_weights, model_id): +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +@pytest.mark.parametrize("with_weights", BLOB_WITH_WEIGHTS) +def test_pipeline_from_blob( + llm_model: OVConvertedModelSchema, + ov_model: LLMPipeline, + npu_config: dict, + model_tmp_path: tuple[str, Path], + with_weights: bool +): prompt = 'What is OpenVINO?' - _, _, model_path = download_and_convert_model(model_id) + model_path = llm_model.models_path _, temp_path = model_tmp_path - blob_path = os.path.join(temp_path, "compiled_model.blob") + blob_path = temp_path / "compiled_model.blob" - cpu_pipe = LLMPipeline(model_path, "CPU", **get_default_llm_properties()) - ref_out = cpu_pipe.generate(prompt, max_new_tokens=30) + ref_out = ov_model.generate(prompt, max_new_tokens=30) + + blob_path = blob_path.as_posix() + model_path_bin = (model_path / "openvino_model.bin").as_posix() # NB: Generate the blob - cfg = { "EXPORT_BLOB": "YES", "BLOB_PATH": blob_path } - cfg |= config + cfg = { "EXPORT_BLOB": "YES", "BLOB_PATH": blob_path} + cfg |= npu_config if with_weights: cfg |= {"CACHE_MODE": "OPTIMIZE_SPEED"} npu_pipe = LLMPipeline(model_path, "NPU", **cfg) @@ -92,8 +104,8 @@ def test_pipeline_from_blob(model_tmp_path, config, with_weights, model_id): del npu_pipe # Import blob and check accuracy - import_cfg = {"BLOB_PATH": blob_path, "WEIGHTS_PATH": os.path.join(model_path, "openvino_model.bin") } - import_cfg |= config + import_cfg = {"BLOB_PATH": blob_path, "WEIGHTS_PATH": model_path_bin } + import_cfg |= npu_config if with_weights: import_cfg.pop("WEIGHTS_PATH") npu_pipe = LLMPipeline(model_path, "NPU", **import_cfg) @@ -103,20 +115,50 @@ def test_pipeline_from_blob(model_tmp_path, config, with_weights, model_id): @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("with_weights", blob_with_weights) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_pipeline_cache_dir(model_tmp_path, config, with_weights, model_id): +@pytest.mark.parametrize( + "generation_config", + [ + pytest.param(get_greedy(), id="greedy"), + pytest.param(get_greedy_with_penalties(), id="greedy_with_penalties"), + ] +) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +@pytest.mark.xfail(reason="Generation result mismatch. Ticket 171117", raises=AssertionError) +def test_generation_compare_with_stateful( + ov_model: LLMPipeline, + npu_model: LLMPipeline, + generation_config: GenerationConfig, +): prompt = 'What is OpenVINO?' - _, _, model_path = download_and_convert_model(model_id) + + ref_out = ov_model.generate(prompt, generation_config) + actual_out = npu_model.generate(prompt, generation_config) + + assert ref_out == actual_out + + +@pytest.mark.precommit +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +@pytest.mark.parametrize("with_weights", BLOB_WITH_WEIGHTS) +def test_pipeline_cache_dir( + llm_model: OVConvertedModelSchema, + ov_model: LLMPipeline, + model_tmp_path: tuple[str, Path], + npu_config: dict, + with_weights: bool, +): + prompt = 'What is OpenVINO?' + model_path = llm_model.models_path _, temp_path = model_tmp_path + temp_path = Path(temp_path) - cpu_pipe = LLMPipeline(model_path, "CPU", **get_default_llm_properties()) - ref_out = cpu_pipe.generate(prompt, max_new_tokens=30) + ref_out = ov_model.generate(prompt, max_new_tokens=30) # NB: Generate the blob cfg = { "NPUW_DEVICES": "CPU", "CACHE_DIR": str(temp_path) } - cfg |= config + cfg |= npu_config if with_weights: cfg |= {"CACHE_MODE": "OPTIMIZE_SPEED"} npu_pipe = LLMPipeline(model_path, "NPU", **cfg) @@ -125,32 +167,37 @@ def test_pipeline_cache_dir(model_tmp_path, config, with_weights, model_id): del npu_pipe # Check that blob was cached - blobs = [file for file in os.listdir(temp_path) if file.endswith(".blob")] + blobs = [file for file in temp_path.iterdir() if file.suffix == ".blob"] if len(blobs) == 0: - print(f"Couldn't cache the blob") + logging.info(f"Couldn't cache the blob") assert len(blobs) > 0 # Import blob and check accuracy - npu_pipe = LLMPipeline(model_path, "NPU", **(config | { "CACHE_DIR": str(temp_path) })) + npu_pipe = LLMPipeline(model_path, "NPU", **(npu_config | { "CACHE_DIR": str(temp_path) })) actual_out = npu_pipe.generate(prompt, max_new_tokens=30) # Check that blob was used from cache - blobs = [file for file in os.listdir(temp_path) if file.endswith(".blob")] + blobs = [file for file in temp_path.iterdir() if file.suffix == ".blob"] if len(blobs) == 0: - print(f"Couldn't cache the blob") + logging.info(f"Couldn't cache the blob") assert len(blobs) > 0 assert ref_out == actual_out -generation_configs = [ - get_multinomial_temperature_and_presence_penalty() -] @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_multinomial_sampling(generation_config, config, model_id): +@pytest.mark.parametrize( + "generation_config", + [ + pytest.param(get_multinomial_temperature_and_presence_penalty(), id="temp+presence"), + ] +) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_multinomial_sampling( + npu_model: LLMPipeline, + generation_config: GenerationConfig, +): # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply @@ -158,105 +205,117 @@ def test_multinomial_sampling(generation_config, config, model_id): # variations in raw logits. Therefore, there is no reliable reference for validation, # so only ensure that no exceptions are raised. prompt = 'What is OpenVINO?' - _, _, model_path = download_and_convert_model(model_id) - static_pipe = LLMPipeline(model_path, "NPU", **config) - actual_out = static_pipe.generate(prompt, generation_config) + npu_model.generate(prompt, generation_config) @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_length_properties_set_no_exception(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_length_properties_set_no_exception( + llm_model: OVConvertedModelSchema, + npu_config: dict +): + model_path = llm_model.models_path # NB: Check it doesn't throw any exception pipeline_config = { "MAX_PROMPT_LEN": 256, "MIN_RESPONSE_LEN": 64 } - pipeline_config |= config - pipe = LLMPipeline(model_path, "NPU", **pipeline_config) + pipeline_config |= npu_config + LLMPipeline(model_path, "NPU", **pipeline_config) -length_configs = [ - { "MAX_PROMPT_LEN": -1 }, - { "MAX_PROMPT_LEN": "1" }, - { "MIN_RESPONSE_LEN": -1 }, - { "MIN_RESPONSE_LEN": "1" } -] -@pytest.mark.parametrize("length_config", length_configs) -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) @pytest.mark.precommit -def test_invalid_length_properties_raise_error(length_config, config, model_id): - _, _, model_path = download_and_convert_model(model_id) - length_config |= config +@pytest.mark.parametrize( + "length_config", + [ + { "MAX_PROMPT_LEN": -1 }, + { "MAX_PROMPT_LEN": "1" }, + { "MIN_RESPONSE_LEN": -1 }, + { "MIN_RESPONSE_LEN": "1" }, + ] +) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_invalid_length_properties_raise_error( + llm_model: OVConvertedModelSchema, + npu_config: dict, + length_config: dict, +): + model_path = llm_model.models_path + length_config |= npu_config with pytest.raises(RuntimeError): - pipe = LLMPipeline(model_path, "NPU", **length_config) + LLMPipeline(model_path, "NPU", **length_config) @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_batch_one_no_exception(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +@pytest.mark.precommit +def test_batch_one_no_exception(npu_model: LLMPipeline): prompt = 'The Sun is yellow because' - static_pipe = LLMPipeline(model_path, "NPU", **config) # Check it doesn't throw any exception when batch of size 1 is provided - actual_out = static_pipe.generate([prompt], max_new_tokens=20) + npu_model.generate([prompt], max_new_tokens=20) # TODO: For the further batch support @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_batch_raise_error(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_batch_raise_error(npu_model: LLMPipeline): prompt = 'The Sun is yellow because' - pipe = LLMPipeline(model_path, "NPU", **config) with pytest.raises(RuntimeError): - pipe.generate([prompt] * 3, max_new_tokens=100) + npu_model.generate([prompt] * 3, max_new_tokens=100) # TODO: For the further sampling support -generation_configs = [ - get_beam_search(), - # NB: Only num_return_sequences=1 is supported! - get_multinomial_all_parameters() -] -@pytest.mark.parametrize("generation_config", generation_configs) -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) @pytest.mark.precommit -def test_unsupported_sampling_raise_error(generation_config, config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize( + "generation_config", + [ + pytest.param(get_beam_search(), id="beam_search"), + # NB: Only num_return_sequences=1 is supported! + pytest.param(get_multinomial_all_parameters(), id="multinomial") + ] +) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_unsupported_sampling_raise_error( + npu_model: LLMPipeline, + generation_config: GenerationConfig, +): prompt = 'What is OpenVINO?' - pipe = LLMPipeline(model_path, "NPU", **config) with pytest.raises(RuntimeError): - pipe.generate(prompt, generation_config) + npu_model.generate(prompt, generation_config) @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_terminate_by_max_number_of_tokens(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_terminate_by_max_number_of_tokens( + llm_model: OVConvertedModelSchema, + npu_model: LLMPipeline, +): + model_path = llm_model.models_path prompt = 'The Sun is yellow because' num_tokens = 128 - pipe = LLMPipeline(model_path, "NPU", **config) tokenizer = Tokenizer(model_path) tokenized_input = tokenizer.encode(prompt) # ignore_eos=True to ensure model will generate exactly num_tokens - encoded_results = pipe.generate(tokenized_input, max_new_tokens=num_tokens, ignore_eos=True) + encoded_results = npu_model.generate(tokenized_input, max_new_tokens=num_tokens, ignore_eos=True) assert len(encoded_results.tokens[0]) == num_tokens @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_terminate_by_out_of_memory(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_terminate_by_out_of_memory( + llm_model: OVConvertedModelSchema, + npu_config: dict +): + model_path = llm_model.models_path prompt = 'The Sun is yellow because' pipeline_config = { "MAX_PROMPT_LEN": 256, "MIN_RESPONSE_LEN": 64 } - pipeline_config |= config + pipeline_config |= npu_config kv_cache_size = pipeline_config['MAX_PROMPT_LEN'] + pipeline_config['MIN_RESPONSE_LEN'] tokenizer = Tokenizer(model_path) @@ -270,10 +329,13 @@ def test_terminate_by_out_of_memory(config, model_id): @pytest.mark.precommit -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) -def test_terminate_by_sampler(config, model_id): - _, _, model_path = download_and_convert_model(model_id) +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_terminate_by_sampler( + llm_model: OVConvertedModelSchema, + npu_model: LLMPipeline, +): + model_path = llm_model.models_path prompt = 'The Sun is yellow because' current_iter = 0 @@ -292,18 +354,30 @@ def end(self): tokenizer = Tokenizer(model_path) tokenized_input = tokenizer.encode(prompt) - pipe = LLMPipeline(model_path, "NPU", **config) - encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True, streamer=TestStreamer()) + encoded_results = npu_model.generate( + tokenized_input, + max_new_tokens=1000, + ignore_eos=True, + streamer=TestStreamer(), + ) assert len(encoded_results.tokens[0]) == num_iters # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt! -@pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline") -@pytest.mark.parametrize("config", pipeline_configs) -@pytest.mark.parametrize("model_id", get_models_list()) @pytest.mark.precommit -def test_chat_generation(config, model_id): +@pytest.mark.parametrize("llm_model", MODELS_LIST, indirect=True) +@pytest.mark.parametrize("npu_config", PIPELINE_CONFIGS, indirect=True) +def test_chat_generation( + ov_model: LLMPipeline, + npu_model: LLMPipeline, +): + def generate_chat_history(pipe: LLMPipeline, questions): + pipe.start_chat() + chat_history = [ pipe.generate(question, max_new_tokens=50, do_sample=False) for question in questions ] + pipe.finish_chat() + return chat_history + questions = [ '1+1=', 'What is the previous answer?', @@ -311,10 +385,8 @@ def test_chat_generation(config, model_id): 'What was my first question?' ] - _, _, model_path = download_and_convert_model(model_id) - - chat_history_stateful = generate_chat_history(model_path, "CPU", get_default_llm_properties(), questions) - chat_history_static = generate_chat_history(model_path, "NPU", config, questions) + chat_history_stateful = generate_chat_history(ov_model, questions) + chat_history_static = generate_chat_history(npu_model, questions) print('npu chat: \n{chat_history_static}\n') print('cpu chat: \n{chat_history_stateful}') diff --git a/tests/python_tests/test_rag.py b/tests/python_tests/test_rag.py index 3facb41112..7c6b4ab109 100644 --- a/tests/python_tests/test_rag.py +++ b/tests/python_tests/test_rag.py @@ -6,11 +6,11 @@ import gc from pathlib import Path from openvino_genai import TextEmbeddingPipeline, TextRerankPipeline -from utils.hugging_face import download_and_convert_embeddings_models, download_and_convert_rerank_model, download_and_convert_model_fixture +from utils.hugging_face import download_and_convert_model, download_and_convert_model_class, OVConvertedModelSchema from langchain_core.documents.base import Document from langchain_community.embeddings import OpenVINOBgeEmbeddings from langchain_community.document_compressors.openvino_rerank import OpenVINOReranker -from typing import Literal, Union +from typing import Literal, Union, Optional import sys import platform from optimum.intel import OVModelForFeatureExtraction, OVModelForSequenceClassification @@ -65,11 +65,33 @@ " -@pytest.fixture(scope="class", autouse=True) +@pytest.fixture(scope="module") +def rerank_model(request) -> OVConvertedModelSchema: + model_id = request.param + return download_and_convert_model_class(model_id, OVModelForSequenceClassification) + + + +@pytest.fixture(scope="module") +def emb_model(request) -> OVConvertedModelSchema: + model_id = request.param + return download_and_convert_model_class(model_id, OVModelForFeatureExtraction) + + +@pytest.fixture(scope="module") +def llm_model(request: pytest.FixtureRequest) -> OVConvertedModelSchema: + tokenizer_kwargs = { + "padding_side": "left" + } + return download_and_convert_model(request.param, **tokenizer_kwargs) + + +@pytest.fixture(autouse=True) def run_gc_after_test(): """ - Fixture to run garbage collection after each test class. - This is a workaround to minimize memory consumption during tests and allow the use of less powerful CI runners. + Fixture to run garbage collection after each test. + This is a workaround to minimize memory consumption + during tests and allow the use of less powerful CI runners. """ yield gc.collect() @@ -83,7 +105,7 @@ def dataset_documents(chunk_size=200): def run_text_embedding_genai( models_path: Path, documents: list[str], - config: TextEmbeddingPipeline.Config | None = None, + config: Optional[TextEmbeddingPipeline.Config] = None, task: Literal["embed_documents", "embed_query"] = "embed_documents", ): if not config: @@ -103,7 +125,7 @@ def run_text_embedding_genai( def run_text_embedding_langchain( models_path: Path, documents: list[str], - config: TextEmbeddingPipeline.Config | None = None, + config: Optional[TextEmbeddingPipeline.Config] = None, task: Literal["embed_documents", "embed_query"] = "embed_documents", ): if not config: @@ -171,19 +193,22 @@ def run_qwen3_embedding_optimum( def validate_embedding_results(result_1: EmbeddingResult, result_2: EmbeddingResult): + __tracebackhide__ = True np_result_1 = np.array(result_1) np_result_2 = np.array(result_2) max_error = np.abs(np_result_1 - np_result_2).max() assert max_error < MAX_EMBEDDING_ERROR, f"Max error: {max_error} is greater than allowed {MAX_EMBEDDING_ERROR}" + def run_text_embedding_pipeline_with_ref( models_path: Path, documents: list[str], - config: TextEmbeddingPipeline.Config | None = None, + config: Optional[TextEmbeddingPipeline.Config] = None, task: Literal["embed_documents", "embed_query"] = "embed_documents", ): + __tracebackhide__ = True genai_result = run_text_embedding_genai(models_path, documents, config, task) langchain_result = run_text_embedding_langchain(models_path, documents, config, task) @@ -191,7 +216,8 @@ def run_text_embedding_pipeline_with_ref( def assert_rerank_results(result_1: list[tuple[int, float]], result_2: list[tuple[int, float]]): - score_diff_max = 1e-6 if sys.platform != "darwin" else 2e-4 # ARM64 macs have different results + __tracebackhide__ = True + score_diff_max = 1e-6 if sys.platform != 'darwin' else 2e-4 # ARM64 macs have different results assert len(result_1) == len(result_2), f"Results length mismatch: {len(result_1)} != {len(result_2)}" for pair_1, pair_2 in zip(result_1, result_2): assert pair_1[0] == pair_2[0], f"Document IDs do not match: {pair_1[0]} != {pair_2[0]}" @@ -202,7 +228,7 @@ def run_text_rerank_langchain( models_path: Path, query: str, documents: list[str], - config: TextRerankPipeline.Config | None = None, + config: Optional[TextRerankPipeline.Config] = None, ): if not config: config = TextRerankPipeline.Config() @@ -258,7 +284,7 @@ def run_text_rerank_genai( models_path: Path, query: str, documents: list[str], - config: TextRerankPipeline.Config | None = None, + config: Optional[TextRerankPipeline.Config] = None, ): if not config: config = TextRerankPipeline.Config() @@ -281,7 +307,7 @@ def run_text_rerank_pipeline_with_ref( models_path: Path, query: str, documents: list[str], - config: TextRerankPipeline.Config | None = None, + config: Optional[TextRerankPipeline.Config] = None, ): genai_result = run_text_rerank_genai(models_path, query, documents, config) langchain_result = run_text_rerank_langchain(models_path, query, documents, config) @@ -289,10 +315,50 @@ def run_text_rerank_pipeline_with_ref( assert_rerank_results(genai_result, langchain_result) -@pytest.mark.parametrize("download_and_convert_embeddings_models", ["BAAI/bge-small-en-v1.5"], indirect=True) @pytest.mark.precommit -def test_embedding_constructors(download_and_convert_embeddings_models): - _, _, models_path = download_and_convert_embeddings_models +@pytest.mark.parametrize( + "emb_model", + ["Qwen/Qwen3-Embedding-0.6B"], + indirect=True, +) +@pytest.mark.parametrize( + "config", + [ + TextEmbeddingPipeline.Config( + normalize=False, + pooling_type=TextEmbeddingPipeline.PoolingType.LAST_TOKEN, + padding_side="left" + ), + TextEmbeddingPipeline.Config( + normalize=False, + pooling_type=TextEmbeddingPipeline.PoolingType.LAST_TOKEN + ), + ], +) +@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 174635") +def test_qwen3_embedding(emb_model, dataset_documents, config): + if sys.platform == "darwin": + pytest.xfail("Qwen3-Embedding-0.6B model produces different results on ARM64 macs.") + + embeddings_opt = run_qwen3_embedding_optimum( + emb_model.opt_model, + emb_model.hf_tokenizer, + dataset_documents, + config.padding_side, + ) + embeddings_genai = run_text_embedding_genai( + emb_model.models_path, + dataset_documents, + config, + "embed_documents", + ) + validate_embedding_results(embeddings_genai, embeddings_opt.tolist()) + + +@pytest.mark.parametrize("emb_model", ["BAAI/bge-small-en-v1.5"], indirect=True) +@pytest.mark.precommit +def test_embedding_constructors(emb_model): + models_path = emb_model.models_path TextEmbeddingPipeline(models_path, "CPU") TextEmbeddingPipeline(models_path, "CPU", TextEmbeddingPipeline.Config()) @@ -317,24 +383,7 @@ def test_embedding_constructors(download_and_convert_embeddings_models): ) -@pytest.mark.parametrize("download_and_convert_embeddings_models", ["Qwen/Qwen3-Embedding-0.6B"], indirect=True) -@pytest.mark.parametrize( - "config", - [ - TextEmbeddingPipeline.Config(normalize=False, pooling_type=TextEmbeddingPipeline.PoolingType.LAST_TOKEN, padding_side="left"), - TextEmbeddingPipeline.Config(normalize=False, pooling_type=TextEmbeddingPipeline.PoolingType.LAST_TOKEN), - ], -) -@pytest.mark.precommit -@pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 174635") -def test_qwen3_embedding(download_and_convert_embeddings_models, dataset_documents, config): - opt_model, hf_tokenizer, models_path = download_and_convert_embeddings_models - embeddings_opt = run_qwen3_embedding_optimum(opt_model, hf_tokenizer, dataset_documents, config.padding_side) - embeddings_genai = run_text_embedding_genai(models_path, dataset_documents, config, "embed_documents") - validate_embedding_results(embeddings_genai, embeddings_opt.tolist()) - - -@pytest.mark.parametrize("download_and_convert_embeddings_models", EMBEDDINGS_TEST_MODELS, indirect=True) +@pytest.mark.parametrize("emb_model", EMBEDDINGS_TEST_MODELS, indirect=True) @pytest.mark.parametrize( "config", [ @@ -356,19 +405,19 @@ def test_qwen3_embedding(download_and_convert_embeddings_models, dataset_documen ], ) @pytest.mark.precommit -def test_embed_documents(download_and_convert_embeddings_models, dataset_documents, config): +def test_embed_documents(emb_model, dataset_documents, config): if ( sys.platform == "linux" - and "bge-small-en-v1.5" in str(download_and_convert_embeddings_models) + and "bge-small-en-v1.5" in str(emb_model) and config.normalize and config.pooling_type == TextEmbeddingPipeline.PoolingType.CLS ): pytest.xfail("Random segmentation fault. Ticket 172306") - _, _, models_path = download_and_convert_embeddings_models + models_path = emb_model.models_path run_text_embedding_pipeline_with_ref(models_path, dataset_documents, config, "embed_documents") -@pytest.mark.parametrize("download_and_convert_embeddings_models", EMBEDDINGS_TEST_MODELS, indirect=True) +@pytest.mark.parametrize("emb_model", EMBEDDINGS_TEST_MODELS, indirect=True) @pytest.mark.parametrize( "config", [ @@ -390,18 +439,18 @@ def test_embed_documents(download_and_convert_embeddings_models, dataset_documen ], ) @pytest.mark.precommit -def test_embed_query(download_and_convert_embeddings_models, dataset_documents, config): - _, _, models_path = download_and_convert_embeddings_models +def test_embed_query(emb_model, dataset_documents, config): + models_path = emb_model.models_path run_text_embedding_pipeline_with_ref(models_path, dataset_documents[:1], config, "embed_query") @pytest.fixture(scope="module") -def dataset_embeddings_genai_default_config_refs(download_and_convert_embeddings_models, dataset_documents): - _, _, models_path = download_and_convert_embeddings_models +def dataset_embeddings_genai_default_config_refs(emb_model, dataset_documents): + models_path = emb_model.models_path return run_text_embedding_genai(models_path, dataset_documents, None, "embed_documents") -@pytest.mark.parametrize("download_and_convert_embeddings_models", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) +@pytest.mark.parametrize("emb_model", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) @pytest.mark.parametrize( "config", [ @@ -415,8 +464,8 @@ def dataset_embeddings_genai_default_config_refs(download_and_convert_embeddings ], ) @pytest.mark.precommit -def test_fixed_shapes_configs(download_and_convert_embeddings_models, dataset_documents, config, dataset_embeddings_genai_default_config_refs): - _, _, models_path = download_and_convert_embeddings_models +def test_fixed_shapes_configs(emb_model, dataset_documents, config, dataset_embeddings_genai_default_config_refs): + models_path = emb_model.models_path docs_to_embed = dataset_documents[: config.batch_size] if config.batch_size else dataset_documents result = run_text_embedding_genai(models_path, docs_to_embed, config, "embed_documents") @@ -425,7 +474,7 @@ def test_fixed_shapes_configs(download_and_convert_embeddings_models, dataset_do validate_embedding_results(refs_to_validate, result) -@pytest.mark.parametrize("download_and_convert_embeddings_models", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) +@pytest.mark.parametrize("emb_model", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) @pytest.mark.parametrize( "config", [ @@ -439,8 +488,8 @@ def test_fixed_shapes_configs(download_and_convert_embeddings_models, dataset_do ) @pytest.mark.xfail() @pytest.mark.precommit -def test_fixed_shapes_configs_xfail(download_and_convert_embeddings_models, dataset_documents, config, dataset_embeddings_genai_default_config_refs): - _, _, models_path = download_and_convert_embeddings_models +def test_fixed_shapes_configs_xfail(emb_model, dataset_documents, config, dataset_embeddings_genai_default_config_refs): + models_path = emb_model.models_path docs_to_embed = dataset_documents[: config.batch_size] if config.batch_size else dataset_documents result = run_text_embedding_genai(models_path, docs_to_embed, config, "embed_documents") @@ -449,7 +498,7 @@ def test_fixed_shapes_configs_xfail(download_and_convert_embeddings_models, data validate_embedding_results(refs_to_validate, result) -@pytest.mark.parametrize("download_and_convert_embeddings_models", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) +@pytest.mark.parametrize("emb_model", ["mixedbread-ai/mxbai-embed-xsmall-v1"], indirect=True) @pytest.mark.parametrize( "config", [ @@ -462,8 +511,8 @@ def test_fixed_shapes_configs_xfail(download_and_convert_embeddings_models, data sys.platform == "darwin" or platform.machine() in ["aarch64", "arm64", "ARM64"], reason="NPU plugin is available only on Linux and Windows x86_64", ) -def test_npu_fallback(download_and_convert_embeddings_models, dataset_documents, config, dataset_embeddings_genai_default_config_refs): - _, _, models_path = download_and_convert_embeddings_models +def test_npu_fallback(emb_model, dataset_documents, config, dataset_embeddings_genai_default_config_refs): + models_path = emb_model.models_path NPU_FALLBACK_PROPERTIES = {"NPU_USE_NPUW": "YES", "NPUW_DEVICES": "CPU", "NPUW_ONLINE_PIPELINE": "NONE"} @@ -475,10 +524,10 @@ def test_npu_fallback(download_and_convert_embeddings_models, dataset_documents, validate_embedding_results(refs_to_validate, result) -@pytest.mark.parametrize("download_and_convert_rerank_model", [RERANK_TEST_MODELS[0]], indirect=True) +@pytest.mark.parametrize("rerank_model", [RERANK_TEST_MODELS[0]], indirect=True) @pytest.mark.precommit -def test_rerank_constructors(download_and_convert_rerank_model): - _, _, models_path = download_and_convert_rerank_model +def test_rerank_constructors(rerank_model): + models_path = rerank_model.models_path TextRerankPipeline(models_path, "CPU") TextRerankPipeline(models_path, "CPU", TextRerankPipeline.Config()) @@ -501,7 +550,7 @@ def test_rerank_constructors(download_and_convert_rerank_model): ) -@pytest.mark.parametrize("download_and_convert_rerank_model", RERANK_TEST_MODELS, indirect=True) +@pytest.mark.parametrize("rerank_model", RERANK_TEST_MODELS, indirect=True) @pytest.mark.parametrize("query", ["What are the main features of Intel Core Ultra processors?"]) @pytest.mark.parametrize( "config", @@ -515,13 +564,13 @@ def test_rerank_constructors(download_and_convert_rerank_model): ], ) @pytest.mark.precommit -def test_rerank_documents(download_and_convert_rerank_model, dataset_documents, query, config): - _, _, models_path = download_and_convert_rerank_model +def test_rerank_documents(rerank_model, dataset_documents, query, config): + models_path = rerank_model.models_path run_text_rerank_pipeline_with_ref(models_path, query, dataset_documents, config) # aligned with https://huggingface.co/tomaarsen/Qwen3-Reranker-0.6B-seq-cls#updated-transformers-usage -@pytest.mark.parametrize("download_and_convert_rerank_model", [QWEN3_RERANK_SEQ_CLS], indirect=True) +@pytest.mark.parametrize("rerank_model", [QWEN3_RERANK_SEQ_CLS], indirect=True) @pytest.mark.parametrize("query", ["Which planet is known as the Red Planet?"]) @pytest.mark.parametrize("task", ["Given a web search query, retrieve relevant passages that answer the query"]) @pytest.mark.parametrize( @@ -546,19 +595,28 @@ def test_rerank_documents(download_and_convert_rerank_model, dataset_documents, ) @pytest.mark.precommit @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 174635") -def test_qwen3_seq_cls_rerank_documents(download_and_convert_rerank_model, query, task, documents, config): - opt_model, hf_tokenizer, models_path = download_and_convert_rerank_model - +def test_qwen3_seq_cls_rerank_documents(rerank_model: OVConvertedModelSchema, query, task, documents, config): formatted_query = qwen3_reranker_format_queries(query, task) formatted_documents = [qwen3_reranker_format_document(doc) for doc in documents] - opt_result = run_qwen3_rerank_optimum(opt_model, hf_tokenizer, formatted_query, formatted_documents, config) - genai_result = run_text_rerank_genai(models_path, formatted_query, formatted_documents, config) + opt_result = run_qwen3_rerank_optimum( + rerank_model.opt_model, + rerank_model.hf_tokenizer, + formatted_query, + formatted_documents, + config, + ) + genai_result = run_text_rerank_genai( + rerank_model.models_path, + formatted_query, + formatted_documents, + config, + ) assert_rerank_results(opt_result, genai_result) -@pytest.mark.parametrize("download_and_convert_model_fixture", [QWEN3_RERANK], indirect=True) +@pytest.mark.parametrize("llm_model", [QWEN3_RERANK], indirect=True) @pytest.mark.parametrize("query", ["Which planet is known as the Red Planet?"]) @pytest.mark.parametrize("task", ["Given a web search query, retrieve relevant passages that answer the query"]) @pytest.mark.parametrize( @@ -583,13 +641,22 @@ def test_qwen3_seq_cls_rerank_documents(download_and_convert_rerank_model, query ) @pytest.mark.precommit @pytest.mark.xfail(condition=(sys.platform == "darwin"), reason="Ticket - 174635") -def test_qwen3_rerank_documents(download_and_convert_model_fixture, query, task, documents, config): - opt_model, hf_tokenizer, models_path = download_and_convert_model_fixture - +def test_qwen3_rerank_documents(llm_model: OVConvertedModelSchema, query, task, documents, config): formatted_query = qwen3_reranker_format_queries(query, task) formatted_documents = [qwen3_reranker_format_document(doc) for doc in documents] - opt_result = run_qwen3_rerank_optimum(opt_model, hf_tokenizer, formatted_query, formatted_documents, config) - genai_result = run_text_rerank_genai(models_path, formatted_query, formatted_documents, config) + opt_result = run_qwen3_rerank_optimum( + llm_model.opt_model, + llm_model.hf_tokenizer, + formatted_query, + formatted_documents, + config, + ) + genai_result = run_text_rerank_genai( + llm_model.models_path, + formatted_query, + formatted_documents, + config, + ) assert_rerank_results(opt_result, genai_result) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index fbd296bfcb..acecbe54dc 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -10,59 +10,152 @@ from openvino_genai import GenerationConfig, StopCriteria from utils.ov_genai_pipelines import generate_and_compare, run_ov_pipeline, get_main_pipeline_types -from utils.hugging_face import download_and_convert_model +from utils.hugging_face import OVConvertedModelSchema, download_and_convert_model -@pytest.mark.precommit -@pytest.mark.parametrize("generation_config,prompt", - [(dict(max_new_tokens=30), 'table is made of'), - (dict(max_new_tokens=30, min_new_tokens=30), '你好! 你好嗎?'), - (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'), - (dict(max_length=30, ignore_eos=True), 'table is made of'), - (dict(stop_token_ids={28998}, apply_chat_template=False), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met, skip chat template to generate long answer - # (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?') - ], - ids=["max_new_tokens", - "min_and_max_new_tokens", - "max_new_tokens_and_ignore_eos_true", - "max_length", - "stop_token_ids", - # "echo_with_generation", - ]) -def test_basic_stop_criteria(generation_config, prompt): +PIPELINE_MAIN_TYPES = get_main_pipeline_types() + + +@pytest.fixture(scope="module") +def model_facebook_opt_125m() -> OVConvertedModelSchema: + model_id : str = "facebook/opt-125m" + return download_and_convert_model(model_id) + + +@pytest.fixture(scope="module") +def model_katuni4ka_tiny_random_phi3() -> OVConvertedModelSchema: model_id : str = "katuni4ka/tiny-random-phi3" - generate_and_compare(model_id, [prompt], generation_config) + return download_and_convert_model(model_id) @pytest.mark.precommit -@pytest.mark.parametrize("generation_config,model_id", - [(dict(max_new_tokens=50, min_new_tokens=15, stop_strings={"anag"}, include_stop_str_in_output=True), 'facebook/opt-125m'), # expected match on "manage" - (dict(max_new_tokens=50, min_new_tokens=1, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True), 'facebook/opt-125m'), - (dict(max_new_tokens=50, min_new_tokens=1, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True), 'facebook/opt-125m'), # expected no match - (dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=False),'facebook/opt-125m'), - (dict(max_new_tokens=30, stop_strings={ "machines" }, include_stop_str_in_output=True), 'facebook/opt-125m'), - (dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=False), 'facebook/opt-125m'), - (dict(max_new_tokens=30, stop_strings={ "machines", "manage" }, include_stop_str_in_output=True), 'facebook/opt-125m'), - (dict(max_new_tokens=30, stop_strings={ "software toolkit developed 1 by", "Intel" }, include_stop_str_in_output=False), 'TinyLlama/TinyLlama-1.1B-Chat-v1.0')], - ids=["single_stop_string", - "multiple_stop_strings_match", - "multiple_stop_strings_no_match", - "single_stop_string_exclude_from_output", - "single_stop_string_include_to_output", - "multiple_stop_strings_exclude_from_output", - "multiple_stop_strings_include_to_output", - "multiple_stop_strings_one_no_match_and_long_exclude_from_output"]) -@pytest.mark.parametrize("pipeline_type", get_main_pipeline_types()) -def test_stop_strings(generation_config, model_id, pipeline_type): +@pytest.mark.parametrize( + "generation_config,prompt", + [ + ({"max_new_tokens": 30}, 'table is made of'), + ({"max_new_tokens": 30, "min_new_tokens": 30}, '你好! 你好嗎?'), + ({"max_new_tokens": 30, "ignore_eos": True}, 'Alan Turing was a'), + ({"max_length": 30, "ignore_eos": True}, 'table is made of'), + ({"stop_token_ids": {28998}, "apply_chat_template": False}, 'The Sun is yellow because'), + ], + ids=[ + "max_new_tokens", + "min_and_max_new_tokens", + "max_new_tokens_and_ignore_eos_true", + "max_length", + "stop_token_ids", + ] +) +def test_basic_stop_criteria( + model_katuni4ka_tiny_random_phi3: OVConvertedModelSchema, + generation_config: GenerationConfig, + prompt +): + generate_and_compare(model_katuni4ka_tiny_random_phi3, [prompt], generation_config) + + +@pytest.fixture(scope="module") +def model_tinyllama_1_1b_chat() -> OVConvertedModelSchema: + model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + return download_and_convert_model(model_id) + + +@pytest.mark.precommit +@pytest.mark.parametrize( + "generation_config", + [ + { + "max_new_tokens": 50, + "min_new_tokens": 15, + "stop_strings": {"anag"}, + "include_stop_str_in_output": True + }, + { + "max_new_tokens": 50, + "min_new_tokens": 1, + "stop_strings": {".", "software", "Intel"}, + "include_stop_str_in_output": True + }, + { + "max_new_tokens": 50, + "min_new_tokens": 1, + "stop_strings": {"Einstein", "sunny", "geothermal"}, + "include_stop_str_in_output": True + }, + { + "max_new_tokens": 30, + "stop_strings": {"machines"}, + "include_stop_str_in_output": False + }, + { + "max_new_tokens": 30, + "stop_strings": {"machines"}, + "include_stop_str_in_output": True + }, + { + "max_new_tokens": 30, + "stop_strings": {"machines", "manage"}, + "include_stop_str_in_output": False + }, + { + "max_new_tokens": 30, + "stop_strings": {"machines", "manage"}, + "include_stop_str_in_output": True + }, + ], + ids=[ + "single_stop_string", + "multiple_stop_strings_match", + "multiple_stop_strings_no_match", + "single_stop_string_exclude_from_output", + "single_stop_string_include_to_output", + "multiple_stop_strings_exclude_from_output", + "multiple_stop_strings_include_to_output", + ] +) +@pytest.mark.parametrize("pipeline_type", PIPELINE_MAIN_TYPES) +def test_stop_strings_facebook_opt( + model_facebook_opt_125m: OVConvertedModelSchema, + generation_config: GenerationConfig, + pipeline_type +): prompts = [ "What is OpenVINO?" ] - generate_and_compare(model_id, prompts, generation_config, pipeline_type=pipeline_type) + generate_and_compare(model_facebook_opt_125m, prompts, generation_config, pipeline_type=pipeline_type) @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", - [dict(max_new_tokens=30), - dict(max_new_tokens=30, repetition_penalty=2.0), - dict(max_new_tokens=300, apply_chat_template=False)], - ids=["basic", "repetition_penalty", "long_max_new_tokens"]) +@pytest.mark.parametrize( + "generation_config", + [ + { + "max_new_tokens": 30, + "stop_strings": {"software toolkit developed 1 by", "Intel"}, + "include_stop_str_in_output": False + }, + ], + ids=[ + "multiple_stop_strings_one_no_match_and_long_exclude_from_output", + ] +) +@pytest.mark.parametrize("pipeline_type", PIPELINE_MAIN_TYPES) +def test_stop_strings_tinyllama( + model_tinyllama_1_1b_chat: OVConvertedModelSchema, + generation_config: GenerationConfig, + pipeline_type +): + prompts = [ "What is OpenVINO?" ] + generate_and_compare(model_tinyllama_1_1b_chat, prompts, generation_config, pipeline_type=pipeline_type) + + +@pytest.mark.precommit +@pytest.mark.parametrize( + "generation_config", + [ + {"max_new_tokens": 30}, + {"max_new_tokens": 30, "repetition_penalty": 2.0}, + {"max_new_tokens": 300, "apply_chat_template": False}, + ], + ids=["basic", "repetition_penalty", "long_max_new_tokens"] +) @pytest.mark.parametrize("prompt", [ 'What is OpenVINO?', 'table is made of', @@ -70,39 +163,62 @@ def test_stop_strings(generation_config, model_id, pipeline_type): '你好! 你好嗎?'.encode('unicode_escape'), # to escape Win limitation on Unicode tmp path 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' ]) -def test_greedy(generation_config, prompt): - model_id : str = "katuni4ka/tiny-random-phi3" +def test_greedy( + model_katuni4ka_tiny_random_phi3: OVConvertedModelSchema, + generation_config, + prompt +): prompt = prompt.decode('unicode_escape') if isinstance(prompt, bytes) else prompt - generate_and_compare(model=model_id, - prompts=prompt, - generation_config=generation_config) + generate_and_compare( + model_katuni4ka_tiny_random_phi3, + prompt, + generation_config + ) @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", - [dict(max_new_tokens=30, num_beams=2), - dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.NEVER), - dict(max_new_tokens=30, num_beams=2, stop_criteria=StopCriteria.EARLY), - # dict(max_new_tokens=30, num_beams=2, echo=True), - dict(max_new_tokens=30, num_beams=2, length_penalty=1.0), - dict(max_new_tokens=30, num_beams=2, no_repeat_ngram_size=2), - dict(max_new_tokens=30, num_beams=6, num_beam_groups=3, diversity_penalty=1.2, num_return_sequences=3), - dict(max_new_tokens=30, min_new_tokens=15, num_beams=2, num_return_sequences=1), - dict(max_new_tokens=30, num_beams=2, stop_strings={"Einstein", "sunny", "geothermal"}, include_stop_str_in_output=True),], - ids=["single_group_stop_criteria_heuristic", - "single_group_stop_criteria_never", - "single_group_stop_criteria_early", - # "single_group_with_echo", - "single_group_lenght_penalty", - "single_group_no_repeat_ngram_size", - "multiple_groups", - "single_group_min_new_tokens", - "single_group_with_multiple_stop_strings_no_match",]) -def test_beam_search(generation_config): +@pytest.mark.parametrize( + "generation_config", + [ + {"max_new_tokens": 30, "num_beams": 2}, + {"max_new_tokens": 30, "num_beams": 2, "stop_criteria": StopCriteria.NEVER}, + {"max_new_tokens": 30, "num_beams": 2, "stop_criteria": StopCriteria.EARLY}, + {"max_new_tokens": 30, "num_beams": 2, "length_penalty": 1.0}, + {"max_new_tokens": 30, "num_beams": 2, "no_repeat_ngram_size": 2}, + { + "max_new_tokens": 30, + "num_beams": 6, + "num_beam_groups": 3, + "diversity_penalty": 1.2, + "num_return_sequences": 3 + }, + {"max_new_tokens": 30, "min_new_tokens": 15, "num_beams": 2, "num_return_sequences": 1}, + { + "max_new_tokens": 30, + "num_beams": 2, + "stop_strings": {"Einstein", "sunny", "geothermal"}, + "include_stop_str_in_output": True + }, + ], + ids=[ + "single_group_stop_criteria_heuristic", + "single_group_stop_criteria_never", + "single_group_stop_criteria_early", + "single_group_lenght_penalty", + "single_group_no_repeat_ngram_size", + "multiple_groups", + "single_group_min_new_tokens", + "single_group_with_multiple_stop_strings_no_match", + ] +) +def test_beam_search( + model_facebook_opt_125m: OVConvertedModelSchema, + generation_config: GenerationConfig +): prompts = [ "What is OpenVINO?" ] - model_id : str = "facebook/opt-125m" - generate_and_compare(model_id, prompts, generation_config) + generate_and_compare(model_facebook_opt_125m, prompts, generation_config) + @pytest.mark.precommit @@ -111,28 +227,61 @@ def test_beam_search(generation_config): reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", strict=True, ) -@pytest.mark.parametrize("generation_config", - [dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={"open sour"}, include_stop_str_in_output=True), - dict(max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, num_return_sequences=6, stop_strings={".", "software", "Intel"}, include_stop_str_in_output=True),], - ids=["single_stop_string_match", "multiple_stop_strings_match"]) -def test_beam_search_with_stop_string(generation_config): +@pytest.mark.parametrize( + "generation_config", + [ + { + "max_new_tokens": 50, + "num_beams": 6, + "num_beam_groups": 3, + "diversity_penalty": 1.0, + "num_return_sequences": 6, + "stop_strings": {"open sour"}, + "include_stop_str_in_output": True + }, + { + "max_new_tokens": 50, + "num_beams": 6, + "num_beam_groups": 3, + "diversity_penalty": 1.0, + "num_return_sequences": 6, + "stop_strings": {".", "software", "Intel"}, + "include_stop_str_in_output": True + }, + ], + ids=[ + "single_stop_string_match", + "multiple_stop_strings_match", + ] +) +def test_beam_search_with_stop_string( + model_facebook_opt_125m: OVConvertedModelSchema, + generation_config: GenerationConfig +): prompts = [ "What is OpenVINO?" ] - model_id : str = "facebook/opt-125m" - generate_and_compare(model_id, prompts, generation_config) + generate_and_compare(model_facebook_opt_125m, prompts, generation_config) @pytest.mark.precommit -@pytest.mark.parametrize("generation_config", - [dict(max_new_tokens=1, min_new_tokens=0, echo=True), - dict(max_new_tokens=30, num_beams=2, echo=True),], - ids=["echo_with_generation", - "single_group_with_echo",]) -def test_echo(generation_config): +@pytest.mark.parametrize( + "generation_config", + [ + {"max_new_tokens": 1, "min_new_tokens": 0, "echo": True}, + {"max_new_tokens": 30, "num_beams": 2, "echo": True}, + ], + ids=[ + "echo_with_generation", + "single_group_with_echo", + ] +) +def test_echo( + model_facebook_opt_125m: OVConvertedModelSchema, + generation_config: GenerationConfig +): prompts = [ "What is OpenVINO?" ] - model_id : str = "facebook/opt-125m" # TODO: support in stateful mode and remove 'use_cb=True' and this test at all # as we can enable new parameters set in other tests - generate_and_compare(model_id, prompts, generation_config) + generate_and_compare(model_facebook_opt_125m, prompts, generation_config) # TODO: remove platform specific reference texts once CVS-159912 is done and use comparison with HF @@ -313,31 +462,38 @@ class RandomSamplingTestStruct: @pytest.mark.precommit -@pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, - ids=["multinomial_temperature", - "multinomial_temperature_and_top_p", - "multinomial_temperature_and_top_k", - "multinomial_temperature_top_p_and_top_k", - "multinomial_temperature_and_repetition_penalty", - "multinomial_temperature_and_num_return_sequence", - "multinomial_all_parameters", - "multinomial_temperature_and_presence_penalty", - "multinomial_temperature_and_frequence_penalty", - "greedy_with_penalties", - "multinomial_max_and_min_token"]) -def test_multinomial_sampling_against_reference(test_struct: RandomSamplingTestStruct): +@pytest.mark.parametrize( + "test_struct", + RANDOM_SAMPLING_TEST_CASES, + ids=[ + "multinomial_temperature", + "multinomial_temperature_and_top_p", + "multinomial_temperature_and_top_k", + "multinomial_temperature_top_p_and_top_k", + "multinomial_temperature_and_repetition_penalty", + "multinomial_temperature_and_num_return_sequence", + "multinomial_all_parameters", + "multinomial_temperature_and_presence_penalty", + "multinomial_temperature_and_frequence_penalty", + "greedy_with_penalties", + "multinomial_max_and_min_token", + ] +) +def test_multinomial_sampling_against_reference( + model_facebook_opt_125m: OVConvertedModelSchema, + test_struct: RandomSamplingTestStruct +): generation_config = test_struct.generation_config prompts = test_struct.prompts generation_config.rng_seed = 0 - model_id : str = "facebook/opt-125m" - _, _, models_path = download_and_convert_model(model_id) - # Run multinomial without comparison with HF reference. - _ = run_ov_pipeline(models_path=models_path, - prompt=prompts, - generation_config=generation_config) + run_ov_pipeline( + models_path=model_facebook_opt_125m.models_path, + prompt=prompts, + generation_config=generation_config + ) # Reference comparison is not performed as sampling results are non-deterministic. # Discrete_distribution impl depends on platform, model inference results may depend on CPU. diff --git a/tests/python_tests/test_structured_output.py b/tests/python_tests/test_structured_output.py index 7926f3ebff..16c6ddca56 100644 --- a/tests/python_tests/test_structured_output.py +++ b/tests/python_tests/test_structured_output.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="module") def ov_pipe(request): - _, _, models_path = download_and_convert_model(request.param) + models_path = download_and_convert_model(request.param).models_path return create_ov_pipeline(models_path) class Person(BaseModel): diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 866139f8cf..6e7d8ad8df 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -1,7 +1,10 @@ # Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + import dataclasses import json +import sys +from pathlib import Path from typing import Optional import numpy as np @@ -59,7 +62,9 @@ def get_chat_templates(): @pytest.fixture(scope="module") def ov_hf_tokenizers(request): - _, hf_tokenizer, models_path = download_and_convert_model(request.param) + model_schema = download_and_convert_model(request.param) + hf_tokenizer = model_schema.hf_tokenizer + models_path = model_schema.models_path ov_tokenizer = Tokenizer(models_path) return ov_tokenizer, hf_tokenizer @@ -80,24 +85,31 @@ def test_encode(ov_hf_tokenizers, prompt): assert np.all(encoded_hf == encoded_ov[0]) -encoded_prompts = [ - [1, 1591, 338, 1754, 310], - [1, 17102, 323, 3864, 471, 263], - # chineze characters - [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], - # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token - [3113, 264, 364, 267], - # batched tokens +@pytest.mark.parametrize("ov_hf_tokenizers", get_models_list(), indirect=True) +@pytest.mark.parametrize( + "encoded_prompt", [ - [1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263], + # chineze characters + [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], + # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token + [3113, 264, 364, 267], + # batched tokens + [ + [1, 1591, 338, 1754, 310], + [1, 1591, 338, 1754, 310], + [1, 17102, 323, 3864, 471, 263], + ], ], -] - - -@pytest.mark.parametrize("ov_hf_tokenizers", get_models_list(), indirect=True) -@pytest.mark.parametrize("encoded_prompt", encoded_prompts) + ids=[ + "encoded_prompt", + "encoded_prompt_2", + "encoded_prompt_chineze", + "encoded_prompt_meta_llama", + "encoded_prompt_batched", + ], +) @pytest.mark.precommit def test_decode(ov_hf_tokenizers, encoded_prompt): ov_tokenizer, hf_tokenizer = ov_hf_tokenizers @@ -112,7 +124,7 @@ def test_decode(ov_hf_tokenizers, encoded_prompt): assert decoded_hf == decoded_ov -conversation = [ +CONVERSATION_EXAMPLE = [ {"role": "user", "content": "1+1="}, {"role": "assistant", "content": "1 + 1 = 2"}, {"role": "user", "content": "What is the previous answer?"}, @@ -138,12 +150,12 @@ def test_apply_chat_template(model_tmp_path, chat_config: tuple[str, dict], ov_h tokenizer_config["chat_template"] = tokenizer_config["chat_template"]["default"] hf_full_history_str = hf_tokenizer.apply_chat_template( - conversation, add_generation_prompt=False, tokenize=False, **tokenizer_config + CONVERSATION_EXAMPLE, add_generation_prompt=False, tokenize=False, **tokenizer_config ) ov_tokenizer = load_genai_tokenizer_with_configs([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) ov_tokenizer.set_chat_template(tokenizer_config["chat_template"]) - ov_full_history_str = ov_tokenizer.apply_chat_template(conversation, add_generation_prompt=False) + ov_full_history_str = ov_tokenizer.apply_chat_template(CONVERSATION_EXAMPLE, add_generation_prompt=False) assert ov_full_history_str == hf_full_history_str, f"HF reference:\n{hf_full_history_str}\nGenAI output:\n{ov_full_history_str}" @@ -206,13 +218,13 @@ def test_apply_chat_template_with_tools_and_extra_context(model_tmp_path, ov_hf_ extra_context = { "enable_thinking": False } hf_full_history_str = hf_tokenizer.apply_chat_template( - conversation, add_generation_prompt=add_generation_prompt, tokenize=False, tools=tools, **extra_context, **tokenizer_config + CONVERSATION_EXAMPLE, add_generation_prompt=add_generation_prompt, tokenize=False, tools=tools, **extra_context, **tokenizer_config ) genai_tokenizer = load_genai_tokenizer_with_configs([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) ov_full_history_str = genai_tokenizer.apply_chat_template( - conversation, add_generation_prompt=add_generation_prompt, tools=tools, extra_context=extra_context + CONVERSATION_EXAMPLE, add_generation_prompt=add_generation_prompt, tools=tools, extra_context=extra_context ) assert ov_full_history_str == hf_full_history_str, f"HF reference:\n{hf_full_history_str}\nGenAI output:\n{ov_full_history_str}" @@ -228,10 +240,10 @@ def test_non_string_chat_template(hf_ov_genai_models): hf_tokenizer, genai_tokenzier = hf_ov_genai_models hf_full_history_str = hf_tokenizer.apply_chat_template( - conversation, add_generation_prompt=False, tokenize=False, + CONVERSATION_EXAMPLE, add_generation_prompt=False, tokenize=False, ) - ov_full_history_str = genai_tokenzier.apply_chat_template(conversation, add_generation_prompt=False) + ov_full_history_str = genai_tokenzier.apply_chat_template(CONVERSATION_EXAMPLE, add_generation_prompt=False) assert ov_full_history_str == hf_full_history_str, f"HF reference:\n{hf_full_history_str}\nGenAI output:\n{ov_full_history_str}" @@ -260,22 +272,7 @@ def test_set_chat_template(ov_hf_tokenizers): assert prompt == templated_prompt -eng_prompts = [ - "1+1=", - "What is the previous answer?", - "Why is the Sun yellow?", - "What was my first question?", - ["Why is the Sun yellow?"], - "Multiline\nstring\nWow!", -] -unicode_prompts = [ - *(str.encode(x, "unicode_escape") for x in [ - "如果您有任何疑问,请联系我们,我们将予以解答。", - "מחרוזת בדיקה", - ]) -] - - +@pytest.mark.precommit @pytest.mark.parametrize( "ov_hf_tokenizers", [ @@ -285,8 +282,29 @@ def test_set_chat_template(ov_hf_tokenizers): ], indirect=True, ) -@pytest.mark.precommit -@pytest.mark.parametrize("prompt", [*eng_prompts, *unicode_prompts]) +@pytest.mark.parametrize( + "prompt", + [ + "1+1=", + "What is the previous answer?", + "Why is the Sun yellow?", + "What was my first question?", + ["Why is the Sun yellow?"], + "Multiline\nstring\nWow!", + str.encode("如果您有任何疑问,请联系我们,我们将予以解答。", "unicode_escape"), + str.encode("מחרוזת בדיקה", "unicode_escape"), + ], + ids=[ + "Sum", + "Question 1", + "Question 2", + "Question 3", + "Question 4", + "Multiline string", + "Unicode escape Chinese", + "Unicode escape Hebrew", + ] +) def test_special_tokens(prompt, ov_hf_tokenizers): prompt = prompt.decode("unicode_escape") if isinstance(prompt, bytes) else prompt @@ -363,21 +381,6 @@ def hf_ov_genai_models(request, tmp_path_factory): prompts = [ - ["1+1=", "What is the previous answer?"], - # long sentence exceeding max_length, check that is truncated - "What is the previous answers? " * 1000, - # check that short sentence is padded to long - "what", - # check that large batch with multilangual data is correctly padded - [ - "1+1=", - "What is the previous answer?", - "Why is the Sun yellow?", - "What was my first question?", - "若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?", - "מחרוזת בדיקה", - "Multiline\nstring!\nWow!", - ], ] @@ -387,7 +390,32 @@ def hf_ov_genai_models(request, tmp_path_factory): @pytest.mark.parametrize("pad_to_max_length", [None, True, False]) # regardless of what side was set during conversion we should be able to set it at runtime @pytest.mark.parametrize("padding_side", [None, "right", "left"]) -@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize( + "prompt", + [ + ["1+1=", "What is the previous answer?"], + # long sentence exceeding max_length, check that is truncated + "What is the previous answers? " * 1000, + # check that short sentence is padded to long + "what", + # check that large batch with multilangual data is correctly padded + [ + "1+1=", + "What is the previous answer?", + "Why is the Sun yellow?", + "What was my first question?", + "若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?", + "מחרוזת בדיקה", + "Multiline\nstring!\nWow!", + ], + ], + ids=[ + "Sum and Question 1", + "Long sentence", + "Short sentence", + "Multilingual data", + ], +) @pytest.mark.parametrize( "hf_ov_genai_models", [ @@ -401,6 +429,14 @@ def hf_ov_genai_models(request, tmp_path_factory): ), # model with 2 RaggedToDense ops # ("black-forest-labs/FLUX.1-dev", dict(subfolder="tokenizer")), # FLUX.1-dev has tokenizer in subfolder ], + ids=[ + "phi3", + "TinyLlama-1.1B-Chat-v1.0", + "llava-next-right", + "llava-next-left", + "bge-small-en-v1.5", + # "FLUX.1-dev", + ], indirect=True, ) def test_padding( @@ -462,7 +498,6 @@ def test_padding( def make_model_params(): # Parametrize over add_second_input and number_of_inputs - params = [] for model_id_and_params in base_models_for_paired_input_test: model_id, params_dict = model_id_and_params @@ -474,17 +509,23 @@ def make_model_params(): params.append((model_id, {**params_dict, "add_second_input": True, "number_of_inputs": 2})) return params -models_with_pair_input = make_model_params() +MODELS_WITH_PAIR_INPUT = make_model_params() -@pytest.mark.parametrize("hf_ov_genai_models", models_with_pair_input, indirect=True) +@pytest.mark.parametrize("hf_ov_genai_models", MODELS_WITH_PAIR_INPUT, indirect=True) @pytest.mark.precommit -@pytest.mark.parametrize("input_pair", [[ - ["hi", "sun in yellow"], - ["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!"], - ["Eng... test, string?!", "Multiline\nstring!\nWow!" * 100], - ["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!" * 100], - ["hi" * 20, "buy" * 90], -]]) +@pytest.mark.parametrize( + "input_pair", + [ + [ + ["hi", "sun in yellow"], + ["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!"], + ["Eng... test, string?!", "Multiline\nstring!\nWow!" * 100], + ["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!" * 100], + ["hi" * 20, "buy" * 90], + ] + ], + ids=["batched_pairs_mixed_length"] +) def test_two_inputs_string_list_of_lists_batched(hf_ov_genai_models, input_pair): # Check with batched inputs: list of [str, str] pairs, consistent with HF format. hf_tokenizer, genai_tokenizer = hf_ov_genai_models @@ -492,15 +533,25 @@ def test_two_inputs_string_list_of_lists_batched(hf_ov_genai_models, input_pair) hf_encoded = hf_tokenizer(input_pair, return_tensors="np", padding=True)["input_ids"] assert np.all(ov_encoded == hf_encoded) -@pytest.mark.parametrize("hf_ov_genai_models", models_with_pair_input, indirect=True) +@pytest.mark.parametrize("hf_ov_genai_models", MODELS_WITH_PAIR_INPUT, indirect=True) @pytest.mark.precommit -@pytest.mark.parametrize("input_pair", [ - [["hi", "sun in yellow"]], - [["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!"]], - [["Eng... test, string?!", "Multiline\nstring!\nWow!" * 100]], - [["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!" * 100]], - [["hi" * 20, "buy" * 90]], -]) +@pytest.mark.parametrize( + "input_pair", + [ + [["hi", "sun in yellow"]], + [["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!"]], + [["Eng... test, string?!", "Multiline\nstring!\nWow!" * 100]], + [["Eng... test, string?!" * 100, "Multiline\nstring!\nWow!" * 100]], + [["hi" * 20, "buy" * 90]], + ], + ids=[ + "short_pair", + "long_first_short_second", + "short_first_long_second", + "both_long", + "repeated_tokens", + ] +) def test_two_inputs_string_list_of_lists(hf_ov_genai_models, input_pair): # Check with inputs consisted of lists of lists consistent with HF format. hf_tokenizer, genai_tokenzier = hf_ov_genai_models @@ -509,14 +560,23 @@ def test_two_inputs_string_list_of_lists(hf_ov_genai_models, input_pair): assert np.all(ov_encoded == hf_encoded) -@pytest.mark.parametrize("hf_ov_genai_models", models_with_pair_input, indirect=True) +@pytest.mark.parametrize("hf_ov_genai_models", MODELS_WITH_PAIR_INPUT, indirect=True) @pytest.mark.precommit -@pytest.mark.parametrize("input_pair", [ - [["Eng... test, string?!" * 100], ["Multiline\nstring!\nWow!"]], - [["hi" * 20], ["buy" * 90]], - [["What is the capital of Great Britain"] * 4, ["London is capital of Great Britain"]], - [["What is the capital of Great Britain"], ["London is capital of Great Britain"] * 4], -]) +@pytest.mark.parametrize( + "input_pair", + [ + [["Eng... test, string?!" * 100], ["Multiline\nstring!\nWow!"]], + [["hi" * 20], ["buy" * 90]], + [["What is the capital of Great Britain"] * 4, ["London is capital of Great Britain"]], + [["What is the capital of Great Britain"], ["London is capital of Great Britain"] * 4], + ], + ids=[ + "broadcast_long_first_short_second", + "broadcast_repeated_tokens", + "broadcast_first_batch_4", + "broadcast_second_batch_4", + ] +) def test_two_inputs_string(hf_ov_genai_models, input_pair): # Test when inputs are separate and they are broadcasted to the same length. # For HF we broadcast manually, but in GenAI this happens automatically. @@ -662,7 +722,7 @@ class ChatTemplates: tokenizer_config_json: Optional[str] -def generate_tokenizer(tmp_path, chat_templates): +def generate_tokenizer(tmp_path: Path, chat_templates: ChatTemplates) -> Tokenizer: input_ids = openvino.op.Constant(openvino.Type.i64, openvino.Shape([0, 0]), []).output(0) input_ids.get_tensor().set_names({"input_ids"}) attention_mask = openvino.op.Constant(openvino.Type.i64, openvino.Shape([0, 0]), []).output(0) @@ -674,19 +734,26 @@ def generate_tokenizer(tmp_path, chat_templates): if chat_templates.rt_template is not None: model.set_rt_info(chat_templates.rt_template, "chat_template") if chat_templates.chat_template_json is not None: - with open(tmp_path / "chat_template.json", "w", encoding="utf-8") as file: + with (tmp_path / "chat_template.json").open("w", encoding="utf-8") as file: json.dump({"chat_template": chat_templates.chat_template_json}, file) if chat_templates.processor_config_json is not None: - with open(tmp_path / "processor_config.json", "w", encoding="utf-8") as file: + with (tmp_path / "processor_config.json").open("w", encoding="utf-8") as file: json.dump({"chat_template": chat_templates.processor_config_json}, file) if chat_templates.tokenizer_config_json is not None: - with open(tmp_path / "tokenizer_config.json", "w", encoding="utf-8") as file: + with (tmp_path / "tokenizer_config.json").open("w", encoding="utf-8") as file: json.dump({"chat_template": chat_templates.tokenizer_config_json}, file) - openvino.save_model(model, tmp_path / "openvino_tokenizer.xml") + openvino.save_model(model, str(tmp_path / "openvino_tokenizer.xml")) + del model return Tokenizer(tmp_path) -QWEN2_VL_2B = "{% if messages is string %}{{ messages }}{% else %}{% for content in messages %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}" +QWEN2_VL_2B = ( + "{% if messages is string %}{{ messages }}{% else %}{% for content in messages %}" + "{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}" + "<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}" + "<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}" + "{{ content['text'] }}{% endif %}{% endfor %}{% endif %}" +) SIMPLIFIED_QWEN2_VL_2B = "{% for message in messages %}{{ message['content'] }}{% endfor %}" @@ -695,8 +762,11 @@ def generate_tokenizer(tmp_path, chat_templates): @pytest.mark.precommit def test_set_special_runtime_template(tmp_path): tokenizer = generate_tokenizer(tmp_path, ChatTemplates(None, None, None, None, None)) - tokenizer.chat_template = QWEN2_VL_2B - assert tokenizer.chat_template == SIMPLIFIED_QWEN2_VL_2B + try: + tokenizer.chat_template = QWEN2_VL_2B + assert tokenizer.chat_template == SIMPLIFIED_QWEN2_VL_2B + finally: + del tokenizer @pytest.mark.precommit @@ -718,9 +788,14 @@ def test_template_priorities(tmp_path, chat_templates): @pytest.mark.precommit def test_chat_template_with_empty_output(tmp_path): - tokenizer = generate_tokenizer(tmp_path, ChatTemplates(None, None, None, None, None)) - # Test throwing exception for empty rendered chat template (e.g. Qwen2-VL) - # Original Qwen2-VL chat template is modified with \n to avoid remapping with simplified template. + tokenizer = generate_tokenizer( + tmp_path, + ChatTemplates(None, None, None, None, None) + ) chat_template_with_empty_output = QWEN2_VL_2B + "\n" with pytest.raises(Exception): - tokenizer.apply_chat_template(conversation, add_generation_prompt=False, chat_template=chat_template_with_empty_output) + tokenizer.apply_chat_template( + CONVERSATION_EXAMPLE, + add_generation_prompt=False, + chat_template=chat_template_with_empty_output + ) diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py index 6d96f0e1c4..a0870561a0 100644 --- a/tests/python_tests/utils/hugging_face.py +++ b/tests/python_tests/utils/hugging_face.py @@ -1,15 +1,17 @@ # Copyright (C) 2018-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from os.path import sep +from dataclasses import dataclass from pathlib import Path from typing import Type from functools import lru_cache +import re +from optimum.modeling_base import OptimizedModel from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig -from optimum.intel import OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForSequenceClassification +from optimum.intel import OVModelForCausalLM, OVModelForSequenceClassification from optimum.intel.openvino.modeling import OVModel from huggingface_hub import hf_hub_download @@ -22,9 +24,20 @@ from utils.network import retry_request import pytest +from utils.constants import OV_MODEL_FILENAME + + +@dataclass(frozen=True) +class OVConvertedModelSchema: + model_id: str + opt_model: OptimizedModel + hf_tokenizer: AutoTokenizer + models_path: Path + + def generation_config_to_hf( default_generation_config : HFGenerationConfig, - generation_config : GenerationConfig + generation_config : GenerationConfig | None, ) -> HFGenerationConfig: if generation_config is None: return @@ -95,12 +108,12 @@ def generation_config_to_hf( return hf_generation_config def run_hugging_face( - opt_model, - hf_tokenizer, + opt_model: OptimizedModel, + hf_tokenizer: AutoTokenizer, prompts: list[str], generation_configs: list[GenerationConfig] | GenerationConfig, ) -> list[GenerationResult]: - generation_results = [] + generation_results: list[GenerationResult] = [] if type(generation_configs) is list: # process prompt by promp as we have multiple generation configs @@ -165,26 +178,51 @@ def run_hugging_face( # download HF model or read converted model -def get_huggingface_models(model_id: str | Path, model_class: Type[OVModel], local_files_only=False): - hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, local_files_only=local_files_only)) - opt_model = retry_request(lambda: model_class.from_pretrained(model_id, export=isinstance(model_id, str), compile=False, load_in_8bit=False, trust_remote_code=isinstance(model_id, str), ov_config=get_default_llm_properties(), local_files_only=local_files_only)) - return opt_model, hf_tokenizer - - -def convert_and_save_tokenizer(hf_tokenizer : AutoTokenizer, - models_path: Path, - **tokenizer_kwargs): - tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, **tokenizer_kwargs) +def get_huggingface_models( + model_id: str | Path, + model_class: Type[OVModel], + local_files_only=False, +) -> tuple[OptimizedModel, AutoTokenizer]: + def auto_tokenizer_from_pretrained() -> AutoTokenizer: + return AutoTokenizer.from_pretrained( + model_id, + trust_remote_code=True, + local_files_only=local_files_only, + ) + + def auto_model_from_pretrained() -> OptimizedModel: + return model_class.from_pretrained( + model_id, + export=isinstance(model_id, str), + compile=False, + load_in_8bit=False, + trust_remote_code=isinstance(model_id, str), + ov_config=get_default_llm_properties(), + local_files_only=local_files_only, + ) + + return retry_request(auto_model_from_pretrained), retry_request(auto_tokenizer_from_pretrained) + + +def convert_and_save_tokenizer( + hf_tokenizer : AutoTokenizer, + models_path: Path, + **convert_args, +): + tokenizer, detokenizer = convert_tokenizer( + hf_tokenizer, with_detokenizer=True, **convert_args + ) from utils.constants import OV_DETOKENIZER_FILENAME, OV_TOKENIZER_FILENAME save_model(tokenizer, models_path / OV_TOKENIZER_FILENAME) save_model(detokenizer, models_path / OV_DETOKENIZER_FILENAME) -def convert_models(opt_model : OVModelForCausalLM, - hf_tokenizer : AutoTokenizer, - models_path: Path, - **tokenizer_kwargs): +def convert_models( + opt_model : OVModelForCausalLM, + hf_tokenizer : AutoTokenizer, + models_path: Path, +) -> None: opt_model.save_pretrained(models_path) # save generation config if opt_model.generation_config: @@ -194,57 +232,53 @@ def convert_models(opt_model : OVModelForCausalLM, # to store tokenizer config jsons with special tokens hf_tokenizer.save_pretrained(models_path) # convert tokenizers as well - convert_and_save_tokenizer(hf_tokenizer, models_path, **tokenizer_kwargs) + convert_and_save_tokenizer(hf_tokenizer, models_path) -def download_and_convert_model(model_id: str, **tokenizer_kwargs): - return _download_and_convert_model(model_id, OVModelForCausalLM, **tokenizer_kwargs) +def download_and_convert_model(model_id: str, **tokenizer_kwargs) -> OVConvertedModelSchema: + return download_and_convert_model_class(model_id, OVModelForCausalLM, **tokenizer_kwargs) -@pytest.fixture(scope="module") -def download_and_convert_embeddings_models(request): - model_id = request.param - return _download_and_convert_model(model_id, OVModelForFeatureExtraction) +def sanitize_model_id(model_id: str) -> str: + return model_id.replace("/", "_") -@pytest.fixture() -def download_and_convert_rerank_model(request): - model_id = request.param - return _download_and_convert_model(model_id, OVModelForSequenceClassification) - -@pytest.fixture() -def download_and_convert_model_fixture(request): - model_id = request.param - tokenizer_kwargs = { - "padding_side": "left" - } - return _download_and_convert_model(model_id, OVModelForCausalLM, **tokenizer_kwargs) - - -def _download_and_convert_model(model_id: str, model_class: Type[OVModel], **tokenizer_kwargs): - dir_name = str(model_id).replace(sep, "_") +def download_and_convert_model_class( + model_id: str, + model_class: Type[OVModel], + **tokenizer_kwargs, +) -> OVConvertedModelSchema: + dir_name = sanitize_model_id(model_id) + if model_class.__name__ not in ["OVModelForCausalLM"]: + dir_name = f"{dir_name}_{model_class.__name__}" ov_cache_models_dir = get_ov_cache_models_dir() models_path = ov_cache_models_dir / dir_name - from utils.constants import OV_MODEL_FILENAME if (models_path / OV_MODEL_FILENAME).exists(): opt_model, hf_tokenizer = get_huggingface_models(models_path, model_class, local_files_only=True) else: opt_model, hf_tokenizer = get_huggingface_models(model_id, model_class, local_files_only=False) if "padding_side" in tokenizer_kwargs: - hf_tokenizer.padding_side = tokenizer_kwargs.pop("padding_side") + hf_tokenizer.padding_side = tokenizer_kwargs["padding_side"] # ov tokenizer padding side alignes with hf tokenizer during conversion convert_models(opt_model, hf_tokenizer, models_path) if "padding_side" in tokenizer_kwargs: - hf_tokenizer.padding_side = tokenizer_kwargs.pop("padding_side") + hf_tokenizer.padding_side = tokenizer_kwargs["padding_side"] - return opt_model, hf_tokenizer, models_path + return OVConvertedModelSchema( + model_id, + opt_model, + hf_tokenizer, + models_path, + ) -def download_gguf_model(gguf_model_id: str, - gguf_filename: str): - gguf_dir_name = str(gguf_model_id).replace(sep, "_") +def download_gguf_model( + gguf_model_id: str, + gguf_filename: str, +): + gguf_dir_name = sanitize_model_id(gguf_model_id) ov_cache_models_dir = get_ov_cache_models_dir() models_path_gguf = ov_cache_models_dir / gguf_dir_name @@ -256,10 +290,10 @@ def download_gguf_model(gguf_model_id: str, return gguf_path -@lru_cache(maxsize=None) + def load_hf_model_from_gguf(gguf_model_id, gguf_filename): return retry_request(lambda: AutoModelForCausalLM.from_pretrained(gguf_model_id, gguf_file=gguf_filename)) -@lru_cache(maxsize=None) + def load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename): return retry_request(lambda: AutoTokenizer.from_pretrained(gguf_model_id, gguf_file=gguf_filename)) diff --git a/tests/python_tests/utils/ov_genai_pipelines.py b/tests/python_tests/utils/ov_genai_pipelines.py index 167ff94bac..e5efcabcca 100644 --- a/tests/python_tests/utils/ov_genai_pipelines.py +++ b/tests/python_tests/utils/ov_genai_pipelines.py @@ -3,16 +3,22 @@ from enum import Enum from pathlib import Path -from typing import Callable -from shutil import rmtree - -from optimum.intel.openvino.utils import TemporaryDirectory -from openvino_genai import SchedulerConfig, draft_model, ContinuousBatchingPipeline, \ - LLMPipeline, GenerationConfig, GenerationResult, StreamerBase, DecodedResults +from typing import Callable, Optional + +from openvino_genai import ( + SchedulerConfig, + draft_model, + ContinuousBatchingPipeline, + LLMPipeline, + GenerationConfig, + GenerationResult, + StreamerBase, + DecodedResults, +) from utils.constants import get_default_llm_properties from utils.comparation import compare_generation_results, compare_generation_results_vs_ref -from utils.hugging_face import download_and_convert_model, run_hugging_face +from utils.hugging_face import OVConvertedModelSchema, download_and_convert_model, run_hugging_face def dict_to_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: scheduler_config = SchedulerConfig() @@ -45,20 +51,36 @@ class PipelineType(Enum): AUTO = 6 -def get_all_pipeline_types(): - return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION, PipelineType.CONTINUOUS_BATCHING, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING, PipelineType.AUTO] +def get_gguf_pipeline_types() -> list[PipelineType]: + return [ + PipelineType.STATEFUL, + PipelineType.PAGED_ATTENTION, + ] + + +def get_main_pipeline_types() -> list[PipelineType]: + return [ + *get_gguf_pipeline_types(), + PipelineType.SPECULATIVE_DECODING, + PipelineType.PROMPT_LOOKUP_DECODING, + ] -def get_main_pipeline_types(): - return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION, PipelineType.SPECULATIVE_DECODING, PipelineType.PROMPT_LOOKUP_DECODING] -def get_gguf_pipeline_types(): - return [PipelineType.STATEFUL, PipelineType.PAGED_ATTENTION] +def get_all_pipeline_types() -> list[PipelineType]: + return [ + *get_main_pipeline_types(), + PipelineType.CONTINUOUS_BATCHING, + PipelineType.AUTO, + ] + class StreamerWithResults: - # Return a streamer which accumulates results in order to compare with results returned from generate. - results: list[str] = [] - def __init__(self): - self.results = [] + """ + Return a streamer which accumulates results in order to compare with results returned from generate. + """ + + def __init__(self) -> None: + self.results: list[str] = [] def accumulate(self, subword) -> bool: self.results.append(subword) @@ -69,45 +91,63 @@ def get_results(self) -> list[GenerationResult]: streaming_result.m_generation_ids = [''.join(self.results)] return [streaming_result] - def reset(self): + def reset(self) -> None: self.results = [] -def create_ov_pipeline(models_path: Path, - pipeline_type: PipelineType = PipelineType.AUTO, - device: str = "CPU", - ov_config: dict = get_default_llm_properties(), - scheduler_config: SchedulerConfig = SchedulerConfig(), - draft_model_path: Path = None, - enable_save_ov_model: bool = None, - dynamic_quantization_group_size: str = None): - local_ov_config = ov_config.copy() +def create_ov_pipeline( + models_path: Path, + pipeline_type: PipelineType = PipelineType.AUTO, + device: str = "CPU", + ov_config: Optional[dict] = None, + scheduler_config: Optional[SchedulerConfig] = None, + draft_model_path: Optional[Path] = None, + enable_save_ov_model: Optional[bool] = None, + dynamic_quantization_group_size: Optional[str] = None, +) -> LLMPipeline: + if ov_config is None: + ov_config = get_default_llm_properties() + + if scheduler_config is None: + scheduler_config = SchedulerConfig() + if pipeline_type == PipelineType.AUTO: return LLMPipeline(models_path, device, ov_config) elif pipeline_type == PipelineType.STATEFUL: - if enable_save_ov_model is not None: local_ov_config["enable_save_ov_model"] = enable_save_ov_model - if dynamic_quantization_group_size is not None: local_ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = dynamic_quantization_group_size - return LLMPipeline(models_path, device, local_ov_config, ATTENTION_BACKEND="SDPA") + if enable_save_ov_model is not None: + ov_config["enable_save_ov_model"] = enable_save_ov_model + if dynamic_quantization_group_size is not None: + ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = dynamic_quantization_group_size + return LLMPipeline(models_path, device, ov_config, ATTENTION_BACKEND="SDPA") elif pipeline_type == PipelineType.PAGED_ATTENTION: - if enable_save_ov_model is not None: local_ov_config["enable_save_ov_model"] = enable_save_ov_model - if dynamic_quantization_group_size is not None: local_ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = dynamic_quantization_group_size - return LLMPipeline(models_path, device, local_ov_config, scheduler_config=scheduler_config, ATTENTION_BACKEND="PA") + if enable_save_ov_model is not None: + ov_config["enable_save_ov_model"] = enable_save_ov_model + if dynamic_quantization_group_size is not None: + ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"] = dynamic_quantization_group_size + return LLMPipeline(models_path, device, ov_config, scheduler_config=scheduler_config, ATTENTION_BACKEND="PA") elif pipeline_type == PipelineType.CONTINUOUS_BATCHING: return ContinuousBatchingPipeline(models_path, scheduler_config, device, ov_config) elif pipeline_type == PipelineType.SPECULATIVE_DECODING: - ov_draft_model = draft_model(models_path) if draft_model_path is None else draft_model(draft_model_path) + ov_draft_model = ( + draft_model(models_path) + if draft_model_path is None + else draft_model(draft_model_path) + ) return LLMPipeline(models_path, device, ov_config, scheduler_config=scheduler_config, draft_model=ov_draft_model) elif pipeline_type == PipelineType.PROMPT_LOOKUP_DECODING: return LLMPipeline(models_path, device, ov_config, scheduler_config=scheduler_config, prompt_lookup=True) else: raise Exception(f"Unsupported pipeline type: {pipeline_type}") -def create_ov_cb_pipeline(models_path: Path, - pipeline_type: PipelineType = PipelineType.AUTO, - device: str = "CPU", - ov_config: dict = get_default_llm_properties(), - scheduler_config: SchedulerConfig = SchedulerConfig(), - draft_model_path: Path = None): + +def create_ov_cb_pipeline( + models_path: Path, + pipeline_type: PipelineType = PipelineType.AUTO, + device: str = "CPU", + ov_config: dict = get_default_llm_properties(), + scheduler_config: SchedulerConfig = SchedulerConfig(), + draft_model_path: Optional[Path] = None, +) -> ContinuousBatchingPipeline: local_ov_config = ov_config.copy() if pipeline_type == PipelineType.CONTINUOUS_BATCHING: return ContinuousBatchingPipeline(models_path, scheduler_config, device, local_ov_config) @@ -122,8 +162,10 @@ def create_ov_cb_pipeline(models_path: Path, raise Exception(f"Unsupported pipeline type: {pipeline_type}") -def prepare_generation_config_by_pipe_type(generation_config : GenerationConfig, - pipeline_type: PipelineType = PipelineType.AUTO): +def prepare_generation_config_by_pipe_type( + generation_config : GenerationConfig, + pipeline_type: PipelineType = PipelineType.AUTO, +) -> GenerationConfig: if pipeline_type == PipelineType.SPECULATIVE_DECODING: assert not generation_config.is_beam_search() generation_config.assistant_confidence_threshold = 0.9 @@ -134,17 +176,20 @@ def prepare_generation_config_by_pipe_type(generation_config : GenerationConfig, return generation_config -def prepare_generation_configs_by_pipe_type(generation_configs : list[GenerationConfig], - pipeline_type: PipelineType = PipelineType.AUTO): +def prepare_generation_configs_by_pipe_type( + generation_configs : list[GenerationConfig], + pipeline_type: PipelineType = PipelineType.AUTO, +) -> list[GenerationConfig]: return [ prepare_generation_config_by_pipe_type(generation_config, pipeline_type) for generation_config in generation_configs ] def convert_decoded_results_to_generation_result(generate_outputs: DecodedResults, - num_prompts: int, - num_return_sequences: int, - is_beam_search: bool) -> list[GenerationResult]: + num_prompts: int, + num_return_sequences: int, + is_beam_search: bool, +) -> list[GenerationResult]: index = 0 - generation_results = [] + generation_results: list[GenerationResult] = [] for _ in range(num_prompts): generation_result = GenerationResult() @@ -159,21 +204,25 @@ def convert_decoded_results_to_generation_result(generate_outputs: DecodedResult return generation_results -def run_ov_pipeline(models_path: Path, - prompt : str | list[str], - generation_config : GenerationConfig | list[GenerationConfig], - pipeline_type : PipelineType = PipelineType.AUTO, - streamer: StreamerWithResults | Callable | StreamerBase = None, - scheduler_config: SchedulerConfig = SchedulerConfig(), - draft_model_path: Path = None, - ov_config: dict = {}, - device: str = "CPU" - ) -> list[GenerationResult]: +def run_ov_pipeline( + models_path: Path, + prompt : str | list[str], + generation_config : GenerationConfig | list[GenerationConfig], + pipeline_type : PipelineType = PipelineType.AUTO, + streamer: Optional[StreamerWithResults | Callable | StreamerBase] = None, + scheduler_config: SchedulerConfig = SchedulerConfig(), + draft_model_path: Optional[Path] = None, + ov_config: dict = {}, + device: str = "CPU", +) -> list[GenerationResult]: # update the generation config according pipeline_type updated_generation_config = None if isinstance(generation_config, list): if pipeline_type != PipelineType.CONTINUOUS_BATCHING: - raise Exception(f"\'generation_config\' is \'list[GenerationConfig]\'. This type is supported only for \'PipelineType.CONTINIOUS_BATCHING\'! Please change pipeline_type or generation_config type!") + raise Exception( + "\'generation_config\' is \'list[GenerationConfig]\'. This type is supported only for " + "\'PipelineType.CONTINUOUS_BATCHING\'! Please change pipeline_type or generation_config type!" + ) assert isinstance(prompt, list) assert len(generation_config) == len(prompt) updated_generation_config = prepare_generation_configs_by_pipe_type(generation_config, pipeline_type) @@ -190,19 +239,26 @@ def run_ov_pipeline(models_path: Path, streamer.reset() # create pipeline and generate results - ov_pipe = create_ov_pipeline(models_path=models_path, - pipeline_type=pipeline_type, - device=device, - ov_config=ov_config, - scheduler_config=scheduler_config, - draft_model_path=draft_model_path) + ov_pipe = create_ov_pipeline( + models_path=models_path, + pipeline_type=pipeline_type, + device=device, + ov_config=ov_config, + scheduler_config=scheduler_config, + draft_model_path=draft_model_path, + ) generation_results = ov_pipe.generate(prompt, updated_generation_config, streamer) # convert results to `list[GenerationResult]` if isinstance(generation_results, DecodedResults): assert isinstance(generation_config, GenerationConfig) num_prompts = 1 if isinstance(prompt, str) else len(prompt) - generation_results = convert_decoded_results_to_generation_result(generation_results, num_prompts, generation_config.num_return_sequences, generation_config.is_beam_search()) + generation_results = convert_decoded_results_to_generation_result( + generation_results, + num_prompts, + generation_config.num_return_sequences, + generation_config.is_beam_search(), + ) # cleanup test artifacts del ov_pipe @@ -210,7 +266,12 @@ def run_ov_pipeline(models_path: Path, # compare streaming results with generated results if isinstance(streamer, StreamerWithResults): prompts = [ prompt ] if isinstance(prompt, str) else prompt - compare_generation_results(prompts, generation_results, streamer.get_results(), generation_config) + compare_generation_results( + prompts, + generation_results, + streamer.get_results(), + generation_config, + ) return generation_results @@ -230,17 +291,18 @@ def is_generation_available(generation_config: GenerationConfig | list[Generatio # TODO: remove `ref` after Generator property is supported by LLMPipeline / VLMPipeline -def generate_and_compare(model: str, - prompts : str | list[str], - generation_config: list[GenerationConfig] | GenerationConfig | dict, - pipeline_type: PipelineType = PipelineType.AUTO, - scheduler_config: SchedulerConfig | dict = SchedulerConfig(), - ref : list[list[str]] = None, - streamer: StreamerWithResults | Callable | StreamerBase = None): +def generate_and_compare( + model_schema: OVConvertedModelSchema, + prompts : str | list[str], + generation_config: list[GenerationConfig] | GenerationConfig | dict, + pipeline_type: PipelineType = PipelineType.AUTO, + scheduler_config: SchedulerConfig | dict = SchedulerConfig(), + ref : Optional[list[list[str]]] = None, + streamer: Optional[StreamerWithResults | Callable | StreamerBase] = None +) -> None: ov_prompts = prompts if type(prompts) is list else [prompts] - ov_gen_config = GenerationConfig(**generation_config) if type(generation_config) is dict else generation_config - hf_gen_config = ov_gen_config + ov_gen_config = GenerationConfig(**generation_config) if isinstance(generation_config, dict) else generation_config if not is_generation_available(ov_gen_config, pipeline_type): return @@ -251,7 +313,6 @@ def generate_and_compare(model: str, ov_gen_config = [ov_gen_config] * len(ov_prompts) ov_scheduler_config = scheduler_config if isinstance(scheduler_config, SchedulerConfig) else dict_to_scheduler_config(scheduler_config) - opt_model, hf_tokenizer, models_path = download_and_convert_model(model) # w/a to align different API between CB and LLM run_cnt = len(ov_gen_config) if pipeline_type != PipelineType.CONTINUOUS_BATCHING and type(ov_gen_config) is list else 1 @@ -260,17 +321,29 @@ def generate_and_compare(model: str, current_it_prompts = [ov_prompts[i]] if run_cnt > 1 else ov_prompts current_it_gen_config = ov_gen_config[i] if run_cnt > 1 else ov_gen_config - ov_results = run_ov_pipeline(models_path=models_path, - prompt=current_it_prompts, - generation_config=current_it_gen_config, - pipeline_type=pipeline_type, - streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer, - scheduler_config=ov_scheduler_config, - ov_config=get_default_llm_properties()) + ov_results = run_ov_pipeline( + models_path=model_schema.models_path, + prompt=current_it_prompts, + generation_config=current_it_gen_config, + pipeline_type=pipeline_type, + streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer, + scheduler_config=ov_scheduler_config, + ov_config=get_default_llm_properties(), + ) if ref is None: - current_it_hf_config = [hf_gen_config[i]] if run_cnt > 1 else hf_gen_config - ref_results = run_hugging_face(opt_model, hf_tokenizer, current_it_prompts, current_it_hf_config) - compare_generation_results(current_it_prompts, ref_results, ov_results, current_it_gen_config) + current_it_hf_config = [ov_gen_config[i]] if run_cnt > 1 else ov_gen_config + ref_results = run_hugging_face( + model_schema.opt_model, + model_schema.hf_tokenizer, + current_it_prompts, + current_it_hf_config, + ) + compare_generation_results( + current_it_prompts, + ref_results, + ov_results, + current_it_gen_config, + ) else: compare_generation_results_vs_ref(ov_prompts[i], ref[i], ov_results) diff --git a/tests/python_tests/utils/tokenizers.py b/tests/python_tests/utils/tokenizers.py index de13038f06..6d1987dd1e 100644 --- a/tests/python_tests/utils/tokenizers.py +++ b/tests/python_tests/utils/tokenizers.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import shutil +import tempfile from pathlib import Path import openvino @@ -12,19 +13,20 @@ @pytest.fixture(scope="module") -def model_tmp_path(tmpdir_factory): +def model_tmp_path(): model_id = get_models_list()[0] - _, _, models_path = download_and_convert_model(model_id) - - temp_path = tmpdir_factory.mktemp(model_id.replace("/", "_")) - - # copy openvino converted model and tokenizers - for pattern in ["*.xml", "*.bin"]: - for src_file in models_path.glob(pattern): - if src_file.is_file(): - shutil.copy(src_file, temp_path / src_file.name) - - yield model_id, Path(temp_path) + model_schema = download_and_convert_model(model_id) + models_path = model_schema.models_path + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + # copy openvino converted model and tokenizers + for pattern in ["*.xml", "*.bin"]: + for src_file in models_path.glob(pattern): + if src_file.is_file(): + shutil.copy(src_file, temp_path / src_file.name) + + yield model_id, Path(temp_path) def delete_rt_info(configs: list[tuple], temp_path):