From b9c1921caf93d8a4c6c5ac375df2e5cd7866407b Mon Sep 17 00:00:00 2001 From: Amine Date: Fri, 11 Jul 2025 20:19:51 +0100 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20add=20AI=20module=20for=20LLM=20int?= =?UTF-8?q?eraction=20and=20a=20=20heuristic=20for=20checking=20code?= =?UTF-8?q?=E2=80=93docstring=20consistency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Amine --- src/macaron/ai.py | 175 ++++++++++++++++++ src/macaron/config/defaults.ini | 14 ++ .../pypi_heuristics/heuristics.py | 3 + .../sourcecode/matching_docstrings.py | 101 ++++++++++ .../slsa_analyzer/build_tool/gradle.py | 89 +++++++++ src/macaron/slsa_analyzer/build_tool/maven.py | 68 +++++++ src/macaron/slsa_analyzer/build_tool/pip.py | 5 + .../slsa_analyzer/build_tool/poetry.py | 5 + .../checks/detect_malicious_metadata_check.py | 12 +- .../pypi/test_matching_docstrings.py | 103 +++++++++++ 10 files changed, 572 insertions(+), 3 deletions(-) create mode 100644 src/macaron/ai.py create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py create mode 100644 tests/malware_analyzer/pypi/test_matching_docstrings.py diff --git a/src/macaron/ai.py b/src/macaron/ai.py new file mode 100644 index 000000000..eb48ba08b --- /dev/null +++ b/src/macaron/ai.py @@ -0,0 +1,175 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides a client for interacting with a Large Language Model (LLM).""" + +import json +import logging +import re +from typing import Any, TypeVar + +from pydantic import BaseModel, ValidationError + +from macaron.config.defaults import defaults +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.util import send_post_http_raw + +logger: logging.Logger = logging.getLogger(__name__) + +T = TypeVar("T", bound=BaseModel) + + +class AIClient: + """A client for interacting with a Large Language Model.""" + + def __init__(self, system_prompt: str): + """ + Initialize the AI client. + + The LLM configuration (enabled, API key, endpoint, model) is read from defaults. + """ + self.enabled, self.api_endpoint, self.api_key, self.model, self.context_window = self._load_defaults() + self.system_prompt = system_prompt.strip() or "You are a helpful AI assistant." + logger.info("AI client is %s.", "enabled" if self.enabled else "disabled") + + def _load_defaults(self) -> tuple[bool, str, str, str, int]: + """Load the LLM configuration from the defaults.""" + section_name = "llm" + enabled, api_key, api_endpoint, model, context_window = False, "", "", "", 10000 + + if defaults.has_section(section_name): + section = defaults[section_name] + enabled = section.get("enabled", "False").strip().lower() == "true" + api_key = section.get("api_key", "").strip() + api_endpoint = section.get("api_endpoint", "").strip() + model = section.get("model", "").strip() + context_window = section.getint("context_window", 10000) + + if enabled: + if not api_key: + raise ConfigurationError("API key for the AI client is not configured.") + if not api_endpoint: + raise ConfigurationError("API endpoint for the AI client is not configured.") + if not model: + raise ConfigurationError("Model for the AI client is not configured.") + + return enabled, api_endpoint, api_key, model, context_window + + def _validate_response(self, response_text: str, response_model: type[T]) -> T: + """ + Validate and parse the response from the LLM. + + If raw JSON parsing fails, attempts to extract a JSON object from text. + + Parameters + ---------- + response_text: str + The response text from the LLM. + response_model: Type[T] + The Pydantic model to validate the response against. + + Returns + ------- + bool + The validated Pydantic model instance. + + Raises + ------ + HeuristicAnalyzerValueError + If there is an error in parsing or validating the response. + """ + try: + data = json.loads(response_text) + except json.JSONDecodeError: + logger.debug("Full JSON parse failed; trying to extract JSON from text.") + # If the response is not a valid JSON, try to extract a JSON object from the text. + match = re.search(r"\{.*\}", response_text, re.DOTALL) + if not match: + raise HeuristicAnalyzerValueError("No JSON object found in the LLM response.") from match + try: + data = json.loads(match.group(0)) + except json.JSONDecodeError as e: + logger.error("Failed to parse extracted JSON: %s", e) + raise HeuristicAnalyzerValueError("Invalid JSON extracted from response.") from e + + try: + return response_model.model_validate(data) + except ValidationError as e: + logger.error("Validation failed against response model: %s", e) + raise HeuristicAnalyzerValueError("Response JSON validation failed.") from e + + def invoke( + self, + user_prompt: str, + temperature: float = 0.2, + max_tokens: int = 4000, + structured_output: type[T] | None = None, + timeout: int = 30, + ) -> Any: + """ + Invoke the LLM and optionally validate its response. + + Parameters + ---------- + user_prompt: str + The user prompt to send to the LLM. + temperature: float + The temperature for the LLM response. + max_tokens: int + The maximum number of tokens for the LLM response. + structured_output: Optional[Type[T]] + The Pydantic model to validate the response against. If provided, the response will be parsed and validated. + timeout: int + The timeout for the HTTP request in seconds. + + Returns + ------- + Optional[T | str] + The validated Pydantic model instance if `structured_output` is provided, + or the raw string response if not. + + Raises + ------ + HeuristicAnalyzerValueError + If there is an error in parsing or validating the response. + """ + if not self.enabled: + raise ConfigurationError("AI client is not enabled. Please check your configuration.") + + if len(user_prompt.split()) > self.context_window: + logger.warning( + "User prompt exceeds context window (%s words). " + "Truncating the prompt to fit within the context window.", + self.context_window, + ) + user_prompt = " ".join(user_prompt.split()[: self.context_window]) + + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} + payload = { + "model": self.model, + "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], + "temperature": temperature, + "max_tokens": max_tokens, + } + + try: + response = send_post_http_raw(url=self.api_endpoint, json_data=payload, headers=headers, timeout=timeout) + if not response: + raise HeuristicAnalyzerValueError("No response received from the LLM.") + response_json = response.json() + usage = response_json.get("usage", {}) + + if usage: + usage_str = ", ".join(f"{key} = {value}" for key, value in usage.items()) + logger.info("LLM call token usage: %s", usage_str) + + message_content = response_json["choices"][0]["message"]["content"] + + if not structured_output: + logger.debug("Returning raw message content (no structured output requested).") + return message_content + return self._validate_response(message_content, structured_output) + + except Exception as e: + logger.error("Error during LLM invocation: %s", e) + raise HeuristicAnalyzerValueError(f"Failed to get or validate LLM response: {e}") from e diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 8aa5e7a11..0d43d46bc 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -632,3 +632,17 @@ custom_semgrep_rules_path = # .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain # duplicated elements, meaning that ruleset names must be unique. disabled_custom_rulesets = + +[llm] +# The LLM configuration for Macaron. +# If enabled, the LLM will be used to analyze the results and provide insights. +enabled = +# The API key for the LLM service. +api_key = +# The API endpoint for the LLM service. +api_endpoint = +# The model to use for the LLM service. +model = +# The context window size for the LLM service. +# This is the maximum number of tokens that the LLM can process in a single request. +context_window = 10000 diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 1f1fdbf2e..0286cda2c 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -49,6 +49,9 @@ class Heuristics(str, Enum): #: Indicates that the package has a similar structure to other packages maintained by the same user. SIMILAR_PROJECTS = "similar_projects" + #: Indicates that the package contains some code that doesn't match the docstrings. + MATCHING_DOCSTRINGS = "matching_docstrings" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py new file mode 100644 index 000000000..ca9cafbe3 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py @@ -0,0 +1,101 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This analyzer checks the iconsistency of code with its docstrings.""" + +import logging +import time +from typing import Literal + +from pydantic import BaseModel, Field + +from macaron.ai import AIClient +from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class Result(BaseModel): + """The result after analysing the code with its docstrings.""" + + decision: Literal["consistent", "inconsistent"] = Field( + description=""" The final decision after analysing the code with its docstrings. + It can be either 'consistent' or 'inconsistent'.""" + ) + reason: str = Field( + description=" The reason for the decision made. It should be a short sentence explaining the decision." + ) + inconsistent_code_part: str | None = Field( + default=None, + description=""" The specific part of the code that is inconsistent with the docstring. + Empty if the decision is 'consistent'.""", + ) + + +class MatchingDocstringsAnalyzer(BaseHeuristicAnalyzer): + """Check whether the docstrings and the code components are consistent.""" + + SYSTEM_PROMPT = """ + You are a code master who can detect the inconsistency of the code with the docstrings that describes its components. + You will be given a python code file. Your task is to determine whether the code is consistent with the docstrings. + Wrap the output in `json` tags. + Your response must be a JSON object matching this schema: + { + "decision": "'consistent' or 'inconsistent'", + "reason": "A short explanation.", "inconsistent_code_part": + "The inconsistent code, or null." + } + + /no_think + """ + + REQUEST_INTERVAL = 0.5 + + def __init__(self) -> None: + super().__init__( + name="matching_docstrings_analyzer", + heuristic=Heuristics.MATCHING_DOCSTRINGS, + depends_on=None, + ) + self.client = AIClient(system_prompt=self.SYSTEM_PROMPT.strip()) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + if not self.client.enabled: + logger.warning("AI client is not enabled, skipping the matching docstrings analysis.") + return HeuristicResult.SKIP, {} + + download_result = pypi_package_json.download_sourcecode() + if not download_result: + logger.warning("No source code found for the package, skipping the matching docstrings analysis.") + return HeuristicResult.SKIP, {} + + for file, content in pypi_package_json.iter_sourcecode(): + if file.endswith(".py"): + time.sleep(self.REQUEST_INTERVAL) # Respect the request interval to avoid rate limiting. + code_str = content.decode("utf-8", "ignore") + analysis_result = self.client.invoke( + user_prompt=code_str, + structured_output=Result, + ) + if analysis_result and analysis_result.decision == "inconsistent": + return HeuristicResult.FAIL, { + "file": file, + "reason": analysis_result.reason, + "inconsistent part": analysis_result.inconsistent_code_part or "", + } + return HeuristicResult.PASS, {} diff --git a/src/macaron/slsa_analyzer/build_tool/gradle.py b/src/macaron/slsa_analyzer/build_tool/gradle.py index bd316dd30..c67f428cd 100644 --- a/src/macaron/slsa_analyzer/build_tool/gradle.py +++ b/src/macaron/slsa_analyzer/build_tool/gradle.py @@ -1,4 +1,5 @@ # Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Gradle class which inherits BaseBuildTool. @@ -69,6 +70,94 @@ def is_detected(self, repo_path: str) -> bool: gradle_config_files = self.build_configs + self.entry_conf return any(file_exists(repo_path, file) for file in gradle_config_files) + def prepare_config_files(self, wrapper_path: str, build_dir: str) -> bool: + """Prepare the necessary wrapper files for running the build. + + This method will return False if there is any errors happened during operation. + + Parameters + ---------- + wrapper_path : str + The path where all necessary wrapper files are located. + build_dir : str + The path of the build dir. This is where all files are copied to. + + Returns + ------- + bool + True if succeed else False. + """ + # The path of the needed wrapper files + wrapper_files = self.wrapper_files + + if copy_file_bulk(wrapper_files, wrapper_path, build_dir): + # Ensure that gradlew is executable. + file_path = os.path.join(build_dir, "gradlew") + status = os.stat(file_path) + if oct(status.st_mode)[-3:] != "744": + logger.debug("%s does not have 744 permission. Changing it to 744.") + os.chmod(file_path, 0o744) + return True + + return False + + def get_dep_analyzer(self) -> CycloneDxGradle: + """Create a DependencyAnalyzer for the Gradle build tool. + + Returns + ------- + CycloneDxGradle + The CycloneDxGradle object. + + Raises + ------ + DependencyAnalyzerError + """ + if "dependency.resolver" not in defaults or "dep_tool_gradle" not in defaults["dependency.resolver"]: + raise DependencyAnalyzerError("No default dependency analyzer is found.") + if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")): + raise DependencyAnalyzerError( + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", + ) + + tool_name, tool_version = tuple( + defaults.get( + "dependency.resolver", + "dep_tool_gradle", + fallback="cyclonedx-gradle:1.7.3", + ).split(":") + ) + if tool_name == DependencyTools.CYCLONEDX_GRADLE: + return CycloneDxGradle( + resources_path=global_config.resources_path, + file_name="bom.json", + tool_name=tool_name, + tool_version=tool_version, + ) + + raise DependencyAnalyzerError(f"Unsupported SBOM generator for Gradle: {tool_name}.") + + def get_gradle_exec(self, repo_path: str) -> str: + """Get the Gradle executable for the repo. + + Parameters + ---------- + repo_path: str + The absolute path to a repository containing Gradle projects. + + Returns + ------- + str + The absolute path to the Gradle executable. + """ + # We try to use the gradlew that comes with the repository first. + repo_gradlew = os.path.join(repo_path, "gradlew") + if os.path.isfile(repo_gradlew) and os.access(repo_gradlew, os.X_OK): + return repo_gradlew + + # We use Macaron's built-in gradlew as a fallback option. + return os.path.join(os.path.join(macaron.MACARON_PATH, "resources"), "gradlew") + def get_group_id(self, gradle_exec: str, project_path: str) -> str | None: """Get the group id of a Gradle project. diff --git a/src/macaron/slsa_analyzer/build_tool/maven.py b/src/macaron/slsa_analyzer/build_tool/maven.py index 0e89849af..922fb7b71 100644 --- a/src/macaron/slsa_analyzer/build_tool/maven.py +++ b/src/macaron/slsa_analyzer/build_tool/maven.py @@ -64,3 +64,71 @@ def is_detected(self, repo_path: str) -> bool: return False maven_config_files = self.build_configs return any(file_exists(repo_path, file) for file in maven_config_files) + + def prepare_config_files(self, wrapper_path: str, build_dir: str) -> bool: + """Prepare the necessary wrapper files for running the build. + + This method will return False if there is any errors happened during operation. + + Parameters + ---------- + wrapper_path : str + The path where all necessary wrapper files are located. + build_dir : str + The path of the build dir. This is where all files are copied to. + + Returns + ------- + bool + True if succeed else False. + """ + # The path of the needed wrapper files + wrapper_files = self.wrapper_files + + if copy_file_bulk(wrapper_files, wrapper_path, build_dir): + # Ensure that mvnw is executable. + file_path = os.path.join(build_dir, "mvnw") + status = os.stat(file_path) + if oct(status.st_mode)[-3:] != "744": + logger.debug("%s does not have 744 permission. Changing it to 744.") + os.chmod(file_path, 0o744) + return True + + return False + + def get_dep_analyzer(self) -> CycloneDxMaven: + """ + Create a DependencyAnalyzer for the Maven build tool. + + Returns + ------- + CycloneDxMaven + The CycloneDxMaven object. + + Raises + ------ + DependencyAnalyzerError + """ + if "dependency.resolver" not in defaults or "dep_tool_maven" not in defaults["dependency.resolver"]: + raise DependencyAnalyzerError("No default dependency analyzer is found.") + if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")): + raise DependencyAnalyzerError( + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.", + ) + + tool_name, tool_version = tuple( + defaults.get( + "dependency.resolver", + "dep_tool_maven", + fallback="cyclonedx-maven:2.6.2", + ).split(":") + ) + if tool_name == DependencyTools.CYCLONEDX_MAVEN: + return CycloneDxMaven( + resources_path=global_config.resources_path, + file_name="bom.json", + tool_name=tool_name, + tool_version=tool_version, + ) + + raise DependencyAnalyzerError(f"Unsupported SBOM generator for Maven: {tool_name}.") diff --git a/src/macaron/slsa_analyzer/build_tool/pip.py b/src/macaron/slsa_analyzer/build_tool/pip.py index 1926ca33b..073380ec2 100644 --- a/src/macaron/slsa_analyzer/build_tool/pip.py +++ b/src/macaron/slsa_analyzer/build_tool/pip.py @@ -64,6 +64,11 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: DependencyAnalyzer The DependencyAnalyzer object. """ + tool_name = "cyclonedx_py" + if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): + raise DependencyAnalyzerError( + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", + ) return CycloneDxPython( resources_path=global_config.resources_path, file_name="python_sbom.json", diff --git a/src/macaron/slsa_analyzer/build_tool/poetry.py b/src/macaron/slsa_analyzer/build_tool/poetry.py index a1d5a4a0d..3e928dfca 100644 --- a/src/macaron/slsa_analyzer/build_tool/poetry.py +++ b/src/macaron/slsa_analyzer/build_tool/poetry.py @@ -102,6 +102,11 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: DependencyAnalyzer The DependencyAnalyzer object. """ + tool_name = "cyclonedx_py" + if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): + raise DependencyAnalyzerError( + f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", + ) return CycloneDxPython( resources_path=global_config.resources_path, file_name="python_sbom.json", diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 9f09362a4..a09289713 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -28,6 +28,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer +from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext @@ -366,6 +367,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: TyposquattingPresenceAnalyzer, FakeEmailAnalyzer, SimilarProjectAnalyzer, + MatchingDocstringsAnalyzer, ] # name used to query the result of all problog rules, so it can be accessed outside the model. @@ -445,6 +447,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: failed({Heuristics.SIMILAR_PROJECTS.value}), failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}), failed({Heuristics.FAKE_EMAIL.value}). + % Package released with a name similar to a popular package. + {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_3) :- + quickUndetailed, forceSetup, failed({Heuristics.MATCHING_DOCSTRINGS.value}). + % ----- Evaluation ----- % Aggregate result @@ -452,10 +458,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: {problog_result_access} :- trigger(malware_high_confidence_2). {problog_result_access} :- trigger(malware_high_confidence_3). {problog_result_access} :- trigger(malware_high_confidence_4). - {problog_result_access} :- trigger(malware_medium_confidence_1). - {problog_result_access} :- trigger(malware_medium_confidence_2). - {problog_result_access} :- trigger(malware_medium_confidence_3). {problog_result_access} :- trigger(malware_medium_confidence_4). + {problog_result_access} :- trigger(malware_medium_confidence_3). + {problog_result_access} :- trigger(malware_medium_confidence_2). + {problog_result_access} :- trigger(malware_medium_confidence_1). query({problog_result_access}). % Explainability diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py new file mode 100644 index 000000000..c427fa6f9 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_matching_docstrings.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the MatchingDocstringsAnalyzer heuristic.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer, Result + + +@pytest.fixture(name="analyzer") +def analyzer_() -> MatchingDocstringsAnalyzer: + """Pytest fixture to create a MatchingDocstringsAnalyzer instance.""" + return MatchingDocstringsAnalyzer() + + +@pytest.fixture(autouse=True) +def skip_if_client_disabled(analyzer: MatchingDocstringsAnalyzer) -> None: + """ + Automatically skip tests in this file if the AI client is disabled. + """ + if not analyzer.client.enabled: + pytest.skip("AI client disabled - skipping test") + + +def test_analyze_consistent_docstrings_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes when docstrings are consistent with the code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n '''docstring'''\n pass")] + + mock_result = Result(decision="consistent", reason="The code is consistent with the docstring.") + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info + mock_invoke.assert_called_once() + + +def test_analyze_inconsistent_docstrings_fail( + analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer fails when docstrings are inconsistent with the code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [ + ("test.py", b"def func():\n '''docstring'''\n print('hello')") + ] + + mock_result = Result( + decision="inconsistent", + reason="The docstring does not mention the print statement.", + inconsistent_code_part="print('hello')", + ) + + with patch.object(analyzer.client, "invoke", return_value=mock_result): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["file"] == "test.py" + assert info["reason"] == "The docstring does not mention the print statement." + assert info["inconsistent part"] == "print('hello')" + + +def test_analyze_ai_client_disabled_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer skips when the AI client is disabled.""" + with patch.object(analyzer.client, "enabled", False): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info + + +def test_analyze_no_source_code_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer skips if the source code cannot be downloaded.""" + pypi_package_json.download_sourcecode.return_value = False + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info + mock_invoke.assert_not_called() + + +def test_analyze_no_python_files_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes if there are no Python files in the source code.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("README.md", b"This is a test package.")] + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info + mock_invoke.assert_not_called() + + +def test_analyze_llm_invocation_error_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer passes if the LLM invocation returns None (e.g., on API error).""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n pass")] + + with patch.object(analyzer.client, "invoke", return_value=None): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info From 65e54a16b2cd4d78c0ccc0ea9e21de06f7e82c61 Mon Sep 17 00:00:00 2001 From: Amine Date: Thu, 24 Jul 2025 10:52:16 +0100 Subject: [PATCH 2/5] feat(ai): improve robustness of AI client Signed-off-by: Amine --- pyproject.toml | 1 + src/macaron/ai.py | 175 ------------------ src/macaron/ai/README.md | 50 +++++ src/macaron/ai/__init__.py | 2 + src/macaron/ai/ai_client.py | 53 ++++++ src/macaron/ai/ai_factory.py | 70 +++++++ src/macaron/ai/ai_tools.py | 53 ++++++ src/macaron/ai/openai_client.py | 100 ++++++++++ src/macaron/config/defaults.ini | 6 +- .../sourcecode/matching_docstrings.py | 13 +- .../pypi/test_matching_docstrings.py | 10 +- 11 files changed, 344 insertions(+), 189 deletions(-) delete mode 100644 src/macaron/ai.py create mode 100644 src/macaron/ai/README.md create mode 100644 src/macaron/ai/__init__.py create mode 100644 src/macaron/ai/ai_client.py create mode 100644 src/macaron/ai/ai_factory.py create mode 100644 src/macaron/ai/ai_tools.py create mode 100644 src/macaron/ai/openai_client.py diff --git a/pyproject.toml b/pyproject.toml index 40d99dcec..b87768cb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "cryptography >=44.0.0,<45.0.0", "semgrep == 1.113.0", "email-validator >=2.2.0,<3.0.0", + "pydantic >= 2.11.5,<2.12.0", ] keywords = [] # https://pypi.org/classifiers/ diff --git a/src/macaron/ai.py b/src/macaron/ai.py deleted file mode 100644 index eb48ba08b..000000000 --- a/src/macaron/ai.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""This module provides a client for interacting with a Large Language Model (LLM).""" - -import json -import logging -import re -from typing import Any, TypeVar - -from pydantic import BaseModel, ValidationError - -from macaron.config.defaults import defaults -from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError -from macaron.util import send_post_http_raw - -logger: logging.Logger = logging.getLogger(__name__) - -T = TypeVar("T", bound=BaseModel) - - -class AIClient: - """A client for interacting with a Large Language Model.""" - - def __init__(self, system_prompt: str): - """ - Initialize the AI client. - - The LLM configuration (enabled, API key, endpoint, model) is read from defaults. - """ - self.enabled, self.api_endpoint, self.api_key, self.model, self.context_window = self._load_defaults() - self.system_prompt = system_prompt.strip() or "You are a helpful AI assistant." - logger.info("AI client is %s.", "enabled" if self.enabled else "disabled") - - def _load_defaults(self) -> tuple[bool, str, str, str, int]: - """Load the LLM configuration from the defaults.""" - section_name = "llm" - enabled, api_key, api_endpoint, model, context_window = False, "", "", "", 10000 - - if defaults.has_section(section_name): - section = defaults[section_name] - enabled = section.get("enabled", "False").strip().lower() == "true" - api_key = section.get("api_key", "").strip() - api_endpoint = section.get("api_endpoint", "").strip() - model = section.get("model", "").strip() - context_window = section.getint("context_window", 10000) - - if enabled: - if not api_key: - raise ConfigurationError("API key for the AI client is not configured.") - if not api_endpoint: - raise ConfigurationError("API endpoint for the AI client is not configured.") - if not model: - raise ConfigurationError("Model for the AI client is not configured.") - - return enabled, api_endpoint, api_key, model, context_window - - def _validate_response(self, response_text: str, response_model: type[T]) -> T: - """ - Validate and parse the response from the LLM. - - If raw JSON parsing fails, attempts to extract a JSON object from text. - - Parameters - ---------- - response_text: str - The response text from the LLM. - response_model: Type[T] - The Pydantic model to validate the response against. - - Returns - ------- - bool - The validated Pydantic model instance. - - Raises - ------ - HeuristicAnalyzerValueError - If there is an error in parsing or validating the response. - """ - try: - data = json.loads(response_text) - except json.JSONDecodeError: - logger.debug("Full JSON parse failed; trying to extract JSON from text.") - # If the response is not a valid JSON, try to extract a JSON object from the text. - match = re.search(r"\{.*\}", response_text, re.DOTALL) - if not match: - raise HeuristicAnalyzerValueError("No JSON object found in the LLM response.") from match - try: - data = json.loads(match.group(0)) - except json.JSONDecodeError as e: - logger.error("Failed to parse extracted JSON: %s", e) - raise HeuristicAnalyzerValueError("Invalid JSON extracted from response.") from e - - try: - return response_model.model_validate(data) - except ValidationError as e: - logger.error("Validation failed against response model: %s", e) - raise HeuristicAnalyzerValueError("Response JSON validation failed.") from e - - def invoke( - self, - user_prompt: str, - temperature: float = 0.2, - max_tokens: int = 4000, - structured_output: type[T] | None = None, - timeout: int = 30, - ) -> Any: - """ - Invoke the LLM and optionally validate its response. - - Parameters - ---------- - user_prompt: str - The user prompt to send to the LLM. - temperature: float - The temperature for the LLM response. - max_tokens: int - The maximum number of tokens for the LLM response. - structured_output: Optional[Type[T]] - The Pydantic model to validate the response against. If provided, the response will be parsed and validated. - timeout: int - The timeout for the HTTP request in seconds. - - Returns - ------- - Optional[T | str] - The validated Pydantic model instance if `structured_output` is provided, - or the raw string response if not. - - Raises - ------ - HeuristicAnalyzerValueError - If there is an error in parsing or validating the response. - """ - if not self.enabled: - raise ConfigurationError("AI client is not enabled. Please check your configuration.") - - if len(user_prompt.split()) > self.context_window: - logger.warning( - "User prompt exceeds context window (%s words). " - "Truncating the prompt to fit within the context window.", - self.context_window, - ) - user_prompt = " ".join(user_prompt.split()[: self.context_window]) - - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} - payload = { - "model": self.model, - "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], - "temperature": temperature, - "max_tokens": max_tokens, - } - - try: - response = send_post_http_raw(url=self.api_endpoint, json_data=payload, headers=headers, timeout=timeout) - if not response: - raise HeuristicAnalyzerValueError("No response received from the LLM.") - response_json = response.json() - usage = response_json.get("usage", {}) - - if usage: - usage_str = ", ".join(f"{key} = {value}" for key, value in usage.items()) - logger.info("LLM call token usage: %s", usage_str) - - message_content = response_json["choices"][0]["message"]["content"] - - if not structured_output: - logger.debug("Returning raw message content (no structured output requested).") - return message_content - return self._validate_response(message_content, structured_output) - - except Exception as e: - logger.error("Error during LLM invocation: %s", e) - raise HeuristicAnalyzerValueError(f"Failed to get or validate LLM response: {e}") from e diff --git a/src/macaron/ai/README.md b/src/macaron/ai/README.md new file mode 100644 index 000000000..28ddf4757 --- /dev/null +++ b/src/macaron/ai/README.md @@ -0,0 +1,50 @@ +# Macaron AI Module + +This module provides the foundation for interacting with Large Language Models (LLMs) in a provider-agnostic way. It includes an abstract client definition, provider-specific client implementations, a client factory, and utility functions for processing responses. + +## Module Components + +- **ai_client.py** + Defines the abstract [`AIClient`](./ai_client.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations. + +- **openai_client.py** + Implements the [`OpenAiClient`](./openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided. + +- **ai_factory.py** + Contains the [`AIClientFactory`](./ai_factory.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance. + +- **ai_tools.py** + Offers utility functions such as `structure_response` to assist with parsing and validating the JSON response returned by an LLM. These functions ensure that responses conform to a given Pydantic model for easier downstream processing. + +## Usage + +1. **Configuration:** + The module reads the LLM configuration from the application defaults (using the `defaults` module). Make sure that the `llm` section in your configuration includes valid settings such as `enabled`, `api_key`, `api_endpoint`, `model`, and `context_window`. + +2. **Creating a Client:** + Use the [`AIClientFactory`](./ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./openai_client.py)) that can be used to invoke the LLM. + + Example: + ```py + from macaron.ai.ai_factory import AIClientFactory + + factory = AIClientFactory() + client = factory.create_client(system_prompt="You are a helpful assistant.") + response = client.invoke("Hello, how can you assist me?") + print(response) + ``` + +3. **Response Processing:** + When a structured response is required, pass a Pydantic model class to the `invoke` method. The [`ai_tools.py`](./ai_tools.py) module takes care of parsing and validating the response to ensure it meets the expected structure. + +## Logging and Error Handling + +- The module uses Python's logging framework to report important events, such as token usage and warnings when prompts exceed the allowed context window. +- Configuration errors (e.g., missing API key or endpoint) are handled by raising descriptive exceptions, such as those defined in the [`ConfigurationError`](../errors.py). + +## Extensibility + +The design of the AI module is provider-agnostic. To add support for additional LLM providers: +- Implement a new client by subclassing [`AIClient`](./ai_client.py). +- Add the new client to the [`PROVIDER_MAPPING`](./ai_factory.py). +- Update the configuration defaults accordingly. diff --git a/src/macaron/ai/__init__.py b/src/macaron/ai/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/ai/ai_client.py b/src/macaron/ai/ai_client.py new file mode 100644 index 000000000..35733e5d8 --- /dev/null +++ b/src/macaron/ai/ai_client.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module defines the abstract AIClient class for implementing AI clients.""" + +import logging +from abc import ABC, abstractmethod +from typing import Any, TypeVar + +from pydantic import BaseModel + +T = TypeVar("T", bound=BaseModel) + +logger: logging.Logger = logging.getLogger(__name__) + + +class AIClient(ABC): + """This abstract class is used to implement ai clients.""" + + def __init__(self, system_prompt: str, defaults: dict) -> None: + """ + Initialize the AI client. + + The LLM configuration is read from defaults. + """ + self.system_prompt = system_prompt + self.defaults = defaults + + @abstractmethod + def invoke( + self, + user_prompt: str, + temperature: float = 0.2, + structured_output: type[T] | None = None, + ) -> Any: + """ + Invoke the LLM and optionally validate its response. + + Parameters + ---------- + user_prompt: str + The user prompt to send to the LLM. + temperature: float + The temperature for the LLM response. + structured_output: Optional[Type[T]] + The Pydantic model to validate the response against. If provided, the response will be parsed and validated. + + Returns + ------- + Optional[T | str] + The validated Pydantic model instance if `structured_output` is provided, + or the raw string response if not. + """ diff --git a/src/macaron/ai/ai_factory.py b/src/macaron/ai/ai_factory.py new file mode 100644 index 000000000..9462ebf86 --- /dev/null +++ b/src/macaron/ai/ai_factory.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module defines the AIClientFactory class for creating AI clients based on provider configuration.""" + +import logging + +from macaron.ai.ai_client import AIClient +from macaron.ai.openai_client import OpenAiClient +from macaron.config.defaults import defaults +from macaron.errors import ConfigurationError + +logger: logging.Logger = logging.getLogger(__name__) + + +class AIClientFactory: + """Factory to create AI clients based on provider configuration.""" + + PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient} + + def __init__(self) -> None: + """ + Initialize the AI client. + + The LLM configuration is read from defaults. + """ + self.defaults = self._load_defaults() + + def _load_defaults(self) -> dict: + section_name = "llm" + default_values = { + "enabled": False, + "provider": "", + "api_key": "", + "api_endpoint": "", + "model": "", + "context_window": 10000, + } + + if defaults.has_section(section_name): + section = defaults[section_name] + default_values["enabled"] = section.getboolean("enabled", default_values["enabled"]) + default_values["api_key"] = str(section.get("api_key", default_values["api_key"])).strip().lower() + default_values["api_endpoint"] = ( + str(section.get("api_endpoint", default_values["api_endpoint"])).strip().lower() + ) + default_values["model"] = str(section.get("model", default_values["model"])).strip().lower() + default_values["provider"] = str(section.get("provider", default_values["provider"])).strip().lower() + default_values["context_window"] = section.getint("context_window", 10000) + + if default_values["enabled"]: + for key, value in default_values.items(): + if not value: + raise ConfigurationError( + f"AI client configuration '{key}' is required but not set in the defaults." + ) + + return default_values + + def create_client(self, system_prompt: str) -> AIClient | None: + """Create an AI client based on the configured provider.""" + client_class = self.PROVIDER_MAPPING.get(self.defaults["provider"]) + if client_class is None: + logger.error("Provider '%s' is not supported.", self.defaults["provider"]) + return None + return client_class(system_prompt, self.defaults) + + def list_available_providers(self) -> list[str]: + """List all registered providers.""" + return list(self.PROVIDER_MAPPING.keys()) diff --git a/src/macaron/ai/ai_tools.py b/src/macaron/ai/ai_tools.py new file mode 100644 index 000000000..e476376f9 --- /dev/null +++ b/src/macaron/ai/ai_tools.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides utility functions for Large Language Model (LLM).""" +import json +import logging +import re +from typing import TypeVar + +from pydantic import BaseModel, ValidationError + +T = TypeVar("T", bound=BaseModel) + +logger: logging.Logger = logging.getLogger(__name__) + + +def structure_response(response_text: str, response_model: type[T]) -> T | None: + """ + Structure and parse the response from the LLM. + + If raw JSON parsing fails, attempts to extract a JSON object from text. + + Parameters + ---------- + response_text: str + The response text from the LLM. + response_model: Type[T] + The Pydantic model to structure the response against. + + Returns + ------- + T | None + The structured Pydantic model instance. + """ + try: + data = json.loads(response_text) + except json.JSONDecodeError: + logger.debug("Full JSON parse failed; trying to extract JSON from text.") + # If the response is not a valid JSON, try to extract a JSON object from the text. + match = re.search(r"\{.*\}", response_text, re.DOTALL) + if not match: + return None + try: + data = json.loads(match.group(0)) + except json.JSONDecodeError as e: + logger.debug("Failed to parse extracted JSON: %s", e) + return None + + try: + return response_model.model_validate(data) + except ValidationError as e: + logger.debug("Validation failed against response model: %s", e) + return None diff --git a/src/macaron/ai/openai_client.py b/src/macaron/ai/openai_client.py new file mode 100644 index 000000000..cd856745c --- /dev/null +++ b/src/macaron/ai/openai_client.py @@ -0,0 +1,100 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides a client for interacting with a Large Language Model (LLM) that is Openai like.""" + +import logging +from typing import Any, TypeVar + +from pydantic import BaseModel + +from macaron.ai.ai_client import AIClient +from macaron.ai.ai_tools import structure_response +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.util import send_post_http_raw + +logger: logging.Logger = logging.getLogger(__name__) + +T = TypeVar("T", bound=BaseModel) + + +class OpenAiClient(AIClient): + """A client for interacting with a Large Language Model that is OpenAI API like.""" + + def invoke( + self, + user_prompt: str, + temperature: float = 0.2, + structured_output: type[T] | None = None, + max_tokens: int = 4000, + timeout: int = 30, + ) -> Any: + """ + Invoke the LLM and optionally validate its response. + + Parameters + ---------- + user_prompt: str + The user prompt to send to the LLM. + temperature: float + The temperature for the LLM response. + structured_output: Optional[Type[T]] + The Pydantic model to validate the response against. If provided, the response will be parsed and validated. + max_tokens: int + The maximum number of tokens for the LLM response. + timeout: int + The timeout for the HTTP request in seconds. + + Returns + ------- + Optional[T | str] + The validated Pydantic model instance if `structured_output` is provided, + or the raw string response if not. + + Raises + ------ + HeuristicAnalyzerValueError + If there is an error in parsing or validating the response. + """ + if not self.defaults["enabled"]: + raise ConfigurationError("AI client is not enabled. Please check your configuration.") + + if len(user_prompt.split()) > self.defaults["context_window"]: + logger.warning( + "User prompt exceeds context window (%s words). " + "Truncating the prompt to fit within the context window.", + self.defaults["context_window"], + ) + user_prompt = " ".join(user_prompt.split()[: self.defaults["context_window"]]) + + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.defaults["api_key"]}"} + payload = { + "model": self.defaults["model"], + "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], + "temperature": temperature, + "max_tokens": max_tokens, + } + + try: + response = send_post_http_raw( + url=self.defaults["api_endpoint"], json_data=payload, headers=headers, timeout=timeout + ) + if not response: + raise HeuristicAnalyzerValueError("No response received from the LLM.") + response_json = response.json() + usage = response_json.get("usage", {}) + + if usage: + usage_str = ", ".join(f"{key} = {value}" for key, value in usage.items()) + logger.info("LLM call token usage: %s", usage_str) + + message_content = response_json["choices"][0]["message"]["content"] + + if not structured_output: + logger.debug("Returning raw message content (no structured output requested).") + return message_content + return structure_response(message_content, structured_output) + + except Exception as e: + logger.error("Error during LLM invocation: %s", e) + raise HeuristicAnalyzerValueError(f"Failed to get or validate LLM response: {e}") from e diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 0d43d46bc..5f40e8ac5 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -636,7 +636,11 @@ disabled_custom_rulesets = [llm] # The LLM configuration for Macaron. # If enabled, the LLM will be used to analyze the results and provide insights. -enabled = +enabled = False +# The provider for the LLM service. +# Supported providers : +# - openai: OpenAI's GPT models. +provider = # The API key for the LLM service. api_key = # The API endpoint for the LLM service. diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py index ca9cafbe3..bd5a864da 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field -from macaron.ai import AIClient +from macaron.ai.ai_factory import AIClientFactory from macaron.json_tools import JsonType from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -60,7 +60,13 @@ def __init__(self) -> None: heuristic=Heuristics.MATCHING_DOCSTRINGS, depends_on=None, ) - self.client = AIClient(system_prompt=self.SYSTEM_PROMPT.strip()) + factory = AIClientFactory() + client = None + + if factory.defaults["enabled"]: + client = factory.create_client(self.SYSTEM_PROMPT.strip()) + + self.client = client def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. @@ -75,8 +81,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes tuple[HeuristicResult, dict[str, JsonType]]: The result and related information collected during the analysis. """ - if not self.client.enabled: - logger.warning("AI client is not enabled, skipping the matching docstrings analysis.") + if not self.client: return HeuristicResult.SKIP, {} download_result = pypi_package_json.download_sourcecode() diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py index c427fa6f9..f051bf76c 100644 --- a/tests/malware_analyzer/pypi/test_matching_docstrings.py +++ b/tests/malware_analyzer/pypi/test_matching_docstrings.py @@ -22,7 +22,7 @@ def skip_if_client_disabled(analyzer: MatchingDocstringsAnalyzer) -> None: """ Automatically skip tests in this file if the AI client is disabled. """ - if not analyzer.client.enabled: + if not analyzer.client: pytest.skip("AI client disabled - skipping test") @@ -63,14 +63,6 @@ def test_analyze_inconsistent_docstrings_fail( assert info["inconsistent part"] == "print('hello')" -def test_analyze_ai_client_disabled_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: - """Test the analyzer skips when the AI client is disabled.""" - with patch.object(analyzer.client, "enabled", False): - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.SKIP - assert not info - - def test_analyze_no_source_code_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: """Test the analyzer skips if the source code cannot be downloaded.""" pypi_package_json.download_sourcecode.return_value = False From 6da5458de8e973b0de2eec0a70049b92dafbd388 Mon Sep 17 00:00:00 2001 From: Amine Date: Fri, 15 Aug 2025 02:19:11 +0100 Subject: [PATCH 3/5] feat: add Inconsistent Description heuristic Signed-off-by: Amine --- src/macaron/ai/README.md | 14 +-- src/macaron/ai/ai_tools.py | 22 +--- src/macaron/ai/clients/__init__.py | 9 ++ src/macaron/ai/{ => clients}/ai_factory.py | 34 +++--- .../ai/{ai_client.py => clients/base.py} | 24 ++-- src/macaron/ai/{ => clients}/openai_client.py | 33 ++---- src/macaron/ai/prompts/__init__.py | 2 + src/macaron/ai/schemas/__init__.py | 2 + src/macaron/config/defaults.ini | 3 - .../pypi_heuristics/heuristics.py | 3 + .../metadata/inconsistent_description.py | 107 ++++++++++++++++++ .../sourcecode/matching_docstrings.py | 66 ++++++----- .../checks/detect_malicious_metadata_check.py | 3 + .../pypi/test_inconsistent_description.py | 75 ++++++++++++ .../pypi/test_matching_docstrings.py | 18 +-- 15 files changed, 292 insertions(+), 123 deletions(-) create mode 100644 src/macaron/ai/clients/__init__.py rename src/macaron/ai/{ => clients}/ai_factory.py (58%) rename src/macaron/ai/{ai_client.py => clients/base.py} (62%) rename src/macaron/ai/{ => clients}/openai_client.py (69%) create mode 100644 src/macaron/ai/prompts/__init__.py create mode 100644 src/macaron/ai/schemas/__init__.py create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py create mode 100644 tests/malware_analyzer/pypi/test_inconsistent_description.py diff --git a/src/macaron/ai/README.md b/src/macaron/ai/README.md index 28ddf4757..9fc5b0b30 100644 --- a/src/macaron/ai/README.md +++ b/src/macaron/ai/README.md @@ -5,13 +5,13 @@ This module provides the foundation for interacting with Large Language Models ( ## Module Components - **ai_client.py** - Defines the abstract [`AIClient`](./ai_client.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations. + Defines the abstract [`AIClient`](./clients/base.py) class. This class handles the initialization of LLM configuration from the defaults and serves as the base for all specific AI client implementations. - **openai_client.py** - Implements the [`OpenAiClient`](./openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided. + Implements the [`OpenAiClient`](./clients/openai_client.py) class, a concrete subclass of [`AIClient`](./ai_client.py). This client interacts with OpenAI-like APIs by sending requests using HTTP and processing the responses. It also validates and structures responses using the tools provided. - **ai_factory.py** - Contains the [`AIClientFactory`](./ai_factory.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance. + Contains the [`AIClientFactory`](./clients/base.py) class, which is responsible for reading provider configuration from the defaults and creating the correct AI client instance. - **ai_tools.py** Offers utility functions such as `structure_response` to assist with parsing and validating the JSON response returned by an LLM. These functions ensure that responses conform to a given Pydantic model for easier downstream processing. @@ -22,11 +22,11 @@ This module provides the foundation for interacting with Large Language Models ( The module reads the LLM configuration from the application defaults (using the `defaults` module). Make sure that the `llm` section in your configuration includes valid settings such as `enabled`, `api_key`, `api_endpoint`, `model`, and `context_window`. 2. **Creating a Client:** - Use the [`AIClientFactory`](./ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./openai_client.py)) that can be used to invoke the LLM. + Use the [`AIClientFactory`](./clients/ai_factory.py) to create an AI client instance. The factory checks the configured provider and returns a client (e.g., an instance of [`OpenAiClient`](./clients/openai_client.py)) that can be used to invoke the LLM. Example: ```py - from macaron.ai.ai_factory import AIClientFactory + from macaron.ai.clients.ai_factory import AIClientFactory factory = AIClientFactory() client = factory.create_client(system_prompt="You are a helpful assistant.") @@ -45,6 +45,6 @@ This module provides the foundation for interacting with Large Language Models ( ## Extensibility The design of the AI module is provider-agnostic. To add support for additional LLM providers: -- Implement a new client by subclassing [`AIClient`](./ai_client.py). -- Add the new client to the [`PROVIDER_MAPPING`](./ai_factory.py). +- Implement a new client by subclassing [`AIClient`](./clients/base.py). +- Add the new client to the [`PROVIDER_MAPPING`](./clients/ai_factory.py). - Update the configuration defaults accordingly. diff --git a/src/macaron/ai/ai_tools.py b/src/macaron/ai/ai_tools.py index e476376f9..d5704a80c 100644 --- a/src/macaron/ai/ai_tools.py +++ b/src/macaron/ai/ai_tools.py @@ -5,18 +5,14 @@ import json import logging import re -from typing import TypeVar - -from pydantic import BaseModel, ValidationError - -T = TypeVar("T", bound=BaseModel) +from typing import Any logger: logging.Logger = logging.getLogger(__name__) -def structure_response(response_text: str, response_model: type[T]) -> T | None: +def extract_json(response_text: str) -> Any: """ - Structure and parse the response from the LLM. + Parse the response from the LLM. If raw JSON parsing fails, attempts to extract a JSON object from text. @@ -24,13 +20,11 @@ def structure_response(response_text: str, response_model: type[T]) -> T | None: ---------- response_text: str The response text from the LLM. - response_model: Type[T] - The Pydantic model to structure the response against. Returns ------- - T | None - The structured Pydantic model instance. + dict[str, Any] | None + The structured JSON object. """ try: data = json.loads(response_text) @@ -46,8 +40,4 @@ def structure_response(response_text: str, response_model: type[T]) -> T | None: logger.debug("Failed to parse extracted JSON: %s", e) return None - try: - return response_model.model_validate(data) - except ValidationError as e: - logger.debug("Validation failed against response model: %s", e) - return None + return data diff --git a/src/macaron/ai/clients/__init__.py b/src/macaron/ai/clients/__init__.py new file mode 100644 index 000000000..7450cef22 --- /dev/null +++ b/src/macaron/ai/clients/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module provides a mapping of AI client providers to their respective client classes.""" + +from macaron.ai.clients.base import AIClient +from macaron.ai.clients.openai_client import OpenAiClient + +PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient} diff --git a/src/macaron/ai/ai_factory.py b/src/macaron/ai/clients/ai_factory.py similarity index 58% rename from src/macaron/ai/ai_factory.py rename to src/macaron/ai/clients/ai_factory.py index 9462ebf86..5df841efe 100644 --- a/src/macaron/ai/ai_factory.py +++ b/src/macaron/ai/clients/ai_factory.py @@ -5,8 +5,8 @@ import logging -from macaron.ai.ai_client import AIClient -from macaron.ai.openai_client import OpenAiClient +from macaron.ai.clients import PROVIDER_MAPPING +from macaron.ai.clients.base import AIClient from macaron.config.defaults import defaults from macaron.errors import ConfigurationError @@ -16,17 +16,15 @@ class AIClientFactory: """Factory to create AI clients based on provider configuration.""" - PROVIDER_MAPPING: dict[str, type[AIClient]] = {"openai": OpenAiClient} - def __init__(self) -> None: """ Initialize the AI client. The LLM configuration is read from defaults. """ - self.defaults = self._load_defaults() + self.params = self._load_defaults() - def _load_defaults(self) -> dict: + def _load_defaults(self) -> dict | None: section_name = "llm" default_values = { "enabled": False, @@ -34,19 +32,14 @@ def _load_defaults(self) -> dict: "api_key": "", "api_endpoint": "", "model": "", - "context_window": 10000, } if defaults.has_section(section_name): section = defaults[section_name] default_values["enabled"] = section.getboolean("enabled", default_values["enabled"]) - default_values["api_key"] = str(section.get("api_key", default_values["api_key"])).strip().lower() - default_values["api_endpoint"] = ( - str(section.get("api_endpoint", default_values["api_endpoint"])).strip().lower() - ) - default_values["model"] = str(section.get("model", default_values["model"])).strip().lower() - default_values["provider"] = str(section.get("provider", default_values["provider"])).strip().lower() - default_values["context_window"] = section.getint("context_window", 10000) + for key, default_value in default_values.items(): + if isinstance(default_value, str): + default_values[key] = str(section.get(key, default_value)).strip().lower() if default_values["enabled"]: for key, value in default_values.items(): @@ -59,12 +52,11 @@ def _load_defaults(self) -> dict: def create_client(self, system_prompt: str) -> AIClient | None: """Create an AI client based on the configured provider.""" - client_class = self.PROVIDER_MAPPING.get(self.defaults["provider"]) - if client_class is None: - logger.error("Provider '%s' is not supported.", self.defaults["provider"]) + if not self.params or not self.params["enabled"]: return None - return client_class(system_prompt, self.defaults) - def list_available_providers(self) -> list[str]: - """List all registered providers.""" - return list(self.PROVIDER_MAPPING.keys()) + client_class = PROVIDER_MAPPING.get(self.params["provider"]) + if client_class is None: + logger.error("Provider '%s' is not supported.", self.params["provider"]) + return None + return client_class(system_prompt, self.params) diff --git a/src/macaron/ai/ai_client.py b/src/macaron/ai/clients/base.py similarity index 62% rename from src/macaron/ai/ai_client.py rename to src/macaron/ai/clients/base.py index 35733e5d8..5177ae8aa 100644 --- a/src/macaron/ai/ai_client.py +++ b/src/macaron/ai/clients/base.py @@ -3,36 +3,28 @@ """This module defines the abstract AIClient class for implementing AI clients.""" -import logging from abc import ABC, abstractmethod -from typing import Any, TypeVar - -from pydantic import BaseModel - -T = TypeVar("T", bound=BaseModel) - -logger: logging.Logger = logging.getLogger(__name__) class AIClient(ABC): """This abstract class is used to implement ai clients.""" - def __init__(self, system_prompt: str, defaults: dict) -> None: + def __init__(self, system_prompt: str, params: dict) -> None: """ Initialize the AI client. The LLM configuration is read from defaults. """ self.system_prompt = system_prompt - self.defaults = defaults + self.params = params @abstractmethod def invoke( self, user_prompt: str, temperature: float = 0.2, - structured_output: type[T] | None = None, - ) -> Any: + response_format: dict | None = None, + ) -> dict: """ Invoke the LLM and optionally validate its response. @@ -42,12 +34,12 @@ def invoke( The user prompt to send to the LLM. temperature: float The temperature for the LLM response. - structured_output: Optional[Type[T]] - The Pydantic model to validate the response against. If provided, the response will be parsed and validated. + response_format: dict | None + The json schema to validate the response against. Returns ------- - Optional[T | str] - The validated Pydantic model instance if `structured_output` is provided, + dict + The validated schema if `response_format` is provided, or the raw string response if not. """ diff --git a/src/macaron/ai/openai_client.py b/src/macaron/ai/clients/openai_client.py similarity index 69% rename from src/macaron/ai/openai_client.py rename to src/macaron/ai/clients/openai_client.py index cd856745c..c788cab45 100644 --- a/src/macaron/ai/openai_client.py +++ b/src/macaron/ai/clients/openai_client.py @@ -8,8 +8,8 @@ from pydantic import BaseModel -from macaron.ai.ai_client import AIClient -from macaron.ai.ai_tools import structure_response +from macaron.ai.ai_tools import extract_json +from macaron.ai.clients.base import AIClient from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.util import send_post_http_raw @@ -25,7 +25,7 @@ def invoke( self, user_prompt: str, temperature: float = 0.2, - structured_output: type[T] | None = None, + response_format: dict | None = None, max_tokens: int = 4000, timeout: int = 30, ) -> Any: @@ -38,8 +38,8 @@ def invoke( The user prompt to send to the LLM. temperature: float The temperature for the LLM response. - structured_output: Optional[Type[T]] - The Pydantic model to validate the response against. If provided, the response will be parsed and validated. + response_format: dict + The json schema to validate the response against. If provided, the response will be parsed and validated. max_tokens: int The maximum number of tokens for the LLM response. timeout: int @@ -56,28 +56,21 @@ def invoke( HeuristicAnalyzerValueError If there is an error in parsing or validating the response. """ - if not self.defaults["enabled"]: + if not self.params["enabled"]: raise ConfigurationError("AI client is not enabled. Please check your configuration.") - if len(user_prompt.split()) > self.defaults["context_window"]: - logger.warning( - "User prompt exceeds context window (%s words). " - "Truncating the prompt to fit within the context window.", - self.defaults["context_window"], - ) - user_prompt = " ".join(user_prompt.split()[: self.defaults["context_window"]]) - - headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.defaults["api_key"]}"} + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.params['api_key']}"} payload = { - "model": self.defaults["model"], + "model": self.params["model"], "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], + "response_format": response_format, "temperature": temperature, "max_tokens": max_tokens, } try: response = send_post_http_raw( - url=self.defaults["api_endpoint"], json_data=payload, headers=headers, timeout=timeout + url=self.params["api_endpoint"], json_data=payload, headers=headers, timeout=timeout ) if not response: raise HeuristicAnalyzerValueError("No response received from the LLM.") @@ -89,11 +82,7 @@ def invoke( logger.info("LLM call token usage: %s", usage_str) message_content = response_json["choices"][0]["message"]["content"] - - if not structured_output: - logger.debug("Returning raw message content (no structured output requested).") - return message_content - return structure_response(message_content, structured_output) + return extract_json(message_content) except Exception as e: logger.error("Error during LLM invocation: %s", e) diff --git a/src/macaron/ai/prompts/__init__.py b/src/macaron/ai/prompts/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/prompts/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/ai/schemas/__init__.py b/src/macaron/ai/schemas/__init__.py new file mode 100644 index 000000000..8e17a3508 --- /dev/null +++ b/src/macaron/ai/schemas/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 5f40e8ac5..980e18730 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -647,6 +647,3 @@ api_key = api_endpoint = # The model to use for the LLM service. model = -# The context window size for the LLM service. -# This is the maximum number of tokens that the LLM can process in a single request. -context_window = 10000 diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 0286cda2c..fcd88f175 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -52,6 +52,9 @@ class Heuristics(str, Enum): #: Indicates that the package contains some code that doesn't match the docstrings. MATCHING_DOCSTRINGS = "matching_docstrings" + #: Indicates that the package description is inconsistent. + INCONSISTENT_DESCRIPTION = "inconsistent_description" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py new file mode 100644 index 000000000..31fd7ac38 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py @@ -0,0 +1,107 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This analyzer checks if a PyPI package has unsecure description.""" + +import logging + +from macaron.ai.clients.ai_factory import AIClientFactory +from macaron.errors import HeuristicAnalyzerValueError +from macaron.json_tools import JsonType, json_extract +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class InconsistentDescriptionAnalyzer(BaseHeuristicAnalyzer): + """Check whether the package's description is unsecure.""" + + SYSTEM_PROMPT = """ + You are a security expert analyzing a PyPI package. Determine if the package description is secure. + you must score between 0 and 100 based on the following criteria: + - High-level description summary + - Benefit + - How to install + - How to use + - Are the "How to use", "High-level description summary", and "Benefit" sections consistent? + + Wrap the output in `json` tags. + Your response must be a JSON object matching this schema: + { + "score": 0-100, + "reason": "A short explanation.", + } + """ + + THRESHOLD = 60 + + RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "result_schema", + "strict": True, + "schema": { + "type": "object", + "properties": { + "score": { + "type": "integer", + "minimum": 0, + "maximum": 100, + "description": "The final score from 0 to 100 after analysing the packages' description.", + }, + "reason": { + "type": "string", + "description": "The reason for the overall score. It should be a short sentence explaining the decision.", + }, + }, + "required": ["score", "reason"], + }, + }, + } + + def __init__(self) -> None: + super().__init__( + name="inconsistent_description_analyzer", heuristic=Heuristics.INCONSISTENT_DESCRIPTION, depends_on=None + ) + factory = AIClientFactory() + self.client = factory.create_client(self.SYSTEM_PROMPT.strip()) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + """ + if not self.client: + return HeuristicResult.SKIP, {} + + package_json = pypi_package_json.package_json + info = package_json.get("info", {}) + if not info: + error_msg = "No package info found in metadata" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + description = json_extract(package_json, ["info", "description"], str) + if not description or not description.strip(): + return HeuristicResult.FAIL, {"message": "No description found."} + + analysis_result = self.client.invoke( + user_prompt=description, + response_format=self.RESPONSE_FORMAT, + ) + + if analysis_result["score"] < self.THRESHOLD: + return HeuristicResult.FAIL, { + "message": f"inconsistent description with score {analysis_result['score']}. because {analysis_result['reason']}" + } + return HeuristicResult.PASS, {"message": f"consistent description with a {analysis_result['score']} score."} diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py index bd5a864da..a1e7fd1c0 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py @@ -5,11 +5,8 @@ import logging import time -from typing import Literal -from pydantic import BaseModel, Field - -from macaron.ai.ai_factory import AIClientFactory +from macaron.ai.clients.ai_factory import AIClientFactory from macaron.json_tools import JsonType from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -18,23 +15,6 @@ logger: logging.Logger = logging.getLogger(__name__) -class Result(BaseModel): - """The result after analysing the code with its docstrings.""" - - decision: Literal["consistent", "inconsistent"] = Field( - description=""" The final decision after analysing the code with its docstrings. - It can be either 'consistent' or 'inconsistent'.""" - ) - reason: str = Field( - description=" The reason for the decision made. It should be a short sentence explaining the decision." - ) - inconsistent_code_part: str | None = Field( - default=None, - description=""" The specific part of the code that is inconsistent with the docstring. - Empty if the decision is 'consistent'.""", - ) - - class MatchingDocstringsAnalyzer(BaseHeuristicAnalyzer): """Check whether the docstrings and the code components are consistent.""" @@ -54,6 +34,35 @@ class MatchingDocstringsAnalyzer(BaseHeuristicAnalyzer): REQUEST_INTERVAL = 0.5 + RESPONSE_FORMAT = { + "type": "json_schema", + "json_schema": { + "name": "result_schema", + "strict": True, + "schema": { + "type": "object", + "properties": { + "decision": { + "type": "string", + "enum": ["consistent", "inconsistent"], + "description": """The final decision after analysing the code with its docstrings. + It can be either 'consistent' or 'inconsistent'.""", + }, + "reason": { + "type": "string", + "description": "The reason for the decision made.", + }, + "inconsistent_code_part": { + "type": ["string", "null"], + "description": """The specific part of the code that is inconsistent with the docstring. + Empty if the decision is 'consistent'.""", + }, + }, + }, + "required": ["decision", "reason", "inconsistent_code_part"], + }, + } + def __init__(self) -> None: super().__init__( name="matching_docstrings_analyzer", @@ -61,12 +70,7 @@ def __init__(self) -> None: depends_on=None, ) factory = AIClientFactory() - client = None - - if factory.defaults["enabled"]: - client = factory.create_client(self.SYSTEM_PROMPT.strip()) - - self.client = client + self.client = factory.create_client(self.SYSTEM_PROMPT.strip()) def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. @@ -95,12 +99,12 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes code_str = content.decode("utf-8", "ignore") analysis_result = self.client.invoke( user_prompt=code_str, - structured_output=Result, + response_format=self.RESPONSE_FORMAT, ) - if analysis_result and analysis_result.decision == "inconsistent": + if analysis_result["decision"] == "inconsistent": return HeuristicResult.FAIL, { "file": file, - "reason": analysis_result.reason, - "inconsistent part": analysis_result.inconsistent_code_part or "", + "reason": analysis_result["reason"], + "inconsistent part": analysis_result["inconsistent_code_part"] or "", } return HeuristicResult.PASS, {} diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index a09289713..304f3c9e7 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -22,6 +22,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.inconsistent_description import InconsistentDescriptionAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects import SimilarProjectAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer @@ -368,6 +369,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: FakeEmailAnalyzer, SimilarProjectAnalyzer, MatchingDocstringsAnalyzer, + # This heuristic is not used in any combination below, some tests needed before doing that. + InconsistentDescriptionAnalyzer, ] # name used to query the result of all problog rules, so it can be accessed outside the model. diff --git a/tests/malware_analyzer/pypi/test_inconsistent_description.py b/tests/malware_analyzer/pypi/test_inconsistent_description.py new file mode 100644 index 000000000..69a557c41 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_inconsistent_description.py @@ -0,0 +1,75 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for the InconsistentDescriptionAnalyzer heuristic.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.inconsistent_description import InconsistentDescriptionAnalyzer + + +@pytest.fixture(name="analyzer") +def analyzer_() -> InconsistentDescriptionAnalyzer: + """Pytest fixture to create an InconsistentDescriptionAnalyzer instance.""" + return InconsistentDescriptionAnalyzer() + + +@pytest.fixture(autouse=True) +def skip_if_client_disabled(analyzer: InconsistentDescriptionAnalyzer) -> None: + """ + Automatically skip tests in this file if the AI client is disabled. + """ + if not analyzer.client: + pytest.skip("AI client disabled - skipping test") + + +def test_analyze_consistent_description_pass( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer passes when the description is consistent.""" + pypi_package_json.package_json = {"info": {"description": "This is a test package."}} + mock_result = {"score": 80, "reason": "The description is consistent."} + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert isinstance(info["message"], str) + assert "consistent description with a 80 score" in info["message"] + mock_invoke.assert_called_once() + + +def test_analyze_inconsistent_description_fail( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer fails when the description is inconsistent.""" + pypi_package_json.package_json = {"info": {"description": "This is a misleading package."}} + mock_result = {"score": 30, "reason": "The description is misleading."} + + with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert isinstance(info["message"], str) + assert "inconsistent description with score 30" in info["message"] + assert "because The description is misleading" in info["message"] + mock_invoke.assert_called_once() + + +def test_analyze_no_description_fail(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer fails if there is no description.""" + pypi_package_json.package_json = {"info": {"description": " "}} + with patch.object(analyzer.client, "invoke") as mock_invoke: + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["message"] == "No description found." + mock_invoke.assert_not_called() + + +def test_analyze_no_info_raises_error(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock) -> None: + """Test the analyzer raises an error if the package JSON has no 'info' field.""" + pypi_package_json.package_json = {} + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py index f051bf76c..dbdcddebe 100644 --- a/tests/malware_analyzer/pypi/test_matching_docstrings.py +++ b/tests/malware_analyzer/pypi/test_matching_docstrings.py @@ -8,7 +8,7 @@ import pytest from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult -from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer, Result +from macaron.malware_analyzer.pypi_heuristics.sourcecode.matching_docstrings import MatchingDocstringsAnalyzer @pytest.fixture(name="analyzer") @@ -31,7 +31,11 @@ def test_analyze_consistent_docstrings_pass(analyzer: MatchingDocstringsAnalyzer pypi_package_json.download_sourcecode.return_value = True pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n '''docstring'''\n pass")] - mock_result = Result(decision="consistent", reason="The code is consistent with the docstring.") + mock_result = { + "decision": "consistent", + "reason": "The code is consistent with the docstring.", + "inconsistent_code_part": None, + } with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: result, info = analyzer.analyze(pypi_package_json) @@ -49,11 +53,11 @@ def test_analyze_inconsistent_docstrings_fail( ("test.py", b"def func():\n '''docstring'''\n print('hello')") ] - mock_result = Result( - decision="inconsistent", - reason="The docstring does not mention the print statement.", - inconsistent_code_part="print('hello')", - ) + mock_result = { + "decision": "inconsistent", + "reason": "The docstring does not mention the print statement.", + "inconsistent_code_part": "print('hello')", + } with patch.object(analyzer.client, "invoke", return_value=mock_result): result, info = analyzer.analyze(pypi_package_json) From d0a0d65939cc34351c70c780202815b2b44c2efc Mon Sep 17 00:00:00 2001 From: Amine Date: Mon, 25 Aug 2025 18:13:23 +0100 Subject: [PATCH 4/5] refactor: move threshold configuration to defaults.ini Signed-off-by: Amine --- src/macaron/config/defaults.ini | 3 +++ .../metadata/inconsistent_description.py | 14 +++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 980e18730..f4ae2cd2a 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -609,6 +609,9 @@ popular_packages_path = # A boolean value that determines whether to check the deliverability of the email address. check_deliverability = True +# The threshold for a package's description score to be considered secure. +score_threshold = 70 + # ==== The following sections are for source code analysis using Semgrep ==== # rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules. # rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py index 31fd7ac38..8aa5a61e9 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py @@ -6,6 +6,7 @@ import logging from macaron.ai.clients.ai_factory import AIClientFactory +from macaron.config.defaults import defaults from macaron.errors import HeuristicAnalyzerValueError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer @@ -35,8 +36,6 @@ class InconsistentDescriptionAnalyzer(BaseHeuristicAnalyzer): } """ - THRESHOLD = 60 - RESPONSE_FORMAT = { "type": "json_schema", "json_schema": { @@ -65,9 +64,18 @@ def __init__(self) -> None: super().__init__( name="inconsistent_description_analyzer", heuristic=Heuristics.INCONSISTENT_DESCRIPTION, depends_on=None ) + self.threshold = self._load_defaults() factory = AIClientFactory() self.client = factory.create_client(self.SYSTEM_PROMPT.strip()) + def _load_defaults(self) -> int: + """Load the default values from defaults.ini.""" + section_name = "heuristic.pypi" + if defaults.has_section(section_name): + section = defaults[section_name] + return section.getint("score_threshold", 70) + return 70 + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the package. @@ -100,7 +108,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes response_format=self.RESPONSE_FORMAT, ) - if analysis_result["score"] < self.THRESHOLD: + if analysis_result["score"] < self.threshold: return HeuristicResult.FAIL, { "message": f"inconsistent description with score {analysis_result['score']}. because {analysis_result['reason']}" } From 2df3401eda56ea241f91acc222d363b7116a217f Mon Sep 17 00:00:00 2001 From: Amine Date: Sat, 6 Sep 2025 02:19:19 +0100 Subject: [PATCH 5/5] chore(tests): improve test coverage and apply minor heuristic changes Signed-off-by: Amine --- src/macaron/ai/clients/openai_client.py | 2 + .../metadata/inconsistent_description.py | 3 + .../sourcecode/matching_docstrings.py | 8 ++ .../slsa_analyzer/build_tool/gradle.py | 89 ------------ src/macaron/slsa_analyzer/build_tool/maven.py | 68 --------- src/macaron/slsa_analyzer/build_tool/pip.py | 5 - .../slsa_analyzer/build_tool/poetry.py | 5 - .../checks/detect_malicious_metadata_check.py | 4 +- .../pypi/test_inconsistent_description.py | 134 +++++++++++++++--- .../pypi/test_matching_docstrings.py | 70 ++++----- 10 files changed, 163 insertions(+), 225 deletions(-) diff --git a/src/macaron/ai/clients/openai_client.py b/src/macaron/ai/clients/openai_client.py index c788cab45..772cc61d8 100644 --- a/src/macaron/ai/clients/openai_client.py +++ b/src/macaron/ai/clients/openai_client.py @@ -27,6 +27,7 @@ def invoke( temperature: float = 0.2, response_format: dict | None = None, max_tokens: int = 4000, + seed: int = 42, timeout: int = 30, ) -> Any: """ @@ -65,6 +66,7 @@ def invoke( "messages": [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": user_prompt}], "response_format": response_format, "temperature": temperature, + "seed": seed, "max_tokens": max_tokens, } diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py index 8aa5a61e9..5ca7d8ce1 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/inconsistent_description.py @@ -107,6 +107,9 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes user_prompt=description, response_format=self.RESPONSE_FORMAT, ) + if not analysis_result: + logger.error("LLM returned invalid response, skipping the analysis.") + return HeuristicResult.SKIP, {} if analysis_result["score"] < self.threshold: return HeuristicResult.FAIL, { diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py index a1e7fd1c0..51cf6eb8d 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/matching_docstrings.py @@ -93,6 +93,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes logger.warning("No source code found for the package, skipping the matching docstrings analysis.") return HeuristicResult.SKIP, {} + none_attempts = 5 for file, content in pypi_package_json.iter_sourcecode(): if file.endswith(".py"): time.sleep(self.REQUEST_INTERVAL) # Respect the request interval to avoid rate limiting. @@ -101,6 +102,13 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes user_prompt=code_str, response_format=self.RESPONSE_FORMAT, ) + if not analysis_result: + none_attempts -= 1 + if none_attempts == 0: + logger.error("LLM returned None multiple times, skipping the analysis.") + return HeuristicResult.SKIP, {} + continue + if analysis_result["decision"] == "inconsistent": return HeuristicResult.FAIL, { "file": file, diff --git a/src/macaron/slsa_analyzer/build_tool/gradle.py b/src/macaron/slsa_analyzer/build_tool/gradle.py index c67f428cd..bd316dd30 100644 --- a/src/macaron/slsa_analyzer/build_tool/gradle.py +++ b/src/macaron/slsa_analyzer/build_tool/gradle.py @@ -1,5 +1,4 @@ # Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the Gradle class which inherits BaseBuildTool. @@ -70,94 +69,6 @@ def is_detected(self, repo_path: str) -> bool: gradle_config_files = self.build_configs + self.entry_conf return any(file_exists(repo_path, file) for file in gradle_config_files) - def prepare_config_files(self, wrapper_path: str, build_dir: str) -> bool: - """Prepare the necessary wrapper files for running the build. - - This method will return False if there is any errors happened during operation. - - Parameters - ---------- - wrapper_path : str - The path where all necessary wrapper files are located. - build_dir : str - The path of the build dir. This is where all files are copied to. - - Returns - ------- - bool - True if succeed else False. - """ - # The path of the needed wrapper files - wrapper_files = self.wrapper_files - - if copy_file_bulk(wrapper_files, wrapper_path, build_dir): - # Ensure that gradlew is executable. - file_path = os.path.join(build_dir, "gradlew") - status = os.stat(file_path) - if oct(status.st_mode)[-3:] != "744": - logger.debug("%s does not have 744 permission. Changing it to 744.") - os.chmod(file_path, 0o744) - return True - - return False - - def get_dep_analyzer(self) -> CycloneDxGradle: - """Create a DependencyAnalyzer for the Gradle build tool. - - Returns - ------- - CycloneDxGradle - The CycloneDxGradle object. - - Raises - ------ - DependencyAnalyzerError - """ - if "dependency.resolver" not in defaults or "dep_tool_gradle" not in defaults["dependency.resolver"]: - raise DependencyAnalyzerError("No default dependency analyzer is found.") - if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_gradle")): - raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", - ) - - tool_name, tool_version = tuple( - defaults.get( - "dependency.resolver", - "dep_tool_gradle", - fallback="cyclonedx-gradle:1.7.3", - ).split(":") - ) - if tool_name == DependencyTools.CYCLONEDX_GRADLE: - return CycloneDxGradle( - resources_path=global_config.resources_path, - file_name="bom.json", - tool_name=tool_name, - tool_version=tool_version, - ) - - raise DependencyAnalyzerError(f"Unsupported SBOM generator for Gradle: {tool_name}.") - - def get_gradle_exec(self, repo_path: str) -> str: - """Get the Gradle executable for the repo. - - Parameters - ---------- - repo_path: str - The absolute path to a repository containing Gradle projects. - - Returns - ------- - str - The absolute path to the Gradle executable. - """ - # We try to use the gradlew that comes with the repository first. - repo_gradlew = os.path.join(repo_path, "gradlew") - if os.path.isfile(repo_gradlew) and os.access(repo_gradlew, os.X_OK): - return repo_gradlew - - # We use Macaron's built-in gradlew as a fallback option. - return os.path.join(os.path.join(macaron.MACARON_PATH, "resources"), "gradlew") - def get_group_id(self, gradle_exec: str, project_path: str) -> str | None: """Get the group id of a Gradle project. diff --git a/src/macaron/slsa_analyzer/build_tool/maven.py b/src/macaron/slsa_analyzer/build_tool/maven.py index 922fb7b71..0e89849af 100644 --- a/src/macaron/slsa_analyzer/build_tool/maven.py +++ b/src/macaron/slsa_analyzer/build_tool/maven.py @@ -64,71 +64,3 @@ def is_detected(self, repo_path: str) -> bool: return False maven_config_files = self.build_configs return any(file_exists(repo_path, file) for file in maven_config_files) - - def prepare_config_files(self, wrapper_path: str, build_dir: str) -> bool: - """Prepare the necessary wrapper files for running the build. - - This method will return False if there is any errors happened during operation. - - Parameters - ---------- - wrapper_path : str - The path where all necessary wrapper files are located. - build_dir : str - The path of the build dir. This is where all files are copied to. - - Returns - ------- - bool - True if succeed else False. - """ - # The path of the needed wrapper files - wrapper_files = self.wrapper_files - - if copy_file_bulk(wrapper_files, wrapper_path, build_dir): - # Ensure that mvnw is executable. - file_path = os.path.join(build_dir, "mvnw") - status = os.stat(file_path) - if oct(status.st_mode)[-3:] != "744": - logger.debug("%s does not have 744 permission. Changing it to 744.") - os.chmod(file_path, 0o744) - return True - - return False - - def get_dep_analyzer(self) -> CycloneDxMaven: - """ - Create a DependencyAnalyzer for the Maven build tool. - - Returns - ------- - CycloneDxMaven - The CycloneDxMaven object. - - Raises - ------ - DependencyAnalyzerError - """ - if "dependency.resolver" not in defaults or "dep_tool_maven" not in defaults["dependency.resolver"]: - raise DependencyAnalyzerError("No default dependency analyzer is found.") - if not DependencyAnalyzer.tool_valid(defaults.get("dependency.resolver", "dep_tool_maven")): - raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_maven')} is not valid.", - ) - - tool_name, tool_version = tuple( - defaults.get( - "dependency.resolver", - "dep_tool_maven", - fallback="cyclonedx-maven:2.6.2", - ).split(":") - ) - if tool_name == DependencyTools.CYCLONEDX_MAVEN: - return CycloneDxMaven( - resources_path=global_config.resources_path, - file_name="bom.json", - tool_name=tool_name, - tool_version=tool_version, - ) - - raise DependencyAnalyzerError(f"Unsupported SBOM generator for Maven: {tool_name}.") diff --git a/src/macaron/slsa_analyzer/build_tool/pip.py b/src/macaron/slsa_analyzer/build_tool/pip.py index 073380ec2..1926ca33b 100644 --- a/src/macaron/slsa_analyzer/build_tool/pip.py +++ b/src/macaron/slsa_analyzer/build_tool/pip.py @@ -64,11 +64,6 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: DependencyAnalyzer The DependencyAnalyzer object. """ - tool_name = "cyclonedx_py" - if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): - raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", - ) return CycloneDxPython( resources_path=global_config.resources_path, file_name="python_sbom.json", diff --git a/src/macaron/slsa_analyzer/build_tool/poetry.py b/src/macaron/slsa_analyzer/build_tool/poetry.py index 3e928dfca..a1d5a4a0d 100644 --- a/src/macaron/slsa_analyzer/build_tool/poetry.py +++ b/src/macaron/slsa_analyzer/build_tool/poetry.py @@ -102,11 +102,6 @@ def get_dep_analyzer(self) -> DependencyAnalyzer: DependencyAnalyzer The DependencyAnalyzer object. """ - tool_name = "cyclonedx_py" - if not DependencyAnalyzer.tool_valid(f"{tool_name}:{cyclonedx_version}"): - raise DependencyAnalyzerError( - f"Dependency analyzer {defaults.get('dependency.resolver', 'dep_tool_gradle')} is not valid.", - ) return CycloneDxPython( resources_path=global_config.resources_path, file_name="python_sbom.json", diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 304f3c9e7..c3233bce1 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -450,8 +450,9 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: failed({Heuristics.SIMILAR_PROJECTS.value}), failed({Heuristics.HIGH_RELEASE_FREQUENCY.value}), failed({Heuristics.FAKE_EMAIL.value}). + % Package released with a name similar to a popular package. - {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_3) :- + {Confidence.MEDIUM.value}::trigger(malware_medium_confidence_5) :- quickUndetailed, forceSetup, failed({Heuristics.MATCHING_DOCSTRINGS.value}). % ----- Evaluation ----- @@ -461,6 +462,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: {problog_result_access} :- trigger(malware_high_confidence_2). {problog_result_access} :- trigger(malware_high_confidence_3). {problog_result_access} :- trigger(malware_high_confidence_4). + {problog_result_access} :- trigger(malware_medium_confidence_5). {problog_result_access} :- trigger(malware_medium_confidence_4). {problog_result_access} :- trigger(malware_medium_confidence_3). {problog_result_access} :- trigger(malware_medium_confidence_2). diff --git a/tests/malware_analyzer/pypi/test_inconsistent_description.py b/tests/malware_analyzer/pypi/test_inconsistent_description.py index 69a557c41..51b96a166 100644 --- a/tests/malware_analyzer/pypi/test_inconsistent_description.py +++ b/tests/malware_analyzer/pypi/test_inconsistent_description.py @@ -27,35 +27,16 @@ def skip_if_client_disabled(analyzer: InconsistentDescriptionAnalyzer) -> None: pytest.skip("AI client disabled - skipping test") -def test_analyze_consistent_description_pass( - analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock -) -> None: - """Test the analyzer passes when the description is consistent.""" - pypi_package_json.package_json = {"info": {"description": "This is a test package."}} - mock_result = {"score": 80, "reason": "The description is consistent."} - - with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.PASS - assert isinstance(info["message"], str) - assert "consistent description with a 80 score" in info["message"] - mock_invoke.assert_called_once() - - def test_analyze_inconsistent_description_fail( analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock ) -> None: """Test the analyzer fails when the description is inconsistent.""" pypi_package_json.package_json = {"info": {"description": "This is a misleading package."}} - mock_result = {"score": 30, "reason": "The description is misleading."} - with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.FAIL - assert isinstance(info["message"], str) - assert "inconsistent description with score 30" in info["message"] - assert "because The description is misleading" in info["message"] - mock_invoke.assert_called_once() + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert isinstance(info["message"], str) + assert info["message"].startswith("inconsistent description with score") def test_analyze_no_description_fail(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock) -> None: @@ -73,3 +54,110 @@ def test_analyze_no_info_raises_error(analyzer: InconsistentDescriptionAnalyzer, pypi_package_json.package_json = {} with pytest.raises(HeuristicAnalyzerValueError): analyzer.analyze(pypi_package_json) + + +CONSISTENT_DESCRIPTION = """ +# Requests + +**Requests** is a simple, yet elegant, HTTP library. + +Requests allows you to send HTTP/1.1 requests extremely easily. +There’s no need to manually add query strings to your URLs, +or to form-encode your `PUT` & `POST` data — but nowadays, just use the `json` method! + +Requests is one of the most downloaded Python packages today, +pulling in around `30M downloads / week`— according to GitHub, +Requests is currently +[depended upon](https://github.com/psf/requests/network/dependents?package_id=UGFja2FnZS01NzA4OTExNg%3D%3D) +by `1,000,000+` repositories. +You may certainly put your trust in this code. + +[![Downloads](https://static.pepy.tech/badge/requests/month)](https://pepy.tech/project/requests) +[![Supported Versions](https://img.shields.io/pypi/pyversions/requests.svg)](https://pypi.org/project/requests) +[![Contributors](https://img.shields.io/github/contributors/psf/requests.svg)](https://github.com/psf/requests/graphs/contributors) + +## Installing Requests and Supported Versions + +Requests is available on PyPI: + +```console +$ python -m pip install requests +``` + +Requests officially supports Python 3.9+. + +## Supported Features & Best–Practices + +Requests is ready for the demands of building robust and reliable HTTP–speaking applications, for the needs of today. + +- Keep-Alive & Connection Pooling +- International Domains and URLs +- Sessions with Cookie Persistence +- Browser-style TLS/SSL Verification +- Basic & Digest Authentication +- Familiar `dict`–like Cookies +- Automatic Content Decompression and Decoding +- Multi-part File Uploads +- SOCKS Proxy Support +- Connection Timeouts +- Streaming Downloads +- Automatic honoring of `.netrc` +- Chunked HTTP Requests + +## API Reference and User Guide available on [Read the Docs](https://requests.readthedocs.io) + +[![Read the Docs](https://raw.githubusercontent.com/psf/requests/main/ext/ss.png)](https://requests.readthedocs.io) + +## Cloning the repository + +When cloning the Requests repository, you may need to add the `-c +fetch.fsck.badTimezone=ignore` flag to avoid an error about a bad commit timestamp (see +[this issue](https://github.com/psf/requests/issues/2690) for more background): + +```shell +git clone -c fetch.fsck.badTimezone=ignore https://github.com/psf/requests.git +``` + +You can also apply this setting to your global Git config: + +```shell +git config --global fetch.fsck.badTimezone ignore +``` + +--- + +[![Kenneth Reitz](https://raw.githubusercontent.com/psf/requests/main/ext/kr.png)](https://kennethreitz.org) +[![Python Software Foundation](https://raw.githubusercontent.com/psf/requests/main/ext/psf.png)](https://www.python.org/psf) +""" + + +def test_analyze_consistent_description_pass( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer passes when the description is consistent.""" + pypi_package_json.package_json = { + "info": { + "description": CONSISTENT_DESCRIPTION, + } + } + + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert isinstance(info["message"], str) + assert info["message"].startswith("consistent description with a ") + + +def test_analyze_excessive_llm_invocation_error_skip( + analyzer: InconsistentDescriptionAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer skips if the LLM invocation returns None multiple times.""" + pypi_package_json.package_json = { + "info": { + "description": "description", + } + } + + with patch.object(analyzer.client, "invoke", return_value=None): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info diff --git a/tests/malware_analyzer/pypi/test_matching_docstrings.py b/tests/malware_analyzer/pypi/test_matching_docstrings.py index dbdcddebe..dcad994de 100644 --- a/tests/malware_analyzer/pypi/test_matching_docstrings.py +++ b/tests/malware_analyzer/pypi/test_matching_docstrings.py @@ -31,17 +31,9 @@ def test_analyze_consistent_docstrings_pass(analyzer: MatchingDocstringsAnalyzer pypi_package_json.download_sourcecode.return_value = True pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n '''docstring'''\n pass")] - mock_result = { - "decision": "consistent", - "reason": "The code is consistent with the docstring.", - "inconsistent_code_part": None, - } - - with patch.object(analyzer.client, "invoke", return_value=mock_result) as mock_invoke: - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.PASS - assert not info - mock_invoke.assert_called_once() + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info def test_analyze_inconsistent_docstrings_fail( @@ -50,42 +42,39 @@ def test_analyze_inconsistent_docstrings_fail( """Test the analyzer fails when docstrings are inconsistent with the code.""" pypi_package_json.download_sourcecode.return_value = True pypi_package_json.iter_sourcecode.return_value = [ - ("test.py", b"def func():\n '''docstring'''\n print('hello')") + ( + "test.py", + b""" + def factorial(number: int): + '''A function that returns the factorial of a number''' + return number * 2 + print('hello') + """, + ), ] - mock_result = { - "decision": "inconsistent", - "reason": "The docstring does not mention the print statement.", - "inconsistent_code_part": "print('hello')", - } - - with patch.object(analyzer.client, "invoke", return_value=mock_result): - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.FAIL - assert info["file"] == "test.py" - assert info["reason"] == "The docstring does not mention the print statement." - assert info["inconsistent part"] == "print('hello')" + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.FAIL + assert info["file"] == "test.py" def test_analyze_no_source_code_skip(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: """Test the analyzer skips if the source code cannot be downloaded.""" pypi_package_json.download_sourcecode.return_value = False - with patch.object(analyzer.client, "invoke") as mock_invoke: - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.SKIP - assert not info - mock_invoke.assert_not_called() + + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info def test_analyze_no_python_files_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: """Test the analyzer passes if there are no Python files in the source code.""" pypi_package_json.download_sourcecode.return_value = True pypi_package_json.iter_sourcecode.return_value = [("README.md", b"This is a test package.")] - with patch.object(analyzer.client, "invoke") as mock_invoke: - result, info = analyzer.analyze(pypi_package_json) - assert result == HeuristicResult.PASS - assert not info - mock_invoke.assert_not_called() + + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.PASS + assert not info def test_analyze_llm_invocation_error_pass(analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock) -> None: @@ -97,3 +86,16 @@ def test_analyze_llm_invocation_error_pass(analyzer: MatchingDocstringsAnalyzer, result, info = analyzer.analyze(pypi_package_json) assert result == HeuristicResult.PASS assert not info + + +def test_analyze_excessive_llm_invocation_error_skip( + analyzer: MatchingDocstringsAnalyzer, pypi_package_json: MagicMock +) -> None: + """Test the analyzer skips if the LLM invocation returns None multiple times.""" + pypi_package_json.download_sourcecode.return_value = True + pypi_package_json.iter_sourcecode.return_value = [("test.py", b"def func():\n pass") for _ in range(5)] + + with patch.object(analyzer.client, "invoke", return_value=None): + result, info = analyzer.analyze(pypi_package_json) + assert result == HeuristicResult.SKIP + assert not info