@article{NGUYEN2025112455,
title = {Human-understandable explanation for software vulnerability prediction},
journal = {Journal of Systems and Software},
pages = {112455},
year = {2025},
issn = {0164-1212},
doi = {https://doi.org/10.1016/j.jss.2025.112455},
url = {https://www.sciencedirect.com/science/article/pii/S0164121225001232},
author = {Hong Quy Nguyen and Thong Hoang and Hoa Khanh Dam and Guoxin Su and Zhenchang Xing and Qinghua Lu and Jiamou Sun},
keywords = {Vulnerability prediction, Explainable AI, Text generation, Key aspects},
abstract = {Recent advances in deep learning have significantly improved the performance of software vulnerability prediction (SVP). To enhance trustworthiness, the SVP highlights predicted lines of code (LoC) that may be vulnerable. However, providing LoC alone is often insufficient for software practitioners, as it lacks detailed information about the nature of the vulnerability. This paper introduces a novel framework that is built on SVP by offering additional explanatory information based on the suggested LoC. Similar to security reports, our framework comprehensively explains the vulnerability aspects, such as Root Cause, Impact, Attack Vector, and Vulnerability Type. The proposed framework is powered by transformer architectures. Specifically, we leverage pre-trained language models for code to fine-tune on two practical datasets: BigVul and Vulnerability Key Aspect, ensuring our framework’s applicability to real-world scenarios. Experiments using the ROUGE and BLEU scores as evaluation metrics show that our framework achieves better performance with CodeT5+, statistically outperforming a baseline study in generating key vulnerability aspects. Additionally, we conducted a small-scale user study with experienced software practitioners to assess the effectiveness of the framework. The results show that 72% of the participants found our framework helpful in accepting the SVP results, and 68% rated the additional explanations as moderately to extremely useful. Editor’s note: Open Science material was validated by the Journal of Systems and Software Open Science Board.}
}
./run_docker.sh
conda env create -f binder/environment.yml
conda activate vul-intext-reason
Install other dependencies in OS
- Ubuntu clang-format version 14.0.0-1ubuntu1
- Graphviz
sudo apt install clang-format
sudo apt install graphviz
conda env update --file binder/environment.yml --prune
conda env export --from-history -f binder/environment.yml
Firstly, find / -name "libstdc++.so*"
and create a symbolic link properly
- Merge BigVul and VKA:
explore_data61.ipynb
- Apply LineVul:
apply_linevul_parse_data.ipynb
Final data can be download at https://drive.google.com/file/d/1ZxGaSg4L3lGq94SYgngjR_CnZtNpEvtc/view?usp=sharing , unzip and rename it to .aspect_bigvul_new
./run_t5p_new.sh
./run_bert_seq2seq_new.sh
./run_t5p_percentage.sh
- run elasticsearch
docker run -d --name elasticsearch \
-p 0.0.0.0:9200:9200 -p 9300:9300 \
-e "discovery.type=single-node" \
-e "xpack.security.enabled=false" \
-v /data/elasticsearchData/:/usr/share/elasticsearch/data \
docker.elastic.co/elasticsearch/elasticsearch:8.15.0
- run RAG(CodeT5+) or RAG(BM25): use notebooks
rag_baseline.ipynb
andrag_baseline_BM25.ipynb
- for Few-shot learning
./run_3shot.sh