Skip to content

Commit 18c938b

Browse files
committed
MXFP4 quants
Signed-off-by: Prabhu Subramanian <prabhu@appthreat.com>
1 parent d7a6d66 commit 18c938b

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

contrib/fine-tuning/convert-gguf.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ set -e
1616
# cmake -B build
1717
# cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
1818

19+
# Run 'ollama serve' in a separate terminal
20+
1921
export TOKENIZERS_PARALLELISM=false
2022
LLAMA_CPP_PATH=/Users/appthreat/work/llama.cpp
2123
cd $LLAMA_CPP_PATH
@@ -52,12 +54,14 @@ GGUF_MODEL_Q8_0_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT}
5254
GGUF_MODEL_Q8_0_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT}
5355
FUSED_MODEL=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
5456

57+
# Direct conversion to 8-bit from the fused BF16 version
5558
rm -rf ${GGUF_MODEL_Q8_0_PATH}
5659
mkdir -p ${GGUF_MODEL_Q8_0_PATH}
5760
python convert_hf_to_gguf.py --outtype q8_0 --outfile ${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q8_0-${FORMAT}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf --model-name ${GGUF_MODEL_Q8_0_NAME} ${FUSED_MODEL}
5861
cp ${MODEL_FILE_PATH} ${GGUF_MODEL_Q8_0_PATH}/Modelfile
5962
cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_Q8_0_PATH}/
6063

64+
# BF16
6165
GGUF_MODEL_BF16_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT}
6266
GGUF_MODEL_BF16_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT}
6367
rm -rf ${GGUF_MODEL_BF16_PATH}
@@ -67,6 +71,16 @@ cp ${MODEL_FILE_PATH} ${GGUF_MODEL_BF16_PATH}/Modelfile
6771
sed -i '' 's|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-bf16.gguf|g' ${GGUF_MODEL_BF16_PATH}/Modelfile
6872
cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_BF16_PATH}/
6973

74+
# MXFP4 - MOE only
75+
GGUF_MODEL_MXFP4_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4-${FORMAT}
76+
GGUF_MODEL_MXFP4_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4-${FORMAT}
77+
rm -rf ${GGUF_MODEL_MXFP4_PATH}
78+
mkdir -p ${GGUF_MODEL_MXFP4_PATH}
79+
llama-quantize ${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-BF16-${FORMAT}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-bf16.gguf ${GGUF_MODEL_MXFP4_PATH}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4.gguf MXFP4_MOE
80+
cp ${MODEL_FILE_PATH} ${GGUF_MODEL_MXFP4_PATH}/Modelfile
81+
sed -i '' 's|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-q8_0.gguf|./${TOOL_BASE_MODEL}-${PARAM_SIZE}-MXFP4.gguf|g' ${GGUF_MODEL_MXFP4_PATH}/Modelfile
82+
cp ${FUSED_MODEL}/*.json ${FUSED_MODEL}/merges.txt ${GGUF_MODEL_MXFP4_PATH}/
83+
7084
if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then
7185
GGUF_MODEL_Q6_K_NAME=${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q6_K-${FORMAT}
7286
GGUF_MODEL_Q6_K_PATH=${CDXGEN_FT_PATH}/${HF_ORG}/${TOOL_BASE_MODEL}-${PARAM_SIZE}-Q6_K-${FORMAT}
@@ -114,6 +128,7 @@ fi
114128
export HF_HUB_ENABLE_HF_TRANSFER=0
115129
hf auth whoami
116130
hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_Q8_0_NAME} ${GGUF_MODEL_Q8_0_PATH} .
131+
hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_MXFP4_NAME} ${GGUF_MODEL_MXFP4_PATH} .
117132
if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then
118133
hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_Q6_K_NAME} ${GGUF_MODEL_Q6_K_PATH} .
119134
else
@@ -123,11 +138,18 @@ else
123138
fi
124139
hf upload --quiet --exclude "**/README.md" --repo-type model ${GGUF_MODEL_BF16_NAME} ${GGUF_MODEL_BF16_PATH} .
125140

141+
### upload to ollama registry. Move this to a separate script in the future.
142+
126143
ollama pull hf.co/${GGUF_MODEL_Q8_0_NAME}
127144
ollama cp hf.co/${GGUF_MODEL_Q8_0_NAME} ${GGUF_MODEL_Q8_0_NAME}
128145
ollama push ${GGUF_MODEL_Q8_0_NAME}
129146
ollama rm hf.co/${GGUF_MODEL_Q8_0_NAME}
130147

148+
ollama pull hf.co/${GGUF_MODEL_MXFP4_NAME}
149+
ollama cp hf.co/${GGUF_MODEL_MXFP4_NAME} ${GGUF_MODEL_MXFP4_NAME}
150+
ollama push ${GGUF_MODEL_MXFP4_NAME}
151+
ollama rm hf.co/${GGUF_MODEL_MXFP4_NAME}
152+
131153
if [ "$TOOL_BASE_MODEL" == "cdx1-mini" ] || [ "$TOOL_BASE_MODEL" == "cdx1-nano" ]; then
132154
ollama pull hf.co/${GGUF_MODEL_Q6_K_NAME}
133155
ollama cp hf.co/${GGUF_MODEL_Q6_K_NAME} ${GGUF_MODEL_Q6_K_NAME}

contrib/fine-tuning/fine-tune-mlx.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
4545
QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit
4646
QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit
4747
QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit
48+
QUANT_MODEL_MXFP4=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-MXFP4
4849
DWQ_QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit-DWQ
4950

5051
### mlx-lm needs train.jsonl and valid.jsonl
@@ -109,6 +110,11 @@ mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_6BIT} -q --q-bi
109110
echo "Test ${QUANT_MODEL_6BIT} with the prompt 'Tell me about cdxgen'. Must yield a better response."
110111
mlx_lm.generate --model ./${QUANT_MODEL_6BIT} --prompt "Tell me about cdxgen" --temp ${TEMP} --max-tokens ${MAX_TOKENS}
111112

113+
rm -rf ${QUANT_MODEL_MXFP4}
114+
mlx_lm.convert --hf-path ${FUSED_MODEL} --mlx-path ${QUANT_MODEL_MXFP4} -q --q-bits 4 --q-group-size 32 --q-mode mxfp4 --dtype bfloat16
115+
echo "Test ${QUANT_MODEL_MXFP4} with the prompt 'Tell me about cdxgen'. Must yield a better response."
116+
mlx_lm.generate --model ./${QUANT_MODEL_MXFP4} --prompt "Tell me about cdxgen" --temp ${TEMP} --max-tokens ${MAX_TOKENS}
117+
112118
# 4-bit for a small model has very poor performance
113119
if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ] && [ "$TOOL_BASE_MODEL" != "cdx1-nano" ]; then
114120
rm -rf ${QUANT_MODEL_4BIT}

contrib/fine-tuning/upload-hf.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ FUSED_MODEL=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}
1010
QUANT_MODEL_8BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-8bit
1111
QUANT_MODEL_6BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-6bit
1212
QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit
13+
QUANT_MODEL_MXFP4=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-MXFP4
1314
DWQ_QUANT_MODEL_4BIT=${HF_ORG}/${TOOL_BASE_MODEL}-${TUNING_TOOL}-4bit-DWQ
1415

1516
hf auth whoami
@@ -20,12 +21,13 @@ hf upload --quiet --repo-type dataset CycloneDX/cdx-docs ./guides guides
2021
hf upload --quiet --repo-type dataset CycloneDX/cdx-docs ./semantics semantics
2122

2223
echo "Uploading models. Please wait ..."
23-
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_8BIT} ./${QUANT_MODEL_8BIT} .
24-
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_6BIT} ./${QUANT_MODEL_6BIT} .
24+
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_8BIT} ./${QUANT_MODEL_8BIT} --delete "*.safetensors" .
25+
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_MXFP4} ./${QUANT_MODEL_MXFP4} --delete "*.safetensors" .
26+
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_6BIT} ./${QUANT_MODEL_6BIT} --delete "*.safetensors" .
2527
if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ] && [ "$TOOL_BASE_MODEL" != "cdx1-nano" ]; then
26-
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_4BIT} ./${QUANT_MODEL_4BIT} .
28+
hf upload --quiet --exclude "**/README.md" --repo-type model ${QUANT_MODEL_4BIT} ./${QUANT_MODEL_4BIT} --delete "*.safetensors" .
2729
fi
2830
#if [ "$TOOL_BASE_MODEL" != "cdx1-mini" ]; then
2931
# hf upload --quiet --exclude "**/README.md" --repo-type model ${DWQ_QUANT_MODEL_4BIT} ./${DWQ_QUANT_MODEL_4BIT} .
3032
#fi
31-
hf upload --quiet --exclude "**/README.md" --repo-type model ${FUSED_MODEL} ./${FUSED_MODEL} .
33+
hf upload --quiet --exclude "**/README.md" --repo-type model ${FUSED_MODEL} ./${FUSED_MODEL} --delete "*.safetensors" .

0 commit comments

Comments
 (0)