How to add a new backend for llama.cpp #14359

yuenyu1 · 2025-06-24T09:24:19Z

yuenyu1
Jun 24, 2025

How to add a new backend for llama.cpp? Is there any docments?

taronaeo · 2025-06-26T13:32:04Z

taronaeo
Jun 26, 2025

If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.

Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.

Backend Registration

Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.

ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) {
    static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = {
        /* .api_version = */ GGML_YOUR_BACKEND_NAME_BACKEND_VERSION,
        /* .interface   = */ ggml_backend_YOUR_BACKEND_NAME_reg_i,
        /* .context     = */ NULL,
    };

    return &ggml_backend_zdnn_reg;
}

GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)

Backend Inference

static const struct ggml_backend_reg_i ggml_backend_YOUR_BACKEND_NAME_reg_i = {
    /* .get_name            = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_name,
    /* .get_device_count    = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device_count,
    /* .get_device          = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device,
    /* .get_proc_address    = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address,
};

Backend Device and Proc Address

static ggml_backend_dev_t ggml_backend_YOUR_BACKEND_NAME_reg_get_device(ggml_backend_reg_t reg, size_t index) {
    GGML_ASSERT(index == 0);

    static ggml_backend_device ggml_backend_YOUR_BACKEND_NAME_device = {
        /* .interface   = */ ggml_backend_YOUR_BACKEND_NAME_device_i,
        /* .register    = */ reg,
        /* .context     = */ nullptr,
    };

    return &ggml_backend_YOUR_BACKEND_NAME_device;
}

static void * ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    return nullptr;

    GGML_UNUSED(reg);
    GGML_UNUSED(name);
}

Actual Device Interface

static const struct ggml_backend_device_i ggml_backend_YOUR_BACKEND_NAME_device_i = {
    /* .get_name                = */ ggml_backend_YOUR_BACKEND_NAME_device_get_name,
    /* .get_description         = */ ggml_backend_YOUR_BACKEND_NAME_device_get_desc,
    /* .get_memory              = */ ggml_backend_YOUR_BACKEND_NAME_device_get_memory,
    /* .get_type                = */ ggml_backend_YOUR_BACKEND_NAME_device_get_type,
    /* .get_props               = */ ggml_backend_YOUR_BACKEND_NAME_device_get_props,
    /* .init_backend            = */ ggml_backend_YOUR_BACKEND_NAME_device_init_backend,
    /* .get_buffer_type         = */ ggml_backend_YOUR_BACKEND_NAME_device_get_buffer_type,
    /* .get_host_buffer_type    = */ NULL,
    /* .buffer_from_host_ptr    = */ ggml_backend_YOUR_BACKEND_NAME_device_buffer_from_host_ptr,
    /* .supports_op             = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_op,
    /* .supports_buft           = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
    /* .event_synchronize       = */ NULL,
};

Check if device supports operation

static bool ggml_backend_YOUR_BACKEND_NAME_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
    const struct ggml_tensor * src0 = op->src[0];
    const struct ggml_tensor * src1 = op->src[1];

    switch (op->op) {
        // GGML required ops
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_PERMUTE:
        case GGML_OP_TRANSPOSE:
            break;

        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SQRT:
        case GGML_OP_LOG:
        case GGML_OP_NORM:
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_LEAKY_RELU:
            return false; // TODO: disable all support first to showcase device reg
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
                case GGML_UNARY_OP_ABS:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_STEP:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_EXP:
                    break;
                default:
                    return false;
            }
        default:
            return false;
    }

    return true;

    GGML_UNUSED(dev);
}

Dispatch of computation of operation

inline bool ggml_YOUR_BACKEND_NAME_compute_forward(ggml_backend_YOUR_BACKEND_NAME_context & ctx,
                                                    ggml_tensor * dst) {
    switch (dst->op) {
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SQRT:
        case GGML_OP_LOG:
        case GGML_OP_NORM:
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_LEAKY_RELU:
            return false;
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(dst)) {
                case GGML_UNARY_OP_ABS:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_STEP:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_SIGMOID:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                    return false;
                case GGML_UNARY_OP_EXP:
                    break;
                default:
                    return false;
            }
        default:
            return false;
    }

    return true;
}

That should be the main bulk of registering a device and getting the compute operation forwarded to your backend. Please take note that GGML's matrix multiplication is computed as $C = B*A^T$, where C = destination tensor, B = input tensor, A = weights tensor.

You may choose to refer to my zDNN implementation here: https://github.com/taronaeo/llama.cpp-s390x/blob/zdnn-accelerator-backend/ggml/src/ggml-zdnn/ggml-zdnn.cpp

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to add a new backend for llama.cpp #14359

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Select a reply

Uh oh!

How to add a new backend for llama.cpp #14359

Uh oh!

yuenyu1 Jun 24, 2025

Replies: 1 comment

Uh oh!

Uh oh!

taronaeo Jun 26, 2025

yuenyu1
Jun 24, 2025

taronaeo
Jun 26, 2025