@@ -1329,30 +1329,74 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
1329
1329
GGML_CANN_CALL_ACLNN_OP (ctx, InplacePowTensorTensor, acl_dst, acl_exp);
1330
1330
}
1331
1331
1332
-
1332
+ /* *
1333
+ * @brief Generate a range of values and apply a scalar base exponentiation.
1334
+ *
1335
+ * This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
1336
+ * with step size `step`, stores it in a temporary buffer, and then computes:
1337
+ *
1338
+ * @f[
1339
+ * slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
1340
+ * @f]
1341
+ *
1342
+ * The results are written to the provided @p slope_buffer.
1343
+ *
1344
+ * @param ctx CANN backend context for memory allocation and operator execution.
1345
+ * @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
1346
+ * @param m Scalar base for the exponentiation.
1347
+ * @param size Number of elements in the generated sequence.
1348
+ * @param start Starting exponent offset.
1349
+ * @param stop Stopping exponent offset (exclusive).
1350
+ * @param step Step size for the exponent increment.
1351
+ */
1333
1352
static void aclnn_get_slope_inner (ggml_backend_cann_context& ctx, void * slope_buffer,
1334
1353
float m, int64_t size, float start, float stop, float step){
1335
1354
int64_t ne[] = {size};
1336
1355
size_t nb[] = {sizeof (float )};
1337
1356
1338
1357
ggml_cann_pool_alloc arange_allocator (ctx.pool (), size * sizeof (float ));
1339
- void * arange_buffer = arange_allocator.get ();
1358
+ void * arange_buffer = arange_allocator.get ();
1340
1359
1341
- aclTensor * arange_tensor = ggml_cann_create_tensor (
1360
+ aclTensor* arange_tensor = ggml_cann_create_tensor (
1342
1361
arange_buffer, ACL_FLOAT, sizeof (float ), ne, nb, 1 );
1343
1362
aclnn_arange (ctx, arange_tensor, start, stop, step, size);
1344
1363
1345
- aclTensor * slope_tensor = ggml_cann_create_tensor (
1364
+ aclTensor* slope_tensor = ggml_cann_create_tensor (
1346
1365
slope_buffer, ACL_FLOAT, sizeof (float ), ne, nb, 1 );
1347
1366
1348
- aclScalar * sc = aclCreateScalar (&m, aclDataType::ACL_FLOAT);
1367
+ aclScalar* sc = aclCreateScalar (&m, aclDataType::ACL_FLOAT);
1349
1368
1350
1369
GGML_CANN_CALL_ACLNN_OP (ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
1351
1370
ggml_cann_release_resources (ctx, sc, arange_tensor, slope_tensor);
1352
1371
}
1353
1372
1373
+ /* *
1374
+ * @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
1375
+ *
1376
+ * This function generates slope values for each attention head according to the ALiBi
1377
+ * (Attention with Linear Biases) method. It splits the computation into two ranges depending
1378
+ * on whether the head index is less than @p n_head_log2 or not, and uses different base values
1379
+ * (`m0` and `m1`) for the exponentiation.
1380
+ *
1381
+ * @f[
1382
+ * slope[h] =
1383
+ * \begin{cases}
1384
+ * m_0^{(h + 1)}, & h < n\_head\_log2 \\
1385
+ * m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
1386
+ * \end{cases}
1387
+ * \quad , \quad \text{if } max\_bias > 0
1388
+ * @f]
1389
+ *
1390
+ * If @p max_bias <= 0, all slope values are set to 1.0.
1391
+ *
1392
+ * @param ctx CANN backend context for memory allocation and operator execution.
1393
+ * @param n_head Total number of attention heads.
1394
+ * @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
1395
+ * @param max_bias Maximum bias value for slope computation.
1396
+ *
1397
+ */
1354
1398
static void aclnn_get_slope (ggml_backend_cann_context & ctx, int64_t n_head,
1355
- void * slope_buffer, float max_bias) {
1399
+ void * slope_buffer, float max_bias) {
1356
1400
const int n_head_log2 = 1u << (uint32_t ) floor (log2 (n_head));
1357
1401
1358
1402
float m0 = powf (2 .0f , -(max_bias) / n_head_log2);
@@ -1382,24 +1426,43 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
1382
1426
}
1383
1427
}
1384
1428
1429
+ /* *
1430
+ * @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
1431
+ *
1432
+ * This function computes the ALiBi slopes for each attention head (if max_bias > 0),
1433
+ * multiplies them with the attention mask to produce bias tensors, and adds these biases
1434
+ * to the destination tensor (@p dst).
1435
+ *
1436
+ * The function performs necessary broadcasting of the mask and slope tensors to match
1437
+ * the shape of the destination tensor, then applies element-wise multiplication and addition
1438
+ * using CANN operators.
1439
+ *
1440
+ * @param ctx CANN backend context for memory management and operator execution.
1441
+ * @param mask Input attention mask tensor, assumed to be contiguous.
1442
+ * @param dst Destination tensor to which ALiBi biases will be added.
1443
+ * @param dst_ptr Pointer to the memory of the destination tensor.
1444
+ * @param max_bias Maximum bias value controlling the slope scaling.
1445
+ *
1446
+ * @note
1447
+ * - Write data into dst_ptr using only the shape information of the dst tensor.
1448
+ * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
1449
+ */
1385
1450
static void aclnn_add_alibi (ggml_backend_cann_context& ctx, ggml_tensor* mask,
1386
1451
ggml_tensor* dst, void * dst_ptr, float max_bias) {
1387
1452
void * slope_buffer = nullptr ;
1388
1453
void * bias_buffer = nullptr ;
1389
1454
1390
- int64_t n_heads = dst->ne [2 ];
1391
- ggml_cann_pool_alloc slope_allocator (ctx.pool (), n_heads * sizeof (float ));
1392
- slope_buffer = slope_allocator.get ();
1393
- ggml_cann_pool_alloc bias_allocator (
1394
- ctx.pool (), ggml_nelements (dst) * ggml_element_size (dst));
1395
- bias_buffer = bias_allocator.get ();
1396
-
1397
1455
if (max_bias > 0 .0f ) {
1456
+ int64_t n_heads = dst->ne [2 ];
1457
+ ggml_cann_pool_alloc slope_allocator (ctx.pool (), n_heads * sizeof (float ));
1458
+ slope_buffer = slope_allocator.get ();
1459
+ ggml_cann_pool_alloc bias_allocator (
1460
+ ctx.pool (), ggml_nelements (dst) * ggml_element_size (dst));
1461
+ bias_buffer = bias_allocator.get ();
1398
1462
aclnn_get_slope (ctx, n_heads, slope_buffer, max_bias);
1399
1463
}
1400
1464
1401
1465
// broadcast for mask, slop and dst;
1402
- GGML_ASSERT (ggml_is_contiguous (mask));
1403
1466
int64_t nr2 = dst->ne [2 ] / mask->ne [2 ];
1404
1467
int64_t nr3 = dst->ne [3 ] / mask->ne [3 ];
1405
1468
@@ -1424,12 +1487,14 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1424
1487
slope_nb[i] = slope_nb[i - 1 ] * slope_ne[i - 1 ];
1425
1488
}
1426
1489
1427
- aclTensor * acl_slope = ggml_cann_create_tensor (
1490
+ aclTensor* acl_slope = ggml_cann_create_tensor (
1428
1491
slope_buffer, ACL_FLOAT, sizeof (float ),
1429
1492
slope_ne, slope_nb, GGML_MAX_DIMS + 2 );
1430
- aclTensor * acl_mask = ggml_cann_create_tensor (
1493
+ aclTensor* acl_mask = ggml_cann_create_tensor (
1431
1494
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2 );
1432
- aclTensor * acl_dst = ggml_cann_create_tensor (
1495
+
1496
+ // write data into dst_ptr using only the shape information of the dst tensor.
1497
+ aclTensor* acl_dst = ggml_cann_create_tensor (
1433
1498
dst_ptr, ggml_cann_type_mapping (dst->type ),
1434
1499
ggml_type_size (dst->type ), dst_ne, dst_nb,
1435
1500
GGML_MAX_DIMS + 2 );
@@ -1441,7 +1506,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
1441
1506
for (int i = 1 ; i < GGML_MAX_DIMS + 2 ; i++) {
1442
1507
bias_nb[i] = bias_nb[i - 1 ] * bias_ne[i - 1 ];
1443
1508
}
1444
- aclTensor * bias_tensor = ggml_cann_create_tensor (
1509
+ aclTensor* bias_tensor = ggml_cann_create_tensor (
1445
1510
bias_buffer, ACL_FLOAT, sizeof (float ),
1446
1511
bias_ne, bias_nb, GGML_MAX_DIMS + 2 );
1447
1512
@@ -1473,16 +1538,16 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1473
1538
* stored.
1474
1539
*/
1475
1540
static void aclnn_softmax (ggml_backend_cann_context & ctx,
1476
- aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
1541
+ aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
1477
1542
GGML_CANN_CALL_ACLNN_OP (ctx, Softmax, acl_src, dim, acl_dst);
1478
1543
}
1479
1544
1480
1545
void ggml_cann_softmax (ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1481
- ggml_tensor * src0 = dst->src [0 ];
1482
- ggml_tensor * src1 = dst->src [1 ]; // mask
1546
+ ggml_tensor* src0 = dst->src [0 ];
1547
+ ggml_tensor* src1 = dst->src [1 ]; // mask
1483
1548
1484
- aclTensor * acl_src0 = ggml_cann_create_tensor (src0);
1485
- aclTensor * acl_dst = ggml_cann_create_tensor (dst);
1549
+ aclTensor* acl_src0 = ggml_cann_create_tensor (src0);
1550
+ aclTensor* acl_dst = ggml_cann_create_tensor (dst);
1486
1551
1487
1552
float scale = 1 .0f ;
1488
1553
float max_bias = 0 .0f ;
@@ -1491,7 +1556,7 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
1491
1556
memcpy (&max_bias, (float *) dst->op_params + 1 , sizeof (float ));
1492
1557
1493
1558
// input mul scale
1494
- aclScalar * acl_scale = aclCreateScalar (&scale, aclDataType::ACL_FLOAT);
1559
+ aclScalar* acl_scale = aclCreateScalar (&scale, aclDataType::ACL_FLOAT);
1495
1560
ggml_cann_pool_alloc src_tensor_allocator (ctx.pool (), ggml_nbytes (src0));
1496
1561
void * src_tensor_buffer = src_tensor_allocator.get ();
1497
1562
aclTensor* softmax_tensor = ggml_cann_create_tensor (
0 commit comments