Skip to content

Commit cf23ee2

Browse files
hexagon: add neg, exp, sigmoid, softplus ops, cont, repeat ops (ggml-org#20701)
Add element-wise unary ops needed by Qwen 3.5's DeltaNet linear attention layers. These ops follow the existing unary-ops pattern with VTCM DMA double-buffering. - neg: negate via scale by -1.0 - exp: uses existing hvx_exp_f32 HVX intrinsics - sigmoid: uses existing hvx_sigmoid_f32_aa HVX intrinsics - softplus: log(1 + exp(x)) scalar fallback - CONT reuses the existing CPY infrastructure since making a tensor contiguous is equivalent to a same-type copy. - REPEAT implements tiled memory copy with multi-threaded execution via the worker pool, supporting f32 and f16 types. The kernel parallelizes across output rows and uses memcpy for each tile. Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
1 parent 892e3c3 commit cf23ee2

11 files changed

Lines changed: 441 additions & 28 deletions

File tree

ggml/src/ggml-hexagon/ggml-hexagon.cpp

Lines changed: 133 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2362,6 +2362,27 @@ static inline size_t init_cpy_req(htp_general_req * req, dspqueue_buffer * bufs,
23622362
return n_bufs;
23632363
}
23642364

2365+
static inline size_t init_cont_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2366+
// CONT is just a contiguous copy — reuse CPY op
2367+
req->op = HTP_OP_CPY;
2368+
2369+
size_t n_bufs = 0;
2370+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2371+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2372+
2373+
return n_bufs;
2374+
}
2375+
2376+
static inline size_t init_repeat_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
2377+
req->op = HTP_OP_REPEAT;
2378+
2379+
size_t n_bufs = 0;
2380+
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
2381+
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
2382+
2383+
return n_bufs;
2384+
}
2385+
23652386
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
23662387
req->op = HTP_OP_GET_ROWS;
23672388

@@ -2449,12 +2470,33 @@ static inline size_t init_unary_req(htp_general_req * req, dspqueue_buffer * buf
24492470
break;
24502471

24512472
case GGML_OP_UNARY:
2452-
if (ggml_get_unary_op(t) == GGML_UNARY_OP_SILU) {
2473+
switch (ggml_get_unary_op(t)) {
2474+
case GGML_UNARY_OP_SILU:
24532475
req->op = HTP_OP_UNARY_SILU;
24542476
supported = true;
2455-
} else if (ggml_get_unary_op(t) == GGML_UNARY_OP_GELU) {
2477+
break;
2478+
case GGML_UNARY_OP_GELU:
24562479
req->op = HTP_OP_UNARY_GELU;
24572480
supported = true;
2481+
break;
2482+
case GGML_UNARY_OP_SIGMOID:
2483+
req->op = HTP_OP_UNARY_SIGMOID;
2484+
supported = true;
2485+
break;
2486+
case GGML_UNARY_OP_NEG:
2487+
req->op = HTP_OP_UNARY_NEG;
2488+
supported = true;
2489+
break;
2490+
case GGML_UNARY_OP_EXP:
2491+
req->op = HTP_OP_UNARY_EXP;
2492+
supported = true;
2493+
break;
2494+
case GGML_UNARY_OP_SOFTPLUS:
2495+
req->op = HTP_OP_UNARY_SOFTPLUS;
2496+
supported = true;
2497+
break;
2498+
default:
2499+
break;
24582500
}
24592501
break;
24602502

@@ -2640,16 +2682,28 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
26402682
ggml_hexagon_dispatch_op<init_sum_rows_req>(sess, node, flags);
26412683
break;
26422684
case GGML_OP_UNARY:
2643-
if ((ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) ||
2644-
(ggml_get_unary_op(node) == GGML_UNARY_OP_GELU)) {
2645-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2685+
switch (ggml_get_unary_op(node)) {
2686+
case GGML_UNARY_OP_NEG:
2687+
case GGML_UNARY_OP_EXP:
2688+
case GGML_UNARY_OP_SIGMOID:
2689+
case GGML_UNARY_OP_SOFTPLUS:
2690+
case GGML_UNARY_OP_SILU:
2691+
case GGML_UNARY_OP_GELU:
2692+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2693+
break;
2694+
default:
2695+
break;
26462696
}
26472697
break;
26482698
case GGML_OP_GLU:
2649-
if ((ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU) ||
2650-
(ggml_get_glu_op(node) == GGML_GLU_OP_SWIGLU_OAI) ||
2651-
(ggml_get_glu_op(node) == GGML_GLU_OP_GEGLU)) {
2652-
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2699+
switch (ggml_get_glu_op(node)) {
2700+
case GGML_GLU_OP_SWIGLU:
2701+
case GGML_GLU_OP_SWIGLU_OAI:
2702+
case GGML_GLU_OP_GEGLU:
2703+
ggml_hexagon_dispatch_op<init_unary_req>(sess, node, flags);
2704+
break;
2705+
default:
2706+
break;
26532707
}
26542708
break;
26552709
case GGML_OP_SOFT_MAX:
@@ -2676,6 +2730,14 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
26762730
ggml_hexagon_dispatch_op<init_cpy_req>(sess, node, flags);
26772731
break;
26782732

2733+
case GGML_OP_CONT:
2734+
ggml_hexagon_dispatch_op<init_cont_req>(sess, node, flags);
2735+
break;
2736+
2737+
case GGML_OP_REPEAT:
2738+
ggml_hexagon_dispatch_op<init_repeat_req>(sess, node, flags);
2739+
break;
2740+
26792741
case GGML_OP_ARGSORT:
26802742
ggml_hexagon_dispatch_op<init_argsort_req>(sess, node, flags);
26812743
break;
@@ -3006,6 +3068,39 @@ static bool ggml_hexagon_supported_cpy(const struct ggml_hexagon_session * sess,
30063068
return true;
30073069
}
30083070

3071+
static bool ggml_hexagon_supported_cont(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3072+
GGML_UNUSED(sess);
3073+
const struct ggml_tensor * src0 = op->src[0];
3074+
3075+
// CONT is same-type only, supports f32 and f16
3076+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3077+
3078+
return true;
3079+
}
3080+
3081+
static bool ggml_hexagon_supported_repeat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
3082+
GGML_UNUSED(sess);
3083+
const struct ggml_tensor * src0 = op->src[0];
3084+
const struct ggml_tensor * dst = op;
3085+
3086+
// Support f32 and f16
3087+
if (src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) return false;
3088+
3089+
// src and dst must be the same type
3090+
if (src0->type != dst->type) return false;
3091+
3092+
// dst dims must be multiples of src dims
3093+
if (dst->ne[0] % src0->ne[0] != 0) return false;
3094+
if (dst->ne[1] % src0->ne[1] != 0) return false;
3095+
if (dst->ne[2] % src0->ne[2] != 0) return false;
3096+
if (dst->ne[3] % src0->ne[3] != 0) return false;
3097+
3098+
// require contiguous tensors (no transposition)
3099+
if (ggml_is_transposed(src0) || ggml_is_transposed(dst)) return false;
3100+
3101+
return true;
3102+
}
3103+
30093104
static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
30103105
auto sess = static_cast<ggml_hexagon_session *>(dev->context);
30113106

@@ -3063,21 +3158,32 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
30633158
break;
30643159

30653160
case GGML_OP_UNARY:
3066-
{
3067-
const auto unary_op = ggml_get_unary_op(op);
3068-
if (unary_op == GGML_UNARY_OP_SILU || unary_op == GGML_UNARY_OP_GELU) {
3161+
switch (ggml_get_unary_op(op)) {
3162+
case GGML_UNARY_OP_NEG:
3163+
case GGML_UNARY_OP_EXP:
3164+
case GGML_UNARY_OP_SIGMOID:
3165+
case GGML_UNARY_OP_SOFTPLUS:
3166+
supp = ggml_hexagon_supported_unary(sess, op);
3167+
break;
3168+
case GGML_UNARY_OP_SILU:
3169+
case GGML_UNARY_OP_GELU:
30693170
supp = ggml_hexagon_supported_activations(sess, op);
3070-
}
3071-
break;
3171+
break;
3172+
default:
3173+
break;
30723174
}
3175+
break;
30733176
case GGML_OP_GLU:
3074-
{
3075-
const auto glu_op = ggml_get_glu_op(op);
3076-
if ((glu_op == GGML_GLU_OP_SWIGLU) || (glu_op == GGML_GLU_OP_SWIGLU_OAI) || (glu_op == GGML_GLU_OP_GEGLU)) {
3177+
switch (ggml_get_glu_op(op)) {
3178+
case GGML_GLU_OP_SWIGLU:
3179+
case GGML_GLU_OP_SWIGLU_OAI:
3180+
case GGML_GLU_OP_GEGLU:
30773181
supp = ggml_hexagon_supported_activations(sess, op);
3078-
}
3079-
break;
3182+
break;
3183+
default:
3184+
break;
30803185
}
3186+
break;
30813187
case GGML_OP_ROPE:
30823188
supp = ggml_hexagon_supported_rope(sess, op);
30833189
break;
@@ -3098,6 +3204,14 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
30983204
supp = ggml_hexagon_supported_cpy(sess, op);
30993205
break;
31003206

3207+
case GGML_OP_CONT:
3208+
supp = ggml_hexagon_supported_cont(sess, op);
3209+
break;
3210+
3211+
case GGML_OP_REPEAT:
3212+
supp = ggml_hexagon_supported_repeat(sess, op);
3213+
break;
3214+
31013215
case GGML_OP_ARGSORT:
31023216
supp = ggml_hexagon_supported_argsort(sess, op);
31033217
break;

ggml/src/ggml-hexagon/htp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ add_library(${HTP_LIB} SHARED
3030
set-rows-ops.c
3131
get-rows-ops.c
3232
cpy-ops.c
33+
repeat-ops.c
3334
argsort-ops.c
3435
ssm-conv.c
3536
)

ggml/src/ggml-hexagon/htp/htp-msg.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ enum htp_op {
5353
HTP_OP_RMS_NORM,
5454
HTP_OP_UNARY_SILU,
5555
HTP_OP_UNARY_GELU,
56+
HTP_OP_UNARY_SIGMOID,
57+
HTP_OP_UNARY_EXP,
58+
HTP_OP_UNARY_NEG,
59+
HTP_OP_UNARY_SOFTPLUS,
5660
HTP_OP_GLU_SWIGLU,
5761
HTP_OP_GLU_SWIGLU_OAI,
5862
HTP_OP_GLU_GEGLU,
@@ -69,6 +73,7 @@ enum htp_op {
6973
HTP_OP_SQRT,
7074
HTP_OP_SUM_ROWS,
7175
HTP_OP_SSM_CONV,
76+
HTP_OP_REPEAT,
7277
INVALID
7378
};
7479

ggml/src/ggml-hexagon/htp/htp-ops.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx);
5757
int op_set_rows(struct htp_ops_context * octx);
5858
int op_get_rows(struct htp_ops_context * octx);
5959
int op_cpy(struct htp_ops_context * octx);
60+
int op_repeat(struct htp_ops_context * octx);
6061
int op_argsort(struct htp_ops_context * octx);
6162
int op_ssm_conv(struct htp_ops_context * octx);
6263

ggml/src/ggml-hexagon/htp/hvx-base.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#include <stdbool.h>
55
#include <stdint.h>
6+
#include <math.h>
7+
#include <assert.h>
68

79
#include "hex-utils.h"
810
#include "hvx-types.h"

ggml/src/ggml-hexagon/htp/hvx-exp.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include <stdbool.h>
55
#include <stdint.h>
6+
#include <math.h>
67

78
#include "hvx-base.h"
89
#include "hvx-floor.h"
@@ -16,8 +17,8 @@
1617
#define EXP_LOGN2 (0x3F317218) // ln(2) = 0.6931471805
1718
#define EXP_LOG2E (0x3FB8AA3B) // log2(e) = 1/ln(2) = 1.4426950408
1819
#define EXP_ONE (0x3f800000) // 1.0
19-
#define EXP_RANGE_R (0x41a00000) // 20.0
20-
#define EXP_RANGE_L (0xc1a00000) // -20.0
20+
#define EXP_RANGE_R (0x42B16666) // 88.7
21+
#define EXP_RANGE_L (0xC2B00000) // -88.0 (approx log(FLT_MIN))
2122

2223
static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
2324
HVX_Vector z_qf32_v;
@@ -47,12 +48,12 @@ static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
4748

4849
HVX_Vector temp_v = in_vec;
4950

50-
// Clamp inputs to (-20.0, 20.0)
51+
// Clamp inputs to (-88.0, 88.0) to avoid overflow/underflow
5152
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
5253
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
5354

5455
in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
55-
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
56+
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
5657

5758
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
5859
epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
@@ -69,12 +70,12 @@ static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
6970
// normalize before every QFloat's vmpy
7071
x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
7172

73+
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
74+
7275
// z = x * x;
7376
z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
7477
z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
7578

76-
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
77-
7879
// y = E4 + E5 * x;
7980
E_const = Q6_V_vsplat_R(EXP_COEFF_5);
8081
y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
@@ -145,7 +146,7 @@ static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max
145146
return Q6_V_vmux_QVV(pred0, inf, out);
146147
}
147148

148-
static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
149+
static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int num_elems, bool negate) {
149150
int left_over = num_elems & (VLEN_FP32 - 1);
150151
int num_elems_whole = num_elems - left_over;
151152

@@ -162,7 +163,7 @@ static inline void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict
162163
HVX_Vector vec_out = Q6_V_vzero();
163164

164165
static const float kInf = INFINITY;
165-
static const float kMaxExp = 88.02f; // log(INF)
166+
static const float kMaxExp = 88.7f;
166167

167168
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
168169
const HVX_Vector inf = hvx_vec_splat_f32(kInf);

ggml/src/ggml-hexagon/htp/hvx-sigmoid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define HVX_SIGMOID_H
33

44
#include "hvx-base.h"
5+
#include "hvx-inverse.h"
56

67
#define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022
78
#define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777

ggml/src/ggml-hexagon/htp/main.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,39 @@ static void proc_cpy_req(struct htp_context * ctx, struct htp_general_req * req,
516516
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
517517
}
518518

519+
static void proc_repeat_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
520+
struct dspqueue_buffer rsp_bufs[1];
521+
522+
// We had written to the output buffer, we'd also need to flush it
523+
rsp_bufs[0].fd = bufs[1].fd;
524+
rsp_bufs[0].ptr = bufs[1].ptr;
525+
rsp_bufs[0].offset = bufs[1].offset;
526+
rsp_bufs[0].size = bufs[1].size;
527+
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
528+
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
529+
530+
// Setup Op context
531+
struct htp_ops_context octx = { 0 };
532+
octx.ctx = ctx;
533+
octx.src0 = req->src0;
534+
octx.dst = req->dst;
535+
octx.flags = req->flags;
536+
octx.op = req->op;
537+
538+
// Update data pointers
539+
octx.src0.data = (uint32_t) bufs[0].ptr;
540+
octx.dst.data = (uint32_t) bufs[1].ptr;
541+
octx.n_threads = ctx->n_threads;
542+
543+
struct profile_data prof;
544+
profile_start(&prof);
545+
546+
uint32_t rsp_status = op_repeat(&octx);
547+
548+
profile_stop(&prof);
549+
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
550+
}
551+
519552
static void proc_get_rows_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
520553
struct dspqueue_buffer rsp_bufs[1];
521554

@@ -1090,6 +1123,10 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
10901123

10911124
case HTP_OP_SQR:
10921125
case HTP_OP_SQRT:
1126+
case HTP_OP_UNARY_NEG:
1127+
case HTP_OP_UNARY_EXP:
1128+
case HTP_OP_UNARY_SIGMOID:
1129+
case HTP_OP_UNARY_SOFTPLUS:
10931130
if (n_bufs != 2) {
10941131
FARF(ERROR, "Bad unary-req buffer list");
10951132
continue;
@@ -1175,6 +1212,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
11751212
proc_cpy_req(ctx, &req, bufs);
11761213
break;
11771214

1215+
case HTP_OP_REPEAT:
1216+
if (n_bufs != 2) {
1217+
FARF(ERROR, "Bad repeat-req buffer list");
1218+
continue;
1219+
}
1220+
proc_repeat_req(ctx, &req, bufs);
1221+
break;
1222+
11781223
case HTP_OP_ARGSORT:
11791224
if (n_bufs != 2) {
11801225
FARF(ERROR, "Bad argsort-req buffer list");

0 commit comments

Comments
 (0)