Skip to content

Commit ad803e9

Browse files
committed
ggml-cpu: add rvv 512b,1024b impls for i-quants
1 parent b48e80f commit ad803e9

1 file changed

Lines changed: 364 additions & 0 deletions

File tree

ggml/src/ggml-cpu/arch/riscv/quants.c

Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1954,3 +1954,367 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
19541954
#endif
19551955
}
19561956

1957+
static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl128(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1958+
assert(nrc == 1);
1959+
UNUSED(nrc);
1960+
UNUSED(bx);
1961+
UNUSED(by);
1962+
UNUSED(bs);
1963+
assert(n % QK_K == 0);
1964+
1965+
const block_iq4_xs * GGML_RESTRICT x = vx;
1966+
const block_q8_K * GGML_RESTRICT y = vy;
1967+
1968+
const int nb = n / QK_K;
1969+
1970+
const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
1971+
float sumf = 0;
1972+
1973+
for (int ibl = 0; ibl < nb; ++ibl) {
1974+
const int8_t * q8 = y[ibl].qs;
1975+
const uint8_t * iq4 = x[ibl].qs;
1976+
uint16_t h = x[ibl].scales_h;
1977+
1978+
// We process 2 sub-blocks together.
1979+
int sumi1 = 0, sumi2 = 0;
1980+
#pragma GCC unroll 1
1981+
for (int ib = 0; ib < QK_K / 64; ++ib) {
1982+
// Load the packed weights.
1983+
const vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2(iq4, 32);
1984+
iq4 += 32;
1985+
1986+
// Unpack the weight blocks.
1987+
const vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2(iq4_packed, 0xf, 32);
1988+
const vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2(iq4_packed, 4, 32);
1989+
const vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4(iq4bits_lo, iq4bits_hi);
1990+
const vuint8m4_t iq4bits_reorder = __riscv_vcreate_v_u8m1_u8m4(
1991+
__riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 0), 16),
1992+
__riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 2), 16),
1993+
__riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 1), 16),
1994+
__riscv_vmv_v_v_u8m1(__riscv_vget_v_u8m4_u8m1(iq4bits, 3), 16)
1995+
);
1996+
const vint8m4_t iq4b = __riscv_vrgather_vv_i8m4(values, iq4bits_reorder, 64);
1997+
1998+
// Multiply with activations.
1999+
const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 64);
2000+
q8 += 64;
2001+
const vint16m8_t prod = __riscv_vwmul_vv_i16m8(iq4b, q8b, 64);
2002+
2003+
// Reduce separately.
2004+
const int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
2005+
const int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m4_i32m1(__riscv_vget_v_i16m8_i16m4(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
2006+
2007+
const int ls1 = ((x[ibl].scales_l[ib] & 0xf) | ((h << 4) & 0x30)) - 32;
2008+
const int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
2009+
h >>= 4;
2010+
2011+
sumi1 += acc0 * ls1;
2012+
sumi2 += acc1 * ls2;
2013+
2014+
__asm__ __volatile__("" ::: "memory");
2015+
}
2016+
2017+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
2018+
}
2019+
2020+
*s = sumf;
2021+
}
2022+
2023+
static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl256(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2024+
assert(nrc == 1);
2025+
UNUSED(nrc);
2026+
UNUSED(bx);
2027+
UNUSED(by);
2028+
UNUSED(bs);
2029+
assert(n % QK_K == 0);
2030+
2031+
const block_iq4_xs * GGML_RESTRICT x = vx;
2032+
const block_q8_K * GGML_RESTRICT y = vy;
2033+
2034+
const int nb = n / QK_K;
2035+
2036+
const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
2037+
float sumf = 0;
2038+
2039+
// Indices for re-ordering IQ4 data.
2040+
uint16_t index[16] = {
2041+
0, 1, 8, 9,
2042+
2, 3, 10, 11,
2043+
4, 5,12, 13,
2044+
6, 7, 14, 15,
2045+
};
2046+
vuint16m1_t i_vec = __riscv_vle16_v_u16m1(index, 16);
2047+
2048+
for (int ibl = 0; ibl < nb; ++ibl) {
2049+
const int8_t * q8 = y[ibl].qs;
2050+
const uint8_t * iq4 = x[ibl].qs;
2051+
uint16_t h = x[ibl].scales_h;
2052+
2053+
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
2054+
2055+
#pragma GCC unroll 1
2056+
for (int ib = 0; ib < QK_K / 128; ++ib) {
2057+
// Weights and activations.
2058+
vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2(iq4, 64);
2059+
iq4 += 64;
2060+
2061+
// Unpack the weight blocks.
2062+
vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2(iq4_packed, 0xf, 64);
2063+
vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2(iq4_packed, 4, 64);
2064+
vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4(iq4bits_lo, iq4bits_hi);
2065+
vuint8m4_t iq4bits_reorder = __riscv_vreinterpret_v_u64m4_u8m4(__riscv_vrgatherei16_vv_u64m4(__riscv_vreinterpret_v_u8m4_u64m4(iq4bits), i_vec, 16));
2066+
vint8m4_t iq4b = __riscv_vrgather_vv_i8m4(values, iq4bits_reorder, 128);
2067+
2068+
__asm__ __volatile__("" ::: "memory");
2069+
2070+
// Multiply with activations.
2071+
vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 128);
2072+
vint16m8_t prod = __riscv_vwmul_vv_i16m8(iq4b, q8b, 128);
2073+
q8 += 128;
2074+
2075+
__asm__ __volatile__("" ::: "memory");
2076+
2077+
// Reduce separately.
2078+
int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
2079+
int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
2080+
int acc2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 2), __riscv_vmv_v_x_i32m1(0, 1), 32));
2081+
int acc3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m2_i32m1(__riscv_vget_v_i16m8_i16m2(prod, 3), __riscv_vmv_v_x_i32m1(0, 1), 32));
2082+
2083+
int ls1 = ((x[ibl].scales_l[ib * 2 + 0] & 0xf) | ((h << 4) & 0x30)) - 32;
2084+
int ls2 = ((x[ibl].scales_l[ib * 2 + 0] >> 4) | ((h << 2) & 0x30)) - 32;
2085+
int ls3 = ((x[ibl].scales_l[ib * 2 + 1] & 0xf) | ((h << 0) & 0x30)) - 32;
2086+
int ls4 = ((x[ibl].scales_l[ib * 2 + 1] >> 4) | ((h >> 2) & 0x30)) - 32;
2087+
h >>= 8;
2088+
2089+
sumi1 += acc0 * ls1;
2090+
sumi2 += acc1 * ls2;
2091+
sumi3 += acc2 * ls3;
2092+
sumi4 += acc3 * ls4;
2093+
2094+
__asm__ __volatile__("" ::: "memory");
2095+
}
2096+
2097+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2 + sumi3 + sumi4);
2098+
}
2099+
2100+
*s = sumf;
2101+
}
2102+
2103+
static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl512(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2104+
assert(nrc == 1);
2105+
UNUSED(nrc);
2106+
UNUSED(bx);
2107+
UNUSED(by);
2108+
UNUSED(bs);
2109+
assert(n % QK_K == 0);
2110+
2111+
const block_iq4_xs * GGML_RESTRICT x = vx;
2112+
const block_q8_K * GGML_RESTRICT y = vy;
2113+
2114+
const int nb = n / QK_K;
2115+
2116+
const vint8m4_t values = __riscv_vle8_v_i8m4(kvalues_iq4nl, 16);
2117+
float sumf = 0;
2118+
2119+
// Indices for re-ordering IQ4 data.
2120+
const uint16_t index[32] = {
2121+
0, 1, 16, 17,
2122+
2, 3, 18, 19,
2123+
4, 5,20, 21,
2124+
6, 7, 22, 23,
2125+
8, 9, 24, 25,
2126+
10, 11, 26, 27,
2127+
12, 13,28, 29,
2128+
14, 15, 30, 31,
2129+
};
2130+
const vuint16m1_t i_vec = __riscv_vle16_v_u16m1(index, 32);
2131+
2132+
for (int ibl = 0; ibl < nb; ++ibl) {
2133+
const int8_t * q8 = y[ibl].qs;
2134+
const uint8_t * iq4 = x[ibl].qs;
2135+
uint16_t h = x[ibl].scales_h;
2136+
2137+
int sumi = 0;
2138+
2139+
#pragma GCC unroll 1
2140+
// Process the entire super-block together.
2141+
for (int ib = 0; ib < QK_K / 256; ++ib) {
2142+
// Weights and activations.
2143+
const vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2(iq4, 128);
2144+
iq4 += 128;
2145+
2146+
// Unpack the weight blocks.
2147+
const vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2(iq4_packed, 0xf, 128);
2148+
const vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2(iq4_packed, 4, 128);
2149+
const vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4(iq4bits_lo, iq4bits_hi);
2150+
const vuint8m4_t iq4bits_reorder = __riscv_vreinterpret_v_u64m4_u8m4(__riscv_vrgatherei16_vv_u64m4(__riscv_vreinterpret_v_u8m4_u64m4(iq4bits), i_vec, 32));
2151+
const vint8m4_t iq4b = __riscv_vrgather_vv_i8m4(values, iq4bits_reorder, 256);
2152+
2153+
__asm__ __volatile__("" ::: "memory");
2154+
2155+
// Multiply with activations.
2156+
const vint8m4_t q8b = __riscv_vle8_v_i8m4(q8, 256);
2157+
const vint16m8_t prod = __riscv_vwmul_vv_i16m8(iq4b, q8b, 256);
2158+
q8 += 256;
2159+
2160+
// Reduce separately.
2161+
const int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
2162+
const int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
2163+
const int acc2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 2), __riscv_vmv_v_x_i32m1(0, 1), 32));
2164+
const int acc3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 3), __riscv_vmv_v_x_i32m1(0, 1), 32));
2165+
const int acc4 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 4), __riscv_vmv_v_x_i32m1(0, 1), 32));
2166+
const int acc5 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 5), __riscv_vmv_v_x_i32m1(0, 1), 32));
2167+
const int acc6 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 6), __riscv_vmv_v_x_i32m1(0, 1), 32));
2168+
const int acc7 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1(__riscv_vget_v_i16m8_i16m1(prod, 7), __riscv_vmv_v_x_i32m1(0, 1), 32));
2169+
2170+
2171+
const int ls0 = ((x[ibl].scales_l[0] & 0xf) | ((h << 4) & 0x30)) - 32;
2172+
const int ls1 = ((x[ibl].scales_l[0] >> 4) | ((h << 2) & 0x30)) - 32;
2173+
const int ls2 = ((x[ibl].scales_l[1] & 0xf) | ((h << 0) & 0x30)) - 32;
2174+
const int ls3 = ((x[ibl].scales_l[1] >> 4) | ((h >> 2) & 0x30)) - 32;
2175+
h >>= 8;
2176+
const int ls4 = ((x[ibl].scales_l[2] & 0xf) | ((h << 4) & 0x30)) - 32;
2177+
const int ls5 = ((x[ibl].scales_l[2] >> 4) | ((h << 2) & 0x30)) - 32;
2178+
const int ls6 = ((x[ibl].scales_l[3] & 0xf) | ((h << 0) & 0x30)) - 32;
2179+
const int ls7 = ((x[ibl].scales_l[3] >> 4) | ((h >> 2) & 0x30)) - 32;
2180+
2181+
sumi += acc0 * ls0;
2182+
sumi += acc1 * ls1;
2183+
sumi += acc2 * ls2;
2184+
sumi += acc3 * ls3;
2185+
sumi += acc4 * ls4;
2186+
sumi += acc5 * ls5;
2187+
sumi += acc6 * ls6;
2188+
sumi += acc7 * ls7;
2189+
2190+
__asm__ __volatile__("" ::: "memory");
2191+
}
2192+
2193+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi);
2194+
}
2195+
2196+
*s = sumf;
2197+
}
2198+
2199+
static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl1024(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2200+
assert(nrc == 1);
2201+
UNUSED(nrc);
2202+
UNUSED(bx);
2203+
UNUSED(by);
2204+
UNUSED(bs);
2205+
assert(n % QK_K == 0);
2206+
2207+
const block_iq4_xs * GGML_RESTRICT x = vx;
2208+
const block_q8_K * GGML_RESTRICT y = vy;
2209+
2210+
const int nb = n / QK_K;
2211+
2212+
const vint8m2_t values = __riscv_vle8_v_i8m2(kvalues_iq4nl, 16);
2213+
float sumf = 0;
2214+
2215+
// Indices for re-ordering IQ4 data.
2216+
const uint16_t index[32] = {
2217+
0, 1, 16, 17,
2218+
2, 3, 18, 19,
2219+
4, 5,20, 21,
2220+
6, 7, 22, 23,
2221+
8, 9, 24, 25,
2222+
10, 11, 26, 27,
2223+
12, 13,28, 29,
2224+
14, 15, 30, 31,
2225+
};
2226+
const vuint16mf2_t i_vec = __riscv_vle16_v_u16mf2(index, 32);
2227+
2228+
for (int ibl = 0; ibl < nb; ++ibl) {
2229+
const int8_t * q8 = y[ibl].qs;
2230+
const uint8_t * iq4 = x[ibl].qs;
2231+
uint16_t h = x[ibl].scales_h;
2232+
2233+
int sumi = 0;
2234+
2235+
#pragma GCC unroll 1
2236+
// Process the entire super-block together.
2237+
for (int ib = 0; ib < QK_K / 256; ++ib) {
2238+
// Weights and activations.
2239+
const vuint8m1_t iq4_packed = __riscv_vle8_v_u8m1(iq4, 128);
2240+
iq4 += 128;
2241+
2242+
// Unpack the weight blocks.
2243+
const vuint8m1_t iq4bits_lo = __riscv_vand_vx_u8m1(iq4_packed, 0xf, 128);
2244+
const vuint8m1_t iq4bits_hi = __riscv_vsrl_vx_u8m1(iq4_packed, 4, 128);
2245+
const vuint8m2_t iq4bits = __riscv_vcreate_v_u8m1_u8m2(iq4bits_lo, iq4bits_hi);
2246+
const vuint8m2_t iq4bits_reorder = __riscv_vreinterpret_v_u64m2_u8m2(__riscv_vrgatherei16_vv_u64m2(__riscv_vreinterpret_v_u8m2_u64m2(iq4bits), i_vec, 32));
2247+
const vint8m2_t iq4b = __riscv_vrgather_vv_i8m2(values, iq4bits_reorder, 256);
2248+
2249+
__asm__ __volatile__("" ::: "memory");
2250+
2251+
// Multiply with activations.
2252+
const vint8m2_t q8b = __riscv_vle8_v_i8m2(q8, 256);
2253+
const vint16m4_t prod = __riscv_vwmul_vv_i16m4(iq4b, q8b, 256);
2254+
q8 += 256;
2255+
2256+
// Mask for processing 32 elements per prod register.
2257+
const vuint16m1_t p_index = __riscv_vid_v_u16m1(64);
2258+
const vbool16_t p_mask = __riscv_vmsgtu_vx_u16m1_b16(p_index, 31, 64);
2259+
2260+
// Reduce separately.
2261+
const int acc0 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1( __riscv_vget_v_i16m4_i16m1(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 32));
2262+
const int acc1 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1_m(p_mask, __riscv_vget_v_i16m4_i16m1(prod, 0), __riscv_vmv_v_x_i32m1(0, 1), 64));
2263+
const int acc2 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1( __riscv_vget_v_i16m4_i16m1(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 32));
2264+
const int acc3 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1_m(p_mask, __riscv_vget_v_i16m4_i16m1(prod, 1), __riscv_vmv_v_x_i32m1(0, 1), 64));
2265+
const int acc4 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1( __riscv_vget_v_i16m4_i16m1(prod, 2), __riscv_vmv_v_x_i32m1(0, 1), 32));
2266+
const int acc5 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1_m(p_mask, __riscv_vget_v_i16m4_i16m1(prod, 2), __riscv_vmv_v_x_i32m1(0, 1), 64));
2267+
const int acc6 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1( __riscv_vget_v_i16m4_i16m1(prod, 3), __riscv_vmv_v_x_i32m1(0, 1), 32));
2268+
const int acc7 = __riscv_vmv_x_s_i32m1_i32(__riscv_vwredsum_vs_i16m1_i32m1_m(p_mask, __riscv_vget_v_i16m4_i16m1(prod, 3), __riscv_vmv_v_x_i32m1(0, 1), 64));
2269+
2270+
const int ls0 = ((x[ibl].scales_l[0] & 0xf) | ((h << 4) & 0x30)) - 32;
2271+
const int ls1 = ((x[ibl].scales_l[0] >> 4) | ((h << 2) & 0x30)) - 32;
2272+
const int ls2 = ((x[ibl].scales_l[1] & 0xf) | ((h << 0) & 0x30)) - 32;
2273+
const int ls3 = ((x[ibl].scales_l[1] >> 4) | ((h >> 2) & 0x30)) - 32;
2274+
h >>= 8;
2275+
const int ls4 = ((x[ibl].scales_l[2] & 0xf) | ((h << 4) & 0x30)) - 32;
2276+
const int ls5 = ((x[ibl].scales_l[2] >> 4) | ((h << 2) & 0x30)) - 32;
2277+
const int ls6 = ((x[ibl].scales_l[3] & 0xf) | ((h << 0) & 0x30)) - 32;
2278+
const int ls7 = ((x[ibl].scales_l[3] >> 4) | ((h >> 2) & 0x30)) - 32;
2279+
2280+
sumi += acc0 * ls0;
2281+
sumi += acc1 * ls1;
2282+
sumi += acc2 * ls2;
2283+
sumi += acc3 * ls3;
2284+
sumi += acc4 * ls4;
2285+
sumi += acc5 * ls5;
2286+
sumi += acc6 * ls6;
2287+
sumi += acc7 * ls7;
2288+
2289+
__asm__ __volatile__("" ::: "memory");
2290+
}
2291+
2292+
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi);
2293+
}
2294+
2295+
*s = sumf;
2296+
}
2297+
2298+
void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2299+
#if defined __riscv_v_intrinsic
2300+
switch (__riscv_vlenb() * 8) {
2301+
case 128:
2302+
ggml_vec_dot_iq4_xs_q8_K_vl128(n, s, bs, vx, bx, vy, by, nrc);
2303+
break;
2304+
case 256:
2305+
ggml_vec_dot_iq4_xs_q8_K_vl256(n, s, bs, vx, bx, vy, by, nrc);
2306+
break;
2307+
case 512:
2308+
ggml_vec_dot_iq4_xs_q8_K_vl512(n, s, bs, vx, bx, vy, by, nrc);
2309+
break;
2310+
case 1024:
2311+
ggml_vec_dot_iq4_xs_q8_K_vl1024(n, s, bs, vx, bx, vy, by, nrc);
2312+
break;
2313+
default:
2314+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2315+
break;
2316+
}
2317+
#else
2318+
ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2319+
#endif
2320+
}

0 commit comments

Comments
 (0)