@@ -1954,3 +1954,367 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
19541954#endif
19551955}
19561956
1957+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl128 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
1958+ assert (nrc == 1 );
1959+ UNUSED (nrc );
1960+ UNUSED (bx );
1961+ UNUSED (by );
1962+ UNUSED (bs );
1963+ assert (n % QK_K == 0 );
1964+
1965+ const block_iq4_xs * GGML_RESTRICT x = vx ;
1966+ const block_q8_K * GGML_RESTRICT y = vy ;
1967+
1968+ const int nb = n / QK_K ;
1969+
1970+ const vint8m4_t values = __riscv_vle8_v_i8m4 (kvalues_iq4nl , 16 );
1971+ float sumf = 0 ;
1972+
1973+ for (int ibl = 0 ; ibl < nb ; ++ ibl ) {
1974+ const int8_t * q8 = y [ibl ].qs ;
1975+ const uint8_t * iq4 = x [ibl ].qs ;
1976+ uint16_t h = x [ibl ].scales_h ;
1977+
1978+ // We process 2 sub-blocks together.
1979+ int sumi1 = 0 , sumi2 = 0 ;
1980+ #pragma GCC unroll 1
1981+ for (int ib = 0 ; ib < QK_K / 64 ; ++ ib ) {
1982+ // Load the packed weights.
1983+ const vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2 (iq4 , 32 );
1984+ iq4 += 32 ;
1985+
1986+ // Unpack the weight blocks.
1987+ const vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2 (iq4_packed , 0xf , 32 );
1988+ const vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2 (iq4_packed , 4 , 32 );
1989+ const vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4 (iq4bits_lo , iq4bits_hi );
1990+ const vuint8m4_t iq4bits_reorder = __riscv_vcreate_v_u8m1_u8m4 (
1991+ __riscv_vmv_v_v_u8m1 (__riscv_vget_v_u8m4_u8m1 (iq4bits , 0 ), 16 ),
1992+ __riscv_vmv_v_v_u8m1 (__riscv_vget_v_u8m4_u8m1 (iq4bits , 2 ), 16 ),
1993+ __riscv_vmv_v_v_u8m1 (__riscv_vget_v_u8m4_u8m1 (iq4bits , 1 ), 16 ),
1994+ __riscv_vmv_v_v_u8m1 (__riscv_vget_v_u8m4_u8m1 (iq4bits , 3 ), 16 )
1995+ );
1996+ const vint8m4_t iq4b = __riscv_vrgather_vv_i8m4 (values , iq4bits_reorder , 64 );
1997+
1998+ // Multiply with activations.
1999+ const vint8m4_t q8b = __riscv_vle8_v_i8m4 (q8 , 64 );
2000+ q8 += 64 ;
2001+ const vint16m8_t prod = __riscv_vwmul_vv_i16m8 (iq4b , q8b , 64 );
2002+
2003+ // Reduce separately.
2004+ const int acc0 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m4_i32m1 (__riscv_vget_v_i16m8_i16m4 (prod , 0 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2005+ const int acc1 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m4_i32m1 (__riscv_vget_v_i16m8_i16m4 (prod , 1 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2006+
2007+ const int ls1 = ((x [ibl ].scales_l [ib ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2008+ const int ls2 = ((x [ibl ].scales_l [ib ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2009+ h >>= 4 ;
2010+
2011+ sumi1 += acc0 * ls1 ;
2012+ sumi2 += acc1 * ls2 ;
2013+
2014+ __asm__ __volatile__("" ::: "memory" );
2015+ }
2016+
2017+ sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi1 + sumi2 );
2018+ }
2019+
2020+ * s = sumf ;
2021+ }
2022+
2023+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl256 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2024+ assert (nrc == 1 );
2025+ UNUSED (nrc );
2026+ UNUSED (bx );
2027+ UNUSED (by );
2028+ UNUSED (bs );
2029+ assert (n % QK_K == 0 );
2030+
2031+ const block_iq4_xs * GGML_RESTRICT x = vx ;
2032+ const block_q8_K * GGML_RESTRICT y = vy ;
2033+
2034+ const int nb = n / QK_K ;
2035+
2036+ const vint8m4_t values = __riscv_vle8_v_i8m4 (kvalues_iq4nl , 16 );
2037+ float sumf = 0 ;
2038+
2039+ // Indices for re-ordering IQ4 data.
2040+ uint16_t index [16 ] = {
2041+ 0 , 1 , 8 , 9 ,
2042+ 2 , 3 , 10 , 11 ,
2043+ 4 , 5 ,12 , 13 ,
2044+ 6 , 7 , 14 , 15 ,
2045+ };
2046+ vuint16m1_t i_vec = __riscv_vle16_v_u16m1 (index , 16 );
2047+
2048+ for (int ibl = 0 ; ibl < nb ; ++ ibl ) {
2049+ const int8_t * q8 = y [ibl ].qs ;
2050+ const uint8_t * iq4 = x [ibl ].qs ;
2051+ uint16_t h = x [ibl ].scales_h ;
2052+
2053+ int sumi1 = 0 , sumi2 = 0 , sumi3 = 0 , sumi4 = 0 ;
2054+
2055+ #pragma GCC unroll 1
2056+ for (int ib = 0 ; ib < QK_K / 128 ; ++ ib ) {
2057+ // Weights and activations.
2058+ vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2 (iq4 , 64 );
2059+ iq4 += 64 ;
2060+
2061+ // Unpack the weight blocks.
2062+ vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2 (iq4_packed , 0xf , 64 );
2063+ vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2 (iq4_packed , 4 , 64 );
2064+ vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4 (iq4bits_lo , iq4bits_hi );
2065+ vuint8m4_t iq4bits_reorder = __riscv_vreinterpret_v_u64m4_u8m4 (__riscv_vrgatherei16_vv_u64m4 (__riscv_vreinterpret_v_u8m4_u64m4 (iq4bits ), i_vec , 16 ));
2066+ vint8m4_t iq4b = __riscv_vrgather_vv_i8m4 (values , iq4bits_reorder , 128 );
2067+
2068+ __asm__ __volatile__("" ::: "memory" );
2069+
2070+ // Multiply with activations.
2071+ vint8m4_t q8b = __riscv_vle8_v_i8m4 (q8 , 128 );
2072+ vint16m8_t prod = __riscv_vwmul_vv_i16m8 (iq4b , q8b , 128 );
2073+ q8 += 128 ;
2074+
2075+ __asm__ __volatile__("" ::: "memory" );
2076+
2077+ // Reduce separately.
2078+ int acc0 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m2_i32m1 (__riscv_vget_v_i16m8_i16m2 (prod , 0 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2079+ int acc1 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m2_i32m1 (__riscv_vget_v_i16m8_i16m2 (prod , 1 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2080+ int acc2 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m2_i32m1 (__riscv_vget_v_i16m8_i16m2 (prod , 2 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2081+ int acc3 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m2_i32m1 (__riscv_vget_v_i16m8_i16m2 (prod , 3 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2082+
2083+ int ls1 = ((x [ibl ].scales_l [ib * 2 + 0 ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2084+ int ls2 = ((x [ibl ].scales_l [ib * 2 + 0 ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2085+ int ls3 = ((x [ibl ].scales_l [ib * 2 + 1 ] & 0xf ) | ((h << 0 ) & 0x30 )) - 32 ;
2086+ int ls4 = ((x [ibl ].scales_l [ib * 2 + 1 ] >> 4 ) | ((h >> 2 ) & 0x30 )) - 32 ;
2087+ h >>= 8 ;
2088+
2089+ sumi1 += acc0 * ls1 ;
2090+ sumi2 += acc1 * ls2 ;
2091+ sumi3 += acc2 * ls3 ;
2092+ sumi4 += acc3 * ls4 ;
2093+
2094+ __asm__ __volatile__("" ::: "memory" );
2095+ }
2096+
2097+ sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi1 + sumi2 + sumi3 + sumi4 );
2098+ }
2099+
2100+ * s = sumf ;
2101+ }
2102+
2103+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl512 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2104+ assert (nrc == 1 );
2105+ UNUSED (nrc );
2106+ UNUSED (bx );
2107+ UNUSED (by );
2108+ UNUSED (bs );
2109+ assert (n % QK_K == 0 );
2110+
2111+ const block_iq4_xs * GGML_RESTRICT x = vx ;
2112+ const block_q8_K * GGML_RESTRICT y = vy ;
2113+
2114+ const int nb = n / QK_K ;
2115+
2116+ const vint8m4_t values = __riscv_vle8_v_i8m4 (kvalues_iq4nl , 16 );
2117+ float sumf = 0 ;
2118+
2119+ // Indices for re-ordering IQ4 data.
2120+ const uint16_t index [32 ] = {
2121+ 0 , 1 , 16 , 17 ,
2122+ 2 , 3 , 18 , 19 ,
2123+ 4 , 5 ,20 , 21 ,
2124+ 6 , 7 , 22 , 23 ,
2125+ 8 , 9 , 24 , 25 ,
2126+ 10 , 11 , 26 , 27 ,
2127+ 12 , 13 ,28 , 29 ,
2128+ 14 , 15 , 30 , 31 ,
2129+ };
2130+ const vuint16m1_t i_vec = __riscv_vle16_v_u16m1 (index , 32 );
2131+
2132+ for (int ibl = 0 ; ibl < nb ; ++ ibl ) {
2133+ const int8_t * q8 = y [ibl ].qs ;
2134+ const uint8_t * iq4 = x [ibl ].qs ;
2135+ uint16_t h = x [ibl ].scales_h ;
2136+
2137+ int sumi = 0 ;
2138+
2139+ #pragma GCC unroll 1
2140+ // Process the entire super-block together.
2141+ for (int ib = 0 ; ib < QK_K / 256 ; ++ ib ) {
2142+ // Weights and activations.
2143+ const vuint8m2_t iq4_packed = __riscv_vle8_v_u8m2 (iq4 , 128 );
2144+ iq4 += 128 ;
2145+
2146+ // Unpack the weight blocks.
2147+ const vuint8m2_t iq4bits_lo = __riscv_vand_vx_u8m2 (iq4_packed , 0xf , 128 );
2148+ const vuint8m2_t iq4bits_hi = __riscv_vsrl_vx_u8m2 (iq4_packed , 4 , 128 );
2149+ const vuint8m4_t iq4bits = __riscv_vcreate_v_u8m2_u8m4 (iq4bits_lo , iq4bits_hi );
2150+ const vuint8m4_t iq4bits_reorder = __riscv_vreinterpret_v_u64m4_u8m4 (__riscv_vrgatherei16_vv_u64m4 (__riscv_vreinterpret_v_u8m4_u64m4 (iq4bits ), i_vec , 32 ));
2151+ const vint8m4_t iq4b = __riscv_vrgather_vv_i8m4 (values , iq4bits_reorder , 256 );
2152+
2153+ __asm__ __volatile__("" ::: "memory" );
2154+
2155+ // Multiply with activations.
2156+ const vint8m4_t q8b = __riscv_vle8_v_i8m4 (q8 , 256 );
2157+ const vint16m8_t prod = __riscv_vwmul_vv_i16m8 (iq4b , q8b , 256 );
2158+ q8 += 256 ;
2159+
2160+ // Reduce separately.
2161+ const int acc0 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 0 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2162+ const int acc1 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 1 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2163+ const int acc2 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 2 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2164+ const int acc3 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 3 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2165+ const int acc4 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 4 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2166+ const int acc5 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 5 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2167+ const int acc6 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 6 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2168+ const int acc7 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 (__riscv_vget_v_i16m8_i16m1 (prod , 7 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2169+
2170+
2171+ const int ls0 = ((x [ibl ].scales_l [0 ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2172+ const int ls1 = ((x [ibl ].scales_l [0 ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2173+ const int ls2 = ((x [ibl ].scales_l [1 ] & 0xf ) | ((h << 0 ) & 0x30 )) - 32 ;
2174+ const int ls3 = ((x [ibl ].scales_l [1 ] >> 4 ) | ((h >> 2 ) & 0x30 )) - 32 ;
2175+ h >>= 8 ;
2176+ const int ls4 = ((x [ibl ].scales_l [2 ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2177+ const int ls5 = ((x [ibl ].scales_l [2 ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2178+ const int ls6 = ((x [ibl ].scales_l [3 ] & 0xf ) | ((h << 0 ) & 0x30 )) - 32 ;
2179+ const int ls7 = ((x [ibl ].scales_l [3 ] >> 4 ) | ((h >> 2 ) & 0x30 )) - 32 ;
2180+
2181+ sumi += acc0 * ls0 ;
2182+ sumi += acc1 * ls1 ;
2183+ sumi += acc2 * ls2 ;
2184+ sumi += acc3 * ls3 ;
2185+ sumi += acc4 * ls4 ;
2186+ sumi += acc5 * ls5 ;
2187+ sumi += acc6 * ls6 ;
2188+ sumi += acc7 * ls7 ;
2189+
2190+ __asm__ __volatile__("" ::: "memory" );
2191+ }
2192+
2193+ sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi );
2194+ }
2195+
2196+ * s = sumf ;
2197+ }
2198+
2199+ static NOINLINE void ggml_vec_dot_iq4_xs_q8_K_vl1024 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2200+ assert (nrc == 1 );
2201+ UNUSED (nrc );
2202+ UNUSED (bx );
2203+ UNUSED (by );
2204+ UNUSED (bs );
2205+ assert (n % QK_K == 0 );
2206+
2207+ const block_iq4_xs * GGML_RESTRICT x = vx ;
2208+ const block_q8_K * GGML_RESTRICT y = vy ;
2209+
2210+ const int nb = n / QK_K ;
2211+
2212+ const vint8m2_t values = __riscv_vle8_v_i8m2 (kvalues_iq4nl , 16 );
2213+ float sumf = 0 ;
2214+
2215+ // Indices for re-ordering IQ4 data.
2216+ const uint16_t index [32 ] = {
2217+ 0 , 1 , 16 , 17 ,
2218+ 2 , 3 , 18 , 19 ,
2219+ 4 , 5 ,20 , 21 ,
2220+ 6 , 7 , 22 , 23 ,
2221+ 8 , 9 , 24 , 25 ,
2222+ 10 , 11 , 26 , 27 ,
2223+ 12 , 13 ,28 , 29 ,
2224+ 14 , 15 , 30 , 31 ,
2225+ };
2226+ const vuint16mf2_t i_vec = __riscv_vle16_v_u16mf2 (index , 32 );
2227+
2228+ for (int ibl = 0 ; ibl < nb ; ++ ibl ) {
2229+ const int8_t * q8 = y [ibl ].qs ;
2230+ const uint8_t * iq4 = x [ibl ].qs ;
2231+ uint16_t h = x [ibl ].scales_h ;
2232+
2233+ int sumi = 0 ;
2234+
2235+ #pragma GCC unroll 1
2236+ // Process the entire super-block together.
2237+ for (int ib = 0 ; ib < QK_K / 256 ; ++ ib ) {
2238+ // Weights and activations.
2239+ const vuint8m1_t iq4_packed = __riscv_vle8_v_u8m1 (iq4 , 128 );
2240+ iq4 += 128 ;
2241+
2242+ // Unpack the weight blocks.
2243+ const vuint8m1_t iq4bits_lo = __riscv_vand_vx_u8m1 (iq4_packed , 0xf , 128 );
2244+ const vuint8m1_t iq4bits_hi = __riscv_vsrl_vx_u8m1 (iq4_packed , 4 , 128 );
2245+ const vuint8m2_t iq4bits = __riscv_vcreate_v_u8m1_u8m2 (iq4bits_lo , iq4bits_hi );
2246+ const vuint8m2_t iq4bits_reorder = __riscv_vreinterpret_v_u64m2_u8m2 (__riscv_vrgatherei16_vv_u64m2 (__riscv_vreinterpret_v_u8m2_u64m2 (iq4bits ), i_vec , 32 ));
2247+ const vint8m2_t iq4b = __riscv_vrgather_vv_i8m2 (values , iq4bits_reorder , 256 );
2248+
2249+ __asm__ __volatile__("" ::: "memory" );
2250+
2251+ // Multiply with activations.
2252+ const vint8m2_t q8b = __riscv_vle8_v_i8m2 (q8 , 256 );
2253+ const vint16m4_t prod = __riscv_vwmul_vv_i16m4 (iq4b , q8b , 256 );
2254+ q8 += 256 ;
2255+
2256+ // Mask for processing 32 elements per prod register.
2257+ const vuint16m1_t p_index = __riscv_vid_v_u16m1 (64 );
2258+ const vbool16_t p_mask = __riscv_vmsgtu_vx_u16m1_b16 (p_index , 31 , 64 );
2259+
2260+ // Reduce separately.
2261+ const int acc0 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 ( __riscv_vget_v_i16m4_i16m1 (prod , 0 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2262+ const int acc1 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1_m (p_mask , __riscv_vget_v_i16m4_i16m1 (prod , 0 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 64 ));
2263+ const int acc2 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 ( __riscv_vget_v_i16m4_i16m1 (prod , 1 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2264+ const int acc3 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1_m (p_mask , __riscv_vget_v_i16m4_i16m1 (prod , 1 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 64 ));
2265+ const int acc4 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 ( __riscv_vget_v_i16m4_i16m1 (prod , 2 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2266+ const int acc5 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1_m (p_mask , __riscv_vget_v_i16m4_i16m1 (prod , 2 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 64 ));
2267+ const int acc6 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1 ( __riscv_vget_v_i16m4_i16m1 (prod , 3 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 32 ));
2268+ const int acc7 = __riscv_vmv_x_s_i32m1_i32 (__riscv_vwredsum_vs_i16m1_i32m1_m (p_mask , __riscv_vget_v_i16m4_i16m1 (prod , 3 ), __riscv_vmv_v_x_i32m1 (0 , 1 ), 64 ));
2269+
2270+ const int ls0 = ((x [ibl ].scales_l [0 ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2271+ const int ls1 = ((x [ibl ].scales_l [0 ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2272+ const int ls2 = ((x [ibl ].scales_l [1 ] & 0xf ) | ((h << 0 ) & 0x30 )) - 32 ;
2273+ const int ls3 = ((x [ibl ].scales_l [1 ] >> 4 ) | ((h >> 2 ) & 0x30 )) - 32 ;
2274+ h >>= 8 ;
2275+ const int ls4 = ((x [ibl ].scales_l [2 ] & 0xf ) | ((h << 4 ) & 0x30 )) - 32 ;
2276+ const int ls5 = ((x [ibl ].scales_l [2 ] >> 4 ) | ((h << 2 ) & 0x30 )) - 32 ;
2277+ const int ls6 = ((x [ibl ].scales_l [3 ] & 0xf ) | ((h << 0 ) & 0x30 )) - 32 ;
2278+ const int ls7 = ((x [ibl ].scales_l [3 ] >> 4 ) | ((h >> 2 ) & 0x30 )) - 32 ;
2279+
2280+ sumi += acc0 * ls0 ;
2281+ sumi += acc1 * ls1 ;
2282+ sumi += acc2 * ls2 ;
2283+ sumi += acc3 * ls3 ;
2284+ sumi += acc4 * ls4 ;
2285+ sumi += acc5 * ls5 ;
2286+ sumi += acc6 * ls6 ;
2287+ sumi += acc7 * ls7 ;
2288+
2289+ __asm__ __volatile__("" ::: "memory" );
2290+ }
2291+
2292+ sumf += GGML_CPU_FP16_TO_FP32 (x [ibl ].d ) * y [ibl ].d * (sumi );
2293+ }
2294+
2295+ * s = sumf ;
2296+ }
2297+
2298+ void ggml_vec_dot_iq4_xs_q8_K (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
2299+ #if defined __riscv_v_intrinsic
2300+ switch (__riscv_vlenb () * 8 ) {
2301+ case 128 :
2302+ ggml_vec_dot_iq4_xs_q8_K_vl128 (n , s , bs , vx , bx , vy , by , nrc );
2303+ break ;
2304+ case 256 :
2305+ ggml_vec_dot_iq4_xs_q8_K_vl256 (n , s , bs , vx , bx , vy , by , nrc );
2306+ break ;
2307+ case 512 :
2308+ ggml_vec_dot_iq4_xs_q8_K_vl512 (n , s , bs , vx , bx , vy , by , nrc );
2309+ break ;
2310+ case 1024 :
2311+ ggml_vec_dot_iq4_xs_q8_K_vl1024 (n , s , bs , vx , bx , vy , by , nrc );
2312+ break ;
2313+ default :
2314+ ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2315+ break ;
2316+ }
2317+ #else
2318+ ggml_vec_dot_iq4_xs_q8_K_generic (n , s , bs , vx , bx , vy , by , nrc );
2319+ #endif
2320+ }
0 commit comments