@@ -665,6 +665,97 @@ void ggml_gemv_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
665665 __riscv_vse32_v_f32m2 (s + col_tile, v_sumf, vl);
666666 }
667667}
668+
669+
670+ template <int ncols_interleaved>
671+ static inline void ggml_gemv_f16_1xM_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
672+ GGML_UNUSED (bs);
673+
674+ const int nb = n / 1 ;
675+
676+ assert (nr == 1 );
677+ assert (n % 1 == 0 );
678+ assert (nc % ncols_interleaved == 0 );
679+
680+ const _Float16 * a_ptr = (const _Float16 *) vy;
681+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
682+ const block_f16<ncols_interleaved, 1 > * b_ptr = (const block_f16<ncols_interleaved, 1 > *) vx + (x * nb);
683+
684+ // Accumulators
685+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
686+
687+ for (int l = 0 ; l < nb; l++) {
688+ vfloat16m2_t b_0 = __riscv_vle16_v_f16m2 ((const _Float16 *)&b_ptr[l].d [0 ], ncols_interleaved);
689+
690+ sumf_0 = __riscv_vfwmacc_vf_f32m4 (sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved);
691+ }
692+
693+ __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
694+ }
695+
696+ return ;
697+ }
698+
699+ void ggml_gemv_f16_1x16_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
700+ ggml_gemv_f16_1xM_f16<16 >(n, s, bs, vx, vy, nr, nc);
701+ }
702+
703+ void ggml_gemv_f16_1x32_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
704+ ggml_gemv_f16_1xM_f16<32 >(n, s, bs, vx, vy, nr, nc);
705+ }
706+
707+ void ggml_gemv_f16_1x64_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
708+ ggml_gemv_f16_1xM_f16<64 >(n, s, bs, vx, vy, nr, nc);
709+ }
710+
711+ void ggml_gemv_f16_1x128_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
712+ ggml_gemv_f16_1xM_f16<128 >(n, s, bs, vx, vy, nr, nc);
713+ }
714+
715+ template <int ncols_interleaved>
716+ static inline void ggml_gemv_f32_1xM_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
717+ GGML_UNUSED (bs);
718+
719+ const int nb = n / 1 ;
720+
721+ assert (nr == 1 );
722+ assert (n % 1 == 0 );
723+ assert (nc % ncols_interleaved == 0 );
724+
725+ const float * a_ptr = (const float *) vy;
726+ for (int x = 0 ; x < nc / ncols_interleaved; x++) {
727+ const block_f32<ncols_interleaved, 1 > * b_ptr = (const block_f32<ncols_interleaved, 1 > *) vx + (x * nb);
728+
729+ // Accumulators
730+ vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
731+
732+ for (int l = 0 ; l < nb; l++) {
733+ vfloat32m4_t b_0 = __riscv_vle32_v_f32m4 ((const float *)&b_ptr[l].d [0 ], ncols_interleaved);
734+
735+ sumf_0 = __riscv_vfmacc_vf_f32m4 (sumf_0, *(const float *)(&a_ptr[l]), b_0, ncols_interleaved);
736+ }
737+
738+ __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
739+ }
740+
741+ return ;
742+ }
743+
744+ void ggml_gemv_f32_1x16_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
745+ ggml_gemv_f32_1xM_f32<16 >(n, s, bs, vx, vy, nr, nc);
746+ }
747+
748+ void ggml_gemv_f32_1x32_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
749+ ggml_gemv_f32_1xM_f32<32 >(n, s, bs, vx, vy, nr, nc);
750+ }
751+
752+ void ggml_gemv_f32_1x64_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
753+ ggml_gemv_f32_1xM_f32<64 >(n, s, bs, vx, vy, nr, nc);
754+ }
755+
756+ void ggml_gemv_f32_1x128_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
757+ ggml_gemv_f32_1xM_f32<128 >(n, s, bs, vx, vy, nr, nc);
758+ }
668759#endif
669760
670761void ggml_gemm_q4_0_8x8_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1700,125 +1791,7 @@ void ggml_gemm_q2_K_16x1_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
17001791 }
17011792 }
17021793}
1703- #endif
1704-
1705- template <int ncols_interleaved>
1706- static inline void ggml_gemv_f16_1xM_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1707- const int nb = n / 1 ;
1708-
1709- assert (nr == 1 );
1710- assert (n % 1 == 0 );
1711- assert (nc % ncols_interleaved == 0 );
1712-
1713- const _Float16 * a_ptr = (const _Float16 *) vy;
1714- for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1715- const block_f16<ncols_interleaved, 1 > * b_ptr = (const block_f16<ncols_interleaved, 1 > *) vx + (x * nb);
1716-
1717- // Accumulators
1718- vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1719-
1720- for (int l = 0 ; l < nb; l++) {
1721- vfloat16m2_t b_0 = __riscv_vle16_v_f16m2 ((const _Float16 *)&b_ptr[l].d [0 ], ncols_interleaved);
1722-
1723- sumf_0 = __riscv_vfwmacc_vf_f32m4 (sumf_0, *(const _Float16*)(&a_ptr[l]), b_0, ncols_interleaved);
1724- }
1725-
1726- __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1727- }
1728-
1729- return ;
1730- }
1731-
1732- void ggml_gemv_f16_1x16_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1733- #if defined __riscv_v_intrinsic
1734- ggml_gemv_f16_1xM_f16<16 >(n, s, bs, vx, vy, nr, nc);
1735- return ;
1736- #endif
1737- ggml_gemv_f16_1x16_f16_generic (n, s, bs, vx, vy, nr, nc);
1738- }
1739-
1740- void ggml_gemv_f16_1x32_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1741- #if defined __riscv_v_intrinsic
1742- ggml_gemv_f16_1xM_f16<32 >(n, s, bs, vx, vy, nr, nc);
1743- return ;
1744- #endif
1745- ggml_gemv_f16_1x32_f16_generic (n, s, bs, vx, vy, nr, nc);
1746- }
1747-
1748- void ggml_gemv_f16_1x64_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1749- #if defined __riscv_v_intrinsic
1750- ggml_gemv_f16_1xM_f16<64 >(n, s, bs, vx, vy, nr, nc);
1751- return ;
1752- #endif
1753- ggml_gemv_f16_1x64_f16_generic (n, s, bs, vx, vy, nr, nc);
1754- }
1755-
1756- void ggml_gemv_f16_1x128_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1757- #if defined __riscv_v_intrinsic
1758- ggml_gemv_f16_1xM_f16<128 >(n, s, bs, vx, vy, nr, nc);
1759- return ;
1760- #endif
1761- ggml_gemv_f16_1x128_f16_generic (n, s, bs, vx, vy, nr, nc);
1762- }
1763-
1764- template <int ncols_interleaved>
1765- static inline void ggml_gemv_f32_1xM_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1766- const int nb = n / 1 ;
1767-
1768- assert (nr == 1 );
1769- assert (n % 1 == 0 );
1770- assert (nc % ncols_interleaved == 0 );
1771-
1772- const float * a_ptr = (const float *) vy;
1773- for (int x = 0 ; x < nc / ncols_interleaved; x++) {
1774- const block_f32<ncols_interleaved, 1 > * b_ptr = (const block_f32<ncols_interleaved, 1 > *) vx + (x * nb);
1775-
1776- // Accumulators
1777- vfloat32m4_t sumf_0 = __riscv_vfmv_v_f_f32m4 (0 .0f , ncols_interleaved);
1778-
1779- for (int l = 0 ; l < nb; l++) {
1780- vfloat32m4_t b_0 = __riscv_vle32_v_f32m4 ((const float *)&b_ptr[l].d [0 ], ncols_interleaved);
1781-
1782- sumf_0 = __riscv_vfmacc_vf_f32m4 (sumf_0, *(const float *)(&a_ptr[l]), b_0, ncols_interleaved);
1783- }
1784-
1785- __riscv_vse32_v_f32m4 (&s[x * ncols_interleaved], sumf_0, ncols_interleaved);
1786- }
1787-
1788- return ;
1789- }
1790-
1791- void ggml_gemv_f32_1x16_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1792- #if defined __riscv_v_intrinsic
1793- ggml_gemv_f32_1xM_f32<16 >(n, s, bs, vx, vy, nr, nc);
1794- return ;
1795- #endif
1796- ggml_gemv_f32_1x16_f32_generic (n, s, bs, vx, vy, nr, nc);
1797- }
1798-
1799- void ggml_gemv_f32_1x32_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1800- #if defined __riscv_v_intrinsic
1801- ggml_gemv_f32_1xM_f32<32 >(n, s, bs, vx, vy, nr, nc);
1802- return ;
1803- #endif
1804- ggml_gemv_f32_1x32_f32_generic (n, s, bs, vx, vy, nr, nc);
1805- }
1806-
1807- void ggml_gemv_f32_1x64_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1808- #if defined __riscv_v_intrinsic
1809- ggml_gemv_f32_1xM_f32<64 >(n, s, bs, vx, vy, nr, nc);
1810- return ;
1811- #endif
1812- ggml_gemv_f32_1x64_f32_generic (n, s, bs, vx, vy, nr, nc);
1813- }
18141794
1815- void ggml_gemv_f32_1x128_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1816- #if defined __riscv_v_intrinsic
1817- ggml_gemv_f32_1xM_f32<128 >(n, s, bs, vx, vy, nr, nc);
1818- return ;
1819- #endif
1820- ggml_gemv_f32_1x128_f32_generic (n, s, bs, vx, vy, nr, nc);
1821- }
18221795
18231796template <int ncols_interleaved>
18241797static inline void ggml_gemm_f16_7x1xM_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
@@ -1867,35 +1840,19 @@ static inline void ggml_gemm_f16_7x1xM_f16(int n, float * GGML_RESTRICT s, size_
18671840}
18681841
18691842void ggml_gemm_f16_7x1x16_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1870- #if defined __riscv_v_intrinsic
18711843 ggml_gemm_f16_7x1xM_f16<16 >(n, s, bs, vx, vy, nr, nc);
1872- return ;
1873- #endif
1874- ggml_gemm_f16_7x1x16_f16_generic (n, s, bs, vx, vy, nr, nc);
18751844}
18761845
18771846void ggml_gemm_f16_7x1x32_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1878- #if defined __riscv_v_intrinsic
18791847 ggml_gemm_f16_7x1xM_f16<32 >(n, s, bs, vx, vy, nr, nc);
1880- return ;
1881- #endif
1882- ggml_gemm_f16_7x1x32_f16_generic (n, s, bs, vx, vy, nr, nc);
18831848}
18841849
18851850void ggml_gemm_f16_7x1x64_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1886- #if defined __riscv_v_intrinsic
18871851 ggml_gemm_f16_7x1xM_f16<64 >(n, s, bs, vx, vy, nr, nc);
1888- return ;
1889- #endif
1890- ggml_gemm_f16_7x1x64_f16_generic (n, s, bs, vx, vy, nr, nc);
18911852}
18921853
18931854void ggml_gemm_f16_7x1x128_f16 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1894- #if defined __riscv_v_intrinsic
18951855 ggml_gemm_f16_7x1xM_f16<128 >(n, s, bs, vx, vy, nr, nc);
1896- return ;
1897- #endif
1898- ggml_gemm_f16_7x1x128_f16_generic (n, s, bs, vx, vy, nr, nc);
18991856}
19001857
19011858template <int ncols_interleaved>
@@ -1945,33 +1902,18 @@ static inline void ggml_gemm_f32_7x1xM_f32(int n, float * GGML_RESTRICT s, size_
19451902}
19461903
19471904void ggml_gemm_f32_7x1x16_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1948- #if defined __riscv_v_intrinsic
19491905 ggml_gemm_f32_7x1xM_f32<16 >(n, s, bs, vx, vy, nr, nc);
1950- return ;
1951- #endif
1952- ggml_gemm_f32_7x1x16_f32_generic (n, s, bs, vx, vy, nr, nc);
19531906}
19541907
19551908void ggml_gemm_f32_7x1x32_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1956- #if defined __riscv_v_intrinsic
19571909 ggml_gemm_f32_7x1xM_f32<32 >(n, s, bs, vx, vy, nr, nc);
1958- return ;
1959- #endif
1960- ggml_gemm_f32_7x1x32_f32_generic (n, s, bs, vx, vy, nr, nc);
19611910}
19621911
19631912void ggml_gemm_f32_7x1x64_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1964- #if defined __riscv_v_intrinsic
19651913 ggml_gemm_f32_7x1xM_f32<64 >(n, s, bs, vx, vy, nr, nc);
1966- return ;
1967- #endif
1968- ggml_gemm_f32_7x1x64_f32_generic (n, s, bs, vx, vy, nr, nc);
19691914}
19701915
19711916void ggml_gemm_f32_7x1x128_f32 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
1972- #if defined __riscv_v_intrinsic
19731917 ggml_gemm_f32_7x1xM_f32<128 >(n, s, bs, vx, vy, nr, nc);
1974- return ;
1975- #endif
1976- ggml_gemm_f32_7x1x128_f32_generic (n, s, bs, vx, vy, nr, nc);
19771918}
1919+ #endif
0 commit comments