add some code

2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_aes3.S
@@ -0,0 +1,398 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_61, 458755
+
+    # Program Unit: dspi_dotprod_off_s16_aes3
+    .type	dspi_dotprod_off_s16_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_off_s16_aes3
+dspi_dotprod_off_s16_aes3:	# 0x4
+.LBB1_dspi_dotprod_off_s16_aes3:	# 0x4
+    entry	a1,128                  	#  
+    l32i.n	a10,a2,4               	# [0]  id:760
+    l32i.n	a12,a2,12              	# [1]  id:759
+    mull	a8,a10,a5                	# [2]  
+    blt	a12,a8,.LBB83_dspi_dotprod_off_s16_aes3 	# [4]  
+
+    l32i.n	a13,a2,8               	# [0]  id:761
+    l32i.n	a9,a2,16               	# [1]  id:762
+    mull	a11,a13,a6               	# [2]  
+    blt	a9,a11,.LBB83_dspi_dotprod_off_s16_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:764
+    l32i.n	a14,a3,12              	# [1]  id:763
+    mull	a11,a15,a5               	# [2]  
+    blt	a14,a11,.LBB83_dspi_dotprod_off_s16_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:766
+    l32i.n	a9,a3,8                	# [1]  id:765
+    s32i	a9,a1,88                 	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB83_dspi_dotprod_off_s16_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:767
+    s32i	a8,a1,84                 	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_36354         	# [2]  
+
+    bne	a14,a11,.Lt_0_36354       	# [0]  
+
+    bnei	a15,1,.Lt_0_36354        	# [0]  
+
+    l32i	a9,a1,88                 	# [0]  gra_spill_temp_2
+    beqi	a9,1,.Lt_0_19458         	# [2]  
+
+.Lt_0_36354:	# 0x46
+.Lt_0_19714:	# 0x46
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16si	a8,a1,128               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:875
+    .type	dspi_dotprod_off_s16_ansi, @function
+    call8	dspi_dotprod_off_s16_ansi 	# [8]  dspi_dotprod_off_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB83_dspi_dotprod_off_s16_aes3:	# 0x5e
+    l32r	a2,.LC0_1_61             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_19458:	# 0x63
+    addi.n	a9,a10,-1              	# [0]  
+    bnez	a9,.Lt_0_37122           	# [1]  
+
+    addi.n	a10,a13,-1             	# [0]  
+    bnez	a10,.Lt_0_37122          	# [1]  
+
+    extui	a11,a5,0,3              	# [0]  
+    bnez.n	a11,.Lt_0_37122        	# [1]  
+
+    blti	a6,4,.Lt_0_37122         	# [0]  
+
+    movi.n	a14,32                 	# [0]  
+    blt	a14,a5,.LBB27_dspi_dotprod_off_s16_aes3 	# [1]  
+
+.Lt_0_37634:	# 0x7a
+.Lt_0_21506:	# 0x7a
+    l32i	a15,a1,84                	# [0]  gra_spill_temp_1
+    l32i.n	a2,a2,0                	# [1]  id:769
+    l16si	a9,a1,128               	# [2]  id:768 offset+0x0
+    mull	a10,a12,a13              	# [3]  
+    addi	a8,a1,16                 	# [4]  temp_offset
+    slli	a10,a10,1                	# [5]  
+    s32i	a10,a1,80                	# [6]  gra_spill_temp_0
+    movi.n	a10,2                  	# [7]  
+    # loop-count fixed at 2
+    loop	a10,.LBB137_dspi_dotprod_off_s16_aes3 	# [8]  
+
+.LBB132_dspi_dotprod_off_s16_aes3:	# 0x93
+    s16i	a9,a8,0                  	# [0*II+0]  id:770 temp_offset+0x0
+    s16i	a9,a8,2                  	# [0*II+1]  id:770 temp_offset+0x0
+    s16i	a9,a8,4                  	# [0*II+2]  id:770 temp_offset+0x0
+    s16i	a9,a8,6                  	# [0*II+3]  id:770 temp_offset+0x0
+    s16i	a9,a8,8                  	# [0*II+4]  id:770 temp_offset+0x0
+    s16i	a9,a8,10                 	# [0*II+5]  id:770 temp_offset+0x0
+    s16i	a9,a8,12                 	# [0*II+6]  id:770 temp_offset+0x0
+    s16i	a9,a8,14                 	# [0*II+7]  id:770 temp_offset+0x0
+    addi	a8,a8,16                 	# [0*II+8]  
+
+.LBB137_dspi_dotprod_off_s16_aes3:	# 0xae
+    mov.n	a3,a6                   	# [0]  
+    addi	a11,a5,-24               	# [1]  
+    addi	a12,a1,24                	# [3]  temp_offset+8
+    movi.n	a13,0                  	# [4]  
+    wur.sar_byte	a13              	# [5]  
+    wur.accx_0	a13                	# [6]  
+    wur.accx_1	a13                	# [7]  
+    ee.vld.128.ip	q6,a12,0        	# [8]  id:771
+    s32i.n	a12,a1,48              	# [9]  offset_data_ptr
+    beqz	a11,.LBB34_dspi_dotprod_off_s16_aes3 	# [10]  
+
+.Lt_0_25602:	# 0xc8
+.Lt_0_25090:	# 0xc8
+    ee.vld.128.ip	q0,a15,16       	# [0]  id:786
+    addi	a14,a5,-16               	# [1]  
+    beqz	a14,.LBB40_dspi_dotprod_off_s16_aes3 	# [2]  
+
+.Lt_0_27138:	# 0xd1
+.Lt_0_26626:	# 0xd1
+    addi	a8,a5,-8                 	# [0]  
+    beqz	a8,.LBB46_dspi_dotprod_off_s16_aes3 	# [1]  
+
+.Lt_0_28674:	# 0xd7
+.Lt_0_28162:	# 0xd7
+    addi	a9,a5,-32                	# [0]  
+    beqz	a9,.LBB52_dspi_dotprod_off_s16_aes3 	# [1]  
+
+.Lt_0_30210:	# 0xdd
+.Lt_0_29698:	# 0xdd
+    addi	a10,a5,-64               	# [0]  
+    beqz	a10,.LBB58_dspi_dotprod_off_s16_aes3 	# [1]  
+
+    movi.n	a11,64                 	# [0]  
+    bge	a11,a5,.Lt_0_33026        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a2,16    	# [1]  id:848
+    ee.ld.128.usar.ip	q2,a2,16    	# [2]  id:849
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [4]  id:850
+    beqz.n	a3,.Lt_0_33026         	# [5]  
+
+    slli	a8,a5,1                  	# [0]  
+    l32i	a14,a1,80                	# [1]  gra_spill_temp_0
+    addi	a13,a5,31                	# [2]  
+    movgez	a13,a5,a5              	# [3]  
+    srai	a13,a13,5                	# [4]  
+    sub	a14,a14,a8                	# [5]  
+    addi	a14,a14,16               	# [6]  
+    addi.n	a13,a13,-1             	# [7]  
+
+.Lt_0_33794:	# 0x10c
+    beqz.n	a13,.Lt_0_34050        	# [0]  
+
+    loopnez	a13,.LBB273_dspi_dotprod_off_s16_aes3 	# [0]  
+
+.LBB271_dspi_dotprod_off_s16_aes3:	# 0x111
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a2,16,q0,q1,q2,q3 	# [0*II+0]  id:851
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q1,q6 	# [0*II+1]  id:852
+    ee.vmulas.s16.accx.ld.ip.qup	q1,a2,16,q1,q2,q3,q0 	# [0*II+3]  id:853
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q2,q6 	# [0*II+4]  id:854
+    ee.vmulas.s16.accx.ld.ip.qup	q2,a2,16,q4,q3,q0,q1 	# [0*II+6]  id:855
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q3,q6 	# [0*II+7]  id:856
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q4,q0,q1,q2 	# [0*II+9]  id:857
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [0*II+10]  id:858
+
+.LBB273_dspi_dotprod_off_s16_aes3:	# 0x131
+
+.Lt_0_34050:	# 0x131
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a2,16,q0,q1,q2,q3 	# [0]  id:859
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q1,q6 	# [1]  id:860
+    movi.n	a9,32                  	# [2]  
+    ee.vmulas.s16.accx.ld.xp.qup	q7,a2,a14,q1,q2,q3,q0 	# [3]  id:861
+    ee.vmulas.s16.accx.ld.ip	q5,a15,16,q2,q6 	# [4]  id:862
+    movi.n	a10,-16                	# [5]  
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a10,q5,q3,q0,q7 	# [6]  id:863
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q3,q6 	# [7]  id:865
+    ee.ld.128.usar.xp	q1,a2,a9    	# [8]  id:864
+    addi.n	a12,a12,1              	# [9]  
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q4,q0,q1,q2 	# [10]  id:866
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [11]  id:867
+    bne	a12,a3,.Lt_0_33794        	# [12]  
+
+.Lt_0_33026:	# 0x15d
+.Lt_0_32770:	# 0x15d
+    rur.accx_0	a9                 	# [0]  
+    rur.accx_1	a10                	# [1]  
+    blti	a7,1,.Lt_0_35586         	# [2]  
+
+    movi.n	a2,0                   	# [0]  
+    addi	a13,a7,-33               	# [1]  
+    addi.n	a14,a7,-1              	# [2]  
+    ssr	a14                       	# [3]  
+    sra	a12,a10                   	# [4]  
+    src	a11,a10,a9                	# [5]  
+    movgez	a11,a12,a13            	# [6]  
+    addi.n	a11,a11,1              	# [7]  
+    srai	a11,a11,1                	# [8]  
+    s16i	a11,a4,0                 	# [9]  id:873
+    retw.n                        	# [10]  
+
+.Lt_0_37122:	# 0x183
+.Lt_0_20738:	# 0x183
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16si	a8,a1,128               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:876
+    call8	dspi_dotprod_off_s16_ansi 	# [8]  dspi_dotprod_off_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB27_dspi_dotprod_off_s16_aes3:	# 0x19b
+    extui	a9,a5,0,1               	# [0]  
+    beqz	a9,.Lt_0_37634           	# [1]  
+
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16si	a8,a1,128               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:877
+    call8	dspi_dotprod_off_s16_ansi 	# [8]  dspi_dotprod_off_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB34_dspi_dotprod_off_s16_aes3:	# 0x1b9
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i	a12,a1,80                	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q0,a2,16    	# [3]  id:776
+    ee.ld.128.usar.ip	q2,a2,16    	# [4]  id:777
+    addi	a12,a12,-32              	# [5]  
+    ee.src.q.ld.ip	q3,a2,16,q0,q2 	# [6]  id:778
+    loopgtz	a6,.LBB159_dspi_dotprod_off_s16_aes3 	# [7]  
+
+.LBB157_dspi_dotprod_off_s16_aes3:	# 0x1cf
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q0,q6 	# [0*II+0]  id:779
+    ee.vmulas.s16.accx.ld.xp.qup	q1,a2,a12,q1,q0,q2,q3 	# [0*II+2]  id:780
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q2,q6 	# [0*II+3]  id:781
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q0,q2,q3,q1 	# [0*II+5]  id:782
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q3,q6 	# [0*II+6]  id:784
+    ee.ld.128.usar.xp	q0,a2,a10   	# [0*II+7]  id:783
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q1,q3,q0,q2 	# [0*II+9]  id:785
+
+.LBB159_dspi_dotprod_off_s16_aes3:	# 0x1ea
+    j	.Lt_0_25602                 	# [0]  
+
+.LBB40_dspi_dotprod_off_s16_aes3:	# 0x1ed
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    srli	a3,a6,1                  	# [2]  
+    l32i	a12,a1,80                	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:787
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:788
+    addi	a12,a12,-16              	# [7]  
+    ee.src.q.ld.xp	q3,a2,a12,q1,q2 	# [8]  id:789
+    loopnez	a3,.LBB182_dspi_dotprod_off_s16_aes3 	# [9]  
+
+.LBB180_dspi_dotprod_off_s16_aes3:	# 0x206
+    ee.vmulas.s16.accx.ld.xp.qup	q0,a2,a11,q0,q1,q2,q3 	# [0*II+0]  id:790
+    ee.vmulas.s16.accx.ld.ip	q3,a15,16,q1,q6 	# [0*II+1]  id:791
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+2]  id:792
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a12,q3,q2,q1,q0 	# [0*II+4]  id:793
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q2,q6 	# [0*II+5]  id:794
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q4,q1,q0,q3 	# [0*II+7]  id:795
+    ee.vmulas.s16.accx.ld.ip	q3,a15,16,q1,q6 	# [0*II+8]  id:796
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+9]  id:797
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a12,q3,q0,q1,q2 	# [0*II+11]  id:798
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [0*II+12]  id:799
+
+.LBB182_dspi_dotprod_off_s16_aes3:	# 0x22c
+    j	.Lt_0_27138                 	# [0]  
+
+.LBB46_dspi_dotprod_off_s16_aes3:	# 0x22f
+    movi.n	a10,-16                	# [0]  
+    l32i	a11,a1,80                	# [1]  gra_spill_temp_0
+    addi	a8,a2,16                 	# [2]  
+    addi	a11,a11,16               	# [3]  
+    ee.ld.128.usar.xp	q2,a8,a10   	# [4]  id:800
+    ee.ld.128.usar.xp	q1,a8,a11   	# [5]  id:801
+    ee.src.q.ld.xp	q3,a8,a10,q1,q2 	# [7]  id:802
+    ee.ld.128.usar.xp	q2,a8,a11   	# [8]  id:803
+    srli	a3,a3,2                  	# [9]  
+    mov.n	a2,a8                   	# [10]  
+    loopnez	a3,.LBB205_dspi_dotprod_off_s16_aes3 	# [11]  
+
+.LBB203_dspi_dotprod_off_s16_aes3:	# 0x24e
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a10,q0,q1,q2,q3 	# [0*II+0]  id:804
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q1,q6 	# [0*II+1]  id:805
+    ee.ld.128.usar.xp	q1,a2,a11   	# [0*II+2]  id:806
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a10,q0,q2,q1,q3 	# [0*II+4]  id:807
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q2,q6 	# [0*II+5]  id:808
+    ee.ld.128.usar.xp	q4,a2,a11   	# [0*II+6]  id:809
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a10,q0,q1,q4,q3 	# [0*II+8]  id:810
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q1,q6 	# [0*II+9]  id:811
+    ee.ld.128.usar.xp	q1,a2,a11   	# [0*II+10]  id:812
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a10,q0,q4,q1,q3 	# [0*II+12]  id:813
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q4,q6 	# [0*II+13]  id:814
+    ee.ld.128.usar.xp	q2,a2,a11   	# [0*II+14]  id:815
+
+.LBB205_dspi_dotprod_off_s16_aes3:	# 0x27a
+    j	.Lt_0_28674                 	# [0]  
+
+.LBB52_dspi_dotprod_off_s16_aes3:	# 0x27d
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    slli	a13,a5,1                 	# [2]  
+    l32i	a12,a1,80                	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:816
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:817
+    sub	a12,a12,a13               	# [6]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [8]  id:818
+    addi	a12,a12,16               	# [9]  
+    loopnez	a3,.LBB228_dspi_dotprod_off_s16_aes3 	# [10]  
+
+.LBB226_dspi_dotprod_off_s16_aes3:	# 0x299
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a2,16,q0,q1,q2,q3 	# [0*II+0]  id:819
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q1,q6 	# [0*II+1]  id:820
+    ee.vmulas.s16.accx.ld.xp.qup	q4,a2,a12,q4,q2,q3,q0 	# [0*II+3]  id:821
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q2,q6 	# [0*II+4]  id:822
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q1,q3,q0,q4 	# [0*II+6]  id:823
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q3,q6 	# [0*II+7]  id:825
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+8]  id:824
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q4,q0,q1,q2 	# [0*II+10]  id:826
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [0*II+11]  id:827
+
+.LBB228_dspi_dotprod_off_s16_aes3:	# 0x2bc
+    j	.Lt_0_30210                 	# [0]  
+
+.LBB58_dspi_dotprod_off_s16_aes3:	# 0x2bf
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    slli	a13,a5,1                 	# [2]  
+    l32i	a12,a1,80                	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:828
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:829
+    sub	a12,a12,a13               	# [7]  
+    addi	a12,a12,16               	# [8]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [9]  id:830
+    mov.n	a8,a2                   	# [10]  
+    loopnez	a3,.LBB250_dspi_dotprod_off_s16_aes3 	# [11]  
+
+.LBB248_dspi_dotprod_off_s16_aes3:	# 0x2dd
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a8,16,q0,q1,q2,q3 	# [0*II+0]  id:831
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q1,q6 	# [0*II+1]  id:832
+    ee.vmulas.s16.accx.ld.ip.qup	q4,a8,16,q4,q2,q3,q0 	# [0*II+3]  id:833
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q2,q6 	# [0*II+4]  id:834
+    ee.vmulas.s16.accx.ld.ip.qup	q1,a8,16,q1,q3,q0,q4 	# [0*II+6]  id:835
+    ee.vmulas.s16.accx.ld.ip	q5,a15,16,q3,q6 	# [0*II+7]  id:836
+    ee.vmulas.s16.accx.ld.ip.qup	q5,a8,16,q5,q0,q4,q1 	# [0*II+9]  id:837
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [0*II+10]  id:838
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a8,16,q0,q4,q1,q5 	# [0*II+12]  id:839
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q4,q6 	# [0*II+13]  id:840
+    ee.vmulas.s16.accx.ld.xp.qup	q4,a8,a12,q4,q1,q5,q0 	# [0*II+15]  id:841
+    ee.vmulas.s16.accx.ld.ip	q1,a15,16,q1,q6 	# [0*II+16]  id:842
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a8,a11,q1,q5,q0,q4 	# [0*II+18]  id:843
+    ee.vmulas.s16.accx.ld.ip	q4,a15,16,q5,q6 	# [0*II+19]  id:845
+    ee.ld.128.usar.xp	q1,a8,a10   	# [0*II+20]  id:844
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a8,16,q4,q0,q1,q2 	# [0*II+22]  id:846
+    ee.vmulas.s16.accx.ld.ip	q0,a15,16,q0,q6 	# [0*II+23]  id:847
+
+.LBB250_dspi_dotprod_off_s16_aes3:	# 0x320
+    j	.Lt_0_33026                 	# [0]  
+
+.Lt_0_35586:	# 0x323
+    movi.n	a2,0                   	# [0]  
+    sext	a14,a9,15                	# [1]  
+    s16i	a14,a4,0                 	# [2]  id:874
+    retw.n                        	# [3]  
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    int16_t *i_data =  (int16_t *)in_image->data;
+    int16_t *f_data =  (int16_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int64_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s16_arp4.S
@@ -0,0 +1,104 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_off_s16_arp4
+    .global dspi_dotprod_off_s16_ansi
+    .type   dspi_dotprod_off_s16_arp4,@function
+
+// esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
+dspi_dotprod_off_s16_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+// offset       - a6
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// current i_data - t4
+// current f_data - t5
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 7
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_off_s16_arp4_body
+    j 	dspi_dotprod_off_s16_ansi
+
+.dspi_dotprod_off_s16_arp4_body:
+    add	sp, sp, -16
+
+    sw  a6, 0(sp)
+    mv  t6, sp
+    esp.vldbc.16.ip	  q2, t6, 0 
+
+    lw	t0, 0(a0)       // i_data
+    lw	t1, 0(a1)       // f_data
+
+
+    lw 	t2, 8(a0)       // step_y
+    lw	t4, 12(a0)      // stride_x
+    mul	t2, t4, t2
+    slli t2, t2, 1      // i_step = i_step<<1
+
+    lw 	t3, 8(a1)       // step_y
+    lw	t5, 12(a1)      // stride_x
+    mul	t3, t5, t3
+    slli t3, t3, 1      // f_step = f_step<<1
+
+    srli t6, a3, 3      // t5 = len/8
+    
+
+    addi    a7, a5, -1
+    li      t4, 1
+    sll     t4, t4, a7
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip      q1, t5, 16          // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip          q0, t4, 16  // q1 - f_data
+            esp.vadd.s16            q3, q2, q1
+.loop_count_x: 	esp.vmulas.s16.xacc.ld.ip   q1, t5, 16, q0, q3  // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz    a4, .loop_count_y
+
+    esp.srs.s.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_aes3.S
@@ -0,0 +1,408 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_57, 458755
+
+    # Program Unit: dspi_dotprod_off_s8_aes3
+    .type	dspi_dotprod_off_s8_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_off_s8_aes3
+dspi_dotprod_off_s8_aes3:	# 0x4
+.LBB1_dspi_dotprod_off_s8_aes3:	# 0x4
+    entry	a1,112                  	#  
+    l32i.n	a10,a2,4               	# [0]  id:745
+    l32i.n	a12,a2,12              	# [1]  id:744
+    mull	a8,a10,a5                	# [2]  
+    blt	a12,a8,.LBB86_dspi_dotprod_off_s8_aes3 	# [4]  
+
+    l32i.n	a13,a2,8               	# [0]  id:746
+    l32i.n	a9,a2,16               	# [1]  id:747
+    mull	a11,a13,a6               	# [2]  
+    blt	a9,a11,.LBB86_dspi_dotprod_off_s8_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:749
+    l32i.n	a14,a3,12              	# [1]  id:748
+    mull	a11,a15,a5               	# [2]  
+    blt	a14,a11,.LBB86_dspi_dotprod_off_s8_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:751
+    l32i.n	a9,a3,8                	# [1]  id:750
+    s32i	a9,a1,72                 	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB86_dspi_dotprod_off_s8_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:752
+    s32i	a8,a1,68                 	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_35330         	# [2]  
+
+    bne	a14,a11,.Lt_0_35330       	# [0]  
+
+    bnei	a15,1,.Lt_0_35330        	# [0]  
+
+    l32i	a11,a1,72                	# [0]  gra_spill_temp_2
+    beqi	a11,1,.Lt_0_18946        	# [2]  
+
+.Lt_0_35330:	# 0x46
+.Lt_0_19202:	# 0x46
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    .type	dspi_dotprod_s8_ansi, @function
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB86_dspi_dotprod_off_s8_aes3:	# 0x59
+    l32r	a2,.LC0_1_57             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_18946:	# 0x5e
+    addi.n	a14,a10,-1             	# [0]  
+    bnez	a14,.Lt_0_36098          	# [1]  
+
+    addi.n	a15,a13,-1             	# [0]  
+    bnez	a15,.Lt_0_36098          	# [1]  
+
+    extui	a8,a5,0,4               	# [0]  
+    bnez.n	a8,.Lt_0_36098         	# [1]  
+
+    blti	a6,4,.Lt_0_36098         	# [0]  
+
+    movi.n	a9,64                  	# [0]  
+    blt	a9,a5,.LBB27_dspi_dotprod_off_s8_aes3 	# [1]  
+
+.Lt_0_36610:	# 0x75
+.Lt_0_20994:	# 0x75
+    mov.n	a8,a1                   	# [0]  
+    l8ui	a9,a1,112                	# [1]  id:754 offset+0x0
+    l32i.n	a15,a2,0               	# [2]  id:753
+    mull	a10,a12,a13              	# [3]  
+    l32i	a2,a1,68                 	# [4]  gra_spill_temp_1
+    s32i	a10,a1,64                	# [5]  gra_spill_temp_0
+    sext	a9,a9,7                  	# [6]  
+    movi.n	a10,4                  	# [7]  
+    # loop-count fixed at 4
+    loop	a10,.LBB140_dspi_dotprod_off_s8_aes3 	# [8]  
+
+.LBB135_dspi_dotprod_off_s8_aes3:	# 0x8d
+    s8i	a9,a8,0                   	# [0*II+0]  id:755 temp_offset+0x0
+    s8i	a9,a8,1                   	# [0*II+1]  id:755 temp_offset+0x0
+    s8i	a9,a8,2                   	# [0*II+2]  id:755 temp_offset+0x0
+    s8i	a9,a8,3                   	# [0*II+3]  id:755 temp_offset+0x0
+    s8i	a9,a8,4                   	# [0*II+4]  id:755 temp_offset+0x0
+    s8i	a9,a8,5                   	# [0*II+5]  id:755 temp_offset+0x0
+    s8i	a9,a8,6                   	# [0*II+6]  id:755 temp_offset+0x0
+    s8i	a9,a8,7                   	# [0*II+7]  id:755 temp_offset+0x0
+    addi.n	a8,a8,8                	# [0*II+8]  
+
+.LBB140_dspi_dotprod_off_s8_aes3:	# 0xa7
+    mov.n	a3,a6                   	# [0]  
+    addi	a11,a5,-48               	# [1]  
+
+    addi.n	a12,a1,8               	# [3]  temp_offset+8
+    movi.n	a13,0                  	# [4]  
+    wur.accx_0	a13                	# [5]  
+    wur.accx_1	a13                	# [6]  
+    ee.vld.128.ip	q6,a12,0        	# [7]  id:756
+    s32i.n	a12,a1,32              	# [8]  offset_data_ptr
+    beqz	a11,.LBB34_dspi_dotprod_off_s8_aes3 	# [9]  
+
+    l32i	a2,a1,68                 	# [0]  gra_spill_temp_1
+    ee.vld.128.ip	q0,a2,16        	# [2]  id:771
+    st.qr	q0,a1,48                	# [3]  q0
+
+.Lt_0_24578:	# 0xc6
+    addi	a14,a5,-32               	# [0]  
+    beqz	a14,.LBB43_dspi_dotprod_off_s8_aes3 	# [1]  
+
+.Lt_0_26626:	# 0xcc
+.Lt_0_26114:	# 0xcc
+    addi	a8,a5,-16                	# [0]  
+    beqz	a8,.LBB50_dspi_dotprod_off_s8_aes3 	# [1]  
+
+.Lt_0_28162:	# 0xd2
+.Lt_0_27650:	# 0xd2
+    addi	a9,a5,-64                	# [0]  
+    beqz	a9,.LBB57_dspi_dotprod_off_s8_aes3 	# [1]  
+
+.Lt_0_29698:	# 0xd8
+.Lt_0_29186:	# 0xd8
+    addi	a10,a5,-128              	# [0]  
+    beqz	a10,.LBB64_dspi_dotprod_off_s8_aes3 	# [1]  
+
+    movi	a11,128                  	# [0]  
+    bge	a11,a5,.Lt_0_32514        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a15,16   	# [1]  id:833
+    ee.ld.128.usar.ip	q2,a15,16   	# [2]  id:834
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [4]  id:835
+    beqz.n	a3,.Lt_0_32514         	# [5]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    l32i	a14,a1,64                	# [1]  gra_spill_temp_0
+    addi	a13,a5,31                	# [2]  
+    movgez	a13,a5,a5              	# [3]  
+    srai	a13,a13,5                	# [4]  
+    sub	a14,a14,a5                	# [5]  
+    addi	a14,a14,16               	# [6]  
+    addi.n	a13,a13,-1             	# [7]  
+
+.Lt_0_33282:	# 0x108
+    beqz.n	a13,.Lt_0_33538        	# [0]  
+
+    loopnez	a13,.LBB277_dspi_dotprod_off_s8_aes3 	# [0]  
+
+.LBB275_dspi_dotprod_off_s8_aes3:	# 0x10d
+    ee.vmulas.s8.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:836
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q1,q6 	# [0*II+1]  id:837
+    ee.vmulas.s8.accx.ld.ip.qup	q1,a15,16,q1,q2,q3,q0 	# [0*II+3]  id:838
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+4]  id:839
+    ee.vmulas.s8.accx.ld.ip.qup	q2,a15,16,q4,q3,q0,q1 	# [0*II+6]  id:840
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q3,q6 	# [0*II+7]  id:841
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+9]  id:842
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+10]  id:843
+
+.LBB277_dspi_dotprod_off_s8_aes3:	# 0x12d
+
+.Lt_0_33538:	# 0x12d
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a15,16,q0,q1,q2,q3 	# [0]  id:844
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q1,q6 	# [1]  id:845
+    movi.n	a8,32                  	# [2]  
+    ee.vmulas.s8.accx.ld.xp.qup	q0,a15,a14,q1,q2,q3,q4 	# [3]  id:846
+    ee.vmulas.s8.accx.ld.ip	q7,a2,16,q2,q6 	# [4]  id:847
+    movi.n	a9,-16                 	# [5]  
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a15,a9,q7,q3,q4,q0 	# [6]  id:848
+    ee.vmulas.s8.accx.ld.ip	q5,a2,16,q3,q6 	# [7]  id:850
+    ee.ld.128.usar.xp	q1,a15,a8   	# [8]  id:849
+    addi.n	a12,a12,1              	# [9]  
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a15,16,q5,q4,q1,q2 	# [10]  id:851
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q4,q6 	# [11]  id:852
+    bne	a12,a3,.Lt_0_33282        	# [12]  
+
+.Lt_0_32514:	# 0x159
+.Lt_0_32258:	# 0x159
+    movi.n	a2,0                   	# [0]  
+    rur.accx_0	a10                	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    movi.n	a11,1                  	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:854
+    retw.n                        	# [10]  
+
+.Lt_0_36098:	# 0x175
+.Lt_0_20226:	# 0x175
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB27_dspi_dotprod_off_s8_aes3:	# 0x188
+    extui	a14,a5,0,1              	# [0]  
+    beqz	a14,.Lt_0_36610          	# [1]  
+
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB34_dspi_dotprod_off_s8_aes3:	# 0x1a1
+    ee.ld.128.usar.ip	q0,a15,16   	# [0]  id:760
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:761
+    ee.src.q.ld.ip	q3,a15,16,q0,q2 	# [3]  id:762
+    beqz.n	a6,.Lt_0_24578         	# [4]  
+
+    movi.n	a10,32                 	# [0]  
+    l32i	a12,a1,64                	# [1]  gra_spill_temp_0
+    movi.n	a11,-16                	# [2]  
+    addi	a12,a12,-32              	# [3]  
+    loopgtz	a6,.LBB163_dspi_dotprod_off_s8_aes3 	# [4]  
+
+.LBB161_dspi_dotprod_off_s8_aes3:	# 0x1b9
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q0,q6 	# [0*II+0]  id:763
+    ee.vmulas.s8.accx.ld.xp.qup	q1,a15,a12,q1,q0,q2,q3 	# [0*II+2]  id:764
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+3]  id:765
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a15,a11,q0,q2,q3,q1 	# [0*II+5]  id:766
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q3,q6 	# [0*II+6]  id:768
+    ee.ld.128.usar.xp	q0,a15,a10  	# [0*II+7]  id:767
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a15,16,q1,q3,q0,q2 	# [0*II+9]  id:769
+
+.LBB163_dspi_dotprod_off_s8_aes3:	# 0x1d4
+    st.qr	q1,a1,48                	# [0]  q0
+    j	.Lt_0_24578                 	# [1]  
+
+.LBB43_dspi_dotprod_off_s8_aes3:	# 0x1da
+    srli	a3,a6,1                  	# [0]  
+    l32i	a12,a1,64                	# [1]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [2]  id:772
+    ee.ld.128.usar.ip	q2,a15,16   	# [3]  id:773
+    addi	a12,a12,-16              	# [5]  
+    ee.src.q.ld.xp	q3,a15,a12,q1,q2 	# [6]  id:774
+    beqz.n	a3,.Lt_0_26626         	# [7]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    movi.n	a11,-16                	# [2]  
+    loopnez	a3,.LBB186_dspi_dotprod_off_s8_aes3 	# [3]  
+
+.LBB184_dspi_dotprod_off_s8_aes3:	# 0x1f8
+    ee.vmulas.s8.accx.ld.xp.qup	q0,a15,a11,q0,q1,q2,q3 	# [0*II+0]  id:775
+    ee.vmulas.s8.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+1]  id:776
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+2]  id:777
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a12,q3,q2,q1,q0 	# [0*II+4]  id:778
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+5]  id:779
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a15,a11,q4,q1,q0,q3 	# [0*II+7]  id:780
+    ee.vmulas.s8.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+8]  id:781
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+9]  id:782
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a12,q3,q0,q1,q2 	# [0*II+11]  id:783
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+12]  id:784
+
+.LBB186_dspi_dotprod_off_s8_aes3:	# 0x21e
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_26626                 	# [1]  
+
+.LBB50_dspi_dotprod_off_s8_aes3:	# 0x224
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a13,-16                	# [1]  
+    l32i	a11,a1,64                	# [2]  gra_spill_temp_0
+    addi	a15,a15,16               	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a15,a13  	# [5]  id:785
+    ee.ld.128.usar.xp	q1,a15,a11  	# [6]  id:786
+    ee.src.q.ld.xp	q3,a15,a13,q1,q2 	# [8]  id:787
+    ee.ld.128.usar.xp	q2,a15,a11  	# [9]  id:788
+    beqz.n	a3,.Lt_0_28162         	# [10]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,-16                	# [1]  
+    loopnez	a3,.LBB209_dspi_dotprod_off_s8_aes3 	# [2]  
+
+.LBB207_dspi_dotprod_off_s8_aes3:	# 0x248
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a10,q0,q1,q2,q3 	# [0*II+0]  id:789
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+1]  id:790
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+2]  id:791
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a10,q0,q2,q1,q3 	# [0*II+4]  id:792
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+5]  id:793
+    ee.ld.128.usar.xp	q4,a15,a11  	# [0*II+6]  id:794
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a10,q0,q1,q4,q3 	# [0*II+8]  id:795
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+9]  id:796
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+10]  id:797
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a15,a10,q0,q4,q1,q3 	# [0*II+12]  id:798
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q4,q6 	# [0*II+13]  id:799
+    ee.ld.128.usar.xp	q2,a15,a11  	# [0*II+14]  id:800
+
+.LBB209_dspi_dotprod_off_s8_aes3:	# 0x274
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_28162                 	# [1]  
+
+.LBB57_dspi_dotprod_off_s8_aes3:	# 0x27a
+    ee.ld.128.usar.ip	q1,a15,16   	# [0]  id:801
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:802
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [3]  id:803
+    beqz.n	a3,.Lt_0_29698         	# [4]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    l32i	a12,a1,64                	# [2]  gra_spill_temp_0
+    movi.n	a11,-16                	# [3]  
+    sub	a12,a12,a5                	# [4]  
+    addi	a12,a12,16               	# [5]  
+    loopnez	a3,.LBB232_dspi_dotprod_off_s8_aes3 	# [6]  
+
+.LBB230_dspi_dotprod_off_s8_aes3:	# 0x298
+    ee.vmulas.s8.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:804
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q1,q6 	# [0*II+1]  id:805
+    ee.vmulas.s8.accx.ld.xp.qup	q4,a15,a12,q4,q2,q3,q0 	# [0*II+3]  id:806
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q2,q6 	# [0*II+4]  id:807
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a15,a11,q1,q3,q0,q4 	# [0*II+6]  id:808
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q3,q6 	# [0*II+7]  id:809
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+8]  id:810
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+10]  id:811
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+11]  id:812
+
+.LBB232_dspi_dotprod_off_s8_aes3:	# 0x2bb
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_29698                 	# [1]  
+
+.LBB64_dspi_dotprod_off_s8_aes3:	# 0x2c1
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i	a12,a1,64                	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [3]  id:813
+    ee.ld.128.usar.ip	q2,a15,16   	# [4]  id:814
+    sub	a12,a12,a5                	# [6]  
+    addi	a12,a12,16               	# [7]  
+    ld.qr	q0,a1,48                	# [8]  q0
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [9]  id:815
+    mov.n	a8,a15                  	# [10]  
+    loopnez	a3,.LBB254_dspi_dotprod_off_s8_aes3 	# [11]  
+
+.LBB252_dspi_dotprod_off_s8_aes3:	# 0x2df
+    ee.vmulas.s8.accx.ld.ip.qup	q0,a8,16,q0,q1,q2,q3 	# [0*II+0]  id:816
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q1,q6 	# [0*II+1]  id:817
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a8,16,q4,q2,q3,q0 	# [0*II+3]  id:818
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q2,q6 	# [0*II+4]  id:819
+    ee.vmulas.s8.accx.ld.ip.qup	q1,a8,16,q1,q3,q0,q4 	# [0*II+6]  id:820
+    ee.vmulas.s8.accx.ld.ip	q5,a2,16,q3,q6 	# [0*II+7]  id:821
+    ee.vmulas.s8.accx.ld.ip.qup	q5,a8,16,q5,q0,q4,q1 	# [0*II+9]  id:822
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+10]  id:823
+    ee.vmulas.s8.accx.ld.ip.qup	q0,a8,16,q0,q4,q1,q5 	# [0*II+12]  id:824
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q4,q6 	# [0*II+13]  id:825
+    ee.vmulas.s8.accx.ld.xp.qup	q4,a8,a12,q4,q1,q5,q0 	# [0*II+15]  id:826
+    ee.vmulas.s8.accx.ld.ip	q1,a2,16,q1,q6 	# [0*II+16]  id:827
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a8,a11,q1,q5,q0,q4 	# [0*II+18]  id:828
+    ee.vmulas.s8.accx.ld.ip	q4,a2,16,q5,q6 	# [0*II+19]  id:829
+    ee.ld.128.usar.xp	q1,a8,a10   	# [0*II+20]  id:830
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a8,16,q4,q0,q1,q2 	# [0*II+22]  id:831
+    ee.vmulas.s8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+23]  id:832
+
+.LBB254_dspi_dotprod_off_s8_aes3:	# 0x322
+    movi.n	a2,0                   	# [0]  
+    movi.n	a11,1                  	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    rur.accx_0	a10                	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:854
+    retw.n                        	# [10]  
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    int8_t *i_data =  (int8_t *)in_image->data;
+    int8_t *f_data =  (int8_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int32_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_s8_arp4.S
@@ -0,0 +1,102 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_off_s8_arp4
+    .global dspi_dotprod_off_s8_ansi
+    .type   dspi_dotprod_off_s8_arp4,@function
+
+// esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int8_t offset);
+dspi_dotprod_off_s8_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+// offset       - a6
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 15
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_off_s8_arp4_body
+    j   dspi_dotprod_off_s8_ansi
+
+.dspi_dotprod_off_s8_arp4_body:
+    add sp, sp, -16
+
+    sw  a6, 0(sp)
+    mv  t6, sp
+    esp.vldbc.8.ip    q2, t6, 0 
+
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+
+    lw  t3, 8(a1)       // step_y
+    lw  t5, 12(a1)      // stride_x
+    mul t3, t5, t3
+
+    srli t6, a3, 4      // t5 = len/16
+    
+
+    addi    a7, a5, -1
+    li      t4, 1
+    sll     t4, t4, a7
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip                  q1, t5, 16  // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip          q0, t4, 16      // q1 - f_data
+            esp.vadd.s8             q3, q2, q1
+.loop_count_x:          esp.vmulas.s8.xacc.ld.ip    q1, t5, 16, q0, q3  // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.s.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_aes3.S
@@ -0,0 +1,417 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_61, 458755
+
+    # Program Unit: dspi_dotprod_off_u16_aes3
+    .type	dspi_dotprod_off_u16_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_off_u16_aes3
+dspi_dotprod_off_u16_aes3:	# 0x4
+.LBB1_dspi_dotprod_off_u16_aes3:	# 0x4
+    entry	a1,144                  	#  
+    l32i.n	a10,a2,4               	# [0]  id:760
+    l32i.n	a12,a2,12              	# [1]  id:759
+    mull	a8,a10,a5                	# [2]  
+    blt	a12,a8,.LBB89_dspi_dotprod_off_u16_aes3 	# [4]  
+
+    l32i.n	a13,a2,8               	# [0]  id:761
+    l32i.n	a9,a2,16               	# [1]  id:762
+    mull	a11,a13,a6               	# [2]  
+    blt	a9,a11,.LBB89_dspi_dotprod_off_u16_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:764
+    l32i.n	a14,a3,12              	# [1]  id:763
+    mull	a11,a15,a5               	# [2]  
+    blt	a14,a11,.LBB89_dspi_dotprod_off_u16_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:766
+    l32i.n	a9,a3,8                	# [1]  id:765
+    s32i	a9,a1,104                	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB89_dspi_dotprod_off_u16_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:767
+    s32i	a8,a1,100                	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_36354         	# [2]  
+
+    bne	a14,a11,.Lt_0_36354       	# [0]  
+
+    bnei	a15,1,.Lt_0_36354        	# [0]  
+
+    l32i	a9,a1,104                	# [0]  gra_spill_temp_2
+    beqi	a9,1,.Lt_0_19458         	# [2]  
+
+.Lt_0_36354:	# 0x46
+.Lt_0_19714:	# 0x46
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16ui	a8,a1,144               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:876
+    .type	dspi_dotprod_off_u16_ansi, @function
+    call8	dspi_dotprod_off_u16_ansi 	# [8]  dspi_dotprod_off_u16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB89_dspi_dotprod_off_u16_aes3:	# 0x5e
+    l32r	a2,.LC0_1_61             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_19458:	# 0x63
+    addi.n	a9,a10,-1              	# [0]  
+    bnez	a9,.Lt_0_37122           	# [1]  
+
+    addi.n	a10,a13,-1             	# [0]  
+    bnez	a10,.Lt_0_37122          	# [1]  
+
+    extui	a11,a5,0,3              	# [0]  
+    bnez.n	a11,.Lt_0_37122        	# [1]  
+
+    blti	a6,4,.Lt_0_37122         	# [0]  
+
+    movi.n	a14,32                 	# [0]  
+    blt	a14,a5,.LBB27_dspi_dotprod_off_u16_aes3 	# [1]  
+
+.Lt_0_37634:	# 0x7a
+.Lt_0_21506:	# 0x7a
+    l16ui	a9,a1,144               	# [0]  id:768 offset+0x0
+    addi	a8,a1,16                 	# [1]  temp_offset
+    l32i.n	a15,a2,0               	# [2]  id:769
+    mull	a10,a12,a13              	# [3]  
+    l32i	a2,a1,100                	# [4]  gra_spill_temp_1
+    slli	a10,a10,1                	# [5]  
+    s32i	a10,a1,96                	# [6]  gra_spill_temp_0
+    movi.n	a10,2                  	# [7]  
+    # loop-count fixed at 2
+    loop	a10,.LBB143_dspi_dotprod_off_u16_aes3 	# [8]  
+
+.LBB138_dspi_dotprod_off_u16_aes3:	# 0x93
+    s16i	a9,a8,0                  	# [0*II+0]  id:770 temp_offset+0x0
+    s16i	a9,a8,2                  	# [0*II+1]  id:770 temp_offset+0x0
+    s16i	a9,a8,4                  	# [0*II+2]  id:770 temp_offset+0x0
+    s16i	a9,a8,6                  	# [0*II+3]  id:770 temp_offset+0x0
+    s16i	a9,a8,8                  	# [0*II+4]  id:770 temp_offset+0x0
+    s16i	a9,a8,10                 	# [0*II+5]  id:770 temp_offset+0x0
+    s16i	a9,a8,12                 	# [0*II+6]  id:770 temp_offset+0x0
+    s16i	a9,a8,14                 	# [0*II+7]  id:770 temp_offset+0x0
+    addi	a8,a8,16                 	# [0*II+8]  
+
+.LBB143_dspi_dotprod_off_u16_aes3:	# 0xae
+    mov.n	a3,a6                   	# [0]  
+    addi	a11,a5,-24               	# [1]  
+    addi	a12,a1,24                	# [3]  temp_offset+8
+    movi.n	a13,0                  	# [4]  
+    wur.sar_byte	a13              	# [5]  
+    wur.accx_0	a13                	# [6]  
+    wur.accx_1	a13                	# [7]  
+    ee.vld.128.ip	q6,a12,0        	# [8]  id:771
+    s32i.n	a12,a1,48              	# [9]  offset_data_ptr
+    beqz	a11,.LBB34_dspi_dotprod_off_u16_aes3 	# [10]  
+
+    l32i	a2,a1,100                	# [0]  gra_spill_temp_1
+    ee.vld.128.ip	q0,a2,16        	# [2]  id:787
+    st.qr	q0,a1,64                	# [3]  q0
+
+.Lt_0_25090:	# 0xd1
+    addi	a14,a5,-16               	# [0]  
+    beqz	a14,.LBB43_dspi_dotprod_off_u16_aes3 	# [1]  
+
+.Lt_0_27138:	# 0xd7
+.Lt_0_26626:	# 0xd7
+    addi	a8,a5,-8                 	# [0]  
+    beqz	a8,.LBB50_dspi_dotprod_off_u16_aes3 	# [1]  
+
+.Lt_0_28674:	# 0xdd
+.Lt_0_28162:	# 0xdd
+    addi	a9,a5,-32                	# [0]  
+    beqz	a9,.LBB57_dspi_dotprod_off_u16_aes3 	# [1]  
+
+.Lt_0_30210:	# 0xe3
+.Lt_0_29698:	# 0xe3
+    addi	a10,a5,-64               	# [0]  
+    beqz	a10,.LBB64_dspi_dotprod_off_u16_aes3 	# [1]  
+
+    movi.n	a11,64                 	# [0]  
+    bge	a11,a5,.Lt_0_33026        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a15,16   	# [1]  id:849
+    ee.ld.128.usar.ip	q2,a15,16   	# [2]  id:850
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [4]  id:851
+    beqz.n	a3,.Lt_0_33026         	# [5]  
+
+    ld.qr	q0,a1,64                	# [0]  q0
+    slli	a8,a5,1                  	# [1]  
+    l32i	a14,a1,96                	# [2]  gra_spill_temp_0
+    addi	a13,a5,31                	# [3]  
+    movgez	a13,a5,a5              	# [4]  
+    srai	a13,a13,5                	# [5]  
+    sub	a14,a14,a8                	# [6]  
+    addi	a14,a14,16               	# [7]  
+    addi.n	a13,a13,-1             	# [8]  
+
+.Lt_0_33794:	# 0x115
+    beqz.n	a13,.Lt_0_34050        	# [0]  
+
+    loopnez	a13,.LBB280_dspi_dotprod_off_u16_aes3 	# [0]  
+
+.LBB278_dspi_dotprod_off_u16_aes3:	# 0x11a
+    ee.vmulas.u16.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:852
+    ee.vmulas.u16.accx.ld.ip	q1,a2,16,q1,q6 	# [0*II+1]  id:853
+    ee.vmulas.u16.accx.ld.ip.qup	q1,a15,16,q1,q2,q3,q0 	# [0*II+3]  id:854
+    ee.vmulas.u16.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+4]  id:855
+    ee.vmulas.u16.accx.ld.ip.qup	q2,a15,16,q4,q3,q0,q1 	# [0*II+6]  id:856
+    ee.vmulas.u16.accx.ld.ip	q4,a2,16,q3,q6 	# [0*II+7]  id:857
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+9]  id:858
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+10]  id:859
+
+.LBB280_dspi_dotprod_off_u16_aes3:	# 0x13a
+
+.Lt_0_34050:	# 0x13a
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a15,16,q0,q1,q2,q3 	# [0]  id:860
+    ee.vmulas.u16.accx.ld.ip	q1,a2,16,q1,q6 	# [1]  id:861
+    movi.n	a9,32                  	# [2]  
+    ee.vmulas.u16.accx.ld.xp.qup	q0,a15,a14,q1,q2,q3,q4 	# [3]  id:862
+    ee.vmulas.u16.accx.ld.ip	q7,a2,16,q2,q6 	# [4]  id:863
+    movi.n	a10,-16                	# [5]  
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a15,a10,q7,q3,q4,q0 	# [6]  id:864
+    ee.vmulas.u16.accx.ld.ip	q5,a2,16,q3,q6 	# [7]  id:866
+    ee.ld.128.usar.xp	q1,a15,a9   	# [8]  id:865
+    addi.n	a12,a12,1              	# [9]  
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a15,16,q5,q4,q1,q2 	# [10]  id:867
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q4,q6 	# [11]  id:868
+    bne	a12,a3,.Lt_0_33794        	# [12]  
+
+.Lt_0_33026:	# 0x166
+.Lt_0_32770:	# 0x166
+    rur.accx_0	a9                 	# [0]  
+    rur.accx_1	a10                	# [1]  
+    blti	a7,1,.Lt_0_35586         	# [2]  
+
+    movi.n	a2,0                   	# [0]  
+    addi	a13,a7,-33               	# [1]  
+    addi.n	a14,a7,-1              	# [2]  
+    ssr	a14                       	# [3]  
+    sra	a12,a10                   	# [4]  
+    src	a11,a10,a9                	# [5]  
+    movgez	a11,a12,a13            	# [6]  
+    addi.n	a11,a11,1              	# [7]  
+    srli	a11,a11,1                	# [8]  
+    s16i	a11,a4,0                 	# [9]  id:874
+    retw.n                        	# [10]  
+
+.Lt_0_37122:	# 0x18c
+.Lt_0_20738:	# 0x18c
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16ui	a8,a1,144               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:877
+    call8	dspi_dotprod_off_u16_ansi 	# [8]  dspi_dotprod_off_u16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB27_dspi_dotprod_off_u16_aes3:	# 0x1a4
+    extui	a9,a5,0,1               	# [0]  
+    beqz	a9,.Lt_0_37634           	# [1]  
+
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    l16ui	a8,a1,144               	# [6]  id:768 offset+0x0
+    s32i.n	a8,a1,0                	# [7]  id:878
+    call8	dspi_dotprod_off_u16_ansi 	# [8]  dspi_dotprod_off_u16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB34_dspi_dotprod_off_u16_aes3:	# 0x1c2
+    ee.ld.128.usar.ip	q0,a15,16   	# [0]  id:776
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:777
+    ee.src.q.ld.ip	q3,a15,16,q0,q2 	# [3]  id:778
+    beqz.n	a6,.Lt_0_25090         	# [4]  
+
+    movi.n	a10,32                 	# [0]  
+    l32i	a12,a1,96                	# [1]  gra_spill_temp_0
+    movi.n	a11,-16                	# [2]  
+    addi	a12,a12,-32              	# [3]  
+    loopgtz	a6,.LBB166_dspi_dotprod_off_u16_aes3 	# [4]  
+
+.LBB164_dspi_dotprod_off_u16_aes3:	# 0x1da
+    ee.vmulas.u16.accx.ld.ip	q1,a2,16,q0,q6 	# [0*II+0]  id:779
+    ee.vmulas.u16.accx.ld.xp.qup	q1,a15,a12,q1,q0,q2,q3 	# [0*II+2]  id:780
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+3]  id:781
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a15,a11,q0,q2,q3,q1 	# [0*II+5]  id:782
+    ee.vmulas.u16.accx.ld.ip	q1,a2,16,q3,q6 	# [0*II+6]  id:784
+    ee.ld.128.usar.xp	q0,a15,a10  	# [0*II+7]  id:783
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a15,16,q1,q3,q0,q2 	# [0*II+9]  id:785
+
+.LBB166_dspi_dotprod_off_u16_aes3:	# 0x1f5
+    st.qr	q1,a1,64                	# [0]  q0
+    j	.Lt_0_25090                 	# [1]  
+
+.LBB43_dspi_dotprod_off_u16_aes3:	# 0x1fb
+    srli	a3,a6,1                  	# [0]  
+    l32i	a12,a1,96                	# [1]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [2]  id:788
+    ee.ld.128.usar.ip	q2,a15,16   	# [3]  id:789
+    addi	a12,a12,-16              	# [5]  
+    ee.src.q.ld.xp	q3,a15,a12,q1,q2 	# [6]  id:790
+    beqz.n	a3,.Lt_0_27138         	# [7]  
+
+    ld.qr	q0,a1,64                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    movi.n	a11,-16                	# [2]  
+    loopnez	a3,.LBB189_dspi_dotprod_off_u16_aes3 	# [3]  
+
+.LBB187_dspi_dotprod_off_u16_aes3:	# 0x219
+    ee.vmulas.u16.accx.ld.xp.qup	q0,a15,a11,q0,q1,q2,q3 	# [0*II+0]  id:791
+    ee.vmulas.u16.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+1]  id:792
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+2]  id:793
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a12,q3,q2,q1,q0 	# [0*II+4]  id:794
+    ee.vmulas.u16.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+5]  id:795
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a15,a11,q4,q1,q0,q3 	# [0*II+7]  id:796
+    ee.vmulas.u16.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+8]  id:797
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+9]  id:798
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a12,q3,q0,q1,q2 	# [0*II+11]  id:799
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+12]  id:800
+
+.LBB189_dspi_dotprod_off_u16_aes3:	# 0x23f
+    st.qr	q0,a1,64                	# [0]  q0
+    j	.Lt_0_27138                 	# [1]  
+
+.LBB50_dspi_dotprod_off_u16_aes3:	# 0x245
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a13,-16                	# [1]  
+    l32i	a11,a1,96                	# [2]  gra_spill_temp_0
+    addi	a15,a15,16               	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a15,a13  	# [5]  id:801
+    ee.ld.128.usar.xp	q1,a15,a11  	# [6]  id:802
+    ee.src.q.ld.xp	q3,a15,a13,q1,q2 	# [8]  id:803
+    ee.ld.128.usar.xp	q2,a15,a11  	# [9]  id:804
+    beqz.n	a3,.Lt_0_28674         	# [10]  
+
+    ld.qr	q0,a1,64                	# [0]  q0
+    movi.n	a10,-16                	# [1]  
+    loopnez	a3,.LBB212_dspi_dotprod_off_u16_aes3 	# [2]  
+
+.LBB210_dspi_dotprod_off_u16_aes3:	# 0x269
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a10,q0,q1,q2,q3 	# [0*II+0]  id:805
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+1]  id:806
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+2]  id:807
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a10,q0,q2,q1,q3 	# [0*II+4]  id:808
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+5]  id:809
+    ee.ld.128.usar.xp	q4,a15,a11  	# [0*II+6]  id:810
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a10,q0,q1,q4,q3 	# [0*II+8]  id:811
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+9]  id:812
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+10]  id:813
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a15,a10,q0,q4,q1,q3 	# [0*II+12]  id:814
+    ee.vmulas.u16.accx.ld.ip	q0,a2,16,q4,q6 	# [0*II+13]  id:815
+    ee.ld.128.usar.xp	q2,a15,a11  	# [0*II+14]  id:816
+
+.LBB212_dspi_dotprod_off_u16_aes3:	# 0x295
+    st.qr	q0,a1,64                	# [0]  q0
+    j	.Lt_0_28674                 	# [1]  
+
+.LBB57_dspi_dotprod_off_u16_aes3:	# 0x29b
+    ee.ld.128.usar.ip	q1,a15,16   	# [0]  id:817
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:818
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [3]  id:819
+    beqz.n	a3,.Lt_0_30210         	# [4]  
+
+    ld.qr	q0,a1,64                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    movi.n	a11,-16                	# [2]  
+    l32i	a12,a1,96                	# [3]  gra_spill_temp_0
+    slli	a13,a5,1                 	# [4]  
+    sub	a12,a12,a13               	# [5]  
+    addi	a12,a12,16               	# [6]  
+    loopnez	a3,.LBB235_dspi_dotprod_off_u16_aes3 	# [7]  
+
+.LBB233_dspi_dotprod_off_u16_aes3:	# 0x2bc
+    ee.vmulas.u16.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:820
+    ee.vmulas.u16.accx.ld.ip		q4,a2,16,q1,q6 	# [0*II+1]  id:821
+    ee.vmulas.u16.accx.ld.xp.qup	q4,a15,a12,q4,q2,q3,q0 	# [0*II+3]  id:822
+    ee.vmulas.u16.accx.ld.ip		q1,a2,16,q2,q6 	# [0*II+4]  id:823
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a15,a11,q1,q3,q0,q4 	# [0*II+6]  id:824
+    ee.vmulas.u16.accx.ld.ip		q4,a2,16,q3,q6 	# [0*II+7]  id:826
+    ee.ld.128.usar.xp				q1,a15,a10  	# [0*II+8]  id:825
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+10]  id:827
+    ee.vmulas.u16.accx.ld.ip		q0,a2,16,q0,q6 	# [0*II+11]  id:828
+
+.LBB235_dspi_dotprod_off_u16_aes3:	# 0x2df
+    st.qr	q0,a1,64                	# [0]  q0
+    j	.Lt_0_30210                 	# [1]  
+
+.LBB64_dspi_dotprod_off_u16_aes3:	# 0x2e5
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    slli	a13,a5,1                 	# [2]  
+    l32i	a12,a1,96                	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [4]  id:829
+    ee.ld.128.usar.ip	q2,a15,16   	# [5]  id:830
+    sub	a12,a12,a13               	# [7]  
+    addi	a12,a12,16               	# [8]  
+    ld.qr	q0,a1,64                	# [9]  q0
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [10]  id:831
+    mov.n	a8,a15                  	# [11]  
+    loopnez	a3,.LBB257_dspi_dotprod_off_u16_aes3 	# [12]  
+
+.LBB255_dspi_dotprod_off_u16_aes3:	# 0x306
+    ee.vmulas.u16.accx.ld.ip.qup	q0,a8,16,q0,q1,q2,q3 	# [0*II+0]  id:832
+    ee.vmulas.u16.accx.ld.ip		q4,a2,16,q1,q6 	# [0*II+1]  id:833
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a8,16,q4,q2,q3,q0 	# [0*II+3]  id:834
+    ee.vmulas.u16.accx.ld.ip		q1,a2,16,q2,q6 	# [0*II+4]  id:835
+    ee.vmulas.u16.accx.ld.ip.qup	q1,a8,16,q1,q3,q0,q4 	# [0*II+6]  id:836
+    ee.vmulas.u16.accx.ld.ip		q5,a2,16,q3,q6 	# [0*II+7]  id:837
+    ee.vmulas.u16.accx.ld.ip.qup	q5,a8,16,q5,q0,q4,q1 	# [0*II+9]  id:838
+    ee.vmulas.u16.accx.ld.ip		q0,a2,16,q0,q6 	# [0*II+10]  id:839
+    ee.vmulas.u16.accx.ld.ip.qup	q0,a8,16,q0,q4,q1,q5 	# [0*II+12]  id:840
+    ee.vmulas.u16.accx.ld.ip		q4,a2,16,q4,q6 	# [0*II+13]  id:841
+    ee.vmulas.u16.accx.ld.xp.qup	q4,a8,a12,q4,q1,q5,q0 	# [0*II+15]  id:842
+    ee.vmulas.u16.accx.ld.ip		q1,a2,16,q1,q6 	# [0*II+16]  id:843
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a8,a11,q1,q5,q0,q4 	# [0*II+18]  id:844
+    ee.vmulas.u16.accx.ld.ip		q4,a2,16,q5,q6 	# [0*II+19]  id:846
+    ee.ld.128.usar.xp				q1,a8,a10   	# [0*II+20]  id:845
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a8,16,q4,q0,q1,q2 	# [0*II+22]  id:847
+    ee.vmulas.u16.accx.ld.ip		q0,a2,16,q0,q6 	# [0*II+23]  id:848
+
+.LBB257_dspi_dotprod_off_u16_aes3:	# 0x349
+    j	.Lt_0_33026                 	# [0]  
+
+.Lt_0_35586:	# 0x34c
+    movi.n	a2,0                   	# [0]  
+    sext	a14,a9,15                	# [1]  
+    s16i	a14,a4,0                 	# [2]  id:875
+    retw.n                        	# [3]  
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    uint16_t *i_data =  (uint16_t *)in_image->data;
+    uint16_t *f_data =  (uint16_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int64_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u16_arp4.S
@@ -0,0 +1,104 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_off_u16_arp4
+    .global dspi_dotprod_off_u16_ansi
+    .type   dspi_dotprod_off_u16_arp4,@function
+
+// esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, unt16_t offset);
+dspi_dotprod_off_u16_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+// offset       - a6
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 7
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_off_u16_arp4_body
+    j   dspi_dotprod_off_u16_ansi
+
+.dspi_dotprod_off_u16_arp4_body:
+    add sp, sp, -16
+
+    sw  a6, 0(sp)
+    mv  t6, sp
+    esp.vldbc.16.ip   q2, t6, 0 
+
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+    slli t2, t2, 1      // i_step = i_step<<1
+
+    lw  t3, 8(a1)       // step_y
+    lw  t5, 12(a1)      // stride_x
+    mul t3, t5, t3
+    slli t3, t3, 1      // f_step = f_step<<1
+
+    srli t6, a3, 3      // t5 = len/8
+    
+
+    addi    a7, a5, -1
+    li      t4, 1
+    sll     t4, t4, a7
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip      q1, t5, 16          // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip          q0, t4, 16  // q1 - f_data
+            esp.vadd.u16            q3, q2, q1
+.loop_count_x:  esp.vmulas.u16.xacc.ld.ip       q1, t5, 16, q0, q3  // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.u.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_aes3.S
@@ -0,0 +1,407 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_57, 458755
+
+    # Program Unit: dspi_dotprod_off_u8_aes3
+    .type	dspi_dotprod_off_u8_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_off_u8_aes3
+dspi_dotprod_off_u8_aes3:	# 0x4
+
+.LBB1_dspi_dotprod_off_u8_aes3:	# 0x4
+    entry	a1,112                  	#  
+    l32i.n	a10,a2,4               	# [0]  id:745
+    l32i.n	a12,a2,12              	# [1]  id:744
+    mull	a8,a10,a5                	# [2]  
+    blt	a12,a8,.LBB86_dspi_dotprod_off_u8_aes3 	# [4]  
+
+    l32i.n	a13,a2,8               	# [0]  id:746
+    l32i.n	a9,a2,16               	# [1]  id:747
+    mull	a11,a13,a6               	# [2]  
+    blt	a9,a11,.LBB86_dspi_dotprod_off_u8_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:749
+    l32i.n	a14,a3,12              	# [1]  id:748
+    mull	a11,a15,a5               	# [2]  
+    blt	a14,a11,.LBB86_dspi_dotprod_off_u8_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:751
+    l32i.n	a9,a3,8                	# [1]  id:750
+    s32i	a9,a1,72                 	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB86_dspi_dotprod_off_u8_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:752
+    s32i	a8,a1,68                 	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_35330         	# [2]  
+
+    bne	a14,a11,.Lt_0_35330       	# [0]  
+
+    bnei	a15,1,.Lt_0_35330        	# [0]  
+
+    l32i	a11,a1,72                	# [0]  gra_spill_temp_2
+    beqi	a11,1,.Lt_0_18946        	# [2]  
+
+.Lt_0_35330:	# 0x46
+.Lt_0_19202:	# 0x46
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    .type	dspi_dotprod_u8_ansi, @function
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB86_dspi_dotprod_off_u8_aes3:	# 0x59
+    l32r	a2,.LC0_1_57             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_18946:	# 0x5e
+    addi.n	a14,a10,-1             	# [0]  
+    bnez	a14,.Lt_0_36098          	# [1]  
+
+    addi.n	a15,a13,-1             	# [0]  
+    bnez	a15,.Lt_0_36098          	# [1]  
+
+    extui	a8,a5,0,4               	# [0]  
+    bnez.n	a8,.Lt_0_36098         	# [1]  
+
+    blti	a6,4,.Lt_0_36098         	# [0]  
+
+    movi.n	a9,64                  	# [0]  
+    blt	a9,a5,.LBB27_dspi_dotprod_off_u8_aes3 	# [1]  
+
+.Lt_0_36610:	# 0x75
+.Lt_0_20994:	# 0x75
+    l8ui	a9,a1,112                	# [0]  id:754 offset+0x0
+    mov.n	a8,a1                   	# [1]  
+    l32i.n	a15,a2,0               	# [2]  id:753
+    mull	a10,a12,a13              	# [3]  
+    l32i	a2,a1,68                 	# [4]  gra_spill_temp_1
+    s32i	a10,a1,64                	# [5]  gra_spill_temp_0
+    movi.n	a10,4                  	# [6]  
+    # loop-count fixed at 4
+    loop	a10,.LBB140_dspi_dotprod_off_u8_aes3 	# [7]  
+
+.LBB135_dspi_dotprod_off_u8_aes3:	# 0x8a
+    s8i	a9,a8,0                   	# [0*II+0]  id:755 temp_offset+0x0
+    s8i	a9,a8,1                   	# [0*II+1]  id:755 temp_offset+0x0
+    s8i	a9,a8,2                   	# [0*II+2]  id:755 temp_offset+0x0
+    s8i	a9,a8,3                   	# [0*II+3]  id:755 temp_offset+0x0
+    s8i	a9,a8,4                   	# [0*II+4]  id:755 temp_offset+0x0
+    s8i	a9,a8,5                   	# [0*II+5]  id:755 temp_offset+0x0
+    s8i	a9,a8,6                   	# [0*II+6]  id:755 temp_offset+0x0
+    s8i	a9,a8,7                   	# [0*II+7]  id:755 temp_offset+0x0
+    addi.n	a8,a8,8                	# [0*II+8]  
+
+.LBB140_dspi_dotprod_off_u8_aes3:	# 0xa4
+    mov.n	a3,a6                   	# [0]  
+    addi	a11,a5,-48               	# [1]  
+    addi.n	a12,a1,8               	# [3]  temp_offset+8
+    movi.n	a13,0                  	# [4]  
+    wur.accx_0	a13                	# [5]  
+    wur.accx_1	a13                	# [6]  
+    ee.vld.128.ip	q6,a12,0        	# [7]  id:756
+    s32i.n	a12,a1,32              	# [8]  offset_data_ptr
+    beqz	a11,.LBB34_dspi_dotprod_off_u8_aes3 	# [9]  
+
+    l32i	a2,a1,68                 	# [0]  gra_spill_temp_1
+    ee.vld.128.ip	q0,a2,16        	# [2]  id:771
+    st.qr	q0,a1,48                	# [3]  q0
+
+.Lt_0_24578:	# 0xc3
+    addi	a14,a5,-32               	# [0]  
+    beqz	a14,.LBB43_dspi_dotprod_off_u8_aes3 	# [1]  
+
+.Lt_0_26626:	# 0xc9
+.Lt_0_26114:	# 0xc9
+    addi	a8,a5,-16                	# [0]  
+    beqz	a8,.LBB50_dspi_dotprod_off_u8_aes3 	# [1]  
+
+.Lt_0_28162:	# 0xcf
+.Lt_0_27650:	# 0xcf
+    addi	a9,a5,-64                	# [0]  
+    beqz	a9,.LBB57_dspi_dotprod_off_u8_aes3 	# [1]  
+
+.Lt_0_29698:	# 0xd5
+.Lt_0_29186:	# 0xd5
+    addi	a10,a5,-128              	# [0]  
+    beqz	a10,.LBB64_dspi_dotprod_off_u8_aes3 	# [1]  
+
+    movi	a11,128                  	# [0]  
+    bge	a11,a5,.Lt_0_32514        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a15,16   	# [1]  id:833
+    ee.ld.128.usar.ip	q2,a15,16   	# [2]  id:834
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [4]  id:835
+    beqz.n	a3,.Lt_0_32514         	# [5]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    l32i	a14,a1,64                	# [1]  gra_spill_temp_0
+    addi	a13,a5,31                	# [2]  
+    movgez	a13,a5,a5              	# [3]  
+    srai	a13,a13,5                	# [4]  
+    sub	a14,a14,a5                	# [5]  
+    addi	a14,a14,16               	# [6]  
+    addi.n	a13,a13,-1             	# [7]  
+
+.Lt_0_33282:	# 0x105
+    beqz.n	a13,.Lt_0_33538        	# [0]  
+
+    loopnez	a13,.LBB277_dspi_dotprod_off_u8_aes3 	# [0]  
+
+.LBB275_dspi_dotprod_off_u8_aes3:	# 0x10a
+    ee.vmulas.u8.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:836
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q1,q6 	# [0*II+1]  id:837
+    ee.vmulas.u8.accx.ld.ip.qup	q1,a15,16,q1,q2,q3,q0 	# [0*II+3]  id:838
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+4]  id:839
+    ee.vmulas.u8.accx.ld.ip.qup	q2,a15,16,q4,q3,q0,q1 	# [0*II+6]  id:840
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q3,q6 	# [0*II+7]  id:841
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+9]  id:842
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+10]  id:843
+
+.LBB277_dspi_dotprod_off_u8_aes3:	# 0x12a
+
+.Lt_0_33538:	# 0x12a
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a15,16,q0,q1,q2,q3 	# [0]  id:844
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q1,q6 	# [1]  id:845
+    movi.n	a8,32                  	# [2]  
+    ee.vmulas.u8.accx.ld.xp.qup	q0,a15,a14,q1,q2,q3,q4 	# [3]  id:846
+    ee.vmulas.u8.accx.ld.ip	q7,a2,16,q2,q6 	# [4]  id:847
+    movi.n	a9,-16                 	# [5]  
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a15,a9,q7,q3,q4,q0 	# [6]  id:848
+    ee.vmulas.u8.accx.ld.ip	q5,a2,16,q3,q6 	# [7]  id:850
+    ee.ld.128.usar.xp	q1,a15,a8   	# [8]  id:849
+    addi.n	a12,a12,1              	# [9]  
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a15,16,q5,q4,q1,q2 	# [10]  id:851
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q4,q6 	# [11]  id:852
+    bne	a12,a3,.Lt_0_33282        	# [12]  
+
+.Lt_0_32514:	# 0x156
+.Lt_0_32258:	# 0x156
+    movi.n	a2,0                   	# [0]  
+    rur.accx_0	a10                	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    movi.n	a11,1                  	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:854
+    retw.n                        	# [10]  
+
+.Lt_0_36098:	# 0x172
+.Lt_0_20226:	# 0x172
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB27_dspi_dotprod_off_u8_aes3:	# 0x185
+    extui	a14,a5,0,1              	# [0]  
+    beqz	a14,.Lt_0_36610          	# [1]  
+
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB34_dspi_dotprod_off_u8_aes3:	# 0x19e
+    ee.ld.128.usar.ip	q0,a15,16   	# [0]  id:760
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:761
+    ee.src.q.ld.ip	q3,a15,16,q0,q2 	# [3]  id:762
+    beqz.n	a6,.Lt_0_24578         	# [4]  
+
+    movi.n	a10,32                 	# [0]  
+    l32i	a12,a1,64                	# [1]  gra_spill_temp_0
+    movi.n	a11,-16                	# [2]  
+    addi	a12,a12,-32              	# [3]  
+    loopgtz	a6,.LBB163_dspi_dotprod_off_u8_aes3 	# [4]  
+
+.LBB161_dspi_dotprod_off_u8_aes3:	# 0x1b6
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q0,q6 	# [0*II+0]  id:763
+    ee.vmulas.u8.accx.ld.xp.qup	q1,a15,a12,q1,q0,q2,q3 	# [0*II+2]  id:764
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+3]  id:765
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a15,a11,q0,q2,q3,q1 	# [0*II+5]  id:766
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q3,q6 	# [0*II+6]  id:768
+    ee.ld.128.usar.xp	q0,a15,a10  	# [0*II+7]  id:767
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a15,16,q1,q3,q0,q2 	# [0*II+9]  id:769
+
+.LBB163_dspi_dotprod_off_u8_aes3:	# 0x1d1
+    st.qr	q1,a1,48                	# [0]  q0
+    j	.Lt_0_24578                 	# [1]  
+
+.LBB43_dspi_dotprod_off_u8_aes3:	# 0x1d7
+    srli	a3,a6,1                  	# [0]  
+    l32i	a12,a1,64                	# [1]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [2]  id:772
+    ee.ld.128.usar.ip	q2,a15,16   	# [3]  id:773
+    addi	a12,a12,-16              	# [5]  
+    ee.src.q.ld.xp	q3,a15,a12,q1,q2 	# [6]  id:774
+    beqz.n	a3,.Lt_0_26626         	# [7]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    movi.n	a11,-16                	# [2]  
+    loopnez	a3,.LBB186_dspi_dotprod_off_u8_aes3 	# [3]  
+
+.LBB184_dspi_dotprod_off_u8_aes3:	# 0x1f5
+    ee.vmulas.u8.accx.ld.xp.qup	q0,a15,a11,q0,q1,q2,q3 	# [0*II+0]  id:775
+    ee.vmulas.u8.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+1]  id:776
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+2]  id:777
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a12,q3,q2,q1,q0 	# [0*II+4]  id:778
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q2,q6 	# [0*II+5]  id:779
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a15,a11,q4,q1,q0,q3 	# [0*II+7]  id:780
+    ee.vmulas.u8.accx.ld.ip	q3,a2,16,q1,q6 	# [0*II+8]  id:781
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+9]  id:782
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a12,q3,q0,q1,q2 	# [0*II+11]  id:783
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+12]  id:784
+
+.LBB186_dspi_dotprod_off_u8_aes3:	# 0x21b
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_26626                 	# [1]  
+
+.LBB50_dspi_dotprod_off_u8_aes3:	# 0x221
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a13,-16                	# [1]  
+    l32i	a11,a1,64                	# [2]  gra_spill_temp_0
+    addi	a15,a15,16               	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a15,a13  	# [5]  id:785
+    ee.ld.128.usar.xp	q1,a15,a11  	# [6]  id:786
+    ee.src.q.ld.xp	q3,a15,a13,q1,q2 	# [8]  id:787
+    ee.ld.128.usar.xp	q2,a15,a11  	# [9]  id:788
+    beqz.n	a3,.Lt_0_28162         	# [10]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,-16                	# [1]  
+    loopnez	a3,.LBB209_dspi_dotprod_off_u8_aes3 	# [2]  
+
+.LBB207_dspi_dotprod_off_u8_aes3:	# 0x245
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a10,q0,q1,q2,q3 	# [0*II+0]  id:789
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+1]  id:790
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+2]  id:791
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a10,q0,q2,q1,q3 	# [0*II+4]  id:792
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q2,q6 	# [0*II+5]  id:793
+    ee.ld.128.usar.xp	q4,a15,a11  	# [0*II+6]  id:794
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a10,q0,q1,q4,q3 	# [0*II+8]  id:795
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q1,q6 	# [0*II+9]  id:796
+    ee.ld.128.usar.xp	q1,a15,a11  	# [0*II+10]  id:797
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a15,a10,q0,q4,q1,q3 	# [0*II+12]  id:798
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q4,q6 	# [0*II+13]  id:799
+    ee.ld.128.usar.xp	q2,a15,a11  	# [0*II+14]  id:800
+
+.LBB209_dspi_dotprod_off_u8_aes3:	# 0x271
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_28162                 	# [1]  
+
+.LBB57_dspi_dotprod_off_u8_aes3:	# 0x277
+    ee.ld.128.usar.ip	q1,a15,16   	# [0]  id:801
+    ee.ld.128.usar.ip	q2,a15,16   	# [1]  id:802
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [3]  id:803
+    beqz.n	a3,.Lt_0_29698         	# [4]  
+
+    ld.qr	q0,a1,48                	# [0]  q0
+    movi.n	a10,32                 	# [1]  
+    l32i	a12,a1,64                	# [2]  gra_spill_temp_0
+    movi.n	a11,-16                	# [3]  
+    sub	a12,a12,a5                	# [4]  
+    addi	a12,a12,16               	# [5]  
+    loopnez	a3,.LBB232_dspi_dotprod_off_u8_aes3 	# [6]  
+
+.LBB230_dspi_dotprod_off_u8_aes3:	# 0x295
+    ee.vmulas.u8.accx.ld.ip.qup	q0,a15,16,q0,q1,q2,q3 	# [0*II+0]  id:804
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q1,q6 	# [0*II+1]  id:805
+    ee.vmulas.u8.accx.ld.xp.qup	q4,a15,a12,q4,q2,q3,q0 	# [0*II+3]  id:806
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q2,q6 	# [0*II+4]  id:807
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a15,a11,q1,q3,q0,q4 	# [0*II+6]  id:808
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q3,q6 	# [0*II+7]  id:809
+    ee.ld.128.usar.xp	q1,a15,a10  	# [0*II+8]  id:810
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a15,16,q4,q0,q1,q2 	# [0*II+10]  id:811
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+11]  id:812
+
+.LBB232_dspi_dotprod_off_u8_aes3:	# 0x2b8
+    st.qr	q0,a1,48                	# [0]  q0
+    j	.Lt_0_29698                 	# [1]  
+
+.LBB64_dspi_dotprod_off_u8_aes3:	# 0x2be
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i	a12,a1,64                	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a15,16   	# [3]  id:813
+    ee.ld.128.usar.ip	q2,a15,16   	# [4]  id:814
+    sub	a12,a12,a5                	# [6]  
+    addi	a12,a12,16               	# [7]  
+    ld.qr	q0,a1,48                	# [8]  q0
+    ee.src.q.ld.ip	q3,a15,16,q1,q2 	# [9]  id:815
+    mov.n	a8,a15                  	# [10]  
+    loopnez	a3,.LBB254_dspi_dotprod_off_u8_aes3 	# [11]  
+
+.LBB252_dspi_dotprod_off_u8_aes3:	# 0x2dc
+    ee.vmulas.u8.accx.ld.ip.qup	q0,a8,16,q0,q1,q2,q3 	# [0*II+0]  id:816
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q1,q6 	# [0*II+1]  id:817
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a8,16,q4,q2,q3,q0 	# [0*II+3]  id:818
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q2,q6 	# [0*II+4]  id:819
+    ee.vmulas.u8.accx.ld.ip.qup	q1,a8,16,q1,q3,q0,q4 	# [0*II+6]  id:820
+    ee.vmulas.u8.accx.ld.ip	q5,a2,16,q3,q6 	# [0*II+7]  id:821
+    ee.vmulas.u8.accx.ld.ip.qup	q5,a8,16,q5,q0,q4,q1 	# [0*II+9]  id:822
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+10]  id:823
+    ee.vmulas.u8.accx.ld.ip.qup	q0,a8,16,q0,q4,q1,q5 	# [0*II+12]  id:824
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q4,q6 	# [0*II+13]  id:825
+    ee.vmulas.u8.accx.ld.xp.qup	q4,a8,a12,q4,q1,q5,q0 	# [0*II+15]  id:826
+    ee.vmulas.u8.accx.ld.ip	q1,a2,16,q1,q6 	# [0*II+16]  id:827
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a8,a11,q1,q5,q0,q4 	# [0*II+18]  id:828
+    ee.vmulas.u8.accx.ld.ip	q4,a2,16,q5,q6 	# [0*II+19]  id:829
+    ee.ld.128.usar.xp	q1,a8,a10   	# [0*II+20]  id:830
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a8,16,q4,q0,q1,q2 	# [0*II+22]  id:831
+    ee.vmulas.u8.accx.ld.ip	q0,a2,16,q0,q6 	# [0*II+23]  id:832
+
+.LBB254_dspi_dotprod_off_u8_aes3:	# 0x31f
+    movi.n	a2,0                   	# [0]  
+    movi.n	a11,1                  	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    rur.accx_0	a10                	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:854
+    retw.n                        	# [10]  
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    uint8_t *i_data =  (uint8_t *)in_image->data;
+    uint8_t *f_data =  (uint8_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int32_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_off_u8_arp4.S
@@ -0,0 +1,102 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_off_u8_arp4
+    .global dspi_dotprod_off_u8_ansi
+    .type   dspi_dotprod_off_u8_arp4,@function
+
+// esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
+dspi_dotprod_off_u8_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+// offset       - a6
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 15
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_off_u8_arp4_body
+    j   dspi_dotprod_off_u8_ansi
+
+.dspi_dotprod_off_u8_arp4_body:
+    add sp, sp, -16
+
+    sw  a6, 0(sp)
+    mv  t6, sp
+    esp.vldbc.8.ip    q2, t6, 0 
+
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+
+    lw  t3, 8(a1)   // step_y
+    lw  t5, 12(a1)  // stride_x
+    mul t3, t5, t3
+
+    srli t6, a3, 4  // t5 = len/16
+    
+
+    addi    a7, a5, -1
+    li      t4, 1
+    sll     t4, t4, a7
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip      q1, t5, 16          // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip          q0, t4, 16  // q1 - f_data
+            esp.vadd.u8             q3, q2, q1
+.loop_count_x:  esp.vmulas.u8.xacc.ld.ip        q1, t5, 16, q0, q3  // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.u.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_aes3.S
@@ -0,0 +1,372 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_53, 458755
+
+    # Program Unit: dspi_dotprod_s16_aes3
+    .type	dspi_dotprod_s16_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_s16_aes3
+dspi_dotprod_s16_aes3:	# 0x4
+.LBB1_dspi_dotprod_s16_aes3:	# 0x4
+    entry	a1,64                   	#  
+    l32i.n	a10,a2,4               	# [0]  id:678
+    l32i.n	a11,a2,12              	# [1]  id:677
+    mull	a8,a10,a5                	# [2]  
+    blt	a11,a8,.LBB81_dspi_dotprod_s16_aes3 	# [4]  
+
+    l32i.n	a12,a2,8               	# [0]  id:679
+    l32i.n	a9,a2,16               	# [1]  id:680
+    mull	a13,a12,a6               	# [2]  
+    blt	a9,a13,.LBB81_dspi_dotprod_s16_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:682
+    l32i.n	a14,a3,12              	# [1]  id:681
+    mull	a13,a15,a5               	# [2]  
+    blt	a14,a13,.LBB81_dspi_dotprod_s16_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:684
+    l32i.n	a9,a3,8                	# [1]  id:683
+    s32i.n	a9,a1,24               	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB81_dspi_dotprod_s16_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:685
+    s32i.n	a8,a1,20               	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_34050         	# [2]  
+
+    bne	a14,a13,.Lt_0_34050       	# [0]  
+
+    bnei	a15,1,.Lt_0_34050        	# [0]  
+
+    l32i.n	a9,a1,24               	# [0]  gra_spill_temp_2
+    beqi	a9,1,.Lt_0_18178         	# [2]  
+
+.Lt_0_34050:	# 0x43
+.Lt_0_18434:	# 0x43
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    .type	dspi_dotprod_s16_ansi, @function
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB81_dspi_dotprod_s16_aes3:	# 0x56
+    l32r	a2,.LC0_1_53             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_18178:	# 0x5b
+    addi.n	a13,a10,-1             	# [0]  
+    bnez	a13,.Lt_0_34818          	# [1]  
+
+    addi.n	a14,a12,-1             	# [0]  
+    bnez	a14,.Lt_0_34818          	# [1]  
+
+    extui	a15,a5,0,3              	# [0]  
+    bnez.n	a15,.Lt_0_34818        	# [1]  
+
+    blti	a6,4,.Lt_0_34818         	# [0]  
+
+    movi.n	a8,32                  	# [0]  
+    bge	a8,a5,.Lt_0_35330         	# [1]  
+
+    extui	a9,a5,0,1               	# [0]  
+    bnez	a9,.LBB28_dspi_dotprod_s16_aes3 	# [1]  
+
+.Lt_0_35330:	# 0x78
+.Lt_0_20226:	# 0x78
+    mov.n	a3,a6                   	# [0]  
+    addi	a10,a5,-24               	# [1]  
+    mull	a13,a11,a12              	# [2]  
+    l32i.n	a15,a1,20              	# [3]  gra_spill_temp_1
+    l32i.n	a2,a2,0                	# [4]  id:686
+    movi.n	a14,0                  	# [5]  
+    wur.sar_byte	a14              	# [6]  
+    wur.accx_0	a14                	# [8]  
+    wur.accx_1	a14                	# [9]  
+    ee.vld.128.ip	q0,a15,16       	# [10]  id:690
+    slli	a13,a13,1                	# [11]  
+    s32i.n	a13,a1,16              	# [12]  gra_spill_temp_0
+    beqz	a10,.LBB32_dspi_dotprod_s16_aes3 	# [13]  
+
+.Lt_0_23298:	# 0x99
+.Lt_0_22786:	# 0x99
+    addi	a8,a5,-16                	# [0]  
+    beqz	a8,.LBB38_dspi_dotprod_s16_aes3 	# [1]  
+
+.Lt_0_24834:	# 0x9f
+.Lt_0_24322:	# 0x9f
+    addi	a9,a5,-8                 	# [0]  
+    beqz	a9,.LBB44_dspi_dotprod_s16_aes3 	# [1]  
+
+.Lt_0_26370:	# 0xa5
+.Lt_0_25858:	# 0xa5
+    addi	a10,a5,-32               	# [0]  
+    beqz	a10,.LBB50_dspi_dotprod_s16_aes3 	# [1]  
+
+.Lt_0_27906:	# 0xab
+.Lt_0_27394:	# 0xab
+    addi	a11,a5,-64               	# [0]  
+    beqz	a11,.LBB56_dspi_dotprod_s16_aes3 	# [1]  
+
+    movi.n	a12,64                 	# [0]  
+    bge	a12,a5,.Lt_0_30722        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a2,16    	# [1]  id:762
+    ee.ld.128.usar.ip	q2,a2,16    	# [2]  id:763
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [4]  id:764
+    beqz.n	a3,.Lt_0_30722         	# [5]  
+
+    slli	a8,a5,1                  	# [0]  
+    l32i.n	a14,a1,16              	# [1]  gra_spill_temp_0
+    addi	a13,a5,31                	# [2]  
+    movgez	a13,a5,a5              	# [3]  
+    srai	a13,a13,5                	# [4]  
+    sub	a14,a14,a8                	# [5]  
+    addi	a14,a14,16               	# [6]  
+    addi.n	a13,a13,-1             	# [7]  
+
+.Lt_0_31490:	# 0xd9
+    addi.n	a12,a12,1              	# [0]  
+    movi.n	a9,32                  	# [1]  
+    beqz.n	a13,.Lt_0_31746        	# [2]  
+
+    loopnez	a13,.LBB221_dspi_dotprod_s16_aes3 	# [0]  
+
+.LBB219_dspi_dotprod_s16_aes3:	# 0xe2
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:766
+    ee.vmulas.s16.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:765
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:768
+    ee.vmulas.s16.accx.ld.ip.qup	q1,a2,16,q5,q2,q3,q4 	# [0*II+3]  id:767
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:770
+    ee.vmulas.s16.accx.ld.ip.qup	q2,a2,16,q0,q3,q4,q1 	# [0*II+5]  id:769
+    ee.vld.128.ip	q0,a15,16       	# [0*II+6]  id:772
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+7]  id:771
+
+.LBB221_dspi_dotprod_s16_aes3:	# 0xfe
+
+.Lt_0_31746:	# 0xfe
+    ee.vmulas.s16.accx.ld.ip.qup	q5,a2,16,q0,q1,q2,q3 	# [0]  id:773
+    movi.n	a10,-16                	# [1]  
+    ee.vld.128.ip	q0,a15,16       	# [2]  id:774
+    ee.vld.128.ip	q6,a15,16       	# [3]  id:776
+    ee.vmulas.s16.accx.ld.xp.qup	q7,a2,a14,q0,q2,q3,q5 	# [4]  id:775
+    ee.vld.128.ip	q4,a15,16       	# [5]  id:779
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a10,q6,q3,q5,q7 	# [6]  id:777
+    ee.ld.128.usar.xp	q1,a2,a9    	# [7]  id:778
+    ee.vld.128.ip	q0,a15,16       	# [8]  id:781
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q4,q5,q1,q2 	# [9]  id:780
+    bne	a12,a3,.Lt_0_31490        	# [10]  
+
+.Lt_0_30722:	# 0x122
+.Lt_0_30466:	# 0x122
+    rur.accx_0	a9                 	# [0]  
+    rur.accx_1	a10                	# [1]  
+    blti	a7,1,.Lt_0_33282         	# [2]  
+
+    movi.n	a2,0                   	# [0]  
+    addi	a13,a7,-33               	# [1]  
+    addi.n	a14,a7,-1              	# [2]  
+    ssr	a14                       	# [3]  
+    sra	a12,a10                   	# [4]  
+    src	a11,a10,a9                	# [5]  
+    movgez	a11,a12,a13            	# [6]  
+    addi.n	a11,a11,1              	# [7]  
+    srai	a11,a11,1                	# [8]  
+    s16i	a11,a4,0                 	# [9]  id:787
+    retw.n                        	# [10]  
+
+.Lt_0_34818:	# 0x148
+.Lt_0_19458:	# 0x148
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB32_dspi_dotprod_s16_aes3:	# 0x15b
+    ee.ld.128.usar.ip	q1,a2,16    	# [0]  id:691
+    ee.ld.128.usar.ip	q2,a2,16    	# [1]  id:692
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [3]  id:693
+    beqz.n	a6,.Lt_0_23298         	# [4]  
+
+    addi	a12,a13,-32              	# [0]  
+    movi.n	a10,32                 	# [1]  
+    movi.n	a11,-16                	# [2]  
+    loopgtz	a6,.LBB107_dspi_dotprod_s16_aes3 	# [3]  
+
+.LBB105_dspi_dotprod_s16_aes3:	# 0x170
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:695
+    ee.vmulas.s16.accx.ld.xp.qup	q1,a2,a12,q0,q1,q2,q3 	# [0*II+1]  id:694
+    ee.vld.128.ip	q5,a15,16       	# [0*II+2]  id:697
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q4,q2,q3,q1 	# [0*II+3]  id:696
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+4]  id:698
+    ee.vld.128.ip	q0,a15,16       	# [0*II+5]  id:700
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q5,q3,q1,q2 	# [0*II+6]  id:699
+
+.LBB107_dspi_dotprod_s16_aes3:	# 0x188
+    j	.Lt_0_23298                 	# [0]  
+
+.LBB38_dspi_dotprod_s16_aes3:	# 0x18b
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    srli	a3,a6,1                  	# [2]  
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:701
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:702
+    addi	a12,a12,-16              	# [7]  
+    ee.src.q.ld.xp	q3,a2,a12,q1,q2 	# [8]  id:703
+    loopnez	a3,.LBB130_dspi_dotprod_s16_aes3 	# [9]  
+
+.LBB128_dspi_dotprod_s16_aes3:	# 0x1a3
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:705
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a11,q0,q1,q2,q3 	# [0*II+1]  id:704
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+2]  id:706
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:708
+    ee.vmulas.s16.accx.ld.xp.qup	q4,a2,a12,q4,q2,q1,q3 	# [0*II+4]  id:707
+    ee.vld.128.ip	q5,a15,16       	# [0*II+5]  id:710
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q0,q1,q3,q4 	# [0*II+6]  id:709
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+7]  id:711
+    ee.vld.128.ip	q0,a15,16       	# [0*II+8]  id:713
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a2,a12,q5,q3,q1,q2 	# [0*II+9]  id:712
+
+.LBB130_dspi_dotprod_s16_aes3:	# 0x1c5
+    j	.Lt_0_24834                 	# [0]  
+
+.LBB44_dspi_dotprod_s16_aes3:	# 0x1c8
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a10,-16                	# [1]  
+    l32i.n	a11,a1,16              	# [2]  gra_spill_temp_0
+    addi	a8,a2,16                 	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a8,a10   	# [5]  id:714
+    ee.ld.128.usar.xp	q1,a8,a11   	# [6]  id:715
+    ee.src.q.ld.xp	q3,a8,a10,q1,q2 	# [8]  id:716
+    ee.ld.128.usar.xp	q2,a8,a11   	# [9]  id:717
+    loopnez	a3,.LBB153_dspi_dotprod_s16_aes3 	# [10]  
+
+.LBB151_dspi_dotprod_s16_aes3:	# 0x1e4
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:719
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a8,a10,q0,q1,q2,q3 	# [0*II+1]  id:718
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+2]  id:720
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:722
+    ee.vmulas.s16.accx.ld.xp.qup	q4,a8,a10,q4,q2,q1,q3 	# [0*II+4]  id:721
+    ee.ld.128.usar.xp	q3,a8,a11   	# [0*II+5]  id:723
+    ee.vld.128.ip	q5,a15,16       	# [0*II+6]  id:725
+    ee.vmulas.s16.accx.ld.xp.qup	q4,a8,a10,q0,q1,q3,q4 	# [0*II+7]  id:724
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+8]  id:726
+    ee.vld.128.ip	q0,a15,16       	# [0*II+9]  id:728
+    ee.vmulas.s16.accx.ld.xp.qup	q3,a8,a10,q5,q3,q1,q4 	# [0*II+10]  id:727
+    ee.ld.128.usar.xp	q2,a8,a11   	# [0*II+11]  id:729
+
+.LBB153_dspi_dotprod_s16_aes3:	# 0x20c
+    mov.n	a2,a8                   	# [0]  
+    j	.Lt_0_26370                 	# [1]  
+
+.LBB50_dspi_dotprod_s16_aes3:	# 0x211
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    slli	a13,a5,1                 	# [2]  
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:730
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:731
+    sub	a12,a12,a13               	# [6]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [8]  id:732
+    addi	a12,a12,16               	# [9]  
+    loopnez	a3,.LBB176_dspi_dotprod_s16_aes3 	# [10]  
+
+.LBB174_dspi_dotprod_s16_aes3:	# 0x22c
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:734
+    ee.vmulas.s16.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:733
+    ee.vld.128.ip	q1,a15,16       	# [0*II+2]  id:736
+    ee.vmulas.s16.accx.ld.xp.qup	q0,a2,a12,q5,q2,q3,q4 	# [0*II+3]  id:735
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:739
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q1,q3,q4,q0 	# [0*II+5]  id:737
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+6]  id:738
+    ee.vld.128.ip	q0,a15,16       	# [0*II+7]  id:741
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+8]  id:740
+
+.LBB176_dspi_dotprod_s16_aes3:	# 0x24b
+    j	.Lt_0_27906                 	# [0]  
+
+.LBB56_dspi_dotprod_s16_aes3:	# 0x24e
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    slli	a13,a5,1                 	# [2]  
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:742
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:743
+    sub	a12,a12,a13               	# [7]  
+    addi	a12,a12,16               	# [8]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [9]  id:744
+    loopnez	a3,.LBB198_dspi_dotprod_s16_aes3 	# [10]  
+
+.LBB196_dspi_dotprod_s16_aes3:	# 0x269
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:746
+    ee.vmulas.s16.accx.ld.ip.qup	q1,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:745
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:748
+    ee.vmulas.s16.accx.ld.ip.qup	q4,a2,16,q4,q2,q3,q1 	# [0*II+3]  id:747
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:750
+    ee.vmulas.s16.accx.ld.ip.qup	q0,a2,16,q0,q3,q1,q4 	# [0*II+5]  id:749
+    ee.vld.128.ip	q6,a15,16       	# [0*II+6]  id:752
+    ee.vmulas.s16.accx.ld.ip.qup	q1,a2,16,q5,q1,q4,q0 	# [0*II+7]  id:751
+    ee.vld.128.ip	q5,a15,16       	# [0*II+8]  id:754
+    ee.vmulas.s16.accx.ld.ip.qup	q4,a2,16,q6,q4,q0,q1 	# [0*II+9]  id:753
+    ee.vld.128.ip	q6,a15,16       	# [0*II+10]  id:756
+    ee.vmulas.s16.accx.ld.xp.qup	q0,a2,a12,q5,q0,q1,q4 	# [0*II+11]  id:755
+    ee.vld.128.ip	q5,a15,16       	# [0*II+12]  id:759
+    ee.vmulas.s16.accx.ld.xp.qup	q2,a2,a11,q6,q1,q4,q0 	# [0*II+13]  id:757
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+14]  id:758
+    ee.vld.128.ip	q0,a15,16       	# [0*II+15]  id:761
+    ee.vmulas.s16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+16]  id:760
+
+.LBB198_dspi_dotprod_s16_aes3:	# 0x2a4
+    j	.Lt_0_30722                 	# [0]  
+
+.Lt_0_33282:	# 0x2a7
+    movi.n	a2,0                   	# [0]  
+    sext	a14,a9,15                	# [1]  
+    s16i	a14,a4,0                 	# [2]  id:788
+    retw.n                        	# [3]  
+
+.LBB28_dspi_dotprod_s16_aes3:	# 0x2b1
+    mov.n	a15,a7                  	# [0]  
+    mov.n	a14,a6                  	# [1]  
+    mov.n	a13,a5                  	# [2]  
+    mov.n	a12,a4                  	# [3]  
+    mov.n	a11,a3                  	# [4]  
+    mov.n	a10,a2                  	# [5]  
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    int16_t *i_data =  (int16_t *)in_image->data;
+    int16_t *f_data =  (int16_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int64_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s16_arp4.S
@@ -0,0 +1,95 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_s16_arp4
+    .global dspi_dotprod_s16_ansi
+    .type   dspi_dotprod_s16_arp4,@function
+
+// esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
+dspi_dotprod_s16_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 7
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_s16_arp4_body
+    j   dspi_dotprod_s16_ansi
+
+.dspi_dotprod_s16_arp4_body:
+    add sp, sp, -16
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+    slli t2, t2, 1  // i_step = i_step<<1
+
+    lw  t3, 8(a1)   // step_y
+    lw  t5, 12(a1)  // stride_x
+    mul t3, t5, t3
+    slli t3, t3, 1  // f_step = f_step<<1
+
+    srli t6, a3, 3  // t5 = len/8
+    
+    addi    a6, a5, -1
+    li      t4, 1
+    sll     t4, t4, a6
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip      q0, t4, 16  // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip  q1, t5, 16      // q1 - f_data
+.loop_count_x:  esp.vmulas.s16.xacc.ld.ip  q0, t4, 16, q0, q1 // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.s.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_aes3.S
@@ -0,0 +1,370 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_52, 458755
+
+    # Program Unit: dspi_dotprod_s8_aes3
+    .type	dspi_dotprod_s8_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_s8_aes3
+dspi_dotprod_s8_aes3:	# 0x4
+.LBB1_dspi_dotprod_s8_aes3:	# 0x4
+    entry	a1,48                   	#  
+    l32i.n	a10,a2,4               	# [0]  id:668
+    l32i.n	a11,a2,12              	# [1]  id:667
+    mull	a8,a10,a5                	# [2]  
+    blt	a11,a8,.LBB78_dspi_dotprod_s8_aes3 	# [4]  
+
+    l32i.n	a12,a2,8               	# [0]  id:669
+    l32i.n	a9,a2,16               	# [1]  id:670
+    mull	a13,a12,a6               	# [2]  
+    blt	a9,a13,.LBB78_dspi_dotprod_s8_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:672
+    l32i.n	a14,a3,12              	# [1]  id:671
+    mull	a13,a15,a5               	# [2]  
+    blt	a14,a13,.LBB78_dspi_dotprod_s8_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:674
+    l32i.n	a9,a3,8                	# [1]  id:673
+    s32i.n	a9,a1,8                	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB78_dspi_dotprod_s8_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:675
+    s32i.n	a8,a1,4                	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_33026         	# [2]  
+
+    bne	a14,a13,.Lt_0_33026       	# [0]  
+
+    bnei	a15,1,.Lt_0_33026        	# [0]  
+
+    l32i.n	a13,a1,8               	# [0]  gra_spill_temp_2
+    beqi	a13,1,.Lt_0_17666        	# [2]  
+
+.Lt_0_33026:	# 0x43
+.Lt_0_17922:	# 0x43
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    .type	dspi_dotprod_s8_ansi, @function
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB78_dspi_dotprod_s8_aes3:	# 0x56
+    l32r	a2,.LC0_1_52             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_17666:	# 0x5b
+    addi.n	a14,a10,-1             	# [0]  
+    bnez	a14,.Lt_0_33794          	# [1]  
+
+    addi.n	a15,a12,-1             	# [0]  
+    bnez	a15,.Lt_0_33794          	# [1]  
+
+    extui	a8,a5,0,4               	# [0]  
+    bnez.n	a8,.Lt_0_33794         	# [1]  
+
+    blti	a6,4,.Lt_0_33794         	# [0]  
+
+    movi.n	a9,64                  	# [0]  
+    bge	a9,a5,.Lt_0_34306         	# [1]  
+
+    extui	a10,a5,0,1              	# [0]  
+    bnez	a10,.LBB28_dspi_dotprod_s8_aes3 	# [1]  
+
+.Lt_0_34306:	# 0x78
+.Lt_0_19714:	# 0x78
+    mov.n	a3,a6                   	# [0]  
+    addi	a13,a5,-48               	# [1]  
+    movi.n	a14,0                  	# [2]  
+    mull	a15,a11,a12              	# [3]  
+    l32i.n	a2,a2,0                	# [4]  id:676
+    s32i.n	a15,a1,0               	# [6]  gra_spill_temp_0
+    wur.accx_0	a14                	# [7]  
+    l32i.n	a15,a1,4               	# [8]  gra_spill_temp_1
+    wur.accx_1	a14                	# [9]  
+    ee.vld.128.ip	q0,a15,16       	# [10]  id:679
+    beqz	a13,.LBB32_dspi_dotprod_s8_aes3 	# [11]  
+
+.Lt_0_22786:	# 0x93
+.Lt_0_22274:	# 0x93
+    addi	a8,a5,-32                	# [0]  
+    beqz	a8,.LBB38_dspi_dotprod_s8_aes3 	# [1]  
+
+.Lt_0_24322:	# 0x99
+.Lt_0_23810:	# 0x99
+    addi	a9,a5,-16                	# [0]  
+    beqz	a9,.LBB44_dspi_dotprod_s8_aes3 	# [1]  
+
+.Lt_0_25858:	# 0x9f
+.Lt_0_25346:	# 0x9f
+    addi	a10,a5,-64               	# [0]  
+    beqz	a10,.LBB50_dspi_dotprod_s8_aes3 	# [1]  
+
+.Lt_0_27394:	# 0xa5
+.Lt_0_26882:	# 0xa5
+    addi	a11,a5,-128              	# [0]  
+    beqz	a11,.LBB56_dspi_dotprod_s8_aes3 	# [1]  
+
+    movi	a12,128                  	# [0]  
+    bge	a12,a5,.Lt_0_30210        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a2,16    	# [1]  id:751
+    ee.ld.128.usar.ip	q2,a2,16    	# [2]  id:752
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [4]  id:753
+    beqz.n	a3,.Lt_0_30210         	# [5]  
+
+    l32i.n	a14,a1,0               	# [0]  gra_spill_temp_0
+    addi	a13,a5,63                	# [1]  
+    movgez	a13,a5,a5              	# [2]  
+    srai	a13,a13,6                	# [3]  
+    sub	a14,a14,a5                	# [4]  
+    addi	a14,a14,16               	# [5]  
+    addi.n	a13,a13,-1             	# [6]  
+
+.Lt_0_30978:	# 0xd1
+    addi.n	a12,a12,1              	# [0]  
+    movi.n	a8,32                  	# [1]  
+    movi.n	a9,-16                 	# [2]  
+    beqz.n	a13,.Lt_0_31234        	# [3]  
+
+    loopnez	a13,.LBB218_dspi_dotprod_s8_aes3 	# [0]  
+
+.LBB216_dspi_dotprod_s8_aes3:	# 0xdc
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:755
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:754
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:757
+    ee.vmulas.s8.accx.ld.ip.qup	q1,a2,16,q5,q2,q3,q4 	# [0*II+3]  id:756
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:759
+    ee.vmulas.s8.accx.ld.ip.qup	q2,a2,16,q0,q3,q4,q1 	# [0*II+5]  id:758
+    ee.vld.128.ip	q0,a15,16       	# [0*II+6]  id:761
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+7]  id:760
+
+.LBB218_dspi_dotprod_s8_aes3:	# 0xf8
+
+.Lt_0_31234:	# 0xf8
+    ee.vmulas.s8.accx.ld.ip.qup	q5,a2,16,q0,q1,q2,q3 	# [0]  id:762
+    ee.vld.128.ip	q0,a15,16       	# [1]  id:763
+    ee.vld.128.ip	q6,a15,16       	# [2]  id:765
+    ee.vmulas.s8.accx.ld.xp.qup	q7,a2,a14,q0,q2,q3,q5 	# [3]  id:764
+    ee.vld.128.ip	q4,a15,16       	# [4]  id:768
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a2,a9,q6,q3,q5,q7 	# [5]  id:766
+    ee.ld.128.usar.xp	q1,a2,a8    	# [6]  id:767
+    ee.vld.128.ip	q0,a15,16       	# [7]  id:770
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a2,16,q4,q5,q1,q2 	# [8]  id:769
+    bne	a12,a3,.Lt_0_30978        	# [9]  
+
+.Lt_0_30210:	# 0x11a
+.Lt_0_29954:	# 0x11a
+    movi.n	a2,0                   	# [0]  
+    rur.accx_0	a10                	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    movi.n	a11,1                  	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:772
+    retw.n                        	# [10]  
+
+.Lt_0_33794:	# 0x136
+.Lt_0_18946:	# 0x136
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+#.LBB25_dspi_dotprod_s8_aes3:	# 0x145
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB32_dspi_dotprod_s8_aes3:	# 0x149
+    ee.ld.128.usar.ip	q1,a2,16    	# [0]  id:680
+    ee.ld.128.usar.ip	q2,a2,16    	# [1]  id:681
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [3]  id:682
+    beqz.n	a6,.Lt_0_22786         	# [4]  
+
+    movi.n	a10,32                 	# [0]  
+    l32i.n	a12,a1,0               	# [1]  gra_spill_temp_0
+    movi.n	a11,-16                	# [2]  
+    addi	a12,a12,-32              	# [3]  
+    loopgtz	a6,.LBB104_dspi_dotprod_s8_aes3 	# [4]  
+
+.LBB102_dspi_dotprod_s8_aes3:	# 0x160
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:684
+    ee.vmulas.s8.accx.ld.xp.qup	q1,a2,a12,q0,q1,q2,q3 	# [0*II+1]  id:683
+    ee.vld.128.ip	q5,a15,16       	# [0*II+2]  id:686
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a2,a11,q4,q2,q3,q1 	# [0*II+3]  id:685
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+4]  id:687
+    ee.vld.128.ip	q0,a15,16       	# [0*II+5]  id:689
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a2,16,q5,q3,q1,q2 	# [0*II+6]  id:688
+
+.LBB104_dspi_dotprod_s8_aes3:	# 0x178
+    j	.Lt_0_22786                 	# [0]  
+
+.LBB38_dspi_dotprod_s8_aes3:	# 0x17b
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    srli	a3,a6,1                  	# [2]  
+    l32i.n	a12,a1,0               	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:690
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:691
+    addi	a12,a12,-16              	# [7]  
+    ee.src.q.ld.xp	q3,a2,a12,q1,q2 	# [8]  id:692
+    loopnez	a3,.LBB127_dspi_dotprod_s8_aes3 	# [9]  
+
+.LBB125_dspi_dotprod_s8_aes3:	# 0x193
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:694
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a2,a11,q0,q1,q2,q3 	# [0*II+1]  id:693
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+2]  id:695
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:697
+    ee.vmulas.s8.accx.ld.xp.qup	q4,a2,a12,q4,q2,q1,q3 	# [0*II+4]  id:696
+    ee.vld.128.ip	q5,a15,16       	# [0*II+5]  id:699
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a2,a11,q0,q1,q3,q4 	# [0*II+6]  id:698
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+7]  id:700
+    ee.vld.128.ip	q0,a15,16       	# [0*II+8]  id:702
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a2,a12,q5,q3,q1,q2 	# [0*II+9]  id:701
+
+.LBB127_dspi_dotprod_s8_aes3:	# 0x1b5
+    j	.Lt_0_24322                 	# [0]  
+
+.LBB44_dspi_dotprod_s8_aes3:	# 0x1b8
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a10,-16                	# [1]  
+    l32i.n	a11,a1,0               	# [2]  gra_spill_temp_0
+    addi	a8,a2,16                 	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a8,a10   	# [5]  id:703
+    ee.ld.128.usar.xp	q1,a8,a11   	# [6]  id:704
+    ee.src.q.ld.xp	q3,a8,a10,q1,q2 	# [8]  id:705
+    ee.ld.128.usar.xp	q2,a8,a11   	# [9]  id:706
+    loopnez	a3,.LBB150_dspi_dotprod_s8_aes3 	# [10]  
+
+.LBB148_dspi_dotprod_s8_aes3:	# 0x1d4
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:708
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a8,a10,q0,q1,q2,q3 	# [0*II+1]  id:707
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+2]  id:709
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:711
+    ee.vmulas.s8.accx.ld.xp.qup	q4,a8,a10,q4,q2,q1,q3 	# [0*II+4]  id:710
+    ee.ld.128.usar.xp	q3,a8,a11   	# [0*II+5]  id:712
+    ee.vld.128.ip	q5,a15,16       	# [0*II+6]  id:714
+    ee.vmulas.s8.accx.ld.xp.qup	q4,a8,a10,q0,q1,q3,q4 	# [0*II+7]  id:713
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+8]  id:715
+    ee.vld.128.ip	q0,a15,16       	# [0*II+9]  id:717
+    ee.vmulas.s8.accx.ld.xp.qup	q3,a8,a10,q5,q3,q1,q4 	# [0*II+10]  id:716
+    ee.ld.128.usar.xp	q2,a8,a11   	# [0*II+11]  id:718
+
+.LBB150_dspi_dotprod_s8_aes3:	# 0x1fc
+    mov.n	a2,a8                   	# [0]  
+    j	.Lt_0_25858                 	# [1]  
+
+.LBB50_dspi_dotprod_s8_aes3:	# 0x201
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i.n	a12,a1,0               	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [3]  id:719
+    ee.ld.128.usar.ip	q2,a2,16    	# [4]  id:720
+    sub	a12,a12,a5                	# [5]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [7]  id:721
+    addi	a12,a12,16               	# [8]  
+    loopnez	a3,.LBB173_dspi_dotprod_s8_aes3 	# [9]  
+
+.LBB171_dspi_dotprod_s8_aes3:	# 0x219
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:723
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:722
+    ee.vld.128.ip	q1,a15,16       	# [0*II+2]  id:725
+    ee.vmulas.s8.accx.ld.xp.qup	q0,a2,a12,q5,q2,q3,q4 	# [0*II+3]  id:724
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:728
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a2,a11,q1,q3,q4,q0 	# [0*II+5]  id:726
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+6]  id:727
+    ee.vld.128.ip	q0,a15,16       	# [0*II+7]  id:730
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+8]  id:729
+
+.LBB173_dspi_dotprod_s8_aes3:	# 0x238
+    j	.Lt_0_27394                 	# [0]  
+
+.LBB56_dspi_dotprod_s8_aes3:	# 0x23b
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i.n	a12,a1,0               	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [3]  id:731
+    ee.ld.128.usar.ip	q2,a2,16    	# [4]  id:732
+    sub	a12,a12,a5                	# [6]  
+    addi	a12,a12,16               	# [7]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [8]  id:733
+    loopnez	a3,.LBB195_dspi_dotprod_s8_aes3 	# [9]  
+
+.LBB193_dspi_dotprod_s8_aes3:	# 0x253
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:735
+    ee.vmulas.s8.accx.ld.ip.qup	q1,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:734
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:737
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a2,16,q4,q2,q3,q1 	# [0*II+3]  id:736
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:739
+    ee.vmulas.s8.accx.ld.ip.qup	q0,a2,16,q0,q3,q1,q4 	# [0*II+5]  id:738
+    ee.vld.128.ip	q6,a15,16       	# [0*II+6]  id:741
+    ee.vmulas.s8.accx.ld.ip.qup	q1,a2,16,q5,q1,q4,q0 	# [0*II+7]  id:740
+    ee.vld.128.ip	q5,a15,16       	# [0*II+8]  id:743
+    ee.vmulas.s8.accx.ld.ip.qup	q4,a2,16,q6,q4,q0,q1 	# [0*II+9]  id:742
+    ee.vld.128.ip	q6,a15,16       	# [0*II+10]  id:745
+    ee.vmulas.s8.accx.ld.xp.qup	q0,a2,a12,q5,q0,q1,q4 	# [0*II+11]  id:744
+    ee.vld.128.ip	q5,a15,16       	# [0*II+12]  id:748
+    ee.vmulas.s8.accx.ld.xp.qup	q2,a2,a11,q6,q1,q4,q0 	# [0*II+13]  id:746
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+14]  id:747
+    ee.vld.128.ip	q0,a15,16       	# [0*II+15]  id:750
+    ee.vmulas.s8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+16]  id:749
+
+.LBB195_dspi_dotprod_s8_aes3:	# 0x28e
+    movi.n	a2,0                   	# [0]  
+    movi.n	a11,1                  	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    rur.accx_0	a10                	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    sra	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:772
+    retw.n                        	# [10]  
+
+.LBB28_dspi_dotprod_s8_aes3:	# 0x2aa
+    mov.n	a15,a7                  	# [0]  
+    mov.n	a14,a6                  	# [1]  
+    mov.n	a13,a5                  	# [2]  
+    mov.n	a12,a4                  	# [3]  
+    mov.n	a11,a3                  	# [4]  
+    mov.n	a10,a2                  	# [5]  
+    call8	dspi_dotprod_s8_ansi    	# [6]  dspi_dotprod_s8_ansi
+
+#.LBB29_dspi_dotprod_s8_aes3:	# 0x2b9
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    int8_t *i_data =  (int8_t *)in_image->data;
+    int8_t *f_data =  (int8_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int32_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_s8_arp4.S
@@ -0,0 +1,93 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_s8_arp4
+    .global dspi_dotprod_s8_ansi
+    .type   dspi_dotprod_s8_arp4,@function
+
+// esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
+dspi_dotprod_s8_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 15
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_s8_arp4_body
+    j   dspi_dotprod_s8_ansi
+
+.dspi_dotprod_s8_arp4_body:
+    add sp, sp, -16
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+
+    lw  t3, 8(a1)       // step_y
+    lw  t5, 12(a1)      // stride_x
+    mul t3, t5, t3
+
+    srli t6, a3, 4      // t5 = len/16
+    
+    addi    a6, a5, -1
+    li      t4, 1
+    sll     t4, t4, a6
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip      q0, t4, 16          // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip  q1, t5, 16          // q1 - f_data
+.loop_count_x:  esp.vmulas.s8.xacc.ld.ip     q0, t4, 16, q0, q1 // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.s.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_aes3.S
@@ -0,0 +1,371 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_55, 458755
+
+    # Program Unit: dspi_dotprod_u16_aes3
+    .type	dspi_dotprod_u16_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_u16_aes3
+dspi_dotprod_u16_aes3:	# 0x4
+.LBB1_dspi_dotprod_u16_aes3:	# 0x4
+    entry	a1,64                   	#
+    l32i.n	a10,a2,4               	# [0]  id:681
+    l32i.n	a11,a2,12              	# [1]  id:680
+    mull	a8,a10,a5                	# [2]
+    blt	a11,a8,.LBB81_dspi_dotprod_u16_aes3 	# [4]
+
+    l32i.n	a12,a2,8               	# [0]  id:682
+    l32i.n	a9,a2,16               	# [1]  id:683
+    mull	a13,a12,a6               	# [2]
+    blt	a9,a13,.LBB81_dspi_dotprod_u16_aes3 	# [4]
+
+    l32i.n	a15,a3,4               	# [0]  id:685
+    l32i.n	a14,a3,12              	# [1]  id:684
+    mull	a13,a15,a5               	# [2]
+    blt	a14,a13,.LBB81_dspi_dotprod_u16_aes3 	# [4]
+
+    l32i.n	a8,a3,16               	# [0]  id:687
+    l32i.n	a9,a3,8                	# [1]  id:686
+    s32i.n	a9,a1,24               	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]
+    blt	a8,a9,.LBB81_dspi_dotprod_u16_aes3 	# [5]
+
+    l32i.n	a8,a3,0                	# [0]  id:688
+    s32i.n	a8,a1,20               	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_34050         	# [2]
+
+    bne	a14,a13,.Lt_0_34050       	# [0]
+
+    bnei	a15,1,.Lt_0_34050        	# [0]
+
+    l32i.n	a9,a1,24               	# [0]  gra_spill_temp_2
+    beqi	a9,1,.Lt_0_18178         	# [2]
+
+.Lt_0_34050:	# 0x43
+.Lt_0_18434:	# 0x43
+    mov.n	a10,a2                  	# [0]
+    mov.n	a11,a3                  	# [1]
+    mov.n	a12,a4                  	# [2]
+    mov.n	a13,a5                  	# [3]
+    mov.n	a14,a6                  	# [4]
+    mov.n	a15,a7                  	# [5]
+    .type	dspi_dotprod_s16_ansi, @function
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]
+    retw.n                        	# [1]
+
+.LBB81_dspi_dotprod_u16_aes3:	# 0x56
+    l32r	a2,.LC0_1_55             	# [0]
+    retw.n                        	# [1]
+
+.Lt_0_18178:	# 0x5b
+    addi.n	a13,a10,-1             	# [0]
+    bnez	a13,.Lt_0_34818          	# [1]
+
+    addi.n	a14,a12,-1             	# [0]
+    bnez	a14,.Lt_0_34818          	# [1]
+
+    extui	a15,a5,0,3              	# [0]
+    bnez.n	a15,.Lt_0_34818        	# [1]
+
+    blti	a6,4,.Lt_0_34818         	# [0]
+
+    movi.n	a8,32                  	# [0]
+    bge	a8,a5,.Lt_0_35330         	# [1]
+
+    extui	a9,a5,0,1               	# [0]
+    bnez	a9,.LBB28_dspi_dotprod_u16_aes3 	# [1]
+
+.Lt_0_35330:	# 0x78
+.Lt_0_20226:	# 0x78
+    mov.n	a3,a6                   	# [0]
+    addi	a10,a5,-24               	# [1]
+    mull	a13,a11,a12              	# [2]
+    l32i.n	a15,a1,20              	# [3]  gra_spill_temp_1
+    l32i.n	a2,a2,0                	# [4]  id:689
+    movi.n	a14,0                  	# [5]
+    wur.sar_byte	a14              	# [6]
+    wur.accx_0	a14                	# [8]
+    wur.accx_1	a14                	# [9]
+    ee.vld.128.ip	q0,a15,16       	# [10]  id:693
+    slli	a13,a13,1                	# [11]
+    s32i.n	a13,a1,16              	# [12]  gra_spill_temp_0
+    beqz	a10,.LBB32_dspi_dotprod_u16_aes3 	# [13]
+
+.Lt_0_23298:	# 0x99
+.Lt_0_22786:	# 0x99
+    addi	a8,a5,-16                	# [0]
+    beqz	a8,.LBB38_dspi_dotprod_u16_aes3 	# [1]
+
+.Lt_0_24834:	# 0x9f
+.Lt_0_24322:	# 0x9f
+    addi	a9,a5,-8                 	# [0]
+    beqz	a9,.LBB44_dspi_dotprod_u16_aes3 	# [1]
+
+.Lt_0_26370:	# 0xa5
+.Lt_0_25858:	# 0xa5
+    addi	a10,a5,-32               	# [0]
+    beqz	a10,.LBB50_dspi_dotprod_u16_aes3 	# [1]
+
+.Lt_0_27906:	# 0xab
+.Lt_0_27394:	# 0xab
+    addi	a11,a5,-64               	# [0]
+    beqz	a11,.LBB56_dspi_dotprod_u16_aes3 	# [1]
+
+    movi.n	a12,64                 	# [0]
+    bge	a12,a5,.Lt_0_30722        	# [1]
+
+    movi.n	a12,0                  	# [0]
+    ee.ld.128.usar.ip	q1,a2,16    	# [1]  id:765
+    ee.ld.128.usar.ip	q2,a2,16    	# [2]  id:766
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [4]  id:767
+    beqz.n	a3,.Lt_0_30722         	# [5]
+
+    slli	a8,a5,1                  	# [0]
+    l32i.n	a14,a1,16              	# [1]  gra_spill_temp_0
+    addi	a13,a5,31                	# [2]
+    movgez	a13,a5,a5              	# [3]
+    srai	a13,a13,5                	# [4]
+    sub	a14,a14,a8                	# [5]
+    addi	a14,a14,16               	# [6]
+    addi.n	a13,a13,-1             	# [7]
+
+.Lt_0_31490:	# 0xd9
+    addi.n	a12,a12,1              	# [0]
+    movi.n	a9,32                  	# [1]
+    beqz.n	a13,.Lt_0_31746        	# [2]
+
+    loopnez	a13,.LBB221_dspi_dotprod_u16_aes3 	# [0]
+
+.LBB219_dspi_dotprod_u16_aes3:	# 0xe2
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:769
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:768
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:771
+    ee.vmulas.u16.accx.ld.ip.qup	q1,a2,16,q5,q2,q3,q4 	# [0*II+3]  id:770
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:773
+    ee.vmulas.u16.accx.ld.ip.qup	q2,a2,16,q0,q3,q4,q1 	# [0*II+5]  id:772
+    ee.vld.128.ip	q0,a15,16       	# [0*II+6]  id:775
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+7]  id:774
+
+.LBB221_dspi_dotprod_u16_aes3:	# 0xfe
+
+.Lt_0_31746:	# 0xfe
+    ee.vmulas.u16.accx.ld.ip.qup	q5,a2,16,q0,q1,q2,q3 	# [0]  id:776
+    movi.n	a10,-16                	# [1]
+    ee.vld.128.ip	q0,a15,16       	# [2]  id:777
+    ee.vld.128.ip	q6,a15,16       	# [3]  id:779
+    ee.vmulas.u16.accx.ld.xp.qup	q7,a2,a14,q0,q2,q3,q5 	# [4]  id:778
+    ee.vld.128.ip	q4,a15,16       	# [5]  id:782
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a2,a10,q6,q3,q5,q7 	# [6]  id:780
+    ee.ld.128.usar.xp	q1,a2,a9    	# [7]  id:781
+    ee.vld.128.ip	q0,a15,16       	# [8]  id:784
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a2,16,q4,q5,q1,q2 	# [9]  id:783
+    bne	a12,a3,.Lt_0_31490        	# [10]
+
+.Lt_0_30722:	# 0x122
+.Lt_0_30466:	# 0x122
+    rur.accx_0	a9                 	# [0]
+    rur.accx_1	a10                	# [1]
+    blti	a7,1,.Lt_0_33282         	# [2]
+
+    movi.n	a2,0                   	# [0]
+    addi	a13,a7,-33               	# [1]
+    addi.n	a14,a7,-1              	# [2]
+    ssr	a14                       	# [3]
+    sra	a12,a10                   	# [4]
+    src	a11,a10,a9                	# [5]
+    movgez	a11,a12,a13            	# [6]
+    addi.n	a11,a11,1              	# [7]
+    srli	a11,a11,1                	# [8]
+    s16i	a11,a4,0                 	# [9]  id:790
+    retw.n                        	# [10]
+
+.Lt_0_34818:	# 0x148
+.Lt_0_19458:	# 0x148
+    mov.n	a10,a2                  	# [0]
+    mov.n	a11,a3                  	# [1]
+    mov.n	a12,a4                  	# [2]
+    mov.n	a13,a5                  	# [3]
+    mov.n	a14,a6                  	# [4]
+    mov.n	a15,a7                  	# [5]
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]
+    retw.n                        	# [1]
+
+.LBB32_dspi_dotprod_u16_aes3:	# 0x15b
+    ee.ld.128.usar.ip	q1,a2,16    	# [0]  id:694
+    ee.ld.128.usar.ip	q2,a2,16    	# [1]  id:695
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [3]  id:696
+    beqz.n	a6,.Lt_0_23298         	# [4]
+
+    addi	a12,a13,-32              	# [0]
+    movi.n	a10,32                 	# [1]
+    movi.n	a11,-16                	# [2]
+    loopgtz	a6,.LBB107_dspi_dotprod_u16_aes3 	# [3]
+
+.LBB105_dspi_dotprod_u16_aes3:	# 0x170
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:698
+    ee.vmulas.u16.accx.ld.xp.qup	q1,a2,a12,q0,q1,q2,q3 	# [0*II+1]  id:697
+    ee.vld.128.ip	q5,a15,16       	# [0*II+2]  id:700
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a2,a11,q4,q2,q3,q1 	# [0*II+3]  id:699
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+4]  id:701
+    ee.vld.128.ip	q0,a15,16       	# [0*II+5]  id:703
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a2,16,q5,q3,q1,q2 	# [0*II+6]  id:702
+
+.LBB107_dspi_dotprod_u16_aes3:	# 0x188
+    j	.Lt_0_23298                 	# [0]
+
+.LBB38_dspi_dotprod_u16_aes3:	# 0x18b
+    movi.n	a10,32                 	# [0]
+    movi.n	a11,-16                	# [1]
+    srli	a3,a6,1                  	# [2]
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:704
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:705
+    addi	a12,a12,-16              	# [7]
+    ee.src.q.ld.xp	q3,a2,a12,q1,q2 	# [8]  id:706
+    loopnez	a3,.LBB130_dspi_dotprod_u16_aes3 	# [9]
+
+.LBB128_dspi_dotprod_u16_aes3:	# 0x1a3
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:708
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a2,a11,q0,q1,q2,q3 	# [0*II+1]  id:707
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+2]  id:709
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:711
+    ee.vmulas.u16.accx.ld.xp.qup	q4,a2,a12,q4,q2,q1,q3 	# [0*II+4]  id:710
+    ee.vld.128.ip	q5,a15,16       	# [0*II+5]  id:713
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a2,a11,q0,q1,q3,q4 	# [0*II+6]  id:712
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+7]  id:714
+    ee.vld.128.ip	q0,a15,16       	# [0*II+8]  id:716
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a2,a12,q5,q3,q1,q2 	# [0*II+9]  id:715
+
+.LBB130_dspi_dotprod_u16_aes3:	# 0x1c5
+    j	.Lt_0_24834                 	# [0]
+
+.LBB44_dspi_dotprod_u16_aes3:	# 0x1c8
+    srli	a3,a3,2                  	# [0]
+    movi.n	a10,-16                	# [1]
+    l32i.n	a11,a1,16              	# [2]  gra_spill_temp_0
+    addi	a8,a2,16                 	# [3]
+    addi	a11,a11,16               	# [4]
+    ee.ld.128.usar.xp	q2,a8,a10   	# [5]  id:717
+    ee.ld.128.usar.xp	q1,a8,a11   	# [6]  id:718
+    ee.src.q.ld.xp	q3,a8,a10,q1,q2 	# [8]  id:719
+    ee.ld.128.usar.xp	q2,a8,a11   	# [9]  id:720
+    loopnez	a3,.LBB153_dspi_dotprod_u16_aes3 	# [10]
+
+.LBB151_dspi_dotprod_u16_aes3:	# 0x1e4
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:722
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a8,a10,q0,q1,q2,q3 	# [0*II+1]  id:721
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+2]  id:723
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:725
+    ee.vmulas.u16.accx.ld.xp.qup	q4,a8,a10,q4,q2,q1,q3 	# [0*II+4]  id:724
+    ee.ld.128.usar.xp	q3,a8,a11   	# [0*II+5]  id:726
+    ee.vld.128.ip	q5,a15,16       	# [0*II+6]  id:728
+    ee.vmulas.u16.accx.ld.xp.qup	q4,a8,a10,q0,q1,q3,q4 	# [0*II+7]  id:727
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+8]  id:729
+    ee.vld.128.ip	q0,a15,16       	# [0*II+9]  id:731
+    ee.vmulas.u16.accx.ld.xp.qup	q3,a8,a10,q5,q3,q1,q4 	# [0*II+10]  id:730
+    ee.ld.128.usar.xp	q2,a8,a11   	# [0*II+11]  id:732
+
+.LBB153_dspi_dotprod_u16_aes3:	# 0x20c
+    mov.n	a2,a8                   	# [0]
+    j	.Lt_0_26370                 	# [1]
+
+.LBB50_dspi_dotprod_u16_aes3:	# 0x211
+    movi.n	a10,32                 	# [0]
+    movi.n	a11,-16                	# [1]
+    slli	a13,a5,1                 	# [2]
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:733
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:734
+    sub	a12,a12,a13               	# [6]
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [8]  id:735
+    addi	a12,a12,16               	# [9]
+    loopnez	a3,.LBB176_dspi_dotprod_u16_aes3 	# [10]
+
+.LBB174_dspi_dotprod_u16_aes3:	# 0x22c
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:737
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:736
+    ee.vld.128.ip	q1,a15,16       	# [0*II+2]  id:739
+    ee.vmulas.u16.accx.ld.xp.qup	q0,a2,a12,q5,q2,q3,q4 	# [0*II+3]  id:738
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:742
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a2,a11,q1,q3,q4,q0 	# [0*II+5]  id:740
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+6]  id:741
+    ee.vld.128.ip	q0,a15,16       	# [0*II+7]  id:744
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+8]  id:743
+
+.LBB176_dspi_dotprod_u16_aes3:	# 0x24b
+    j	.Lt_0_27906                 	# [0]
+
+.LBB56_dspi_dotprod_u16_aes3:	# 0x24e
+    movi.n	a10,32                 	# [0]
+    movi.n	a11,-16                	# [1]
+    slli	a13,a5,1                 	# [2]
+    l32i.n	a12,a1,16              	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:745
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:746
+    sub	a12,a12,a13               	# [7]
+    addi	a12,a12,16               	# [8]
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [9]  id:747
+    loopnez	a3,.LBB198_dspi_dotprod_u16_aes3 	# [10]
+
+.LBB196_dspi_dotprod_u16_aes3:	# 0x269
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:749
+    ee.vmulas.u16.accx.ld.ip.qup	q1,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:748
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:751
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a2,16,q4,q2,q3,q1 	# [0*II+3]  id:750
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:753
+    ee.vmulas.u16.accx.ld.ip.qup	q0,a2,16,q0,q3,q1,q4 	# [0*II+5]  id:752
+    ee.vld.128.ip	q6,a15,16           # [0*II+6]  id:755
+    ee.vmulas.u16.accx.ld.ip.qup	q1,a2,16,q5,q1,q4,q0 	# [0*II+7]  id:754
+    ee.vld.128.ip	q5,a15,16       	# [0*II+8]  id:757
+    ee.vmulas.u16.accx.ld.ip.qup	q4,a2,16,q6,q4,q0,q1 	# [0*II+9]  id:756
+    ee.vld.128.ip	q6,a15,16       	# [0*II+10]  id:759
+    ee.vmulas.u16.accx.ld.xp.qup	q0,a2,a12,q5,q0,q1,q4 	# [0*II+11]  id:758
+    ee.vld.128.ip	q5,a15,16       	# [0*II+12]  id:762
+    ee.vmulas.u16.accx.ld.xp.qup	q2,a2,a11,q6,q1,q4,q0 	# [0*II+13]  id:760
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+14]  id:761
+    ee.vld.128.ip	q0,a15,16       	# [0*II+15]  id:764
+    ee.vmulas.u16.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+16]  id:763
+
+.LBB198_dspi_dotprod_u16_aes3:	# 0x2a4
+    j	.Lt_0_30722                 	# [0]
+
+.Lt_0_33282:	# 0x2a7
+    movi.n	a2,0                   	# [0]
+    sext	a14,a9,15                	# [1]
+    s16i	a14,a4,0                 	# [2]  id:791
+    retw.n                        	# [3]
+
+.LBB28_dspi_dotprod_u16_aes3:	# 0x2b1
+    mov.n	a15,a7                  	# [0]
+    mov.n	a14,a6                  	# [1]
+    mov.n	a13,a5                  	# [2]
+    mov.n	a12,a4                  	# [3]
+    mov.n	a11,a3                  	# [4]
+    mov.n	a10,a2                  	# [5]
+    call8	dspi_dotprod_s16_ansi   	# [6]  dspi_dotprod_s16_ansi
+
+    mov.n	a2,a10                  	# [0]
+    retw.n                        	# [1]
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    uint16_t *i_data =  (uint16_t *)in_image->data;
+    uint16_t *f_data =  (uint16_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int64_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u16_arp4.S
@@ -0,0 +1,95 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_u16_arp4
+    .global dspi_dotprod_u16_ansi
+    .type   dspi_dotprod_u16_arp4,@function
+
+// esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
+dspi_dotprod_u16_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 7
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_u16_arp4_body
+    j   dspi_dotprod_u16_ansi
+
+.dspi_dotprod_u16_arp4_body:
+    add sp, sp, -16
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+    slli t2, t2, 1  // i_step = i_step<<1
+
+    lw  t3, 8(a1)   // step_y
+    lw  t5, 12(a1)  // stride_x
+    mul t3, t5, t3
+    slli t3, t3, 1  // f_step = f_step<<1
+
+    srli t6, a3, 3  // t5 = len/8
+    
+    addi    a6, a5, -1
+    li      t4, 1
+    sll     t4, t4, a6
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip  q0, t4, 16  // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip  q1, t5, 16          // q1 - f_data
+.loop_count_x:      esp.vmulas.u16.xacc.ld.ip  q0, t4, 16, q0, q1 // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.u.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_aes3.S
@@ -0,0 +1,367 @@
+// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_aes3_enabled == 1)
+
+    .text
+    .align	4
+    .literal	.LC0_1_52, 458755
+
+    .type	dspi_dotprod_u8_aes3, @function
+    .align	 4
+    .global	dspi_dotprod_u8_aes3
+dspi_dotprod_u8_aes3:	# 0x4
+.LBB1_dspi_dotprod_u8_aes3:	# 0x4
+    entry	a1,48                   	#  
+    l32i.n	a10,a2,4               	# [0]  id:669
+    l32i.n	a11,a2,12              	# [1]  id:668
+    mull	a8,a10,a5                	# [2]  
+    blt	a11,a8,.LBB78_dspi_dotprod_u8_aes3 	# [4]  
+
+    l32i.n	a12,a2,8               	# [0]  id:670
+    l32i.n	a9,a2,16               	# [1]  id:671
+    mull	a13,a12,a6               	# [2]  
+    blt	a9,a13,.LBB78_dspi_dotprod_u8_aes3 	# [4]  
+
+    l32i.n	a15,a3,4               	# [0]  id:673
+    l32i.n	a14,a3,12              	# [1]  id:672
+    mull	a13,a15,a5               	# [2]  
+    blt	a14,a13,.LBB78_dspi_dotprod_u8_aes3 	# [4]  
+
+    l32i.n	a8,a3,16               	# [0]  id:675
+    l32i.n	a9,a3,8                	# [1]  id:674
+    s32i.n	a9,a1,8                	# [2]  gra_spill_temp_2
+    mull	a9,a9,a6                 	# [3]  
+    blt	a8,a9,.LBB78_dspi_dotprod_u8_aes3 	# [5]  
+
+    l32i.n	a8,a3,0                	# [0]  id:676
+    s32i.n	a8,a1,4                	# [1]  gra_spill_temp_1
+    bbsi	a8,0,.Lt_0_33026         	# [2]  
+
+    bne	a14,a13,.Lt_0_33026       	# [0]  
+
+    bnei	a15,1,.Lt_0_33026        	# [0]  
+
+    l32i.n	a13,a1,8               	# [0]  gra_spill_temp_2
+    beqi	a13,1,.Lt_0_17666        	# [2]  
+
+.Lt_0_33026:	# 0x43
+.Lt_0_17922:	# 0x43
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    .type	dspi_dotprod_u8_ansi, @function
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB78_dspi_dotprod_u8_aes3:	# 0x56
+    l32r	a2,.LC0_1_52             	# [0]  
+    retw.n                        	# [1]  
+
+.Lt_0_17666:	# 0x5b
+    addi.n	a14,a10,-1             	# [0]  
+    bnez	a14,.Lt_0_33794          	# [1]  
+
+    addi.n	a15,a12,-1             	# [0]  
+    bnez	a15,.Lt_0_33794          	# [1]  
+
+    extui	a8,a5,0,4               	# [0]  
+    bnez.n	a8,.Lt_0_33794         	# [1]  
+
+    blti	a6,4,.Lt_0_33794         	# [0]  
+
+    movi.n	a9,64                  	# [0]  
+    bge	a9,a5,.Lt_0_34306         	# [1]  
+
+    extui	a10,a5,0,1              	# [0]  
+    bnez	a10,.LBB28_dspi_dotprod_u8_aes3 	# [1]  
+
+.Lt_0_34306:	# 0x78
+.Lt_0_19714:	# 0x78
+    mov.n	a3,a6                   	# [0]  
+    addi	a13,a5,-48               	# [1]  
+    movi.n	a14,0                  	# [2]  
+    mull	a15,a11,a12              	# [3]  
+    l32i.n	a2,a2,0                	# [4]  id:677
+    s32i.n	a15,a1,0               	# [6]  gra_spill_temp_0
+    wur.accx_0	a14                	# [7]  
+    l32i.n	a15,a1,4               	# [8]  gra_spill_temp_1
+    wur.accx_1	a14                	# [9]  
+    ee.vld.128.ip	q0,a15,16       	# [10]  id:680
+    beqz	a13,.LBB32_dspi_dotprod_u8_aes3 	# [11]  
+
+.Lt_0_22786:	# 0x93
+.Lt_0_22274:	# 0x93
+    addi	a8,a5,-32                	# [0]  
+    beqz	a8,.LBB38_dspi_dotprod_u8_aes3 	# [1]  
+
+.Lt_0_24322:	# 0x99
+.Lt_0_23810:	# 0x99
+    addi	a9,a5,-16                	# [0]  
+    beqz	a9,.LBB44_dspi_dotprod_u8_aes3 	# [1]  
+
+.Lt_0_25858:	# 0x9f
+.Lt_0_25346:	# 0x9f
+    addi	a10,a5,-64               	# [0]  
+    beqz	a10,.LBB50_dspi_dotprod_u8_aes3 	# [1]  
+
+.Lt_0_27394:	# 0xa5
+.Lt_0_26882:	# 0xa5
+    addi	a11,a5,-128              	# [0]  
+    beqz	a11,.LBB56_dspi_dotprod_u8_aes3 	# [1]  
+
+    movi	a12,128                  	# [0]  
+    bge	a12,a5,.Lt_0_30210        	# [1]  
+
+    movi.n	a12,0                  	# [0]  
+    ee.ld.128.usar.ip	q1,a2,16    	# [1]  id:752
+    ee.ld.128.usar.ip	q2,a2,16    	# [2]  id:753
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [4]  id:754
+    beqz.n	a3,.Lt_0_30210         	# [5]  
+
+    l32i.n	a14,a1,0               	# [0]  gra_spill_temp_0
+    addi	a13,a5,31                	# [1]  
+    movgez	a13,a5,a5              	# [2]  
+    srai	a13,a13,5                	# [3]  
+    sub	a14,a14,a5                	# [4]  
+    addi	a14,a14,16               	# [5]  
+    addi.n	a13,a13,-1             	# [6]  
+
+.Lt_0_30978:	# 0xd1
+    addi.n	a12,a12,1              	# [0]  
+    movi.n	a8,32                  	# [1]  
+    movi.n	a9,-16                 	# [2]  
+    beqz.n	a13,.Lt_0_31234        	# [3]  
+
+    loopnez	a13,.LBB218_dspi_dotprod_u8_aes3 	# [0]  
+
+.LBB216_dspi_dotprod_u8_aes3:	# 0xdc
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:756
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:755
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:758
+    ee.vmulas.u8.accx.ld.ip.qup	q1,a2,16,q5,q2,q3,q4 	# [0*II+3]  id:757
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:760
+    ee.vmulas.u8.accx.ld.ip.qup	q2,a2,16,q0,q3,q4,q1 	# [0*II+5]  id:759
+    ee.vld.128.ip	q0,a15,16       	# [0*II+6]  id:762
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+7]  id:761
+
+.LBB218_dspi_dotprod_u8_aes3:	# 0xf8
+
+.Lt_0_31234:	# 0xf8
+    ee.vmulas.u8.accx.ld.ip.qup	q5,a2,16,q0,q1,q2,q3 	# [0]  id:763
+    ee.vld.128.ip	q0,a15,16       	# [1]  id:764
+    ee.vld.128.ip	q6,a15,16       	# [2]  id:766
+    ee.vmulas.u8.accx.ld.xp.qup	q7,a2,a14,q0,q2,q3,q5 	# [3]  id:765
+    ee.vld.128.ip	q4,a15,16       	# [4]  id:769
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a2,a9,q6,q3,q5,q7 	# [5]  id:767
+    ee.ld.128.usar.xp	q1,a2,a8    	# [6]  id:768
+    ee.vld.128.ip	q0,a15,16       	# [7]  id:771
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a2,16,q4,q5,q1,q2 	# [8]  id:770
+    bne	a12,a3,.Lt_0_30978        	# [9]  
+
+.Lt_0_30210:	# 0x11a
+.Lt_0_29954:	# 0x11a
+    movi.n	a2,0                   	# [0]  
+    rur.accx_0	a10                	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    movi.n	a11,1                  	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    srl	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:773
+    retw.n                        	# [10]  
+
+.Lt_0_33794:	# 0x136
+.Lt_0_18946:	# 0x136
+    mov.n	a10,a2                  	# [0]  
+    mov.n	a11,a3                  	# [1]  
+    mov.n	a12,a4                  	# [2]  
+    mov.n	a13,a5                  	# [3]  
+    mov.n	a14,a6                  	# [4]  
+    mov.n	a15,a7                  	# [5]  
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+.LBB32_dspi_dotprod_u8_aes3:	# 0x149
+    ee.ld.128.usar.ip	q1,a2,16    	# [0]  id:681
+    ee.ld.128.usar.ip	q2,a2,16    	# [1]  id:682
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [3]  id:683
+    beqz.n	a6,.Lt_0_22786         	# [4]  
+
+    movi.n	a10,32                 	# [0]  
+    l32i.n	a12,a1,0               	# [1]  gra_spill_temp_0
+    movi.n	a11,-16                	# [2]  
+    addi	a12,a12,-32              	# [3]  
+    loopgtz	a6,.LBB104_dspi_dotprod_u8_aes3 	# [4]  
+
+.LBB102_dspi_dotprod_u8_aes3:	# 0x160
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:685
+    ee.vmulas.u8.accx.ld.xp.qup	q1,a2,a12,q0,q1,q2,q3 	# [0*II+1]  id:684
+    ee.vld.128.ip	q5,a15,16       	# [0*II+2]  id:687
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a2,a11,q4,q2,q3,q1 	# [0*II+3]  id:686
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+4]  id:688
+    ee.vld.128.ip	q0,a15,16       	# [0*II+5]  id:690
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a2,16,q5,q3,q1,q2 	# [0*II+6]  id:689
+
+.LBB104_dspi_dotprod_u8_aes3:	# 0x178
+    j	.Lt_0_22786                 	# [0]  
+
+.LBB38_dspi_dotprod_u8_aes3:	# 0x17b
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    srli	a3,a6,1                  	# [2]  
+    l32i.n	a12,a1,0               	# [3]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [4]  id:691
+    ee.ld.128.usar.ip	q2,a2,16    	# [5]  id:692
+    addi	a12,a12,-16              	# [7]  
+    ee.src.q.ld.xp	q3,a2,a12,q1,q2 	# [8]  id:693
+    loopnez	a3,.LBB127_dspi_dotprod_u8_aes3 	# [9]  
+
+.LBB125_dspi_dotprod_u8_aes3:	# 0x193
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:695
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a2,a11,q0,q1,q2,q3 	# [0*II+1]  id:694
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+2]  id:696
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:698
+    ee.vmulas.u8.accx.ld.xp.qup	q4,a2,a12,q4,q2,q1,q3 	# [0*II+4]  id:697
+    ee.vld.128.ip	q5,a15,16       	# [0*II+5]  id:700
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a2,a11,q0,q1,q3,q4 	# [0*II+6]  id:699
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+7]  id:701
+    ee.vld.128.ip	q0,a15,16       	# [0*II+8]  id:703
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a2,a12,q5,q3,q1,q2 	# [0*II+9]  id:702
+
+.LBB127_dspi_dotprod_u8_aes3:	# 0x1b5
+    j	.Lt_0_24322                 	# [0]  
+
+.LBB44_dspi_dotprod_u8_aes3:	# 0x1b8
+    srli	a3,a3,2                  	# [0]  
+    movi.n	a10,-16                	# [1]  
+    l32i.n	a11,a1,0               	# [2]  gra_spill_temp_0
+    addi	a8,a2,16                 	# [3]  
+    addi	a11,a11,16               	# [4]  
+    ee.ld.128.usar.xp	q2,a8,a10   	# [5]  id:704
+    ee.ld.128.usar.xp	q1,a8,a11   	# [6]  id:705
+    ee.src.q.ld.xp	q3,a8,a10,q1,q2 	# [8]  id:706
+    ee.ld.128.usar.xp	q2,a8,a11   	# [9]  id:707
+    loopnez	a3,.LBB150_dspi_dotprod_u8_aes3 	# [10]  
+
+.LBB148_dspi_dotprod_u8_aes3:	# 0x1d4
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:709
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a8,a10,q0,q1,q2,q3 	# [0*II+1]  id:708
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+2]  id:710
+    ee.vld.128.ip	q0,a15,16       	# [0*II+3]  id:712
+    ee.vmulas.u8.accx.ld.xp.qup	q4,a8,a10,q4,q2,q1,q3 	# [0*II+4]  id:711
+    ee.ld.128.usar.xp	q3,a8,a11   	# [0*II+5]  id:713
+    ee.vld.128.ip	q5,a15,16       	# [0*II+6]  id:715
+    ee.vmulas.u8.accx.ld.xp.qup	q4,a8,a10,q0,q1,q3,q4 	# [0*II+7]  id:714
+    ee.ld.128.usar.xp	q1,a8,a11   	# [0*II+8]  id:716
+    ee.vld.128.ip	q0,a15,16       	# [0*II+9]  id:718
+    ee.vmulas.u8.accx.ld.xp.qup	q3,a8,a10,q5,q3,q1,q4 	# [0*II+10]  id:717
+    ee.ld.128.usar.xp	q2,a8,a11   	# [0*II+11]  id:719
+
+.LBB150_dspi_dotprod_u8_aes3:	# 0x1fc
+    mov.n	a2,a8                   	# [0]  
+    j	.Lt_0_25858                 	# [1]  
+
+.LBB50_dspi_dotprod_u8_aes3:	# 0x201
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i.n	a12,a1,0               	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [3]  id:720
+    ee.ld.128.usar.ip	q2,a2,16    	# [4]  id:721
+    sub	a12,a12,a5                	# [5]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [7]  id:722
+    addi	a12,a12,16               	# [8]  
+    loopnez	a3,.LBB173_dspi_dotprod_u8_aes3 	# [9]  
+
+.LBB171_dspi_dotprod_u8_aes3:	# 0x219
+    ee.vld.128.ip	q5,a15,16       	# [0*II+0]  id:724
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:723
+    ee.vld.128.ip	q1,a15,16       	# [0*II+2]  id:726
+    ee.vmulas.u8.accx.ld.xp.qup	q0,a2,a12,q5,q2,q3,q4 	# [0*II+3]  id:725
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:729
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a2,a11,q1,q3,q4,q0 	# [0*II+5]  id:727
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+6]  id:728
+    ee.vld.128.ip	q0,a15,16       	# [0*II+7]  id:731
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+8]  id:730
+
+.LBB173_dspi_dotprod_u8_aes3:	# 0x238
+    j	.Lt_0_27394                 	# [0]  
+
+.LBB56_dspi_dotprod_u8_aes3:	# 0x23b
+    movi.n	a10,32                 	# [0]  
+    movi.n	a11,-16                	# [1]  
+    l32i.n	a12,a1,0               	# [2]  gra_spill_temp_0
+    ee.ld.128.usar.ip	q1,a2,16    	# [3]  id:732
+    ee.ld.128.usar.ip	q2,a2,16    	# [4]  id:733
+    sub	a12,a12,a5                	# [6]  
+    addi	a12,a12,16               	# [7]  
+    ee.src.q.ld.ip	q3,a2,16,q1,q2 	# [8]  id:734
+    loopnez	a3,.LBB195_dspi_dotprod_u8_aes3 	# [9]  
+
+.LBB193_dspi_dotprod_u8_aes3:	# 0x253
+    ee.vld.128.ip	q4,a15,16       	# [0*II+0]  id:736
+    ee.vmulas.u8.accx.ld.ip.qup	q1,a2,16,q0,q1,q2,q3 	# [0*II+1]  id:735
+    ee.vld.128.ip	q0,a15,16       	# [0*II+2]  id:738
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a2,16,q4,q2,q3,q1 	# [0*II+3]  id:737
+    ee.vld.128.ip	q5,a15,16       	# [0*II+4]  id:740
+    ee.vmulas.u8.accx.ld.ip.qup	q0,a2,16,q0,q3,q1,q4 	# [0*II+5]  id:739
+    ee.vld.128.ip	q6,a15,16       	# [0*II+6]  id:742
+    ee.vmulas.u8.accx.ld.ip.qup	q1,a2,16,q5,q1,q4,q0 	# [0*II+7]  id:741
+    ee.vld.128.ip	q5,a15,16       	# [0*II+8]  id:744
+    ee.vmulas.u8.accx.ld.ip.qup	q4,a2,16,q6,q4,q0,q1 	# [0*II+9]  id:743
+    ee.vld.128.ip	q6,a15,16       	# [0*II+10]  id:746
+    ee.vmulas.u8.accx.ld.xp.qup	q0,a2,a12,q5,q0,q1,q4 	# [0*II+11]  id:745
+    ee.vld.128.ip	q5,a15,16       	# [0*II+12]  id:749
+    ee.vmulas.u8.accx.ld.xp.qup	q2,a2,a11,q6,q1,q4,q0 	# [0*II+13]  id:747
+    ee.ld.128.usar.xp	q1,a2,a10   	# [0*II+14]  id:748
+    ee.vld.128.ip	q0,a15,16       	# [0*II+15]  id:751
+    ee.vmulas.u8.accx.ld.ip.qup	q3,a2,16,q5,q4,q1,q2 	# [0*II+16]  id:750
+
+.LBB195_dspi_dotprod_u8_aes3:	# 0x28e
+    movi.n	a2,0                   	# [0]  
+    movi.n	a11,1                  	# [1]  
+    addi.n	a12,a7,-1              	# [2]  
+    rur.accx_0	a10                	# [3]  
+    ssl	a12                       	# [4]  
+    sll	a11,a11                   	# [5]  
+    ssr	a7                        	# [6]  
+    add.n	a10,a10,a11             	# [7]  
+    srl	a10,a10                   	# [8]  
+    s8i	a10,a4,0                  	# [9]  id:773
+    retw.n                        	# [10]  
+
+.LBB28_dspi_dotprod_u8_aes3:	# 0x2aa
+    mov.n	a15,a7                  	# [0]  
+    mov.n	a14,a6                  	# [1]  
+    mov.n	a13,a5                  	# [2]  
+    mov.n	a12,a4                  	# [3]  
+    mov.n	a11,a3                  	# [4]  
+    mov.n	a10,a2                  	# [5]  
+    call8	dspi_dotprod_u8_ansi    	# [6]  dspi_dotprod_u8_ansi
+
+    mov.n	a2,a10                  	# [0]  
+    retw.n                        	# [1]  
+
+
+#endif // dsps_dotprod_s16_aes3_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_ansi.c
@@ -0,0 +1,49 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    uint8_t *i_data =  (uint8_t *)in_image->data;
+    uint8_t *f_data =  (uint8_t *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    int32_t acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    acc += 1 << (shift - 1);    // round operation
+    acc >>= shift;
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dspi_dotprod_u8_arp4.S
@@ -0,0 +1,93 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dspi_dotprod_platform.h"
+#if (dspi_dotprod_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dspi_dotprod_u8_arp4
+    .global dspi_dotprod_u8_ansi
+    .type   dspi_dotprod_u8_arp4,@function
+
+// esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
+dspi_dotprod_u8_arp4: 
+// in_image     - a0
+// filter       - a1
+// out_value    - a2
+// count_x      - a3
+// count_y      - a4
+// shift        - a5
+
+// i_data       - t0
+// f_data       - t1
+// i_step       - t2
+// f_step       - t3
+// t4           - current i_data
+// t5           - current f_data
+
+    lw t1, 4(a0) // load  in_image->step_x
+    lw t2, 4(a1) // load  filter->step_x
+    or t1, t1, t2
+    addi t1, t1, -1 // should be 0 now
+    andi t2, a3, 15
+    or   t1, t1, t2
+    
+    beqz    t1, .dspi_dotprod_u8_arp4_body
+    j   dspi_dotprod_u8_ansi
+
+.dspi_dotprod_u8_arp4_body:
+    add sp, sp, -16
+    lw  t0, 0(a0)   // i_data
+    lw  t1, 0(a1)   // f_data
+
+    lw  t2, 8(a0)   // step_y
+    lw  t4, 12(a0)  // stride_x
+    mul t2, t4, t2
+
+    lw  t3, 8(a1)   // step_y
+    lw  t5, 12(a1)  // stride_x
+    mul t3, t5, t3
+
+    srli t6, a3, 4  // t5 = len/16
+    
+    addi    a6, a5, -1
+    li      t4, 1
+    sll     t4, t4, a6
+    esp.zero.xacc
+    esp.movx.w.xacc.l   t4
+
+.loop_count_y:
+        mv      t4, t0
+        mv      t5, t1
+        esp.vld.128.ip  q0, t4, 16          // q0 - i_data
+
+        esp.lp.setup    0, t6, .loop_count_x
+            esp.vld.128.ip  q1, t5, 16      // q1 - f_data
+.loop_count_x:  esp.vmulas.u8.xacc.ld.ip  q0, t4, 16, q0, q1 // q0 - i_data
+
+        add     t0, t0, t2
+        add     t1, t1, t3
+        add     a4,a4, -1
+    bgtz a4, .loop_count_y
+
+    esp.srs.u.xacc       t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dspi_dotprod_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_ae32.S
@@ -0,0 +1,80 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dsps_dotprod_s16_ae32_enabled == 1)
+#include "dsps_dotprod_s16_m_ae32.S"
+#include "dsp_err_codes.h"
+
+	.text
+	.align  4
+	.global dsps_dotprod_s16_ae32
+	.type   dsps_dotprod_s16_ae32,@function
+
+
+//esp_err_t dsps_dotprod_s16_ae32(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
+dsps_dotprod_s16_ae32: 
+// src1 - a2
+// src2 - a3
+// dest - a4
+// len  - a5
+// shift - a6
+
+	entry	a1, 16
+
+	// Check minimum length
+	movi a8, 4
+	blt a5, a8, dsps_dotprod_s16_ae32_error
+	
+	// Clear accumulator
+	movi a8, 0
+	wsr a8, acchi
+		
+	// Prepare and load round value
+	movi a8, 0x7fff
+	ssr a6
+	srl a8, a8
+	wsr a8, acclo // initialize acc with shifted round value
+
+	// Compensate for pre-increment 
+	// Right shift to 16 bits
+	// RS = -shift + 15
+	neg  a6, a6 
+	addi a6, a6, 15
+	
+	/* number of loop iterations (see below):
+	 * a7 = count / 4 - 1
+	 */
+	
+	srli a7, a5, 2
+	addi a7, a7, -1
+
+	movi.n	a10, 0 // load 0 to the a10 to increment second array
+
+	dotprod_s16_ae32_full a2, a3, a7, a5
+
+	/* Get accumulator */
+	ssr a6
+	rsr a2, acchi
+	rsr a3, acclo
+	src a2, a2, a3
+		
+	s16i	a2, a4, 0
+	movi.n	a2, 0
+	retw.n
+dsps_dotprod_s16_ae32_error:
+	movi.n	a2, ESP_ERR_DSP_INVALID_LENGTH
+	retw.n
+
+#endif // dsps_dotprod_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_ansi.c
@@ -0,0 +1,33 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_dotprod.h"
+
+esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift)
+{
+    // To make correct round operation we have to shift round value
+    long long acc = 0x7fff >> shift;
+
+    for (int i = 0 ; i < len ; i++) {
+        acc += (int32_t)src1[i] * (int32_t)src2[i];
+    }
+
+    int final_shift = shift - 15;
+    if (final_shift > 0) {
+        *dest = (acc << final_shift);
+    } else {
+        *dest = (acc >> (-final_shift));
+    }
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_arp4.S
@@ -0,0 +1,74 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dsps_dotprod_s16_arp4_enabled == 1)
+#include "dsp_err_codes.h"
+
+    .text
+    .align  4
+    .global dsps_dotprod_s16_arp4
+    .global dsps_dotprod_s16_ansi
+    .type   dsps_dotprod_s16_arp4,@function
+
+//esp_err_t dsps_dotprod_s16_arp4(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
+dsps_dotprod_s16_arp4: 
+// src1 - a0
+// src2 - a1
+// dest - a2
+// len  - a3
+// shift - a4
+    andi a5, a3, 7
+    beqz    a5, .dsps_dotprod_s16_arp4_body
+    j   dsps_dotprod_s16_ansi
+
+.dsps_dotprod_s16_arp4_body:
+    add sp,sp,-16
+
+    // Enable analigned data access
+    esp.movx.r.cfg t6
+    or t6, t6, 2
+    esp.movx.w.cfg t6
+
+    add t6, a4, -15
+    neg t6, t6      // t6 - real_shift
+
+    li      t3, 0x7fff
+    srl     t3, t3, a4
+    esp.zero.xacc
+    esp.movx.w.xacc.l t3
+
+    mv      t3, a0
+    mv      t4, a1
+
+    esp.vld.128.ip  q0, t3, 16  //q0 - src1
+    srli t5, a3, 3              // t5 = len>>3
+#     esp.lp.setup    0, t5, .main_loop
+#         esp.vld.128.ip  q1, t4, 16        // q1 - src1
+# .main_loop:       esp.vmulas.s16.xacc.ld.ip     q0, t3, 16, q0, q1 // q0 - src2
+
+    .main_loop:        
+        esp.vld.128.ip  q1, t4, 16      // q1 - src1
+        esp.vmulas.s16.xacc.ld.ip     q0, t3, 16, q0, q1 // q0 - src2
+        add  t5, t5, -1
+    bgtz t5, .main_loop
+
+    esp.srs.s.xacc       t5, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
+    sh  t5, 0(a2)               // store result to output buffer 
+
+    li  a0,0
+    add sp,sp,16
+    ret
+
+#endif // dsps_dotprod_s16_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/fixed/dsps_dotprod_s16_m_ae32.S
@@ -0,0 +1,104 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dotprod_s16_ae32 x1, x2, count
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (for example a2)
+// x2 - input array2 register (for example a3)
+// count - counter register (for example a7)
+// count -   samples_count / 4 - 1
+// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
+// acchi, and acclo have to be initialize before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		/*
+		 * Data schedule. Each line represents instruction, columns represent
+		 * register contents. Last column (MUL) shows the multiplication which
+		 * takes place. Values loaded in the given cycle are shown in square brackets.
+		 *
+		 *  m0     m1     m2     m3     MUL
+		 * ---------  pre-load  ------------
+		 *[x0 x1]								(no MULs in the first 3 instructions)
+		 * x0 x1        [y0 y1]
+		 * x0 x1 [x2 x3] y0 y1
+		 * x0 x1  x2 x3  y0 y1 [y2 y3] x0*y0
+		 * ----------   loop  --------------	(the following 4 instructions are
+		 *[x4 x5] x2 x3  y0 y1  y2 y3  x1*y1     repeated as much as needed)
+		 * x4 x5  x2 x3 [y4 y5] y2 y3  x2*y2
+		 * x4 x5 [x6 x7] y4 y5  y2 y3  x3*y3
+		 * x4 x5  x6 x7  y4 y5 [y6 y7] x4*y4
+		 * ---------  finalize  ------------
+		 * x4 x5  x6 x7  y4 y5  y6 y7  x5*y5	(nothing is load)
+		 * x4 x5  x6 x7  y4 y5  y6 y7  x6*y6
+		 * x4 x5  x6 x7  y4 y5  y6 y7  x7*y7
+		 */
+
+		addi  \x1, \x1, -4 // To arrange fist pointer
+		addi  \x2, \x2, -4 // To arrange fist pointer
+		//lddec m0, \x1 
+		//lddec m2, \x2 // To arrange fist pointer
+
+		ldinc m0, \x1
+		ldinc m2, \x2
+		ldinc m1, \x1
+	
+		mula.dd.ll.ldinc m3, \x2, m0, m2
+		loopnez \count, .loop_end
+		.loop:
+			mula.dd.hh.ldinc m0, \x1, m0, m2
+			mula.dd.ll.ldinc m2, \x2, m1, m3
+			mula.dd.hh.ldinc m1, \x1, m1, m3
+			mula.dd.ll.ldinc m3, \x2, m0, m2
+		.loop_end:
+	
+		mula.dd.hh m0, m2
+		mula.dd.ll m1, m3
+		mula.dd.hh m1, m3
+
+.endm // dotprod_s16_ae32
+
+
+.macro dotprod_s16_ae32_full x1, x2, count, full_count
+// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
+// x1 - input array1 register (for example a2)
+// x2 - input array2 register (for example a3)
+// count - counter register (for example a7)
+// count -   samples_count / 4 - 1
+// full_count - samples_count
+// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
+// acchi, and acclo have to be initialize before
+// Result - acchi || acclo
+// Modifies: 
+// m0, m1, m2, m3
+// acchi || acclo - must be loaded before (for example 0x3fff to acclo). 
+
+		dotprod_s16_ae32 \x1, \x2, \count
+
+		bbci  \full_count, 1, .mod2chk
+		ldinc m0, \x1
+		ldinc m2, \x2
+		mula.dd.hh m0, m2
+		mula.dd.ll m0, m2
+	.mod2chk:
+		bbci  \full_count, 0, .mod1chk
+		ldinc m0, \x1
+		ldinc m2, \x2
+		mula.dd.ll m0, m2
+	.mod1chk:
+
+.endm // dotprod_s16_ae32_full
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dspi_dotprod_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dspi_dotprod_f32_ansi.c
@@ -0,0 +1,47 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    float *i_data =  (float *)in_image->data;
+    float *f_data =  (float *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    float acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += i_data[in_image->step_x * x] * f_data[filter->step_x * x];
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dspi_dotprod_off_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dspi_dotprod_off_f32_ansi.c
@@ -0,0 +1,47 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dspi_dotprod.h"
+
+esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset)
+{
+    if (in_image->step_x * count_x > in_image->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (in_image->step_y * count_y > in_image->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_x * count_x > filter->stride_x) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+    if (filter->step_y * count_y > filter->stride_y) {
+        return ESP_ERR_DSP_PARAM_OUTOFRANGE;
+    }
+
+    float *i_data =  (float *)in_image->data;
+    float *f_data =  (float *)filter->data;
+    int i_step = in_image->stride_x * in_image->step_y;
+    int f_step = filter->stride_x * filter->step_y;
+
+    float acc = 0;
+    for (int y = 0; y < count_y; y++) {
+        for (int x = 0; x < count_x; x++) {
+            acc += i_data[in_image->step_x * x] * (f_data[filter->step_x * x] + offset);
+        }
+        i_data += i_step;
+        f_data += f_step;
+    }
+    *out_value = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_ae32.S
@@ -0,0 +1,62 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dotprod_f32_ae32_enabled == 1)
+
+#include "dsps_dotprod_f32_m_ae32.S"
+
+// This is dot product function for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_dotprod_f32_ae32
+	.global .dsps_dotprod_f32_ae32_body
+	.type   dsps_dotprod_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
+//{
+//    float acc = 0;
+//    for (int i=0 ; i< len ; i++)
+//    {
+//        acc += src1[i]*src2[i];
+//    }
+//    *dest = acc;
+//    return ESP_OK;
+//}
+
+dsps_dotprod_f32_ae32: 
+// src1 - a2
+// src2 - a3
+// dest - a4
+// len  - a5
+
+	entry	a1, 16
+.dsps_dotprod_f32_ae32_body:	
+	// Array increment for floating point data should be 4
+	movi.n	a8, 4
+	// Clear initial state of the result register
+	movi.n	a9, 0
+	wfr	    f1, a9
+	// a2 - input1
+	// a3 - input2
+	// a5 - length
+	// a8 - 4,  step in arrays
+	dotprod_f32_ae32 a2, a3, a5, a9, a8;
+	
+	ssi	f1, a4, 0 // Store result from f1 to memory at a4
+	
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif // dotprode_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_aes3.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_aes3.S
@@ -0,0 +1,85 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dsps_dotprod_f32_aes3_enabled == 1)
+
+// This is dot product function for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_dotprod_f32_aes3
+	.global .dsps_dotprod_f32_ae32_body
+	.type   dsps_dotprod_f32_aes3,@function
+// The function implements the following C code:
+//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
+//{
+//    float acc = 0;
+//    for (int i=0 ; i< len ; i++)
+//    {
+//        acc += src1[i]*src2[i];
+//    }
+//    *dest = acc;
+//    return ESP_OK;
+//}
+
+dsps_dotprod_f32_aes3: 
+// src1 - a2
+// src2 - a3
+// dest - a4
+// len  - a5
+
+	entry	a1, 16
+	// Check length and align
+	movi.n	a10, 3
+	and a10, a10, a5
+	movi.n	a9, 15
+	or  a11, a3, a2
+	and a11, a9, a11
+	or  a10, a10, a11
+	beqz  a10, .dsps_dotprod_f32_aes3_body
+	// Call Esp32 function
+	J 	.dsps_dotprod_f32_ae32_body
+	
+.dsps_dotprod_f32_aes3_body:
+	// Clear initial state of the result register
+	movi.n	a9, 0
+	wfr	    f0, a9
+	wfr	    f1, a9
+	wfr	    f2, a9
+	wfr	    f3, a9
+	// a2 - input1
+	// a3 - input2
+	// a5 - length
+
+	srli	    a6, a5, 2		// N count
+//	lsx	    f0, a2,  a9
+	loopnez a6, .loop_mac_end_m_ae32
+		EE.LDF.128.IP  f11, f10, f9, f8, a2, 16
+		EE.LDF.128.IP  f7, f6, f5, f4, a3, 16
+		madd.s	f0, f4, f8		// f0 = X11*Y11
+		madd.s	f1, f5, f9		// f1 = X12*Y11
+		madd.s	f2, f6, f10		// f2 = X13*Y11
+		madd.s	f3, f7, f11		// f3 = X14*Y11
+	.loop_mac_end_m_ae32:
+
+	add.s   f0, f0, f1
+	add.s   f0, f0, f2
+	add.s   f0, f0, f3
+
+	ssi	f0, a4, 0 // Store result from f1 to memory at a4
+	
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif // dotprode_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_ansi.c
@@ -0,0 +1,25 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_dotprod.h"
+
+esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len)
+{
+    float acc = 0;
+    for (int i = 0 ; i < len ; i++) {
+        acc += src1[i] * src2[i];
+    }
+    *dest = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_arp4.S
@@ -0,0 +1,77 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dsps_dotprod_f32_arp4_enabled == 1)
+
+    .text
+    .align  4
+    .global dsps_dotprod_f32_arp4
+    .type   dsps_dotprod_f32_arp4,@function
+// The function implements the following C code:
+//esp_err_t dsps_dotprod_f32(const float* src1, const float* src2, float* dest, int len)
+//{
+//    float acc = 0;
+//    for (int i=0 ; i< len ; i++)
+//    {
+//        acc += src1[i]*src2[i];
+//    }
+//    *dest = acc;
+//    return ESP_OK;
+//}
+
+dsps_dotprod_f32_arp4:
+// src1 - a0
+// src2 - a1
+// dest - a2
+// len  - a3
+    add sp,sp,-16
+
+    fmv.w.x fa2,zero
+
+
+    flw     fa0, 0(a0)
+    flw     fa1, 0(a1)
+    add     a0, a0, 4
+    add     a1, a1, 4
+    li      a4, 2
+    ble     a3, a4, .loop_less_2 
+
+// Loop when len > 2
+    esp.lp.setup    0, a3, .dotprod_loop
+        fmadd.s fa2, fa0, fa1, fa2
+        flw     fa0, 0(a0)
+        flw     fa1, 0(a1)
+        add     a0, a0, 4
+.dotprod_loop:  add     a1, a1, 4
+    fsw     fa2, 0(a2)
+
+    add sp,sp,16
+    li  a0,0
+    ret
+// Loop when len <=2
+.loop_less_2:
+    fmadd.s fa2, fa0, fa1, fa2
+    flw     fa0, 0(a0)
+    flw     fa1, 0(a1)
+    add     a0, a0, 4
+    add     a1, a1, 4
+    add     a3, a3, -1
+    bnez    a3, .loop_less_2
+    fsw     fa2, 0(a2)
+    add sp,sp,16
+    li  a0,0
+    ret
+
+#endif // dotprode_f32_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprod_f32_m_ae32.S
@@ -0,0 +1,42 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dotprod_f32_ae32 x1 x2 count step1  step2
+// This macro calculates floating point dot product for count float samples
+// x1, x2 - input arrays
+// count - amount of samples
+// step1 - start step 
+//,step2 - A register for array step increment. (should be divided by 4)
+// f1 - contains initial value 
+//
+// result in f1
+// 
+// Macros body:
+// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
+// affected: f0, f1, f2
+// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
+// a8 == 4, step is 4 bytes
+// a5 == 32, length of array is 32
+// 
+//    mov     \step1, \step2
+	lsx	    f0, \x2,  \step1
+//    sub	    \x1, \x1, \step1 // To compensate first increment
+	loopnez \count, .loop_mac_end_m_ae32
+		lsx    f2, \x1, \step1
+		madd.s  f1, f2, f0
+		add.n	\step1, \step1, \step2
+		lsx    f0, \x2, \step1
+	.loop_mac_end_m_ae32:
+.endm
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_ae32.S
@@ -0,0 +1,64 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dotprode_f32_ae32_enabled == 1)
+
+#include "dsps_dotprode_f32_m_ae32.S"
+
+// This is dot product function for ESP32 processor.
+	.text
+	.align  4
+	.global dsps_dotprode_f32_ae32
+	.type   dsps_dotprode_f32_ae32,@function
+// The function implements the following C code:
+//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
+//{
+//    float acc = 0;
+//    for (int i=0 ; i< len ; i++)
+//    {
+//        acc += src1[i]*src2[i];
+//    }
+//    *dest = acc;
+//    return ESP_OK;
+//}
+
+dsps_dotprode_f32_ae32: 
+// src1 - a2
+// src2 - a3
+// dest - a4
+// len  - a5
+// step1- a6
+// step2- a7
+
+	entry	a1, 16
+	// Array increment for floating point data should be 4
+	
+	slli    a6,a6, 2
+	slli    a7,a7, 2
+	// Clear initial state of the result register
+	movi.n	a9, 0
+	wfr	    f1, a9
+	// a2 - input1
+	// a3 - input2
+	// a5 - length
+	// a6,a7,  step in arrays
+	dotprode_f32_ae32 a2, a3, a5, a6, a7;
+	
+	ssi	f1, a4, 0 // Store result from f1 to memory at a4
+	
+	movi.n	a2, 0 // return status ESP_OK
+	retw.n
+
+#endif //dotprode_f32_ae32_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_ansi.c
@@ -0,0 +1,25 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dsps_dotprod.h"
+
+esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
+{
+    float acc = 0;
+    for (int i = 0 ; i < len ; i++) {
+        acc += src1[i * step1] * src2[i * step2];
+    }
+    *dest = acc;
+    return ESP_OK;
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_arp4.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_arp4.S
@@ -0,0 +1,78 @@
+// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+#include "dsps_dotprod_platform.h"
+#if (dsps_dotprod_f32_arp4_enabled == 1)
+
+	.text
+	.align  4
+	.global dsps_dotprode_f32_arp4
+	.type   dsps_dotprode_f32_arp4,@function
+// The function implements the following C code:
+//esp_err_t dsps_dotprode_f32(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
+//{
+//    float acc = 0;
+//    for (int i = 0 ; i < len ; i++) {
+//        acc += src1[i * step1] * src2[i * step2];
+//    }
+//    *dest = acc;
+//    return ESP_OK;
+//}
+
+dsps_dotprode_f32_arp4:
+// src1 - a0
+// src2 - a1
+// dest - a2
+// len  - a3
+    add	sp,sp,-16
+
+    fmv.w.x	fa2,zero
+    slli    a4, a4, 2  // step address increment by 4
+    slli    a5, a5, 2  // step address increment by 4
+
+	flw		fa0, 0(a0)
+	flw		fa1, 0(a1)
+	add		a0, a0, a4
+	add		a1, a1, a5
+	li		a6, 2
+	ble		a3, a6, .loop_less_2 
+
+// Loop when len > 2
+    esp.lp.setup    0, a3, .dotprod_loop
+		fmadd.s fa2, fa0, fa1, fa2
+		flw		fa0, 0(a0)
+		flw		fa1, 0(a1)
+		add		a0, a0, a4
+.dotprod_loop:	add		a1, a1, a5
+	fsw		fa2, 0(a2)
+
+    add	sp,sp,16
+    li	a0,0
+    ret
+// Loop when len <=2
+.loop_less_2:
+	fmadd.s fa2, fa0, fa1, fa2
+	flw		fa0, 0(a0)
+	flw		fa1, 0(a1)
+	add		a0, a0, a4
+	add		a1, a1, a5
+	add		a3, a3, -1
+	bnez	a3, .loop_less_2
+
+	fsw		fa2, 0(a2)
+    add	sp,sp,16
+    li	a0,0
+    ret
+
+#endif // dotprode_f32_arp4_enabled
--- a/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_m_ae32.S
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/float/dsps_dotprode_f32_m_ae32.S
@@ -0,0 +1,41 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. 
+
+
+.macro dotprode_f32_ae32 x1 x2 count step1  step2
+// This macro calculates floating point dot product for count float samples
+// x1, x2 - input arrays
+// count - amount of samples
+// step1,step2 - A register for array step. (should be divided by 4)
+// f1 - contains initial value 
+//
+// result in f1
+// 
+// Macros body:
+// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
+// affected: f0, f1, f2
+// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
+// a8 == 4, step is 4 bytes
+// a5 == 32, length of array is 32
+// 
+	lsi	    f0, \x2, 0
+	sub	    \x1, \x1, \step1 // To compensate first increment
+	loopnez \count, .loop_mace_end_m_ae32
+		add.n	\x1, \x1, \step1
+		lsi	    f2, \x1, 0
+		madd.s	f1, f2, f0
+		add.n	\x2, \x2, \step2
+		lsi	     f0, \x2, 0
+	.loop_mace_end_m_ae32:
+.endm
--- a/managed_components/espressif__esp-dsp/modules/dotprod/include/dspi_dotprod.h
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/include/dspi_dotprod.h
@@ -0,0 +1,191 @@
+
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef _dspi_dotprod_H_
+#define _dspi_dotprod_H_
+
+#include "esp_log.h"
+#include "dsp_err.h"
+#include "dsp_types.h"
+#include "dspi_dotprod_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**@{*/
+/**
+ * @brief      dot product of two images
+ * Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] in_image  descriptor of the image
+ * @param[in] filter  descriptor of the filter
+ * @param[out] out_value   pointer to the output value
+ * @param[in] count_x amount of samples by X axis  (count_x*step_X <= widdth)
+ * @param[in] count_y amount of samples by Y axis (count_y*step_Y  <= height)
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y);
+/**@}*/
+
+/**@{*/
+/**
+ * @brief      dot product of two images
+ * Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] in_image  descriptor of the image
+ * @param[in] filter  descriptor of the filter
+ * @param[out] out_value   pointer to the output value
+ * @param[in] count_x amount of samples by X axis  (count_x*step_X <= widdth)
+ * @param[in] count_y amount of samples by Y axis (count_y*step_Y  <= height)
+ * @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
+
+esp_err_t dspi_dotprod_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
+
+esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
+esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
+
+
+/**@}*/
+
+/**@{*/
+/**
+ * @brief      dot product of two images with input offset
+ * Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] in_image  descriptor of the image
+ * @param[in] filter  descriptor of the filter
+ * @param[out] out_value   pointer to the output value
+ * @param[in] count_x amount of samples by X axis  (count_x*step_X <= widdth)
+ * @param[in] count_y amount of samples by Y axis (count_y*step_Y  <= height)
+ * @param[in] offset - input offset value.
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset);
+/**@}*/
+
+/**@{*/
+/**
+ * @brief      dot product of two images with input offset
+ * Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] in_image  descriptor of the image
+ * @param[in] filter  descriptor of the filter
+ * @param[out] out_value   pointer to the output value
+ * @param[in] count_x amount of samples by X axis  (count_x*step_X <= widdth)
+ * @param[in] count_y amount of samples by Y axis (count_y*step_Y  <= height)
+ * @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
+ * @param[in] offset - input offset value.
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
+esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
+esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
+esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
+
+esp_err_t dspi_dotprod_off_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
+esp_err_t dspi_dotprod_off_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
+esp_err_t dspi_dotprod_off_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
+esp_err_t dspi_dotprod_off_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
+
+esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
+esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
+esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
+esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
+
+/**@}*/
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#ifdef CONFIG_DSP_OPTIMIZED
+#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
+#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
+#if (dspi_dotprod_aes3_enabled == 1)
+#define dspi_dotprod_s16 dspi_dotprod_s16_aes3
+#define dspi_dotprod_u16 dspi_dotprod_u16_aes3
+#define dspi_dotprod_s8 dspi_dotprod_s8_aes3
+#define dspi_dotprod_u8 dspi_dotprod_u8_aes3
+#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_aes3
+#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_aes3
+#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_aes3
+#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_aes3
+#elif (dspi_dotprod_arp4_enabled == 1)
+#define dspi_dotprod_s16 dspi_dotprod_s16_arp4
+#define dspi_dotprod_s8 dspi_dotprod_s8_arp4
+#define dspi_dotprod_u16 dspi_dotprod_u16_arp4
+#define dspi_dotprod_u8 dspi_dotprod_u8_arp4
+#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_arp4
+#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_arp4
+#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_arp4
+#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_arp4
+#else
+#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
+#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
+#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
+#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
+#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
+#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
+#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
+#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
+#endif
+#endif
+#ifdef CONFIG_DSP_ANSI
+#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
+#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
+#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
+#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
+#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
+#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
+#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
+#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
+#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
+#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
+#endif
+
+
+#endif // _dspi_dotprod_H_
--- a/managed_components/espressif__esp-dsp/modules/dotprod/include/dspi_dotprod_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/include/dspi_dotprod_platform.h
@@ -0,0 +1,24 @@
+#ifndef _dspi_dotprod_platform_H_
+#define _dspi_dotprod_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dspi_dotprod_aes3_enabled 1
+#endif
+#endif // __XTENSA__
+
+#if CONFIG_IDF_TARGET_ESP32P4
+#ifdef CONFIG_DSP_OPTIMIZED
+#define dspi_dotprod_arp4_enabled 1
+#else
+#define dspi_dotprod_arp4_enabled 0
+#endif
+#endif
+
+#endif // _dspi_dotprod_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/dotprod/include/dsps_dotprod.h
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/include/dsps_dotprod.h
@@ -0,0 +1,128 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _DSPI_DOTPROD_H_
+#define _DSPI_DOTPROD_H_
+
+#include "esp_log.h"
+#include "dsp_err.h"
+
+#include "dsps_dotprod_platform.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+// These functions calculates dotproduct of two vectors.
+
+/**@{*/
+/**
+ * @brief      dot product of two 16 bit vectors
+ * Dot product calculation for two signed 16 bit arrays: *dest += (src1[i] * src2[i]) >> (15-shift); i= [0..N)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] src1  source array 1
+ * @param[in] src2  source array 2
+ * @param dest  destination pointer
+ * @param[in] len   length of input arrays
+ * @param[in] shift shift of the result.
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
+esp_err_t dsps_dotprod_s16_ae32(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
+esp_err_t dsps_dotprod_s16_arp4(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
+/**@}*/
+
+
+/**@{*/
+/**
+ * @brief      dot product of two float vectors
+ * Dot product calculation for two floating point arrays: *dest += (src1[i] * src2[i]); i= [0..N)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] src1  source array 1
+ * @param[in] src2  source array 2
+ * @param dest  destination pointer
+ * @param[in] len   length of input arrays
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len);
+esp_err_t dsps_dotprod_f32_ae32(const float *src1, const float *src2, float *dest, int len);
+esp_err_t dsps_dotprod_f32_aes3(const float *src1, const float *src2, float *dest, int len);
+esp_err_t dsps_dotprod_f32_arp4(const float *src1, const float *src2, float *dest, int len);
+/**@}*/
+
+/**@{*/
+/**
+ * @brief      dot product of two float vectors with step
+ * Dot product calculation for two floating point arrays: *dest += (src1[i*step1] * src2[i*step2]); i= [0..N)
+ * The extension (_ansi) use ANSI C and could be compiled and run on any platform.
+ * The extension (_ae32) is optimized for ESP32 chip.
+ *
+ * @param[in] src1  source array 1
+ * @param[in] src2  source array 2
+ * @param dest  destination pointer
+ * @param[in] len   length of input arrays
+ * @param[in] step1 step over elements in first array
+ * @param[in] step2 step over elements in second array
+ * @return
+ *      - ESP_OK on success
+ *      - One of the error codes from DSP library
+ */
+esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
+esp_err_t dsps_dotprode_f32_ae32(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
+esp_err_t dsps_dotprode_f32_arp4(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
+/**@}*/
+
+#ifdef __cplusplus
+}
+#endif
+
+#if CONFIG_DSP_OPTIMIZED
+
+#if (dsps_dotprod_s16_ae32_enabled == 1)
+#define dsps_dotprod_s16 dsps_dotprod_s16_ae32
+#elif (dsps_dotprod_s16_arp4_enabled == 1)
+#define dsps_dotprod_s16 dsps_dotprod_s16_arp4
+#else
+#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
+#endif // dsps_dotprod_s16_ae32_enabled
+
+#if (dsps_dotprod_f32_aes3_enabled == 1)
+#define dsps_dotprod_f32 dsps_dotprod_f32_aes3
+#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
+#elif (dsps_dotprod_f32_arp4_enabled == 1)
+#define dsps_dotprod_f32 dsps_dotprod_f32_arp4
+#define dsps_dotprode_f32 dsps_dotprode_f32_arp4
+#elif (dotprod_f32_ae32_enabled == 1)
+#define dsps_dotprod_f32 dsps_dotprod_f32_ae32
+#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
+#else
+#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
+#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
+#endif // dsps_dotprod_f32_ae32_enabled
+
+#else // CONFIG_DSP_OPTIMIZED
+#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
+#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
+#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
+#endif // CONFIG_DSP_OPTIMIZED
+
+#endif // _DSPI_DOTPROD_H_
--- a/managed_components/espressif__esp-dsp/modules/dotprod/include/dsps_dotprod_platform.h
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/include/dsps_dotprod_platform.h
@@ -0,0 +1,42 @@
+#ifndef _dsps_dotprod_platform_H_
+#define _dsps_dotprod_platform_H_
+
+#include "sdkconfig.h"
+
+#ifdef __XTENSA__
+#include <xtensa/config/core-isa.h>
+#include <xtensa/config/core-matmap.h>
+
+
+#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
+
+#define dotprod_f32_ae32_enabled  1
+#define dotprode_f32_ae32_enabled 1
+
+#endif //
+
+#if ((XCHAL_HAVE_LOOPS == 1) && (XCHAL_HAVE_MAC16 == 1))
+
+#define dsps_dotprod_s16_ae32_enabled 1
+
+#endif //
+#endif // __XTENSA__
+
+
+#if CONFIG_IDF_TARGET_ESP32S3
+#define dsps_dotprod_s16_aes3_enabled 1
+#define dsps_dotprod_f32_aes3_enabled 1
+#endif
+
+#if CONFIG_IDF_TARGET_ESP32P4
+#ifdef CONFIG_DSP_OPTIMIZED
+#define dsps_dotprod_s16_arp4_enabled 1
+#define dsps_dotprod_f32_arp4_enabled 1
+#else
+#define dsps_dotprod_s16_arp4_enabled 0
+#define dsps_dotprod_f32_arp4_enabled 0
+#endif // CONFIG_DSP_OPTIMIZED
+#endif
+
+
+#endif // _dsps_dotprod_platform_H_
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprod_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprod_f32.c
@@ -0,0 +1,167 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "esp_dsp.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dsps_dotprod.h"
+#include "dsp_tests.h"
+
+TEST_CASE("dsps_dotprod_f32_aexx functionality", "[dsps]")
+{
+    float check_value = 1235;
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(0, z[1]);
+    }
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 1;
+        y[i] = 3;
+    }
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(i * 3, z[1]);
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+TEST_CASE("dsps_dotprod_f32_aexx benchmark", "[dsps]")
+{
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+    printf("Benchmark dsps_dotprod_f32_aexx - x=%8.8"PRIx32", y=%8.8"PRIx32", len=%8.8x\n", (uint32_t)x, (uint32_t)y, 1024);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprod_f32(x, y, &z[1], 1024);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dsps_dotprod_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
+    float min_exec = 1024;
+    float max_exec = 6 * 1024;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+
+TEST_CASE("dsps_dotprod_f32_ansi functionality", "[dsps]")
+{
+    float check_value = 1235;
+    int max_N = 1024;
+    float *x = (float *)malloc(max_N * sizeof(float));
+    float *y = (float *)malloc(max_N * sizeof(float));
+    float *z = (float *)malloc(max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(0, z[1]);
+    }
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 1;
+        y[i] = 3;
+    }
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(i * 3, z[1]);
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+TEST_CASE("dsps_dotprod_f32_ansi benchmark", "[dsps]")
+{
+    int max_N = 1024;
+    float *x = (float *)malloc(max_N * sizeof(float));
+    float *y = (float *)malloc(max_N * sizeof(float));
+    float *z = (float *)malloc(max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprod_f32_ansi(x, y, &z[1], 1024);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dsps_dotprod_f32_ansi - %f per sample + overhead.\n", cycles);
+    float min_exec = 1024;
+    float max_exec = 20 * 1024;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprod_s16.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprod_s16.c
@@ -0,0 +1,216 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dsps_dotprod.h"
+#include "dsp_tests.h"
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dsps_dotprod_s16_ansi functionality", "[dsps]")
+{
+    int16_t check_value = 1235;
+    int max_N = 1024;
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    // Check result == 0
+    for (int i = 4; i < 1024; i++) {
+        esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, 0);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(0, z[1]);
+    }
+
+    int16_t val_x = 0x080;
+    int16_t val_y = 0x100;
+    int16_t val_shift = 0;
+
+    for (int i = 0; i < max_N; i++) {
+        x[i] = val_x;
+        y[i] = val_y;
+    }
+
+    // We check that dotproduct working with shift = 0;
+    for (int i = 4 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
+
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
+    }
+    val_shift = 2;
+    for (int i = 4 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
+
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(((long long)i * ((long long)val_x * (long long)val_y) + ((long long)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+// Test dsps_dotprod_s16_ansi function
+TEST_CASE("dsps_dotprod_s16_aexx functionality", "[dsps]")
+{
+    int16_t check_value = 1235;
+    int max_N = 1024;
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    // Check result == 0
+    for (int i = 4 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, 0);
+        {
+            TEST_ASSERT_EQUAL(status, ESP_OK);
+            TEST_ASSERT_EQUAL(check_value, z[0]);
+            TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+            TEST_ASSERT_EQUAL(0, z[1]);
+        }
+    }
+
+    int16_t val_x = 0x080;
+    int16_t val_y = 0x100;
+    int16_t val_shift = 0;
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = val_x;
+        y[i] = val_y;
+    }
+    // We check that dotproduct working with shift = 0;
+    for (int i = 4 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
+        {
+            TEST_ASSERT_EQUAL(status, ESP_OK);
+            TEST_ASSERT_EQUAL(check_value, z[0]);
+            TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+            TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
+        }
+    }
+    val_shift = 2;
+    for (int i = 4 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
+        {
+            TEST_ASSERT_EQUAL(status, ESP_OK);
+            TEST_ASSERT_EQUAL(check_value, z[0]);
+            TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+            TEST_ASSERT_EQUAL((i * (val_x * val_y) + ((int)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
+        }
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
+TEST_CASE("dsps_dotprod_s16 benchmark", "[dsps]")
+{
+    int max_N = 1024;
+
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0x100;
+        y[i] = 0x200;
+    }
+    // Disable interrupt to get exect count
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprod_s16(x, y, &z[1], 1024, 0);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
+    float min_exec = 256;
+    float max_exec = 8 * 1024;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+TEST_CASE("dsps_dotprod_s16_ansi benchmark", "[dsps]")
+{
+    int max_N = 1024;
+
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0x100;
+        y[i] = 0x200;
+    }
+    // Disable interrupt to get exect count
+
+    portENTER_CRITICAL(&testnlock);
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprod_s16_ansi(x, y, &z[1], 1024, 0);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+    portEXIT_CRITICAL(&testnlock);
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
+    float min_exec = 1024 * 10;
+    float max_exec = 1024 * 30;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprode_f32.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dotprode_f32.c
@@ -0,0 +1,165 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dsps_dotprod.h"
+#include "dsp_tests.h"
+
+TEST_CASE("dsps_dotprode_f32 functionality", "[dsps]")
+{
+    float check_value = 1235;
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(0, z[1]);
+    }
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 1;
+        y[i] = 3;
+    }
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(i * 3, z[1]);
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+TEST_CASE("dsps_dotprode_f32 benchmark", "[dsps]")
+{
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprode_f32(x, y, &z[1], 1024, 1, 1);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (repeat_count);
+    printf("Benchmark dsps_dotprode_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
+    float min_exec = 1024;
+    float max_exec = 6 * 1024;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+
+TEST_CASE("dsps_dotprode_f32_ansi functionality", "[dsps]")
+{
+    float check_value = 1235;
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    z[0] = check_value;
+    z[2] = check_value + 1;
+
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(0, z[1]);
+    }
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 1;
+        y[i] = 3;
+    }
+    for (int i = 1 ; i < 1024 ; i++) {
+        esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
+        TEST_ASSERT_EQUAL(status, ESP_OK);
+        TEST_ASSERT_EQUAL(check_value, z[0]);
+        TEST_ASSERT_EQUAL(check_value + 1, z[2]);
+        TEST_ASSERT_EQUAL(i * 3, z[1]);
+    }
+
+    free(x);
+    free(y);
+    free(z);
+}
+
+TEST_CASE("dsps_dotprode_f32_ansi benchmark", "[dsps]")
+{
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+
+    for (int i = 0 ; i < max_N ; i++) {
+        x[i] = 0;
+        y[i] = 1000;
+    }
+
+    unsigned int start_b = dsp_get_cpu_cycle_count();
+    int repeat_count = 1024;
+    for (int i = 0 ; i < repeat_count ; i++) {
+        dsps_dotprode_f32_ansi(x, y, &z[1], 1024, 1, 1);
+    }
+    unsigned int end_b = dsp_get_cpu_cycle_count();
+
+    float total_b = end_b - start_b;
+    float cycles = total_b / (1024 * repeat_count);
+    printf("Benchmark dsps_dotprode_f32_ansi - %f per sample + overhead.\n", cycles);
+    float min_exec = 5;
+    float max_exec = 25;
+    TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_f32_ansi.c
@@ -0,0 +1,67 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_f32_ansi";
+
+TEST_CASE("dspi_dotprod_f32_ansi functionality", "[dspi]")
+{
+    float check_value1 = 336;
+    float check_value2 = 480;
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 1;
+        y[i] = i % 8 + 1;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    float result = -1;
+    dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
+    ESP_LOGI(TAG, "result 1 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
+    ESP_LOGI(TAG, "result 2 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
+    ESP_LOGI(TAG, "result 3 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
+    ESP_LOGI(TAG, "result 4 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_f32_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_f32_ansi.c
@@ -0,0 +1,68 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_f32_ansi";
+
+TEST_CASE("dspi_dotprod_off_f32_ansi functionality", "[dspi]")
+{
+    float check_value1 = 976;
+    float check_value2 = 1280;
+    float offset = 10;
+    int max_N = 1024;
+    float *x = (float *)memalign(16, max_N * sizeof(float));
+    float *y = (float *)memalign(16, max_N * sizeof(float));
+    float *z = (float *)memalign(16, max_N * sizeof(float));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 1;
+        y[i] = i % 8 + 1;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    float result = -1;
+    dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
+    ESP_LOGI(TAG, "result 1 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
+    ESP_LOGI(TAG, "result 2 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
+    ESP_LOGI(TAG, "result 3 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
+    ESP_LOGI(TAG, "result 4 = %f", result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s16_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s16_aes3.c
@@ -0,0 +1,107 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_s16";
+
+TEST_CASE("dspi_dotprod_off_s16_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+    int16_t offset = 7;
+
+    int max_N = 8192;
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s16 8x8");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
+        image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s16(&image1, &image2, &result, 8, 8, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s16 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s16(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s16 24x24");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s16(&image1, &image2, &result, 24, 24, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s16 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s16(&image1, &image2, &result, 32, 32, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_off_s16 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s16_ansi.c
@@ -0,0 +1,69 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_s16_ansi";
+
+TEST_CASE("dspi_dotprod_off_s16_ansi functionality", "[dspi]")
+{
+    int16_t check_value1 = 8676;
+    int16_t check_value2 = 8742;
+    int shift = 7;
+    int16_t offset = 11;
+
+    int max_N = 1024;
+    int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 255;
+        y[i] = i % 8 + 255;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    int16_t result = -1;
+    dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s8_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s8_aes3.c
@@ -0,0 +1,123 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_s8";
+
+TEST_CASE("dspi_dotprod_off_s8_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+    int8_t offset = 5;
+
+    int max_N = 16384;
+    int8_t *x = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
+    int8_t *y = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
+    int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
+
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s8 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s8 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s8(&image1, &image2, &result, 32, 32, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s8 48x48");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s8(&image1, &image2, &result, 48, 48, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s8 64x64");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s8(&image1, &image2, &result, 64, 64, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_s8 128x128");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_off_s8 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_s8_ansi.c
@@ -0,0 +1,70 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_s8_ansi";
+
+TEST_CASE("dspi_dotprod_off_s8_ansi functionality", "[dspi]")
+{
+    int8_t check_value1 = 98;
+    int8_t check_value2 = 106;
+    int shift = 7;
+    int8_t offset = 11;
+
+    int max_N = 1024;
+    int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
+    int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
+    int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 20;
+        y[i] = i % 8 + 20;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    int8_t result = -1;
+    dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u16_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u16_aes3.c
@@ -0,0 +1,107 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_u16";
+
+TEST_CASE("dspi_dotprod_off_u16_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+    uint16_t offset = 7;
+
+    int max_N = 8192;
+    uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+    uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+    uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u16 8x8");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
+        image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u16(&image1, &image2, &result, 8, 8, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u16 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u16(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u16 24x24");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u16(&image1, &image2, &result, 24, 24, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u16 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u16(&image1, &image2, &result, 32, 32, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_off_u16 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u16_ansi.c
@@ -0,0 +1,70 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_u16_ansi";
+
+TEST_CASE("dspi_dotprod_off_u16_ansi functionality", "[dspi]")
+{
+    uint16_t check_value1 = 8676;
+    uint16_t check_value2 = 8742;
+    int shift = 7;
+    uint16_t offset = 11;
+
+    int max_N = 1024;
+    uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 255;
+        y[i] = i % 8 + 255;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    uint16_t result = -1;
+    dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u8_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u8_aes3.c
@@ -0,0 +1,122 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_u8";
+
+TEST_CASE("dspi_dotprod_off_u8_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+    uint8_t offset = 7;
+
+    int max_N = 16384;
+    uint8_t *x = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+    uint8_t *y = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+    uint8_t *z = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u8 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u8 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u8(&image1, &image2, &result, 32, 32, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u8 48x48");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u8(&image1, &image2, &result, 48, 48, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u8 64x64");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u8(&image1, &image2, &result, 64, 64, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_off_u8 128x128");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_off_u8 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_off_u8_ansi.c
@@ -0,0 +1,70 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_off_u8_ansi";
+
+TEST_CASE("dspi_dotprod_off_u8_ansi functionality", "[dspi]")
+{
+    uint8_t check_value1 = 98;
+    uint8_t check_value2 = 106;
+    int shift = 7;
+    uint8_t offset = 11;
+
+    int max_N = 1024;
+    uint8_t *x = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    uint8_t *y = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    uint8_t *z = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 20;
+        y[i] = i % 8 + 20;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    uint8_t result = -1;
+    dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s16_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s16_aes3.c
@@ -0,0 +1,106 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_s16";
+
+TEST_CASE("dspi_dotprod_s16_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+
+    int max_N = 8192;
+    int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s16 8x8");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
+        image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s16(&image1, &image2, &result, 8, 8, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s16 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s16(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s16 24x24");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s16(&image1, &image2, &result, 24, 24, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s16 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
+        int16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s16(&image1, &image2, &result, 32, 32, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int16_t result_ref = -1;
+        dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_s16 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s16_ansi.c
@@ -0,0 +1,68 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_s16_ansi";
+
+TEST_CASE("dspi_dotprod_s16_ansi functionality", "[dspi]")
+{
+    int16_t check_value1 = 8321;
+    int16_t check_value2 = 8386;
+    int shift = 7;
+
+    int max_N = 1024;
+    int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
+    int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
+    int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 255;
+        y[i] = i % 8 + 255;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    int16_t result = -1;
+    dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s8_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s8_aes3.c
@@ -0,0 +1,121 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_s8";
+
+TEST_CASE("dspi_dotprod_s8_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+
+    int max_N = 16384;
+    int8_t *x = (int8_t *)memalign(16, max_N * sizeof(int8_t));
+    int8_t *y = (int8_t *)memalign(16, max_N * sizeof(int8_t));
+    int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s8 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s8 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s8(&image1, &image2, &result, 32, 32, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s8 48x48");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s8(&image1, &image2, &result, 48, 48, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s8 64x64");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s8(&image1, &image2, &result, 64, 64, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_s8 128x128");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        int8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        int8_t result_ref = -1;
+        dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_s8 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_s8_ansi.c
@@ -0,0 +1,68 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "";
+
+TEST_CASE("dspi_dotprod_s8_ansi functionality", "[dspi]")
+{
+    int8_t check_value1 = 67;
+    int8_t check_value2 = 73;
+    int shift = 7;
+
+    int max_N = 1024;
+    int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
+    int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
+    int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 20;
+        y[i] = i % 8 + 20;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    int8_t result = -1;
+    dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u16_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u16_aes3.c
@@ -0,0 +1,106 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_u16";
+
+TEST_CASE("dspi_dotprod_u16_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+
+    int max_N = 8192;
+    uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+    uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+    uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u16 8x8");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
+        image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u16(&image1, &image2, &result, 8, 8, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", (end_b - start_b));
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u16 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u16(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u16 24x24");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u16(&image1, &image2, &result, 24, 24, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u16 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
+        uint16_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u16(&image1, &image2, &result, 32, 32, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint16_t result_ref = -1;
+        dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_u16 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u16_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u16_ansi.c
@@ -0,0 +1,68 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_u16_ansi";
+
+TEST_CASE("dspi_dotprod_u16_ansi functionality", "[dspi]")
+{
+    uint16_t check_value1 = 8321;
+    uint16_t check_value2 = 8386;
+    int shift = 7;
+
+    int max_N = 1024;
+    uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 255;
+        y[i] = i % 8 + 255;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    uint16_t result = -1;
+    dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u8_aes3.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u8_aes3.c
@@ -0,0 +1,121 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+#include <malloc.h>
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "dspi_dotprod_u8";
+
+TEST_CASE("dspi_dotprod_u8_aexx functionality", "[dspi]")
+{
+    int shift = 2;
+
+    int max_N = 16384;
+    uint8_t *x = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+    uint8_t *y = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+    uint8_t *z = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
+
+    printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
+    for (size_t i = 0; i < max_N; i++) {
+        x[i] = i % 7;
+        y[i] = i % 7;
+        z[i] = 0;
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u8 16x16");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u8(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u8 32x32");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u8(&image1, &image2, &result, 32, 32, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 32, 32, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u8 48x48");
+        image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u8(&image1, &image2, &result, 48, 48, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 48, 48, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u8 64x64");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u8(&image1, &image2, &result, 64, 64, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 64, 64, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+    {
+        ESP_LOGI(TAG, "dspi_dotprod_u8 128x128");
+        image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
+        image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 8x8
+        uint8_t result = -1;
+        unsigned int start_b = dsp_get_cpu_cycle_count();
+        dspi_dotprod_u8(&image1, &image2, &result, 16, 16, shift);
+        unsigned int end_b = dsp_get_cpu_cycle_count();
+        ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
+        ESP_LOGI(TAG, "result 1 = %i", result);
+        uint8_t result_ref = -1;
+        dspi_dotprod_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
+        ESP_LOGI(TAG, "result ref = %i", result_ref);
+        TEST_ASSERT_EQUAL( result, result_ref);
+    }
+
+    ESP_LOGI(TAG, "dspi_dotprod_u8 done");
+    free(x);
+    free(y);
+    free(z);
+}
--- a/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u8_ansi.c
+++ b/managed_components/espressif__esp-dsp/modules/dotprod/test/test_dspi_dotprod_u8_ansi.c
@@ -0,0 +1,68 @@
+// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include "unity.h"
+#include "dsp_platform.h"
+#include "esp_log.h"
+
+#include "dspi_dotprod.h"
+#include "dsp_tests.h"
+
+static const char *TAG = "";
+
+TEST_CASE("dspi_dotprod_u8_ansi functionality", "[dspi]")
+{
+    uint8_t check_value1 = 67;
+    uint8_t check_value2 = 73;
+    int shift = 7;
+
+    int max_N = 1024;
+    uint8_t *x = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    uint8_t *y = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    uint8_t *z = (uint8_t *)malloc(max_N * sizeof(uint8_t));
+    for (size_t i = 0; i < 256; i++) {
+        x[i] = i % 8 + 20;
+        y[i] = i % 8 + 20;
+        z[i] = 0;
+    }
+    image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
+    image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
+    uint8_t result = -1;
+    dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 1 = %i", result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[1];
+    image2.data = &y[1];
+    result = -1;
+    dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 2 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+    image1.data = &x[image1.stride_x];
+    image2.data = &y[image2.stride_x];
+    result = -1;
+    dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 3 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value1);
+    image1.data = &x[image1.stride_x + 1];
+    image2.data = &y[image2.stride_x + 1];
+    result = -1;
+    dspi_dotprod_u8_ansi(&image1, &image2, &result, 4, 4, shift);
+    ESP_LOGI(TAG, "result 4 = %i", (int)result);
+    TEST_ASSERT_EQUAL( result, check_value2);
+
+    free(x);
+    free(y);
+    free(z);
+}