add some code

This commit is contained in:
2025-09-05 13:25:11 +08:00
parent 9ff0a99e7a
commit 3cf1229a85
8911 changed files with 2535396 additions and 0 deletions

View File

@@ -0,0 +1,75 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_3x3x1_f32_ae32_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_3x3x1_f32_ae32
.type dspm_mult_3x3x1_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_3x3x1_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3
entry a1, 16
movi a5, 0
movi a6, 3
lsi f13,a3, 0 // B[0]
lsi f14,a3, 4 // B[1]
lsi f15,a3, 8 // B[2]
// addi a2, a2, -12 // To compensate first increment
loopnez a6, loop_mac_3x3x1_end_m_ae32
wfr f0, a5
lsi f2, a2, 0
madd.s f0, f2, f13
lsi f3, a2, 4
madd.s f0, f3, f14
lsi f4, a2, 8
madd.s f0, f4, f15
addi a2, a2, 12
ssi f0, a4, 0
addi a4, a4, 4
loop_mac_3x3x1_end_m_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //

View File

@@ -0,0 +1,85 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_3x3x3_f32_ae32_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_3x3x3_f32_ae32
.type dspm_mult_3x3x3_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_3x3x3_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3 - internal loop for n
// a7 - 3 - external loop for M
entry a1, 16
movi a5, 0
movi a6, 3
movi a7, 3 // loop ccount
m_loop_3x3x3:
mov a12, a2 // A
mov a14, a4 // output pointer
lsi f12, a3, 0 // B[0][0]
lsi f13, a3, 12 // B[1][0]
lsi f14, a3, 24 // B[2][0]
loopnez a6, loop_mac_3x3x3_end_m_ae32
wfr f0, a5
lsi f2, a12, 0
madd.s f0, f2, f12
lsi f3, a12, 4
madd.s f0, f3, f13
lsi f4, a12, 8
madd.s f0, f4, f14
addi a12, a12, 12
ssi f0, a14, 0
addi a14, a14, 12
loop_mac_3x3x3_end_m_ae32:
addi a3, a3, 4 // increment input pointer B
addi a4, a4, 4
addi a7, a7, -1
bnez a7, m_loop_3x3x3
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //

View File

@@ -0,0 +1,77 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_4x4x1_f32_ae32_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_4x4x1_f32_ae32
.type dspm_mult_4x4x1_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_4x4x1_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3
entry a1, 16
movi a5, 0
movi a6, 4
lsi f12,a3, 0 // B[0]
lsi f13,a3, 4 // B[1]
lsi f14,a3, 8 // B[2]
lsi f15,a3, 12 // B[3]
loopnez a6, loop_mac_4x4x1_end_m_ae32
wfr f0, a5
lsi f2, a2, 0
madd.s f0, f2, f12
lsi f3, a2, 4
madd.s f0, f3, f13
lsi f4, a2, 8
madd.s f0, f4, f14
lsi f5, a2, 12
madd.s f0, f5, f15
addi a2, a2, 16
ssi f0, a4, 0
addi a4, a4, 4
loop_mac_4x4x1_end_m_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //

View File

@@ -0,0 +1,88 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_4x4x4_f32_ae32_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_4x4x4_f32_ae32
.type dspm_mult_4x4x4_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_4x4x4_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 4 - internal loop for n
// a7 - 4 - external loop for M
entry a1, 16
movi a5, 0
movi a6, 4
movi a7, 4 // loop ccount
m_loop_4x4x4:
mov a12, a2 // A
mov a14, a4 // output pointer
lsi f12, a3, 0 // B[0][0]
lsi f13, a3, 16 // B[1][0]
lsi f14, a3, 32 // B[2][0]
lsi f15, a3, 48 // B[3][0]
loopnez a6, loop_mac_4x4x4_end_m_ae32
wfr f0, a5
lsi f2, a12, 0
madd.s f0, f2, f12
lsi f3, a12, 4
madd.s f0, f3, f13
lsi f4, a12, 8
madd.s f0, f4, f14
lsi f5, a12, 12
madd.s f0, f5, f15
addi a12, a12, 16
ssi f0, a14, 0
addi a14, a14, 16
loop_mac_4x4x4_end_m_ae32:
addi a3, a3, 4 // increment input pointer B
addi a4, a4, 4
addi a7, a7, -1
bnez a7, m_loop_4x4x4
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //

View File

@@ -0,0 +1,88 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_ae32_enabled == 1)
#include "dsps_dotprode_f32_m_ae32.S"
// This is matrix multiplication function for ESP32 processor.
.text
.align 4
.global dspm_mult_ex_f32_ae32
.global .dspm_mult_ex_f32_ae32_body
.type dspm_mult_ex_f32_ae32,@function
// The function implements the following C code:
//esp_err_t dspm_mult_ex_f32_ae32(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd);
dspm_mult_ex_f32_ae32:
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// A_padding - a14
// B_padding - a15
// C_padding - a8
// a10 = 4
// a9 - counter loop1: 0..m
// a11 - counter loop2: 0..k
// a12 - A
// a13 - B
// a4 - C
entry a1, 16
// Array increment for floating point data should be 4
.dspm_mult_ex_f32_ae32_body:
l32i.n a14, a1, 16 // A_padding
l32i.n a15, a1, 20 // B_padding
l32i.n a8, a1, 24 // C_padding
add a14, a14, a6 // A_step = A_padding + A_cols (n)
add a15, a15, a7 // B_step = B_padding + B_cols (k)
slli a15, a15, 2 // Pointer increment for B (B_step * 4)
movi.n a10, 4 // Increment = 4
movi.n a9, 0 // counter loop1
const.s f3, 0 // Innitial state of accumulator, f3 = 0
.mult_ex_loop1:
movi.n a11, 0 // reset counter for loop2
.mult_ex_loop2:
// Clear initial state of the result register
// a2 - A
// a3 - B
// a6 - n
// a10 - step == 4 bytes
mov a12, a2 // load A
addx4 a13, a11, a3 // loop count to pointer value
mov.s f1, f3 // reset f1
// Calculating dotproduct...
//dotprode_f32_ae32( x1 x2 count step1 step2)
dotprode_f32_ae32 a12, a13, a6, a10, a15;
addi.n a11, a11, 1 // Increment loop2 counter
ssip f1, a4, 4 // Store restul from f1 to memory at a4 and increment a4
// check loop 2
blt a11, a7, .mult_ex_loop2
// check loop 1
addx4 a2, a14, a2 // A += (A_step << 2)
addx4 a4, a8, a4 // output += (C_padding << 2)
addi.n a9, a9, 1 // Increment loop1 counter
blt a9, a5, .mult_ex_loop1
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //dspm_mult_f32_ae32_enabled

View File

@@ -0,0 +1,166 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_aes3_enabled == 1)
// This is matrix multiplication function for ESP32S3 processor.
.text
.align 4
.global dspm_mult_ex_f32_aes3
.global .dspm_mult_ex_f32_ae32_body
.type dspm_mult_ex_f32_aes3,@function
// The function implements the following C code:
//esp_err_t dspm_mult_ex_f32_ansi(const float* A, const float* B, float* C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
//{
// const int A_step = A_cols + A_padding;
// const int B_step = B_cols + B_padding;
// const int C_step = B_cols + C_padding;
//
// for (int i = 0; i < A_rows; i++) {
// for (int j = 0; j < B_cols; j++) {
// C[i * C_step + j] = A[i * A_step] * B[j];
// for (int s = 1; s < A_cols; s++) {
// C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
// }
// }
// }
// return ESP_OK;
//}
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// A_padd = a8
// B_padd = a9
// C_padd = a15
dspm_mult_ex_f32_aes3:
entry a1, 16
l32i.n a8, a1, 16 // A_padding
l32i.n a9, a1, 20 // B_padding
l32i.n a15, a1, 24 // C_padding
// Check if we can use S3 memory model
// Check matrices dimensions and paddings all of them must be divisible by 4
or a12, a5, a6 // a12 = m OR n
or a14, a8, a9 // a14 = A_padd OR B_padd
or a12, a12, a7 // a12 = m OR n OR k
or a14, a14, a15 // a14 = A_padd OR B_padd OR C_padd
or a12, a12, a14 // a12 = m OR n OR k OR A_padd OR B_padd OR C_padd
movi.n a11, 3 // a11 = byte mask
and a12, a12, a11 // a12 = a12 AND 3 (byte mask)
// Check alignment of A B C matrices data pointers
movi.n a11, 15 // a11 = byte mask
or a10, a3, a2 // a10 = A pointer OR B pointer
or a10, a10, a4 // a10 = A pointer OR B pointer OR C pointer
and a10, a10, a11 // a10 = a10 AND 15 (byte mask)
or a12, a12, a10 // a12 = mat_dim OR alignment
beqz a12, .s3_mmult_ex // if zero, jump to s3_mult
// Call Esp32 function
J .dspm_mult_ex_f32_ae32_body
.s3_mmult_ex:
// f0, f1, f2, f3 - multiplication result
// f4, f5, f6, f7 - input for matrix B
// f8, f9, f10,f11- input far matrix A
movi.n a14, 0 // B pointer increment for y loop
add a15, a15, a7 // a15 = k + C_padding
slli a10, a15, 2 // a10 = (K + C_padding) * 4 - step for rows
mov a15, a9 // a15 = B_padd
slli a15, a15, 2 // a15 = B_padd * 4
add a7, a7, a9 // a7 = k + B_padding
slli a12, a7, 2 // a12 = (K + B_padding) * 4 - step for rows
srli a11, a6, 2 // a11 = n / 4
addi.n a11, a11, -1 // a11 = innter loop count (n)
slli a6, a8, 2 // a6 = A_padding *4 = A_pointer step
mov a13, a3 // backup B pointer
mov a7, a4 // backup C pointer
.loop_x_mult_ex:
movi.n a9, 0 // reset loop1 counter
mov a8, a2 // move A matrix back to the beginning
.loop_y_mult_ex:
add a13, a3, a14 // Reload Y pointer to Y11 + A14
EE.LDF.128.IP f11, f10, f9, f8, a8, 16 // Load A values: X11, X12, X13, X14
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
mul.s f0, f4, f8 // f0 = X11*Y11
mul.s f1, f5, f8 // f1 = X12*Y11
mul.s f2, f6, f8 // f2 = X13*Y11
mul.s f3, f7, f8 // f3 = X14*Y11
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
madd.s f0, f4, f9 // f0 = X11*Y11 + X12*Y21
madd.s f1, f5, f9 // f1 = X11*Y12 + X12*Y22
madd.s f2, f6, f9 // f2 = X11*Y13 + X12*Y23
madd.s f3, f7, f9 // f3 = X11*Y14 + X12*Y24
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
madd.s f0, f4, f10 // f0 = X11*Y11 + X12*Y21 + X13*Y31
madd.s f1, f5, f10 // f1 = X11*Y12 + X12*Y22 + X13*Y32
madd.s f2, f6, f10 // f2 = X11*Y13 + X12*Y23 + X13*Y33
madd.s f3, f7, f10 // f3 = X11*Y14 + X12*Y24 + X13*Y34
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
madd.s f0, f4, f11 // f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
madd.s f1, f5, f11 // f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
madd.s f2, f6, f11 // f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
madd.s f3, f7, f11 // f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
loopnez a11, .iner_loop_mult_ex
EE.LDF.128.IP f11, f10, f9, f8, a8, 16 // Load A values: X15, X16, X17, X18
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
madd.s f0, f4, f8 // f0 += X15*Y51
madd.s f1, f5, f8 // f1 += X15*Y52
madd.s f2, f6, f8 // f2 += X15*Y53
madd.s f3, f7, f8 // f3 += X15*Y54
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
madd.s f0, f4, f9 // f0 += X16*Y61
madd.s f1, f5, f9 // f1 += X16*Y62
madd.s f2, f6, f9 // f2 += X16*Y63
madd.s f3, f7, f9 // f3 += X16*Y64
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
madd.s f0, f4, f10 // f0 =
madd.s f1, f5, f10 // f1 =
madd.s f2, f6, f10 // f2 =
madd.s f3, f7, f10 // f3 =
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
madd.s f0, f4, f11 // f0 =
madd.s f1, f5, f11 // f1 =
madd.s f2, f6, f11 // f2 =
madd.s f3, f7, f11 // f3 =
.iner_loop_mult_ex:
EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result
addi.n a9, a9, 1 // Increment loop1 counter
add a8, a8, a6 // (increase A pointer by A_padding * 4 times)
blt a9, a5, .loop_y_mult_ex
addi.n a7, a7, 16 // Increase C pinter by 16
mov a4, a7
addi.n a14, a14, 16 // Increase B pointer by 16
addi.n a15, a15, 16 // Increment loop2 counter by 16
blt a15, a12, .loop_x_mult_ex
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //dspm_mult_f32_aes3_enabled

View File

@@ -0,0 +1,57 @@
/*
* SPDX-FileCopyrightText: 2023 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include "dspm_mult.h"
// Matrix A(m,n), m - amount or rows, n - amount of columns
// C(m,k) = A(m,n)*B(n,k)
// c(i * c_step,j) = sum(a(i * a_step,s)*b(s * b_step,j)) , s=1..n
esp_err_t dspm_mult_ex_f32_ansi(const float *A, const float *B, float *C, int A_rows, int A_cols, int B_cols, int A_padding, int B_padding, int C_padding)
{
if (NULL == A) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (NULL == B) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (NULL == C) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (A_rows <= 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (A_cols <= 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (B_cols <= 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (A_padding < 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (B_padding < 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
if (C_padding < 0) {
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
}
const int A_step = A_cols + A_padding;
const int B_step = B_cols + B_padding;
const int C_step = B_cols + C_padding;
for (int i = 0; i < A_rows; i++) {
for (int j = 0; j < B_cols; j++) {
C[i * C_step + j] = A[i * A_step] * B[j];
for (int s = 1; s < A_cols; s++) {
C[i * C_step + j] += A[i * A_step + s] * B[s * B_step + j];
}
}
}
return ESP_OK;
}

View File

@@ -0,0 +1,115 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_arp4_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_ex_f32_arp4
.global .dspm_mult_ex_f32_arp4_body
.type dspm_mult_ex_f32_arp4,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k, int A_padd, int B_padd, int C_padd)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_ex_f32_arp4:
// A - a2: a0
// B - a3: a1
// C - a4: a2
// m - a5: a3
// n - a6: a4
// k - a7: a5
// a8:a6 = n*4
// a10:t0 = 4
// a9:a7 - counter loop1: 0..m
// a11:t1 - counter loop2: 0..k
// a12:t2 - A
// a13:t3 - B
// a14:t4
// a15:t5
add sp,sp,-16
// Array increment for floating point data should be 4
.dspm_mult_ex_f32_arp4_body:
mv t5, a7
add t4, a6, a4 // A_step = A_padding + A_cols (n)
add t5, t5, a5 // B_step = B_padding + B_cols (k)
slli t5, t5, 2 // Pointer increment for B (B_step * 4)
slli t4, t4, 2 // A_step << 2
lw a6, 16(sp) // C_padding from stack
slli a6, a6, 2 // C_step << 2
li a7, 0 // counter loop1
.dpf_loop1:
li t1, 0 // reset counter for loop2
.dpf_loop2:
// Clear initial state of the result register
// a2 - A
// a3 - B
// a6 - n
// a10 - step == 4 bytes
// a8 - step n*4
mv t2, a0 // load A
slli t3, t1, 2 // loop count to pointer value
add t3, a1, t3 // load A
fmv.w.x fa2,zero // reset fa2
// Calculating dotproduct...
esp.lp.setup 0, a4, .matrix_mul_loop
flw fa0, 0(t2)
add t2, t2, 4
flw fa1, 0(t3)
fmadd.s fa2, fa1, fa0, fa2
.matrix_mul_loop: add t3, t3, t5
fsw fa2, 0(a2)
addi a2, a2, 4 // increment a2 for next time
// check loop 2
addi t1, t1, 1 // Increment loop2 counter
blt t1, a5, .dpf_loop2
// check loop 1
add a0, a0, t4 // A += (A_step << 2)
add a2, a2, a6 // output += (C_padding << 2)
add a7, a7, 1 // Increment loop1 counter
blt a7, a3, .dpf_loop1
// Exit
li a0, 0 // return status ESP_OK
add sp,sp,16
ret
#endif //dspm_mult_ex_f32_arp4_enabled

View File

@@ -0,0 +1,104 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_ae32_enabled == 1)
#include "dsps_dotprode_f32_m_ae32.S"
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_f32_ae32
.global .dspm_mult_f32_ae32_body
.type dspm_mult_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_f32_ae32:
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// a8 = n*4
// a10 = 4
// a9 - counter loop1: 0..m
// a11 - counter loop2: 0..k
// a12 - A
// a13 - B
// a4 - C
entry a1, 16
// Array increment for floating point data should be 4
.dspm_mult_f32_ae32_body:
slli a8, a6, 2 // Pointer increment for A
slli a15,a7, 2 // Pointer increment for B
movi.n a14, 0 // Innitial state of accumulator f1
movi.n a10, 4 // Increment = 4
movi.n a9, 0 // counter loop1
.dpf_loop1:
movi.n a11, 0 // reset counter for loop2
.dpf_loop2:
// Clear initial state of the result register
// a2 - A
// a3 - B
// a6 - n
// a10 - step == 4 bytes
// a8 - step n*4
mov a12, a2 // load A
slli a13, a11, 2 // loop count to pointer value
add.n a13, a3, a13 // load A
wfr f1, a14 // reset f1
// Calculating dotproduct...
dotprode_f32_ae32 a12, a13, a6, a10, a15;
ssi f1, a4, 0 // Store result from f1 to memory at a4
addi a4, a4, 4 // increment a4 for next time
// check loop 2
addi a11, a11, 1 // Increment loop2 counter
blt a11, a7, .dpf_loop2
// check loop 1
add.n a2, a2, a8 // Increment A, A = A[i*n]
addi a9, a9, 1 // Increment loop1 counter
blt a9, a5, .dpf_loop1
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //dspm_mult_f32_ae32_enabled

View File

@@ -0,0 +1,150 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_aes3_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_f32_aes3
.global .dspm_mult_f32_ae32_body
.type dspm_mult_f32_aes3,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_f32_aes3:
entry a1, 16
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// Ccheck if we can use S3 memory model:
or a12, a5, a6
or a12, a7, a12
movi.n a11, 3
and a12, a12, a11
movi.n a11, 15
or a10, a3, a2
or a10, a10, a4
and a10, a10, a11
or a12, a12, a10
beqz a12, .s3_mmult
// Call Esp32 function
J .dspm_mult_f32_ae32_body
.s3_mmult:
// f0, f1, f2, f3 - multiplication result
// f4, f5, f6, f7 - input for matrix B
// f8, f9, f10,f11- input far matrix A
movi.n a14, 0
slli a12, a7, 2 // a12 = K*4 - step for rows
slli a10, a7, 2 // a10 = K*4 - step for rows
srli a11, a6, 2 // N count
addi.n a11, a11, -1
movi.n a15, 0
mov a13, a3
mov a7, a4
.loop_x_aes3:
movi.n a9, 0
mov a8, a2 // A matirx
.loop_y_aes3:
add a13, a3, a14 // Reload Y pointer to Y11 + A14
EE.LDF.128.IP f11, f10, f9, f8, a8, 16 // Load A values: X11, X12, X13, X14
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y11, Y12, Y13, Y14
mul.s f0, f4, f8 // f0 = X11*Y11
mul.s f1, f5, f8 // f1 = X12*Y11
mul.s f2, f6, f8 // f2 = X13*Y11
mul.s f3, f7, f8 // f3 = X14*Y11
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y21, Y22, Y23, Y24
madd.s f0, f4, f9 // f0 = X11*Y11 + X12*Y21
madd.s f1, f5, f9 // f1 = X11*Y12 + X12*Y22
madd.s f2, f6, f9 // f2 = X11*Y13 + X12*Y23
madd.s f3, f7, f9 // f3 = X11*Y14 + X12*Y24
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y31, Y32, Y33, Y34
madd.s f0, f4, f10 // f0 = X11*Y11 + X12*Y21 + X13*Y31
madd.s f1, f5, f10 // f1 = X11*Y12 + X12*Y22 + X13*Y32
madd.s f2, f6, f10 // f2 = X11*Y13 + X12*Y23 + X13*Y33
madd.s f3, f7, f10 // f3 = X11*Y14 + X12*Y24 + X13*Y34
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y41, Y42, Y43, Y44
madd.s f0, f4, f11 // f0 = X11*Y11 + X12*Y21 + X13*Y31 + X14*Y41
madd.s f1, f5, f11 // f1 = X11*Y12 + X12*Y22 + X13*Y32 + X14*Y42
madd.s f2, f6, f11 // f2 = X11*Y13 + X12*Y23 + X13*Y33 + X14*Y43
madd.s f3, f7, f11 // f3 = X11*Y14 + X12*Y24 + X13*Y34 + X14*Y44
loopnez a11, .loop_end_m_aes3
EE.LDF.128.IP f11, f10, f9, f8, a8, 16 // Load A values: X15, X16, X17, X18
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y51, Y52, Y53, Y54
madd.s f0, f4, f8 // f0 += X15*Y51
madd.s f1, f5, f8 // f1 += X15*Y52
madd.s f2, f6, f8 // f2 += X15*Y53
madd.s f3, f7, f8 // f3 += X15*Y54
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y61, Y62, Y63, Y64
madd.s f0, f4, f9 // f0 += X16*Y61
madd.s f1, f5, f9 // f1 += X16*Y62
madd.s f2, f6, f9 // f2 += X16*Y63
madd.s f3, f7, f9 // f3 += X16*Y64
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y71, Y72, Y73, Y74
madd.s f0, f4, f10 // f0 =
madd.s f1, f5, f10 // f1 =
madd.s f2, f6, f10 // f2 =
madd.s f3, f7, f10 // f3 =
EE.LDF.128.XP f7, f6, f5, f4, a13, a12 // Load B value: Y81, Y82, Y83, Y84
madd.s f0, f4, f11 // f0 =
madd.s f1, f5, f11 // f1 =
madd.s f2, f6, f11 // f2 =
madd.s f3, f7, f11 // f3 =
.loop_end_m_aes3:
EE.STF.128.XP f3, f2, f1, f0, a4, a10 // Store result
addi a9, a9, 1 // Increment loop1 counter
blt a9, a5, .loop_y_aes3
addi.n a7, a7, 16
mov a4, a7
addi.n a14, a14, 16 // B shift for 4
addi a15, a15, 16 // Increment loop1 counter
blt a15, a12, .loop_x_aes3
movi.n a2, 0 // return status ESP_OK
retw.n
#endif //dspm_mult_f32_aes3_enabled

View File

@@ -0,0 +1,33 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dsps_dotprod.h"
#include "dspm_mult.h"
// Matrinx A(m,n), m - amount or rows, n - amount of columns
// C(m,k) = A(m,n)*B(n,k)
// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k)
{
for (int i = 0 ; i < m ; i++) {
for (int j = 0 ; j < k ; j++) {
C[i * k + j] = A[i * n] * B[j];
for (int s = 1; s < n ; s++) {
C[i * k + j] += A[i * n + s] * B[s * k + j];
}
}
}
return ESP_OK;
}

View File

@@ -0,0 +1,109 @@
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "dspm_mult_platform.h"
#if (dspm_mult_f32_arp4_enabled == 1)
// This is matrix multipliction function for ESP32 processor.
.text
.align 4
.global dspm_mult_f32_arp4
.global .dspm_mult_f32_arp4_body
.type dspm_mult_f32_arp4,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_f32_arp4:
// A - a2: a0
// B - a3: a1
// C - a4: a2
// m - a5: a3
// n - a6: a4
// k - a7: a5
// a8:a6 = n*4
// a10:t0 = 4
// a9:a7 - counter loop1: 0..m
// a11:t1 - counter loop2: 0..k
// a12:t2 - A
// a13:t3 - B
// a14:t4
// a15:t5
add sp,sp,-16
// Array increment for floating point data should be 4
.dspm_mult_f32_arp4_body:
slli a6, a4, 2 // Pointer increment for A
slli t5,a5, 2 // Pointer increment for B
li t4, 0 // Innitial state of accumulator f1
li t0, 4 // Increment = 4
li a7, 0 // counter loop1
.dpf_loop1:
li t1, 0 // reset counter for loop2
.dpf_loop2:
// Clear initial state of the result register
// a2 - A
// a3 - B
// a6 - n
// a10 - step == 4 bytes
// a8 - step n*4
mv t2, a0 // load A
slli t3, t1, 2 // loop count to pointer value
add t3, a1, t3 // load A
fmv.w.x fa2,zero // reset fa2
// Calculating dotproduct...
esp.lp.setup 0, a4, .matrix_mul_loop
flw fa0, 0(t2)
add t2, t2, t0
flw fa1, 0(t3)
fmadd.s fa2, fa1, fa0, fa2
.matrix_mul_loop: add t3, t3, t5
fsw fa2, 0(a2)
addi a2, a2, 4 // increment a2 for next time
// check loop 2
addi t1, t1, 1 // Increment loop2 counter
blt t1, a5, .dpf_loop2
// check loop 1
add a0, a0, a6 // Increment A, A = A[i*n]
add a7, a7, 1 // Increment loop1 counter
blt a7, a3, .dpf_loop1
// Exit
mv a0, a6 // return status ESP_OK
add sp,sp,16
ret
#endif //dspm_mult_f32_arp4_enabled