add some code
This commit is contained in:
@@ -0,0 +1,90 @@
|
||||
// Copyright 2018-2022 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsp_common_H_
|
||||
#define _dsp_common_H_
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include "dsp_err.h"
|
||||
#include "esp_idf_version.h"
|
||||
|
||||
#if defined(__XTENSA__) || defined(__riscv)
|
||||
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
|
||||
#include "esp_cpu.h"
|
||||
#else
|
||||
#include "soc/cpu.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief check power of two
|
||||
* The function check if the argument is power of 2.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @return
|
||||
* - true if x is power of two
|
||||
* - false if no
|
||||
*/
|
||||
bool dsp_is_power_of_two(int x);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Power of two
|
||||
* The function return power of 2 for values 2^N.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @return
|
||||
* - power of two
|
||||
*/
|
||||
int dsp_power_of_two(int x);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Logginng for esp32s3 TIE core
|
||||
* Registers covered q0 to q7, ACCX and SAR_BYTE
|
||||
*
|
||||
* @param n_regs: number of registers to be logged at once
|
||||
* @param ...: register codes 0, 1, 2, 3, 4, 5, 6, 7, 'a', 's'
|
||||
*
|
||||
* @return ESP_OK
|
||||
*
|
||||
*/
|
||||
esp_err_t tie_log(int n_regs, ...);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// esp_cpu_get_ccount function is implemented in IDF 4.1 and later
|
||||
#if defined(__XTENSA__) || defined(__riscv)
|
||||
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(5, 0, 0)
|
||||
#define dsp_get_cpu_cycle_count esp_cpu_get_cycle_count
|
||||
#else
|
||||
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 1, 0)
|
||||
#define dsp_get_cpu_cycle_count esp_cpu_get_ccount
|
||||
#else
|
||||
#define dsp_get_cpu_cycle_count xthal_get_ccount
|
||||
#endif
|
||||
#endif // ESP_IDF_VERSION
|
||||
#else
|
||||
// Linux Target
|
||||
#include <x86intrin.h>
|
||||
#define dsp_get_cpu_cycle_count __rdtsc
|
||||
#endif
|
||||
#endif // _dsp_common_H_
|
||||
@@ -0,0 +1,23 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef _DSP_ERR_H_
|
||||
#define _DSP_ERR_H_
|
||||
|
||||
#include "stdint.h"
|
||||
#include "esp_err.h"
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
#endif // _DSP_ERR_H_
|
||||
@@ -0,0 +1,28 @@
|
||||
// Copyright 2018-2022 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsp_error_codes_H_
|
||||
#define _dsp_error_codes_H_
|
||||
|
||||
#define DSP_OK 0 // For internal use only. Please use ESP_OK instead
|
||||
#define ESP_ERR_DSP_BASE 0x70000
|
||||
#define ESP_ERR_DSP_INVALID_LENGTH (ESP_ERR_DSP_BASE + 1)
|
||||
#define ESP_ERR_DSP_INVALID_PARAM (ESP_ERR_DSP_BASE + 2)
|
||||
#define ESP_ERR_DSP_PARAM_OUTOFRANGE (ESP_ERR_DSP_BASE + 3)
|
||||
#define ESP_ERR_DSP_UNINITIALIZED (ESP_ERR_DSP_BASE + 4)
|
||||
#define ESP_ERR_DSP_REINITIALIZED (ESP_ERR_DSP_BASE + 5)
|
||||
#define ESP_ERR_DSP_ARRAY_NOT_ALIGNED (ESP_ERR_DSP_BASE + 6)
|
||||
|
||||
|
||||
#endif // _dsp_error_codes_H_
|
||||
@@ -0,0 +1,33 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef dsp_platform_h_
|
||||
#define dsp_platform_h_
|
||||
#include "esp_idf_version.h"
|
||||
|
||||
#if defined(__XTENSA__) || defined(__riscv)
|
||||
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
|
||||
#include "esp_cpu.h"
|
||||
#else
|
||||
#include "soc/cpu.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/portable.h"
|
||||
#include "freertos/task.h"
|
||||
#include "freertos/semphr.h"
|
||||
|
||||
#endif // dsp_platform_h_
|
||||
@@ -0,0 +1,38 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _DSP_TESTS_H_
|
||||
#define _DSP_TESTS_H_
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "esp_idf_version.h"
|
||||
#include "esp_dsp.h"
|
||||
|
||||
#define TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, actual) \
|
||||
if (actual >= max_exec) { \
|
||||
ESP_LOGE("", "Time error. Expected max: %i, reached: %i", (int)max_exec, (int)actual);\
|
||||
TEST_ASSERT_MESSAGE (false, "Exec time takes more than expected! ");\
|
||||
}\
|
||||
if (actual < min_exec) {\
|
||||
ESP_LOGE("", "Time error. Expected min: %i, reached: %i", (int)min_exec, (int)actual);\
|
||||
TEST_ASSERT_MESSAGE (false, "Exec time takes less then expected!");\
|
||||
}
|
||||
|
||||
|
||||
// memalign function is implemented in IDF 4.3 and later
|
||||
#if ESP_IDF_VERSION <= ESP_IDF_VERSION_VAL(4, 3, 0)
|
||||
#define memalign(align_, size_) malloc(size_)
|
||||
#endif
|
||||
|
||||
#endif // _DSP_TESTS_H_
|
||||
@@ -0,0 +1,36 @@
|
||||
#ifndef _dsp_types_H_
|
||||
#define _dsp_types_H_
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
// union to simplify access to the 16 bit data
|
||||
typedef union sc16_u {
|
||||
struct {
|
||||
int16_t re;
|
||||
int16_t im;
|
||||
};
|
||||
uint32_t data;
|
||||
} sc16_t;
|
||||
|
||||
typedef union fc32_u {
|
||||
struct {
|
||||
float re;
|
||||
float im;
|
||||
};
|
||||
uint64_t data;
|
||||
} fc32_t;
|
||||
|
||||
typedef struct image2d_s {
|
||||
void *data; // could be int8_t, unt8_t, int16_t, unt16_t, float
|
||||
int step_x; // step of elements by X
|
||||
int step_y; // step of elements by Y, usually is 1
|
||||
int stride_x; // stride width: size of the elements in X axis * by step_x + padding
|
||||
int stride_y; // stride height: size of the elements in Y axis * by step_y + padding
|
||||
// Point[x,y] = data[width*y*step_y + x*step_x];
|
||||
// Full data size = width*height
|
||||
int size_x; // image width
|
||||
int size_y; // image height
|
||||
} image2d_t;
|
||||
|
||||
#endif // _dsp_types_H_
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _esp_dsp_H_
|
||||
#define _esp_dsp_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
// Common includes
|
||||
#include "dsp_common.h"
|
||||
#include "dsp_types.h"
|
||||
|
||||
// Signal processing
|
||||
#include "dsps_dotprod.h"
|
||||
#include "dsps_math.h"
|
||||
#include "dsps_fir.h"
|
||||
#include "dsps_biquad.h"
|
||||
#include "dsps_biquad_gen.h"
|
||||
#include "dsps_wind.h"
|
||||
#include "dsps_conv.h"
|
||||
#include "dsps_corr.h"
|
||||
|
||||
#include "dsps_d_gen.h"
|
||||
#include "dsps_h_gen.h"
|
||||
#include "dsps_tone_gen.h"
|
||||
#include "dsps_snr.h"
|
||||
#include "dsps_sfdr.h"
|
||||
|
||||
#include "dsps_fft2r.h"
|
||||
#include "dsps_fft4r.h"
|
||||
#include "dsps_dct.h"
|
||||
|
||||
// Matrix operations
|
||||
#include "dspm_matrix.h"
|
||||
|
||||
// Support functions
|
||||
#include "dsps_view.h"
|
||||
|
||||
// Image processing functions:
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dspi_conv.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include "mat.h"
|
||||
#endif
|
||||
|
||||
#endif // _esp_dsp_H_
|
||||
@@ -0,0 +1,21 @@
|
||||
// Copyright 2018-2020 spressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file include defenitions that are emulate esp-idf error codes
|
||||
|
||||
#ifndef _esp_attr_h_
|
||||
#define _esp_attr_h_
|
||||
|
||||
|
||||
#endif // _esp_attr_h_
|
||||
@@ -0,0 +1,29 @@
|
||||
// Copyright 2018-2020 spressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file include defenitions that are emulate esp-idf error codes
|
||||
|
||||
#ifndef _esp_err_h_
|
||||
#define _esp_err_h_
|
||||
|
||||
#include <stdlib.h>
|
||||
typedef int esp_err_t;
|
||||
|
||||
#define ESP_OK 0
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.14159265358979323846
|
||||
#endif // M_PI
|
||||
|
||||
#endif // _esp_err_h_
|
||||
@@ -0,0 +1,24 @@
|
||||
// Copyright 2018-2020 spressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file include defenitions that are emulate esp-idf error codes
|
||||
|
||||
#ifndef _esp_log_h_
|
||||
#define _esp_log_h_
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define ESP_LOGD
|
||||
|
||||
#endif // _esp_log_h_
|
||||
@@ -0,0 +1,4 @@
|
||||
#ifndef _sdkconfig_h_
|
||||
#define _sdkconfig_h_
|
||||
|
||||
#endif // _sdkconfig_h_
|
||||
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: 2022 Espressif Systems (Shanghai) CO LTD
|
||||
*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
#include "dsp_common.h"
|
||||
#include <stdarg.h>
|
||||
|
||||
#define TIE_LOG_ENABLED 1
|
||||
|
||||
#if (CONFIG_IDF_TARGET_ESP32S3)
|
||||
|
||||
esp_err_t tie_log(int n_regs, ...)
|
||||
{
|
||||
|
||||
#if !TIE_LOG_ENABLED
|
||||
return ESP_OK;
|
||||
#else
|
||||
|
||||
va_list list;
|
||||
va_start(list, n_regs);
|
||||
|
||||
uint32_t reg_128_bits[4] = {0, 0, 0, 0};
|
||||
int reg_code;
|
||||
|
||||
for (int i = 0; i < n_regs; i++) {
|
||||
reg_code = va_arg(list, int);
|
||||
|
||||
// ACCX register
|
||||
if ( reg_code == 'a') {
|
||||
asm volatile("rur.accx_0 %0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("rur.accx_1 %0" : "=a" (reg_128_bits[1]));
|
||||
printf("ACCX - %02x %08x", (unsigned int)reg_128_bits[1], (unsigned int)reg_128_bits[0]);
|
||||
printf(" --- %llu\n", (long long unsigned)reg_128_bits[1] << 32 | (unsigned int)reg_128_bits[0]);
|
||||
}
|
||||
|
||||
// SAR:_BYTE register
|
||||
else if ( reg_code == 's') {
|
||||
asm volatile("rur.sar_byte %0" : "=a" (reg_128_bits[0]));
|
||||
printf("SAR_BYTE - %d\n", (unsigned int)reg_128_bits[0]);
|
||||
}
|
||||
|
||||
// Q0 - Q7 registers
|
||||
else if ((reg_code >= 0) && (reg_code <= 7)) {
|
||||
switch (reg_code) {
|
||||
case 0 : {
|
||||
asm volatile("ee.movi.32.a q0, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q0, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q0, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q0, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q0");
|
||||
break;
|
||||
}
|
||||
case 1 : {
|
||||
asm volatile("ee.movi.32.a q1, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q1, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q1, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q1, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q1");
|
||||
break;
|
||||
}
|
||||
case 2 : {
|
||||
asm volatile("ee.movi.32.a q2, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q2, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q2, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q2, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q2");
|
||||
break;
|
||||
}
|
||||
case 3 : {
|
||||
asm volatile("ee.movi.32.a q3, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q3, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q3, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q3, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q3");
|
||||
break;
|
||||
}
|
||||
case 4 : {
|
||||
asm volatile("ee.movi.32.a q4, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q4, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q4, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q4, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q4");
|
||||
break;
|
||||
}
|
||||
case 5 : {
|
||||
asm volatile("ee.movi.32.a q5, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q5, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q5, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q5, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q5");
|
||||
break;
|
||||
}
|
||||
case 6 : {
|
||||
asm volatile("ee.movi.32.a q6, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q6, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q6, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q6, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q6");
|
||||
break;
|
||||
}
|
||||
case 7 : {
|
||||
asm volatile("ee.movi.32.a q7, %0, 0" : "=a" (reg_128_bits[0]));
|
||||
asm volatile("ee.movi.32.a q7, %0, 1" : "=a" (reg_128_bits[1]));
|
||||
asm volatile("ee.movi.32.a q7, %0, 2" : "=a" (reg_128_bits[2]));
|
||||
asm volatile("ee.movi.32.a q7, %0, 3" : "=a" (reg_128_bits[3]));
|
||||
printf("Q7");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
printf(" - 0x%08X %08X %08X %08X --- ", (unsigned int)reg_128_bits[3], (unsigned int)reg_128_bits[2], (unsigned int)reg_128_bits[1], (unsigned int)reg_128_bits[0]);
|
||||
printf("%u %u %u %u %u %u %u %u\n", (unsigned int)reg_128_bits[3] >> 16, (unsigned int)reg_128_bits[3] & 0x0000FFFF,
|
||||
(unsigned int)reg_128_bits[2] >> 16, (unsigned int)reg_128_bits[2] & 0x0000FFFF,
|
||||
(unsigned int)reg_128_bits[1] >> 16, (unsigned int)reg_128_bits[1] & 0x0000FFFF,
|
||||
(unsigned int)reg_128_bits[0] >> 16, (unsigned int)reg_128_bits[0] & 0x0000FFFF);
|
||||
} else {
|
||||
printf("Bad register code");
|
||||
}
|
||||
}
|
||||
printf("------------------------------------------------------------------------------------\n");
|
||||
|
||||
return ESP_OK;
|
||||
#endif //TIE_LOG_ENABLED
|
||||
}
|
||||
|
||||
#endif // CONFIG_IDF_TARGET_ESP32S3
|
||||
@@ -0,0 +1,31 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsp_common.h"
|
||||
|
||||
bool dsp_is_power_of_two(int x)
|
||||
{
|
||||
return (x != 0) && ((x & (x - 1)) == 0);
|
||||
}
|
||||
|
||||
int dsp_power_of_two(int x)
|
||||
{
|
||||
for (size_t i = 0; i < 32; i++) {
|
||||
x = x >> 1;
|
||||
if (0 == x) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,184 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_conv.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
esp_err_t dspi_conv_f32_ansi(const image2d_t *in_image, const image2d_t *filter, image2d_t *out_image)
|
||||
{
|
||||
out_image->size_x = in_image->size_x;
|
||||
out_image->size_y = in_image->size_y;
|
||||
float *i_data = (float *)in_image->data;
|
||||
float *out_data = (float *)out_image->data;
|
||||
|
||||
int rest_x = (filter->size_x - 1) >> 1;
|
||||
int rest_y = (filter->size_y - 1) >> 1;
|
||||
|
||||
int i_pos = 0;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
// Up side of image
|
||||
for (int y = 0 ; y < rest_y; y++ ) {
|
||||
int i_pos_y = i_pos;
|
||||
for (int x = 0 ; x < rest_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = rest_y - y ; m < filter->size_y ; m++) {
|
||||
for (int n = rest_x - x ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
for (int x = rest_x ; x < in_image->size_x - filter->size_x / 2; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = rest_y - y ; m < filter->size_y ; m++) {
|
||||
for (int n = 0 ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
for (int x = in_image->size_x - filter->size_x / 2 - 1; x < in_image->size_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = rest_y - y ; m < filter->size_y ; m++) {
|
||||
for (int n = 0 ; n < filter->size_x - (x - in_image->size_x + filter->size_x / 2 + 1); n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
i_pos += in_image->stride_x * in_image->step_y;
|
||||
}
|
||||
// Middle side of image
|
||||
i_pos = 0;
|
||||
for (int y = rest_y ; y < in_image->size_y - filter->size_y / 2; y++ ) {
|
||||
int i_pos_y = i_pos;
|
||||
for (int x = 0 ; x < rest_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y ; m++) {
|
||||
for (int n = rest_x - x ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
for (int x = in_image->size_x - filter->size_x / 2 - 1; x < in_image->size_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y ; m++) {
|
||||
for (int n = 0 ; n < filter->size_x - (x - in_image->size_x + filter->size_x / 2 + 1); n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
|
||||
i_pos += in_image->stride_x * in_image->step_y;
|
||||
}
|
||||
// Down side of image
|
||||
i_pos = 0;
|
||||
for (int y = in_image->size_y - filter->size_y / 2 ; y < in_image->size_y; y++ ) {
|
||||
int i_pos_y = i_pos;
|
||||
for (int x = 0 ; x < rest_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y - (y - in_image->size_y + filter->size_y / 2 + 1); m++) {
|
||||
for (int n = rest_x - x ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
for (int x = rest_x ; x < in_image->size_x - filter->size_x / 2; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y - (y - in_image->size_y + filter->size_y / 2 + 1); m++) {
|
||||
for (int n = 0 ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
for (int x = in_image->size_x - filter->size_x / 2 ; x < in_image->size_x; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y - (y - in_image->size_y + filter->size_y / 2 + 1); m++) {
|
||||
for (int n = 0 ; n < filter->size_x - (x - in_image->size_x + filter->size_x / 2 + 1); n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
|
||||
i_pos += in_image->stride_x * in_image->step_y;
|
||||
}
|
||||
// Main image block
|
||||
i_pos = 0;
|
||||
for (int y = rest_y ; y < in_image->size_y - filter->size_y / 2; y++ ) {
|
||||
int i_pos_y = i_pos;
|
||||
for (int x = rest_x ; x < in_image->size_x - filter->size_x / 2; x++) {
|
||||
int i_pos_x = i_pos_y;
|
||||
float acc = 0;
|
||||
float *f_data = (float *)filter->data;
|
||||
for (int m = 0 ; m < filter->size_y ; m++) {
|
||||
for (int n = 0 ; n < filter->size_x ; n++) {
|
||||
acc += i_data[i_pos_x + n * in_image->step_x] * f_data[filter->step_x * n];
|
||||
}
|
||||
f_data += f_step;
|
||||
i_pos_x += i_step;
|
||||
}
|
||||
i_pos_y += in_image->step_x;
|
||||
out_data[x * out_image->step_x + y * out_image->stride_x * out_image->step_y] = acc;
|
||||
}
|
||||
i_pos += in_image->stride_x * in_image->step_y;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
#if (dsps_ccorr_f32_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_conv_f32_m_ae32.S"
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_ccorr_f32_ae32
|
||||
.type dsps_ccorr_f32_ae32,@function
|
||||
// The function implements the C code from dsps_ccorr_f32_ansi:
|
||||
//esp_err_t dsps_ccorr_f32_ansi(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *corrout);
|
||||
//
|
||||
dsps_ccorr_f32_ae32:
|
||||
// Signal - a2
|
||||
// siglen - a3
|
||||
// Kernel - a4
|
||||
// kernlen - a5
|
||||
// corrout - a6
|
||||
//
|
||||
// a11 - loop length
|
||||
|
||||
entry a1, 16
|
||||
// Array increment for floating point data should be 4
|
||||
sub a10, a3, a5
|
||||
bgez a10, dsps_ccorr_positive
|
||||
addi a10, a2, 0
|
||||
addi a2, a4, 0
|
||||
addi a4, a10, 0
|
||||
|
||||
addi a10, a3, 0
|
||||
addi a3, a5, 0
|
||||
addi a5, a10, 0
|
||||
|
||||
dsps_ccorr_positive:
|
||||
movi.n a8, 4
|
||||
addi a11, a5, 0 // lkern - loop counter
|
||||
movi.n a14, 0
|
||||
addi a9, a14, 1
|
||||
|
||||
movi.n a7, 4
|
||||
movi.n a8, -4
|
||||
|
||||
mull a13, a5, a7 // a13 - kernlen*4
|
||||
add a13, a13, a4 // a13 - Kernel[kernlen]
|
||||
addi a13, a13, -4 // a13 - Kernel[kernlen - 1]
|
||||
ccorr_loop1:
|
||||
// Clear initial state of the result register
|
||||
addi a10, a13, 0 // a10 - Kernel
|
||||
addi a12, a2, 0 // a12 - Signal
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[0]
|
||||
// a10 - kern[n];
|
||||
// a9 - n+1
|
||||
// a7 - 4,
|
||||
// a8 - -4,
|
||||
conv_f32_ae32 a12, a10, a9, a7, a7, loop1
|
||||
|
||||
addi a9, a9, 1 // (n+1)++
|
||||
addi a13, a13, -4 // kern[n] - a4--
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, ccorr_loop1
|
||||
|
||||
// a11 - loop counter = siglen - kernlen - 1
|
||||
addi a9, a2, 4 // sig[1] - sig[kmin]
|
||||
addi a13, a5, 0
|
||||
|
||||
// skip loop if 0
|
||||
sub a11, a3, a5 // a11 - loop counter
|
||||
beqz a11, skip_ccorr_loop2
|
||||
|
||||
ccorr_loop2:
|
||||
|
||||
// Clear initial state of the result register
|
||||
addi a12, a9, 0 // a12 - Signal[kmin]
|
||||
addi a10, a4, 0 // a10 - Kernel
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[kmin]
|
||||
// a10 - kern[0];
|
||||
// a11 - kernlen
|
||||
// a7 - 4,
|
||||
conv_f32_ae32 a12, a10, a13, a7, a7, loop2
|
||||
|
||||
addi a9, a9, 4 // in1++
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, ccorr_loop2
|
||||
|
||||
|
||||
skip_ccorr_loop2:
|
||||
|
||||
// a9 - the same
|
||||
addi a11, a5, -1
|
||||
addi a13, a5, -1
|
||||
ccorr_loop3:
|
||||
|
||||
// Clear initial state of the result register
|
||||
addi a12, a9, 0 // a12 - Signal[kmin]
|
||||
addi a10, a4, 0 // a10 - Kernel
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[kmin]
|
||||
// a10 - kern[n - kmin];
|
||||
// a11 - length
|
||||
// a7 - 4,
|
||||
// a8 - -4,
|
||||
conv_f32_ae32 a12, a10, a11, a7, a7, loop3
|
||||
|
||||
addi a9, a9, 4 // n++
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, ccorr_loop3
|
||||
skip_ccorr_loop3:
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dsps_ccorr_f32_ae32_enabled
|
||||
@@ -0,0 +1,81 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_conv.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
static const char *TAG = "dsps_conv";
|
||||
|
||||
esp_err_t dsps_ccorr_f32_ansi(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *corrvout)
|
||||
{
|
||||
if (NULL == Signal) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == Kernel) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == corrvout) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
float *sig = (float *)Signal;
|
||||
float *kern = (float *)Kernel;
|
||||
int lsig = siglen;
|
||||
int lkern = kernlen;
|
||||
|
||||
if (siglen < kernlen) {
|
||||
sig = (float *)Kernel;
|
||||
kern = (float *)Signal;
|
||||
lsig = kernlen;
|
||||
lkern = siglen;
|
||||
}
|
||||
|
||||
for (int n = 0; n < lkern; n++) {
|
||||
int k;
|
||||
int kmin = lkern - 1 - n;
|
||||
corrvout[n] = 0;
|
||||
|
||||
for (k = 0; k <= n; k++) {
|
||||
corrvout[n] += sig[k] * kern[kmin + k];
|
||||
}
|
||||
ESP_LOGV(TAG, "L1 k = %i, n = %i , kmin= %i, kmax= %i", 0, n, kmin, kmin + n);
|
||||
}
|
||||
for (int n = lkern; n < lsig; n++) {
|
||||
int kmin, kmax, k;
|
||||
|
||||
corrvout[n] = 0;
|
||||
|
||||
kmin = n - lkern + 1;
|
||||
kmax = n;
|
||||
for (k = kmin; k <= kmax; k++) {
|
||||
corrvout[n] += sig[k] * kern[k - kmin];
|
||||
}
|
||||
ESP_LOGV(TAG, "L2 n=%i, kmin = %i, kmax = %i , k-kmin = %i", n, kmin, kmax, 0);
|
||||
}
|
||||
|
||||
for (int n = lsig; n < lsig + lkern - 1; n++) {
|
||||
int kmin, kmax, k;
|
||||
|
||||
corrvout[n] = 0;
|
||||
|
||||
kmin = n - lkern + 1;
|
||||
kmax = lsig - 1;
|
||||
|
||||
for (k = kmin; k <= kmax; k++) {
|
||||
corrvout[n] += sig[k] * kern[k - kmin];
|
||||
}
|
||||
ESP_LOGV(TAG, "L3 n=%i, kmin = %i, kmax = %i , k - kmin = %i", n, kmin, kmax, kmax - kmin);
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,147 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
#if (dsps_conv_f32_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_conv_f32_m_ae32.S"
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_conv_f32_ae32
|
||||
.type dsps_conv_f32_ae32,@function
|
||||
// The function implements the C code from dsps_conv_f32_ansi:
|
||||
//esp_err_t dsps_conv_f32_ansi(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *convout);
|
||||
//
|
||||
dsps_conv_f32_ae32:
|
||||
// Signal - a2
|
||||
// siglen - a3
|
||||
// Kernel - a4
|
||||
// kernlen - a5
|
||||
// convout - a6
|
||||
//
|
||||
// a11 - loop length
|
||||
|
||||
entry a1, 16
|
||||
// Array increment for floating point data should be 4
|
||||
sub a10, a3, a5
|
||||
bgez a10, dsps_conv_positive
|
||||
addi a10, a2, 0
|
||||
addi a2, a4, 0
|
||||
addi a4, a10, 0
|
||||
|
||||
addi a10, a3, 0
|
||||
addi a3, a5, 0
|
||||
addi a5, a10, 0
|
||||
|
||||
dsps_conv_positive:
|
||||
movi.n a8, 4
|
||||
addi a11, a5, 0 // lkern - loop counter
|
||||
movi.n a14, 0
|
||||
addi a9, a14, 1
|
||||
|
||||
movi.n a7, 4
|
||||
movi.n a8, -4
|
||||
|
||||
conv_loop1:
|
||||
// Clear initial state of the result register
|
||||
addi a10, a4, 0 // a10 - Kernel
|
||||
addi a12, a2, 0 // a12 - Signal
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[0]
|
||||
// a10 - kern[n];
|
||||
// a9 - n+1
|
||||
// a7 - 4,
|
||||
// a8 - -4,
|
||||
conv_f32_ae32 a12, a10, a9, a7, a8, loop1
|
||||
|
||||
addi a9, a9, 1 // (n+1)++
|
||||
addi a4, a4, 4 // kern[n] - a4++
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, conv_loop1
|
||||
|
||||
|
||||
// a11 - loop counter = siglen - kernlen - 1
|
||||
addi a9, a2, 0 // sig[1] - sig[kmin]
|
||||
addi a13, a5, 0
|
||||
|
||||
// skip loop if 0
|
||||
sub a11, a3, a5 // a11 - loop counter
|
||||
beqz a11, skip_conv_loop2
|
||||
|
||||
conv_loop2:
|
||||
|
||||
// Clear initial state of the result register
|
||||
addi a12, a9, 4 // a12 - Signal[kmin]
|
||||
addi a10, a4, -4 // a10 - Kernel
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[kmin]
|
||||
// a10 - kern[n - kmin];
|
||||
// a11 - length
|
||||
// a7 - 4,
|
||||
// a8 - -4,
|
||||
conv_f32_ae32 a12, a10, a13, a7, a8, loop2
|
||||
|
||||
addi a9, a9, 4 // (n+1)++
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, conv_loop2
|
||||
|
||||
skip_conv_loop2:
|
||||
|
||||
// sub a11, a3, a5 // a11 - loop counter
|
||||
// beqz a11, skip_conv_loop3
|
||||
// a9 - the same
|
||||
addi a11, a5, -1
|
||||
addi a13, a5, -1
|
||||
// beqz a11, skip_conv_loop3
|
||||
conv_loop3:
|
||||
|
||||
// Clear initial state of the result register
|
||||
addi a12, a9, 4 // a12 - Signal[kmin]
|
||||
addi a10, a4, -4 // a10 - Kernel
|
||||
wfr f1, a14 // clear output: convout[n] = 0;
|
||||
|
||||
// a12 - sig[kmin]
|
||||
// a10 - kern[n - kmin];
|
||||
// a11 - length
|
||||
// a7 - 4,
|
||||
// a8 - -4,
|
||||
conv_f32_ae32 a12, a10, a13, a7, a8, loop3
|
||||
|
||||
addi a9, a9, 4 // (n+1)++
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // convout++ - increment output pointer
|
||||
|
||||
addi a13, a13, -1
|
||||
|
||||
addi a11, a11, -1
|
||||
bnez a11, conv_loop3
|
||||
skip_conv_loop3:
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dsps_conv_f32_ae32_enabled
|
||||
@@ -0,0 +1,81 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_conv.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
static const char *TAG = "dsps_conv";
|
||||
|
||||
esp_err_t dsps_conv_f32_ansi(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *convout)
|
||||
{
|
||||
if (NULL == Signal) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == Kernel) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == convout) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
float *sig = (float *)Signal;
|
||||
float *kern = (float *)Kernel;
|
||||
int lsig = siglen;
|
||||
int lkern = kernlen;
|
||||
|
||||
if (siglen < kernlen) {
|
||||
sig = (float *)Kernel;
|
||||
kern = (float *)Signal;
|
||||
lsig = kernlen;
|
||||
lkern = siglen;
|
||||
}
|
||||
|
||||
for (int n = 0; n < lkern; n++) {
|
||||
size_t k;
|
||||
|
||||
convout[n] = 0;
|
||||
|
||||
for (k = 0; k <= n; k++) {
|
||||
convout[n] += sig[k] * kern[n - k];
|
||||
}
|
||||
ESP_LOGV(TAG, "L1 kmin = %i, kmax = %i , n-kmin = %i", 0, n, n);
|
||||
}
|
||||
for (int n = lkern; n < lsig; n++) {
|
||||
int kmin, kmax, k;
|
||||
|
||||
convout[n] = 0;
|
||||
|
||||
kmin = n - lkern + 1;
|
||||
kmax = n;
|
||||
ESP_LOGV(TAG, "L2 n=%i, kmin = %i, kmax = %i , n-kmin = %i", n, kmin, kmax, n - kmin);
|
||||
for (k = kmin; k <= kmax; k++) {
|
||||
convout[n] += sig[k] * kern[n - k];
|
||||
}
|
||||
}
|
||||
|
||||
for (int n = lsig; n < lsig + lkern - 1; n++) {
|
||||
int kmin, kmax, k;
|
||||
|
||||
convout[n] = 0;
|
||||
|
||||
kmin = n - lkern + 1;
|
||||
kmax = lsig - 1;
|
||||
|
||||
for (k = kmin; k <= kmax; k++) {
|
||||
convout[n] += sig[k] * kern[n - k];
|
||||
}
|
||||
ESP_LOGV(TAG, "L3 n=%i, kmin = %i, kmax = %i , n-kmin = %i", n, kmin, kmax, n - kmin);
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro conv_f32_ae32 x1 x2 count step1 step2 name
|
||||
// This macro calculates floating point dot product for count float samples
|
||||
// x1, x2 - input arrays
|
||||
// count - amount of samples
|
||||
// step1 - start step
|
||||
//,step2 - A register for array step increment. (should be divided by 4)
|
||||
// f1 - contains initial value
|
||||
//
|
||||
// result in f1
|
||||
//
|
||||
// Macros body:
|
||||
// f1 += x1[]*x2[]; i: 0..counter-1
|
||||
// affected: f0, f1, f2
|
||||
// Example: conv_f32_ae32 a2 a3 a5 a8 a9
|
||||
// a8 == 4, step is 4 bytes
|
||||
// a5 == 32, length of array is 32
|
||||
//
|
||||
lsxp f0, \x2, \step2
|
||||
loopnez \count, loop_mac_end_m_ae32\name
|
||||
lsxp f2, \x1, \step1
|
||||
madd.s f1, f2, f0
|
||||
lsxp f0, \x2, \step2
|
||||
loop_mac_end_m_ae32\name:
|
||||
.endm
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
#if (dsps_corr_f32_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_dotprod_f32_m_ae32.S"
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_corr_f32_ae32
|
||||
.type dsps_corr_f32_ae32,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_corr_f32_ansi(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *dest)
|
||||
//{
|
||||
// for (size_t n = 0; n < (siglen - patlen); n++) {
|
||||
// float k_corr = 0;
|
||||
// for (size_t m = 0; m < patlen; m++) {
|
||||
// k_corr += Signal[n + m] * Pattern[m];
|
||||
// }
|
||||
// dest[n] = k_corr;
|
||||
// }
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_corr_f32_ae32:
|
||||
// Signal - a2
|
||||
// siglen - a3
|
||||
// Pattern - a4
|
||||
// patlen - a5
|
||||
// dest - a6
|
||||
// a11 - loop length
|
||||
|
||||
entry a1, 16
|
||||
// Array increment for floating point data should be 4
|
||||
movi.n a8, 4
|
||||
movi.n a13, 4
|
||||
sub a11, a3, a5 // a11 = loop length
|
||||
addi a11, a11, 1
|
||||
addi a12, a2, 0 // move input pointer to the a12
|
||||
movi.n a9, 0
|
||||
movi.n a14, 0
|
||||
|
||||
corr_loop:
|
||||
// Clear initial state of the result register
|
||||
addi a10, a4, 0 // a10 - pattern
|
||||
movi.n a9, 0 // clear a9
|
||||
wfr f1, a9 // clrar f1
|
||||
// a12 - input1
|
||||
// a10 - input2
|
||||
// a5 - length
|
||||
// a8 - 4, step in arrays
|
||||
// a9 - 0
|
||||
dotprod_f32_ae32 a12, a10, a5, a9, a8;
|
||||
|
||||
ssi f1, a6, 0 // Store result from f1 to memory at a6
|
||||
addi a6, a6, 4 // y++ - increment output pointer
|
||||
addi a12, a12, 4 // Signal++
|
||||
addi a11, a11, -1
|
||||
bnez a11, corr_loop
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dsps_corr_f32_ae32_enabled
|
||||
@@ -0,0 +1,40 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_corr.h"
|
||||
|
||||
esp_err_t dsps_corr_f32_ansi(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *dest)
|
||||
{
|
||||
if (NULL == Signal) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == Pattern) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == dest) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (siglen < patlen) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
for (size_t n = 0; n <= (siglen - patlen); n++) {
|
||||
float k_corr = 0;
|
||||
for (size_t m = 0; m < patlen; m++) {
|
||||
k_corr += Signal[n + m] * Pattern[m];
|
||||
}
|
||||
dest[n] = k_corr;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dspi_conv_H_
|
||||
#define _dspi_conv_H_
|
||||
#include "dsp_err.h"
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
#include "dsp_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief 2D Convolution
|
||||
*
|
||||
* The function convolve Signal image with Kernel (filter) image.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @param[in] in_image: input image
|
||||
* @param[in] filter: input array with convolution kernel
|
||||
* @param[out] out_image: output image. The stride and step parameters must be set.
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dspi_conv_f32_ansi(const image2d_t *in_image, const image2d_t *filter, image2d_t *out_image);
|
||||
/**@}*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#define dspi_conv_f32 dspi_conv_f32_ansi
|
||||
#else
|
||||
#define dspi_conv_f32 dspi_conv_f32_ansi
|
||||
#endif // CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#endif // _dspi_conv_H_
|
||||
@@ -0,0 +1,63 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsps_ccorr_H_
|
||||
#define _dsps_ccorr_H_
|
||||
#include "dsp_err.h"
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief Cross correlation
|
||||
*
|
||||
* The function make cross correlate between two ignals.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @param[in] Signal1: input array with input 1 signal values
|
||||
* @param[in] siglen1: length of the input 1 signal array
|
||||
* @param[in] Signal2: input array with input 2 signal values
|
||||
* @param[in] siglen2: length of the input signal array
|
||||
* @param corrout: output array with result of cross correlation. The size of dest array must be (siglen1 + siglen2 - 1) !!!
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library (one of the input array are NULL, or if (siglen < patlen))
|
||||
*/
|
||||
esp_err_t dsps_ccorr_f32_ansi(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *corrout);
|
||||
esp_err_t dsps_ccorr_f32_ae32(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *corrout);
|
||||
/**}@*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#if (dsps_ccorr_f32_ae32_enabled == 1)
|
||||
#define dsps_ccorr_f32 dsps_ccorr_f32_ae32
|
||||
#else
|
||||
#define dsps_ccorr_f32 dsps_ccorr_f32_ansi
|
||||
#endif // dsps_ccorr_f32_ae32_enabled
|
||||
#else
|
||||
#define dsps_ccorr_f32 dsps_ccorr_f32_ansi
|
||||
#endif
|
||||
|
||||
#endif // _dsps_conv_H_
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsps_conv_H_
|
||||
#define _dsps_conv_H_
|
||||
#include "dsp_err.h"
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief Convolution
|
||||
*
|
||||
* The function convolve Signal array with Kernel array.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @param[in] Signal: input array with signal
|
||||
* @param[in] siglen: length of the input signal
|
||||
* @param[in] Kernel: input array with convolution kernel
|
||||
* @param[in] kernlen: length of the Kernel array
|
||||
* @param convout: output array with convolution result length of (siglen + Kernel -1)
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_conv_f32_ae32(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *convout);
|
||||
esp_err_t dsps_conv_f32_ansi(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *convout);
|
||||
/**@}*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#if (dsps_conv_f32_ae32_enabled == 1)
|
||||
#define dsps_conv_f32 dsps_conv_f32_ae32
|
||||
#else
|
||||
#define dsps_conv_f32 dsps_conv_f32_ansi
|
||||
#endif // dsps_conv_f32_ae32_enabled
|
||||
|
||||
#else
|
||||
#define dsps_conv_f32 dsps_conv_f32_ansi
|
||||
#endif
|
||||
|
||||
#endif // _dsps_conv_H_
|
||||
@@ -0,0 +1,20 @@
|
||||
#ifndef _dsps_conv_platform_H_
|
||||
#define _dsps_conv_platform_H_
|
||||
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef __XTENSA__
|
||||
#include <xtensa/config/core-isa.h>
|
||||
#include <xtensa/config/core-matmap.h>
|
||||
|
||||
|
||||
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
|
||||
|
||||
#define dsps_conv_f32_ae32_enabled 1
|
||||
#define dsps_ccorr_f32_ae32_enabled 1
|
||||
#define dsps_corr_f32_ae32_enabled 1
|
||||
|
||||
#endif
|
||||
#endif // __XTENSA__
|
||||
|
||||
#endif // _dsps_conv_platform_H_
|
||||
@@ -0,0 +1,63 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsps_corr_H_
|
||||
#define _dsps_corr_H_
|
||||
#include "dsp_err.h"
|
||||
|
||||
#include "dsps_conv_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief Correlation with pattern
|
||||
*
|
||||
* The function correlate input sigla array with pattern array.
|
||||
* The implementation use ANSI C and could be compiled and run on any platform
|
||||
*
|
||||
* @param[in] Signal: input array with signal values
|
||||
* @param[in] siglen: length of the signal array
|
||||
* @param[in] Pattern: input array with pattern values
|
||||
* @param[in] patlen: length of the pattern array. The siglen must be bigger then patlen!
|
||||
* @param dest: output array with result of correlation
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library (one of the input array are NULL, or if (siglen < patlen))
|
||||
*/
|
||||
esp_err_t dsps_corr_f32_ansi(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *dest);
|
||||
esp_err_t dsps_corr_f32_ae32(const float *Signal, const int siglen, const float *Pattern, const int patlen, float *dest);
|
||||
/**@}*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#if (dsps_corr_f32_ae32_enabled == 1)
|
||||
#define dsps_corr_f32 dsps_corr_f32_ae32
|
||||
#else
|
||||
#define dsps_corr_f32 dsps_corr_f32_ansi
|
||||
#endif // dsps_corr_f32_ae32_enabled
|
||||
#else
|
||||
#define dsps_corr_f32 dsps_corr_f32_ansi
|
||||
#endif
|
||||
|
||||
#endif // _dsps_corr_H_
|
||||
@@ -0,0 +1,118 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "esp_attr.h"
|
||||
#include "esp_dsp.h"
|
||||
#include <malloc.h>
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_conv";
|
||||
|
||||
TEST_CASE("dspi_conv_f32_ansi functionality", "[dspi]")
|
||||
{
|
||||
int max_N = 8192;
|
||||
|
||||
float *data1 = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *data2 = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *data3 = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
image2d_t image1 = {data1, 1, 1, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {data2, 1, 1, 4, 4, 4, 4}; // Image 4x4
|
||||
image2d_t image3 = {data3, 1, 1, 10, 10, 0, 0}; // Image 8x8
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
data1[i] = 0;
|
||||
data2[i] = 0;
|
||||
data3[i] = 0;
|
||||
}
|
||||
|
||||
for (int y = 0 ; y < image1.stride_y / image1.step_y ; y++) {
|
||||
for (int x = 0 ; x < image1.stride_x / image1.step_x ; x++) {
|
||||
data1[y * image1.stride_x * image1.step_y + x * image1.step_x] = 1;
|
||||
}
|
||||
}
|
||||
for (int y = 0 ; y < image2.stride_y / image2.step_y ; y++) {
|
||||
for (int x = 0 ; x < image2.stride_x / image2.step_x ; x++) {
|
||||
data2[y * image2.stride_x * image2.step_y + x * image2.step_x] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
dspi_conv_f32_ansi(&image1, &image2, &image3);
|
||||
// x , y
|
||||
TEST_ASSERT_EQUAL(data3[0 * image3.stride_x * image3.step_y + 0 * image3.step_x], 9);
|
||||
TEST_ASSERT_EQUAL(data3[0 * image3.stride_x * image3.step_y + 6 * image3.step_x], 9);
|
||||
TEST_ASSERT_EQUAL(data3[6 * image3.stride_x * image3.step_y + 6 * image3.step_x], 9);
|
||||
TEST_ASSERT_EQUAL(data3[0 * image3.stride_x * image3.step_y + 6 * image3.step_x], 9);
|
||||
|
||||
TEST_ASSERT_EQUAL(data3[7 * image3.stride_x * image3.step_y + 0 * image3.step_x], 6);
|
||||
TEST_ASSERT_EQUAL(data3[7 * image3.stride_x * image3.step_y + 6 * image3.step_x], 6);
|
||||
TEST_ASSERT_EQUAL(data3[0 * image3.stride_x * image3.step_y + 7 * image3.step_x], 6);
|
||||
TEST_ASSERT_EQUAL(data3[7 * image3.stride_x * image3.step_y + 7 * image3.step_x], 4);
|
||||
|
||||
TEST_ASSERT_EQUAL(data3[1 * image3.stride_x * image3.step_y + 1 * image3.step_x], 16);
|
||||
TEST_ASSERT_EQUAL(data3[5 * image3.stride_x * image3.step_y + 1 * image3.step_x], 16);
|
||||
TEST_ASSERT_EQUAL(data3[1 * image3.stride_x * image3.step_y + 5 * image3.step_x], 16);
|
||||
TEST_ASSERT_EQUAL(data3[5 * image3.stride_x * image3.step_y + 5 * image3.step_x], 16);
|
||||
TEST_ASSERT_EQUAL(data3[3 * image3.stride_x * image3.step_y + 3 * image3.step_x], 16);
|
||||
|
||||
free(data1);
|
||||
free(data2);
|
||||
free(data3);
|
||||
}
|
||||
|
||||
TEST_CASE("dspi_conv_f32_ansi benchmark", "[dspi]")
|
||||
{
|
||||
int max_N = 8192;
|
||||
|
||||
float *data1 = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *data2 = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *data3 = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
image2d_t image1 = {data1, 1, 1, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {data2, 1, 1, 4, 4, 4, 4}; // Image 4x4
|
||||
image2d_t image3 = {data3, 1, 1, 10, 10, 0, 0}; // Image 8x8
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
data1[i] = 0;
|
||||
data2[i] = 0;
|
||||
data3[i] = 0;
|
||||
}
|
||||
|
||||
for (int y = 0 ; y < image1.stride_y / image1.step_y ; y++) {
|
||||
for (int x = 0 ; x < image1.stride_x / image1.step_x ; x++) {
|
||||
data1[y * image1.stride_x * image1.step_y + x * image1.step_x] = 1;
|
||||
}
|
||||
}
|
||||
for (int y = 0 ; y < image2.stride_y / image2.step_y ; y++) {
|
||||
for (int x = 0 ; x < image2.stride_x / image2.step_x ; x++) {
|
||||
data2[y * image2.stride_x * image2.step_y + x * image2.step_x] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_conv_f32_ansi(&image1, &image2, &image3);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dspi_conv_f32_ansi - %f cycles", cycles);
|
||||
|
||||
free(data1);
|
||||
free(data2);
|
||||
free(data3);
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsp_tests.h"
|
||||
#include "dsps_ccorr.h"
|
||||
#include "esp_attr.h"
|
||||
|
||||
static const char *TAG = "dsps_ccorr";
|
||||
|
||||
#define lenA 8
|
||||
#define lenB 4
|
||||
|
||||
static float inputA[lenA];
|
||||
static float inputB[lenB];
|
||||
static float output[lenA + lenB - 1 + 2];
|
||||
static float output_ref[lenA + lenB - 1 + 2];
|
||||
|
||||
TEST_CASE("dsps_ccorr_f32 functionality", "[dsps]")
|
||||
{
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = i + 3;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = i + 10;
|
||||
}
|
||||
for (int i = 0 ; i < (lenA + lenB + 2 - 1); i++) {
|
||||
output[i] = -1;
|
||||
output_ref[i] = -1;
|
||||
}
|
||||
dsps_ccorr_f32(inputA, lenA, inputB, lenB, &output[0]);
|
||||
dsps_ccorr_f32_ansi(inputA, lenA, inputB, lenB, &output_ref[0]);
|
||||
for (int i = 0; i < (lenA + lenB - 1) + 2; i++) {
|
||||
ESP_LOGI(TAG, "Data[%i] = %2.2f, expected = %2.2f", i, output[i], output_ref[i]);
|
||||
}
|
||||
for (size_t i = 0; i < (lenA + lenB - 1) + 2; i++) {
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_ccorr_f32 benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int ccorr_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N + ccorr_size - 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_ccorr_f32(x, max_N, y, ccorr_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_ccorr_f32 - %f cycles for signal %i and pattern %i", cycles, max_N, ccorr_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsps_ccorr.h"
|
||||
#include "esp_attr.h"
|
||||
#include "esp_dsp.h"
|
||||
|
||||
static const char *TAG = "dsps_ccorr";
|
||||
|
||||
#define lenA 20
|
||||
#define lenB 20
|
||||
|
||||
static float inputA[lenA];
|
||||
static float inputB[lenB];
|
||||
static float output_fwd[lenA + lenB - 1 + 2];
|
||||
static float output_back[lenA + lenB - 1 + 2];
|
||||
|
||||
TEST_CASE("dsps_ccorr_f32_ansi functionality", "[dsps]")
|
||||
{
|
||||
for (size_t la = 1; la < lenA; la++) {
|
||||
for (size_t lb = 1; lb < lenB; lb++) {
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < (lenA + lenB - 1 + 2); i++) {
|
||||
output_fwd[i] = -1;
|
||||
output_back[i] = -1;
|
||||
}
|
||||
dsps_ccorr_f32_ansi(inputA, la, inputB, lb, &output_fwd[1]);
|
||||
dsps_ccorr_f32_ansi(inputB, lb, inputA, la, &output_back[1]);
|
||||
TEST_ASSERT_EQUAL(output_fwd[0], -1);
|
||||
TEST_ASSERT_EQUAL(output_fwd[la + lb], -1);
|
||||
TEST_ASSERT_EQUAL(output_back[0], -1);
|
||||
TEST_ASSERT_EQUAL(output_back[la + lb], -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_ccorr_f32_ansi draw", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N * 2 + 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
int l1 = 8;
|
||||
int l2 = 4;
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 0;
|
||||
z[i] = 0;
|
||||
}
|
||||
x[0] = 20;
|
||||
x[7] = 30;
|
||||
y[0] = 10;
|
||||
y[3] = 8;
|
||||
dsps_ccorr_f32_ansi(x, l1, y, l2, &z[0]);
|
||||
|
||||
dsps_view(z, l1 + l2, l1 + l2, 10, -1, 400, '+');
|
||||
for (int i = 0 ; i < (l1 + l2 - 1) ; i++) {
|
||||
ESP_LOGI(TAG, "Z[%i] = %2.2f", i, z[i]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_ccorr_f32_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int conv_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N * 2 + 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_ccorr_f32_ansi(x, max_N, y, conv_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_conv_f32_ansi - %f cycles for signal %i and pattern %i", cycles, max_N, conv_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <malloc.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsp_tests.h"
|
||||
#include "dsps_conv.h"
|
||||
#include "esp_attr.h"
|
||||
|
||||
static const char *TAG = "dsps_conv";
|
||||
|
||||
#define lenA 30
|
||||
#define lenB 30
|
||||
|
||||
TEST_CASE("dsps_conv_f32 test output", "[dsps]")
|
||||
{
|
||||
float *inputA = (float *)memalign(16, lenA * sizeof(float));
|
||||
float *inputB = (float *)memalign(16, lenB * sizeof(float));
|
||||
|
||||
float *output_ref = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_fwd = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_back = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
|
||||
int la = 3;
|
||||
int lb = 2;
|
||||
|
||||
for (int i = 0; i < lenA; i++) {
|
||||
inputA[i] = 10 + i;
|
||||
}
|
||||
for (int i = 0; i < lenB; i++) {
|
||||
inputB[i] = 20 + i;
|
||||
}
|
||||
for (int i = 0; i < (lenA + lenB - 1 + 2); i++) {
|
||||
output_ref[i] = -1;
|
||||
output_fwd[i] = -1;
|
||||
output_back[i] = -1;
|
||||
}
|
||||
dsps_conv_f32_ansi(inputA, la, inputB, lb, &output_ref[1]);
|
||||
dsps_conv_f32(inputA, la, inputB, lb, &output_fwd[1]);
|
||||
|
||||
for (int i = 0; i < (la + lb + 1); i++) {
|
||||
ESP_LOGD(TAG, "la=%i, lb=%i, i=%i, ref=%2.3f, fwd=%2.3f", la, lb, i, output_ref[i], output_fwd[i]);
|
||||
}
|
||||
float max_eps = 0.000001;
|
||||
for (int i = 0; i < (la + lb + 1); i++) {
|
||||
if (fabs(output_ref[i] - output_fwd[i]) > max_eps) {
|
||||
ESP_LOGI(TAG, "la=%i, lb=%i, i=%i, ref=%2.3f, fwd=%2.3f", la, lb, i, output_ref[i], output_fwd[i]);
|
||||
}
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output_fwd[i]);
|
||||
}
|
||||
free(inputA);
|
||||
free(inputB);
|
||||
free(output_ref);
|
||||
free(output_fwd);
|
||||
free(output_back);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_conv_f32 functionality", "[dsps]")
|
||||
{
|
||||
float *inputA = (float *)memalign(16, lenA * sizeof(float));
|
||||
float *inputB = (float *)memalign(16, lenB * sizeof(float));
|
||||
|
||||
float *output_ref = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_fwd = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_back = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
|
||||
for (int la = 2; la < lenA; la++) {
|
||||
for (int lb = 2; lb < lenB; lb++) {
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < (lenA + lenB - 1 + 2); i++) {
|
||||
output_ref[i] = -1;
|
||||
output_fwd[i] = -1;
|
||||
output_back[i] = -1;
|
||||
}
|
||||
dsps_conv_f32_ansi(inputA, la, inputB, lb, &output_ref[1]);
|
||||
dsps_conv_f32(inputA, la, inputB, lb, &output_fwd[1]);
|
||||
dsps_conv_f32(inputB, lb, inputA, la, &output_back[1]);
|
||||
float max_eps = 0.000001;
|
||||
for (int i = 0; i < (la + lb + 1); i++) {
|
||||
if ((fabs(output_ref[i] - output_fwd[i]) > max_eps) || (fabs(output_ref[i] - output_back[i]) > max_eps) || (fabs(output_back[i] - output_fwd[i]) > max_eps)) {
|
||||
ESP_LOGI(TAG, "la=%i, lb=%i, i=%i, ref=%2.3f, fwd=%2.3f, back=%2.3f", la, lb, i, output_ref[i], output_fwd[i], output_back[i]);
|
||||
}
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output_fwd[i]);
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output_back[i]);
|
||||
TEST_ASSERT_EQUAL(output_back[i], output_fwd[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(inputA);
|
||||
free(inputB);
|
||||
free(output_ref);
|
||||
free(output_fwd);
|
||||
free(output_back);
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("dsps_conv_f32 benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int conv_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N * 2 + 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_conv_f32(x, max_N, y, conv_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_conv_f32 - %f cycles for signal %i and pattern %i", cycles, max_N, conv_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
// Copyright 2018-2023 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <malloc.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsp_tests.h"
|
||||
#include "dsps_conv.h"
|
||||
#include "esp_attr.h"
|
||||
#include "esp_dsp.h"
|
||||
|
||||
static const char *TAG = "dsps_conv";
|
||||
|
||||
#define lenA 20
|
||||
#define lenB 20
|
||||
|
||||
esp_err_t dsps_conv_f32_ref(const float *Signal, const int siglen, const float *Kernel, const int kernlen, float *convout)
|
||||
{
|
||||
if (NULL == Signal) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == Kernel) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (NULL == convout) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
for (int n = 0; n < siglen + kernlen - 1; n++) {
|
||||
size_t kmin, kmax, k;
|
||||
|
||||
convout[n] = 0;
|
||||
|
||||
kmin = (n >= kernlen - 1) ? n - (kernlen - 1) : 0;
|
||||
kmax = (n < siglen - 1) ? n : siglen - 1;
|
||||
|
||||
for (k = kmin; k <= kmax; k++) {
|
||||
convout[n] += Signal[k] * Kernel[n - k];
|
||||
}
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_conv_f32_ansi functionality", "[dsps]")
|
||||
{
|
||||
float *inputA = (float *)memalign(16, lenA * sizeof(float));
|
||||
float *inputB = (float *)memalign(16, lenB * sizeof(float));
|
||||
|
||||
float *output_ref = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_fwd = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
float *output_back = (float *)memalign(16, (lenA + lenB - 1 + 2) * sizeof(float));
|
||||
|
||||
for (int la = 1; la < lenA; la++) {
|
||||
for (int lb = 1; lb < lenB; lb++) {
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = (float)rand() / (float)INT32_MAX;
|
||||
}
|
||||
for (int i = 0 ; i < (lenA + lenB - 1 + 2); i++) {
|
||||
output_ref[i] = -1;
|
||||
output_fwd[i] = -1;
|
||||
output_back[i] = -1;
|
||||
}
|
||||
dsps_conv_f32_ref(inputA, la, inputB, lb, &output_ref[1]);
|
||||
dsps_conv_f32_ansi(inputA, la, inputB, lb, &output_fwd[1]);
|
||||
dsps_conv_f32_ansi(inputB, lb, inputA, la, &output_back[1]);
|
||||
float max_eps = 0.000001;
|
||||
for (int i = 0; i < (la + lb + 1); i++) {
|
||||
if ((fabs(output_ref[i] - output_fwd[i]) > max_eps) || (fabs(output_ref[i] - output_back[i]) > max_eps) || (fabs(output_back[i] - output_fwd[i]) > max_eps)) {
|
||||
ESP_LOGI(TAG, "la=%i, lb=%i, i=%i, ref=%2.3f, fwd=%2.3f, back=%2.3f", la, lb, i, output_ref[i], output_fwd[i], output_back[i]);
|
||||
}
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output_fwd[i]);
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output_back[i]);
|
||||
TEST_ASSERT_EQUAL(output_back[i], output_fwd[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(inputA);
|
||||
free(inputB);
|
||||
free(output_ref);
|
||||
free(output_fwd);
|
||||
free(output_back);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_conv_f32_ansi draw", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N * 2 + 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 10;
|
||||
y[i] = 20;
|
||||
z[i] = 0;
|
||||
}
|
||||
|
||||
dsps_conv_f32_ansi(x, 32, y, 16, &z[0]);
|
||||
|
||||
dsps_view(z, 32 + 16, 32 + 16, 10, -1, 4000, '+');
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_conv_f32_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int conv_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc((max_N * 2 + 1) * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_conv_f32_ansi(x, max_N, y, conv_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_conv_f32_ansi - %f cycles for signal %i and pattern %i", cycles, max_N, conv_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsp_tests.h"
|
||||
#include "dsps_corr.h"
|
||||
#include "esp_attr.h"
|
||||
|
||||
static const char *TAG = "dsps_corr";
|
||||
|
||||
#define lenA 15
|
||||
#define lenB 10
|
||||
|
||||
static float inputA[lenA];
|
||||
static float inputB[lenB];
|
||||
static float output[lenA + lenB - 1 + 2];
|
||||
static float output_ref[lenA + lenB - 1 + 2];
|
||||
|
||||
TEST_CASE("dsps_corr_f32_aexx functionality", "[dsps]")
|
||||
{
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = i;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = 10 + i;
|
||||
}
|
||||
for (int i = 0 ; i < (lenA - lenB + 2); i++) {
|
||||
output[i] = -1;
|
||||
output_ref[i] = -1;
|
||||
}
|
||||
inputB[0] = 1;
|
||||
dsps_corr_f32(inputA, lenA, inputB, lenB, &output[1]);
|
||||
dsps_corr_f32_ansi(inputA, lenA, inputB, lenB, &output_ref[1]);
|
||||
for (int i = 0; i < (lenA - lenB) + 2; i++) {
|
||||
ESP_LOGD(TAG, "Data[%i] = %2.2f, expected = %2.2f", i, output[i], output_ref[i]);
|
||||
}
|
||||
for (size_t i = 0; i < (lenA - lenB) + 2; i++) {
|
||||
TEST_ASSERT_EQUAL(output_ref[i], output[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_corr_f32_aexx benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int corr_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_corr_f32(x, max_N, y, corr_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_corr_f32_ae32 - %f cycles for signal %i and pattern %i", cycles, max_N, corr_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsp_tests.h"
|
||||
#include "dsps_corr.h"
|
||||
#include "esp_attr.h"
|
||||
|
||||
static const char *TAG = "dsps_corr";
|
||||
|
||||
#define lenA 15
|
||||
#define lenB 10
|
||||
|
||||
static float inputA[lenA];
|
||||
static float inputB[lenB];
|
||||
static float output[lenA + lenB + 2];
|
||||
|
||||
TEST_CASE("dsps_corr_f32_ansi functionality", "[dsps]")
|
||||
{
|
||||
for (int i = 0 ; i < lenA ; i++) {
|
||||
inputA[i] = i;
|
||||
}
|
||||
for (int i = 0 ; i < lenB ; i++) {
|
||||
inputB[i] = 0;
|
||||
}
|
||||
for (int i = 0 ; i <= (lenA - lenB + 2); i++) {
|
||||
output[i] = -1;
|
||||
}
|
||||
inputB[0] = 1;
|
||||
dsps_corr_f32_ansi(inputA, lenA, inputB, lenB, &output[1]);
|
||||
for (int i = 0; i < lenA + lenB; i++) {
|
||||
ESP_LOGD(TAG, "output[%i] = %2.2f", i, output[i]);
|
||||
}
|
||||
|
||||
TEST_ASSERT_EQUAL(output[0], -1);
|
||||
TEST_ASSERT_EQUAL(output[lenA - lenB + 2], -1);
|
||||
for (size_t i = 0; i <= (lenA - lenB); i++) {
|
||||
TEST_ASSERT_EQUAL(output[i + 1], i);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_corr_f32_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
int corr_size = 64;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(x);
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(y);
|
||||
float *z = (float *)malloc(max_N * sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(z);
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dsps_corr_f32_ansi(x, max_N, y, corr_size, &z[0]);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float cycles = end_b - start_b;
|
||||
ESP_LOGI(TAG, "dsps_corr_f32_ansi - %f cycles for signal %i and pattern %i", cycles, max_N, corr_size);
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsp_common.h"
|
||||
#include <math.h>
|
||||
|
||||
#include "dsps_dct.h"
|
||||
#include "dsps_fft2r.h"
|
||||
|
||||
esp_err_t dsps_dct_f32_ref(float *data, int N, float *result)
|
||||
{
|
||||
float factor = M_PI / N;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
float sum = 0;
|
||||
for (size_t j = 0; j < N; j++) {
|
||||
sum += data[j] * cosf((j + 0.5) * i * factor);
|
||||
}
|
||||
result[i] = sum;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
esp_err_t dsps_dct_inverce_f32_ref(float *data, int N, float *result)
|
||||
{
|
||||
float factor = M_PI / N;
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
float sum = data[0] / 2;
|
||||
for (size_t j = 0; j < N; j++) {
|
||||
sum += data[j] * cosf(j * (i + 0.5) * factor);
|
||||
}
|
||||
result[i] = sum;
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
esp_err_t dsps_dct_f32(float *data, int N)
|
||||
{
|
||||
esp_err_t ret = ESP_OK;
|
||||
if (dsps_fft2r_initialized == 0) {
|
||||
return ESP_ERR_DSP_REINITIALIZED;
|
||||
}
|
||||
|
||||
for (int i = 0; i < N / 2; i++) {
|
||||
data[(N - 1 - i) * 2] = data[i * 2 + 1];
|
||||
data[i * 2 + 1] = 0;
|
||||
data[N + i * 2 + 1] = 0;
|
||||
}
|
||||
|
||||
ret = dsps_fft2r_fc32(data, N);
|
||||
if (ret != ESP_OK) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
// // The follows code do the same as this one:
|
||||
// //
|
||||
// float factor = M_PI / (N * 2);
|
||||
// ret = dsps_bit_rev_fc32(data, N);
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// float temp = i * factor;
|
||||
// data[i] = data[i*2] * cosf(temp) + data[i*2 + 1] * sinf(temp);
|
||||
// }
|
||||
int table_step = 2;
|
||||
for (int i = 0; i < N; i++) {
|
||||
float c = dsps_fft_w_table_fc32[i * 2 * table_step];
|
||||
float s = dsps_fft_w_table_fc32[i * 2 * table_step + 1];
|
||||
data[i * 2] = data[i * 2] * c;
|
||||
data[i * 2 + 1] = data[i * 2 + 1] * s;
|
||||
}
|
||||
ret = dsps_bit_rev_fc32(data, N);
|
||||
if (ret != ESP_OK) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
data[i] = data[i * 2] + data[i * 2 + 1];
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
esp_err_t dsps_dct_inv_f32(float *data, int N)
|
||||
{
|
||||
esp_err_t ret = ESP_OK;
|
||||
if (dsps_fft2r_initialized == 0) {
|
||||
return ESP_ERR_DSP_REINITIALIZED;
|
||||
}
|
||||
|
||||
float factor = M_PI / (N * 2);
|
||||
data[0] *= 0.5;
|
||||
for (int i = N - 1; i >= 0; i--) {
|
||||
float temp = i * factor;
|
||||
data[i * 2] = data[i] * cosf(temp);
|
||||
data[i * 2 + 1] = data[i] * -sinf(temp);
|
||||
}
|
||||
ret = dsps_fft2r_fc32(data, N);
|
||||
if (ret != ESP_OK) {
|
||||
return ret;
|
||||
}
|
||||
ret = dsps_bit_rev_fc32(data, N);
|
||||
if (ret != ESP_OK) {
|
||||
return ret;
|
||||
}
|
||||
for (size_t i = 0; i < N / 2; i++) {
|
||||
data[i * 2 + 1] = data[(N - 1 - i) * 2];
|
||||
}
|
||||
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsp_common.h"
|
||||
#include <math.h>
|
||||
|
||||
#include "dsps_dct.h"
|
||||
#include "dsps_fft2r.h"
|
||||
|
||||
esp_err_t dsps_dctiv_f32(float *data, int ndct)
|
||||
{
|
||||
if (dsps_fft2r_initialized == 0) {
|
||||
return ESP_ERR_DSP_REINITIALIZED;
|
||||
}
|
||||
|
||||
float factor = M_PI / (ndct * 2);
|
||||
float in1, in2, in3, in4;
|
||||
for (int i = 0; i < ndct / 4; i++) {
|
||||
in1 = data[i * 2 + 0];
|
||||
in2 = data[i * 2 + 1];
|
||||
in3 = data[ndct - i * 2 - 1];
|
||||
in4 = data[ndct - i * 2 - 2];
|
||||
|
||||
data[i * 2 + 0] = (
|
||||
in1 * cos(factor * (i * 2 + 0))
|
||||
+ in3 * cos(factor * ((ndct - i * 2)))
|
||||
);
|
||||
|
||||
data[i * 2 + 1] = (
|
||||
-in1 * sin(factor * (i * 2))
|
||||
+ in3 * sin(factor * ((ndct - i * 2)))
|
||||
);
|
||||
|
||||
data[ndct - i * 2 + 0 - 2] = (
|
||||
in2 * cos(factor * (i * 2 + 1 + 0.5) )
|
||||
+ in4 * cos(factor * ((ndct - i * 2 - 1) - 0.5) )
|
||||
);
|
||||
|
||||
data[ndct - i * 2 + 1 - 2] = (
|
||||
in2 * sin(factor * (i * 2 + 1))
|
||||
+ in4 * sin(-factor * ((ndct - i * 2 - 1)) )
|
||||
);
|
||||
|
||||
}
|
||||
esp_err_t error = ESP_OK;
|
||||
error = dsps_fft2r_fc32(data, ndct / 2);
|
||||
if (error != ESP_OK) {
|
||||
return error;
|
||||
}
|
||||
error = dsps_bit_rev_fc32(data, ndct / 2);
|
||||
if (error != ESP_OK) {
|
||||
return error;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ndct / 4; i++) {
|
||||
in1 = data[2 * i + 0];
|
||||
in2 = data[2 * i + 1];
|
||||
|
||||
in3 = data[ndct - 2 * i - 2];
|
||||
in4 = data[ndct - 2 * i - 1];
|
||||
|
||||
data[i * 2 + 0] = (
|
||||
in1 * cos(factor * (0 + i * 2))
|
||||
+ in2 * sin(factor * (0 + i * 2))
|
||||
);
|
||||
|
||||
data[ndct - i * 2 - 1] = (
|
||||
in1 * cos(factor * (ndct - i * 2))
|
||||
- in2 * sin(factor * (ndct - i * 2))
|
||||
);
|
||||
|
||||
data[i * 2 + 1] = (
|
||||
in3 * cos(factor * (2 + i * 2))
|
||||
- in4 * sin(factor * (2 + i * 2))
|
||||
);
|
||||
|
||||
data[ndct - i * 2 - 2] = (
|
||||
in3 * cos(factor * (ndct - i * 2 - 2) )
|
||||
+ in4 * sin(factor * (ndct - i * 2 - 2) )
|
||||
);
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsp_common.h"
|
||||
#include <math.h>
|
||||
|
||||
#include "dsps_dct.h"
|
||||
#include "dsps_fft2r.h"
|
||||
|
||||
esp_err_t dsps_dstiv_f32(float *data, int ndst)
|
||||
{
|
||||
if (dsps_fft2r_initialized == 0) {
|
||||
return ESP_ERR_DSP_REINITIALIZED;
|
||||
}
|
||||
|
||||
float in1, in2, in3, in4;
|
||||
float factor = M_PI / (ndst);
|
||||
|
||||
for (int i = 0; i < ndst / 4; i++) {
|
||||
|
||||
in1 = data[2 * i + 0];
|
||||
in2 = data[2 * i + 1];
|
||||
|
||||
in3 = data[ndst - 2 * i - 2];
|
||||
in4 = data[ndst - 2 * i - 1];
|
||||
|
||||
data[i * 2 + 1] = (
|
||||
in1 * cos(factor * (i + 0))
|
||||
- in4 * sin(factor * ((ndst - i - 1)))
|
||||
);
|
||||
|
||||
data[i * 2 + 0] = (
|
||||
in1 * sin(factor * (i))
|
||||
- in4 * cos(factor * ((ndst - i - 1)))
|
||||
);
|
||||
|
||||
data[ndst - i * 2 - 2] = (
|
||||
-in3 * cos(factor * (ndst - i - 1))
|
||||
+ in2 * sin(factor * (ndst - i - 1))
|
||||
);
|
||||
|
||||
data[ndst - i * 2 - 1] = (
|
||||
+in3 * sin(factor * (i + 1))
|
||||
- in2 * cos(-factor * (i + 1))
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
esp_err_t error = ESP_OK;
|
||||
error = dsps_fft2r_fc32(data, ndst / 2);
|
||||
if (error != ESP_OK) {
|
||||
return error;
|
||||
}
|
||||
error = dsps_bit_rev_fc32(data, ndst / 2);
|
||||
if (error != ESP_OK) {
|
||||
return error;
|
||||
}
|
||||
|
||||
for (int i = 0; i < ndst / 4; i++) {
|
||||
in1 = data[2 * i + 0];
|
||||
in2 = data[2 * i + 1];
|
||||
|
||||
in3 = data[ndst - 2 * i - 2];
|
||||
in4 = data[ndst - 2 * i - 1];
|
||||
|
||||
data[i * 2 + 0] = (
|
||||
in1 * cos(factor * (0 + i))
|
||||
+ in2 * sin(factor * (0 + i))
|
||||
);
|
||||
|
||||
data[ndst - i * 2 - 2 + 1] = (
|
||||
-in1 * cos(factor * (ndst / 2 - i))
|
||||
+ in2 * sin(factor * (ndst / 2 - i))
|
||||
);
|
||||
|
||||
data[i * 2 + 1] = (
|
||||
-in3 * cos(factor * (1 + i))
|
||||
+ in4 * sin(factor * (1 + i))
|
||||
);
|
||||
|
||||
data[ndst - i * 2 - 2 + 0] = (
|
||||
+in3 * cos(factor * (ndst / 2 - i - 1))
|
||||
+ in4 * sin(factor * (ndst / 2 - i - 1))
|
||||
);
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _dsps_dct_H_
|
||||
#define _dsps_dct_H_
|
||||
#include "dsp_err.h"
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief DCT of radix 2, unscaled
|
||||
*
|
||||
* Discrete Cosine Transform type II of radix 2, unscaled
|
||||
* Function is FFT based
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[inout] data: input/output array with size of N*2. An elements located: Re[0],Re[1], , ... Re[N-1], any data... up to N*2
|
||||
* result of DCT will be stored to this array from 0...N-1.
|
||||
* Size of data array must be N*2!!!
|
||||
* @param[in] N: Size of DCT transform. Size of data array must be N*2!!!
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dct_f32(float *data, int N);
|
||||
/**@}*/
|
||||
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief DCT of radix 2, type IV, unscaled
|
||||
*
|
||||
* Discrete Cosine Transform type IV of radix 2, unscaled
|
||||
* Function is FFT based
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[inout] data: input/output array with size of N. An elements located: Re[0],Re[1], , ... Re[N-1]
|
||||
* result of DST will be stored to this array from 0...N-1.
|
||||
* Size of data array must be N
|
||||
* @param[in] N: Size of DCT transform. Size of data array must be N
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dctiv_f32(float *data, int N);
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief DST of radix 2, type IV, unscaled
|
||||
*
|
||||
* Discrete Sine Transform type IV of radix 2, unscaled
|
||||
* Function is FFT based
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[inout] data: input/output array with size of N*2. An elements located: Re[0],Re[1], , ... Re[N-1]
|
||||
* result of DST will be stored to this array from 0...N-1.
|
||||
* Size of data array must be N
|
||||
* @param[in] N: Size of DST transform. Size of data array must be N
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dstiv_f32(float *data, int N);
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief Inverce DCT of radix 2
|
||||
*
|
||||
* Inverce Discrete Cosine Transform type II of radix 2, unscaled
|
||||
* Function is FFT based
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[inout] data: input/output array with size of N*2. An elements located: Re[0],Re[1], , ... Re[N-1], any data... up to N*2
|
||||
* result of DCT will be stored to this array from 0...N-1.
|
||||
* Size of data array must be N*2!!!
|
||||
* @param[in] N: Size of DCT transform. Size of data array must be N*2!!!
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dct_inv_f32(float *data, int N);
|
||||
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief DCTs
|
||||
*
|
||||
* Direct DCT type II and Inverce DCT type III, unscaled
|
||||
* These functions used as a reference for general purpose. These functions are not optimyzed!
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] data: input/output array with size of N. An elements located: Re[0],Re[1], , ... Re[N-1]
|
||||
* @param[in] N: Size of DCT transform. Size of data array must be N*2!!!
|
||||
* @param[out] result: output result array with size of N.
|
||||
*
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dct_f32_ref(float *data, int N, float *result);
|
||||
esp_err_t dsps_dct_inverce_f32_ref(float *data, int N, float *result);
|
||||
/**@}*/
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _dsps_dct_H_
|
||||
@@ -0,0 +1,166 @@
|
||||
// Copyright 2018-2020 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dsps_view.h"
|
||||
#include "dsps_dct.h"
|
||||
#include "dsps_fft2r.h"
|
||||
#include "dsp_tests.h"
|
||||
#include <malloc.h>
|
||||
|
||||
|
||||
static const char *TAG = "dsps_dct";
|
||||
|
||||
TEST_CASE("dsps_dct_f32 functionality", "[dsps]")
|
||||
{
|
||||
float *data = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data);
|
||||
|
||||
float *data_ref = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data_ref);
|
||||
|
||||
float *data_fft = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data_fft);
|
||||
|
||||
int N = 64;
|
||||
int check_bin = 4;
|
||||
for (int i = 0 ; i < N ; i++) {
|
||||
data[i] = 2 * sinf(M_PI / N * check_bin * 2 * i);
|
||||
data_ref[i] = data[i];
|
||||
data_fft[i] = data[i];
|
||||
data[i + N] = 0;
|
||||
data_ref[i + N] = 0;
|
||||
data_fft[i + N] = 0;
|
||||
}
|
||||
|
||||
dsps_dct_f32_ref(data, N, &data[N]);
|
||||
dsps_view(&data[N], 32, 32, 10, -2, 2, '.');
|
||||
|
||||
dsps_dct_inverce_f32_ref(&data[N], N, data);
|
||||
dsps_view(&data[0], 32, 32, 10, -2, 2, '.');
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
ESP_LOGD(TAG, "DCT data[%i] = %2.3f\n", i, data[N + i]);
|
||||
}
|
||||
float abs_tol = 1e-5;
|
||||
for (int i = 1; i < N; i++) {
|
||||
ESP_LOGD(TAG, "data[%i] = %f, ref_data = %f\n", i, data[i], data_ref[i]*N / 2);
|
||||
float error = fabs(data[i] - data_ref[i] * N / 2) / (N / 2);
|
||||
if (error > abs_tol) {
|
||||
ESP_LOGE(TAG, "data[%i] = %f, ref_data = %f, error= %f\n", i, data[i], data_ref[i]*N / 2, error);
|
||||
TEST_ASSERT_MESSAGE (false, "Result out of range!\n");
|
||||
}
|
||||
}
|
||||
|
||||
free(data);
|
||||
free(data_ref);
|
||||
free(data_fft);
|
||||
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dct_f32 functionality Fast DCT", "[dsps]")
|
||||
{
|
||||
esp_err_t ret = dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
|
||||
TEST_ESP_OK(ret);
|
||||
|
||||
float *data = (float *)memalign(16, sizeof(float) * 1024 * 2);
|
||||
TEST_ASSERT_NOT_NULL(data);
|
||||
|
||||
float *data_ref = (float *)memalign(16, sizeof(float) * 1024 * 2);
|
||||
TEST_ASSERT_NOT_NULL(data_ref);
|
||||
|
||||
float *data_fft = (float *)memalign(16, sizeof(float) * 1024 * 2);
|
||||
TEST_ASSERT_NOT_NULL(data_fft);
|
||||
|
||||
int N = 64;
|
||||
int check_bin = 4;
|
||||
for (int i = 0 ; i < N ; i++) {
|
||||
data[i] = 2 * sin(M_PI / N * check_bin * 2 * i);
|
||||
data_ref[i] = data[i];
|
||||
data_fft[i] = data[i];
|
||||
data[i + N] = 0;
|
||||
data_ref[i + N] = 0;
|
||||
data_fft[i + N] = 0;
|
||||
}
|
||||
|
||||
dsps_dct_f32_ref(data, N, &data[N]);
|
||||
ret = dsps_dct_f32(data_fft, N);
|
||||
TEST_ESP_OK(ret);
|
||||
|
||||
float abs_tol = 1e-5;
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
ESP_LOGD(TAG, "DCT data[%i] = %2.3f, data_fft = %2.3f\n", i, data[N + i], data_fft[i]);
|
||||
float error = fabs(data[N + i] - data_fft[i]) / (N / 2);
|
||||
if (error > abs_tol) {
|
||||
ESP_LOGE(TAG, "DCT data[%i] = %f, data_fft = %f, error = %f\n", i, data[N + i], data_fft[i], error);
|
||||
TEST_ASSERT_MESSAGE (false, "Result out of range!\n");
|
||||
}
|
||||
}
|
||||
|
||||
dsps_dct_inv_f32(data_fft, N);
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
ESP_LOGD(TAG, "IDCT data[%i] = %2.3f, data_fft = %2.3f\n", i, data[i], data_fft[i] / N * 2);
|
||||
float error = fabs(data[i] - data_fft[i] / N * 2) / (N / 2);
|
||||
if (error > abs_tol) {
|
||||
ESP_LOGE(TAG, "IDCT data[%i] = %f, data_fft = %f, error = %f\n", i, data[i], data_fft[i] / N * 2, error);
|
||||
TEST_ASSERT_MESSAGE (false, "Result out of range!\n");
|
||||
}
|
||||
}
|
||||
dsps_fft2r_deinit_fc32();
|
||||
free(data);
|
||||
free(data_ref);
|
||||
free(data_fft);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dct_f32 benchmark", "[dsps]")
|
||||
{
|
||||
esp_err_t ret = dsps_fft2r_init_fc32(NULL, CONFIG_DSP_MAX_FFT_SIZE);
|
||||
TEST_ESP_OK(ret);
|
||||
|
||||
float *data = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data);
|
||||
|
||||
float *data_ref = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data_ref);
|
||||
|
||||
float *data_fft = calloc(1024 * 2, sizeof(float));
|
||||
TEST_ASSERT_NOT_NULL(data_fft);
|
||||
|
||||
int N = 64;
|
||||
int check_bin = 4;
|
||||
for (int i = 0 ; i < N ; i++) {
|
||||
data[i] = 2 * sin(M_PI / N * check_bin * 2 * i);
|
||||
data[i + N] = 0;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
ret = dsps_dct_f32(data, N);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
TEST_ESP_OK(ret);
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b;
|
||||
ESP_LOGI(TAG, "Benchmark dsps_dct_f32 - %6i cycles for %6i DCT points FFT.", (int)cycles, N);
|
||||
dsps_fft2r_deinit_fc32();
|
||||
free(data);
|
||||
free(data_ref);
|
||||
free(data_fft);
|
||||
}
|
||||
@@ -0,0 +1,398 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_61, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_s16_aes3
|
||||
.type dspi_dotprod_off_s16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s16_aes3
|
||||
dspi_dotprod_off_s16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_s16_aes3: # 0x4
|
||||
entry a1,128 #
|
||||
l32i.n a10,a2,4 # [0] id:760
|
||||
l32i.n a12,a2,12 # [1] id:759
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:761
|
||||
l32i.n a9,a2,16 # [1] id:762
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:764
|
||||
l32i.n a14,a3,12 # [1] id:763
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB83_dspi_dotprod_off_s16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:766
|
||||
l32i.n a9,a3,8 # [1] id:765
|
||||
s32i a9,a1,88 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB83_dspi_dotprod_off_s16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:767
|
||||
s32i a8,a1,84 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_36354 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_36354 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_36354 # [0]
|
||||
|
||||
l32i a9,a1,88 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_19458 # [2]
|
||||
|
||||
.Lt_0_36354: # 0x46
|
||||
.Lt_0_19714: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:875
|
||||
.type dspi_dotprod_off_s16_ansi, @function
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB83_dspi_dotprod_off_s16_aes3: # 0x5e
|
||||
l32r a2,.LC0_1_61 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_19458: # 0x63
|
||||
addi.n a9,a10,-1 # [0]
|
||||
bnez a9,.Lt_0_37122 # [1]
|
||||
|
||||
addi.n a10,a13,-1 # [0]
|
||||
bnez a10,.Lt_0_37122 # [1]
|
||||
|
||||
extui a11,a5,0,3 # [0]
|
||||
bnez.n a11,.Lt_0_37122 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_37122 # [0]
|
||||
|
||||
movi.n a14,32 # [0]
|
||||
blt a14,a5,.LBB27_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_37634: # 0x7a
|
||||
.Lt_0_21506: # 0x7a
|
||||
l32i a15,a1,84 # [0] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [1] id:769
|
||||
l16si a9,a1,128 # [2] id:768 offset+0x0
|
||||
mull a10,a12,a13 # [3]
|
||||
addi a8,a1,16 # [4] temp_offset
|
||||
slli a10,a10,1 # [5]
|
||||
s32i a10,a1,80 # [6] gra_spill_temp_0
|
||||
movi.n a10,2 # [7]
|
||||
# loop-count fixed at 2
|
||||
loop a10,.LBB137_dspi_dotprod_off_s16_aes3 # [8]
|
||||
|
||||
.LBB132_dspi_dotprod_off_s16_aes3: # 0x93
|
||||
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
|
||||
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
|
||||
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
|
||||
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
|
||||
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
|
||||
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
|
||||
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
|
||||
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
|
||||
addi a8,a8,16 # [0*II+8]
|
||||
|
||||
.LBB137_dspi_dotprod_off_s16_aes3: # 0xae
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-24 # [1]
|
||||
addi a12,a1,24 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.sar_byte a13 # [5]
|
||||
wur.accx_0 a13 # [6]
|
||||
wur.accx_1 a13 # [7]
|
||||
ee.vld.128.ip q6,a12,0 # [8] id:771
|
||||
s32i.n a12,a1,48 # [9] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_s16_aes3 # [10]
|
||||
|
||||
.Lt_0_25602: # 0xc8
|
||||
.Lt_0_25090: # 0xc8
|
||||
ee.vld.128.ip q0,a15,16 # [0] id:786
|
||||
addi a14,a5,-16 # [1]
|
||||
beqz a14,.LBB40_dspi_dotprod_off_s16_aes3 # [2]
|
||||
|
||||
.Lt_0_27138: # 0xd1
|
||||
.Lt_0_26626: # 0xd1
|
||||
addi a8,a5,-8 # [0]
|
||||
beqz a8,.LBB46_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_28674: # 0xd7
|
||||
.Lt_0_28162: # 0xd7
|
||||
addi a9,a5,-32 # [0]
|
||||
beqz a9,.LBB52_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_30210: # 0xdd
|
||||
.Lt_0_29698: # 0xdd
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB58_dspi_dotprod_off_s16_aes3 # [1]
|
||||
|
||||
movi.n a11,64 # [0]
|
||||
bge a11,a5,.Lt_0_33026 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:848
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:849
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:850
|
||||
beqz.n a3,.Lt_0_33026 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i a14,a1,80 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33794: # 0x10c
|
||||
beqz.n a13,.Lt_0_34050 # [0]
|
||||
|
||||
loopnez a13,.LBB273_dspi_dotprod_off_s16_aes3 # [0]
|
||||
|
||||
.LBB271_dspi_dotprod_off_s16_aes3: # 0x111
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:851
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+1] id:852
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q1,q2,q3,q0 # [0*II+3] id:853
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+4] id:854
|
||||
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q4,q3,q0,q1 # [0*II+6] id:855
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:856
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+9] id:857
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:858
|
||||
|
||||
.LBB273_dspi_dotprod_off_s16_aes3: # 0x131
|
||||
|
||||
.Lt_0_34050: # 0x131
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0] id:859
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [1] id:860
|
||||
movi.n a9,32 # [2]
|
||||
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q1,q2,q3,q0 # [3] id:861
|
||||
ee.vmulas.s16.accx.ld.ip q5,a15,16,q2,q6 # [4] id:862
|
||||
movi.n a10,-16 # [5]
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q5,q3,q0,q7 # [6] id:863
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [7] id:865
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [8] id:864
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [10] id:866
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [11] id:867
|
||||
bne a12,a3,.Lt_0_33794 # [12]
|
||||
|
||||
.Lt_0_33026: # 0x15d
|
||||
.Lt_0_32770: # 0x15d
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_35586 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srai a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:873
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_37122: # 0x183
|
||||
.Lt_0_20738: # 0x183
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:876
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_s16_aes3: # 0x19b
|
||||
extui a9,a5,0,1 # [0]
|
||||
beqz a9,.Lt_0_37634 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16si a8,a1,128 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:877
|
||||
call8 dspi_dotprod_off_s16_ansi # [8] dspi_dotprod_off_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_s16_aes3: # 0x1b9
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,80 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q0,a2,16 # [3] id:776
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:777
|
||||
addi a12,a12,-32 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q0,q2 # [6] id:778
|
||||
loopgtz a6,.LBB159_dspi_dotprod_off_s16_aes3 # [7]
|
||||
|
||||
.LBB157_dspi_dotprod_off_s16_aes3: # 0x1cf
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q0,q6 # [0*II+0] id:779
|
||||
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q1,q0,q2,q3 # [0*II+2] id:780
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+3] id:781
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q2,q3,q1 # [0*II+5] id:782
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q3,q6 # [0*II+6] id:784
|
||||
ee.ld.128.usar.xp q0,a2,a10 # [0*II+7] id:783
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q1,q3,q0,q2 # [0*II+9] id:785
|
||||
|
||||
.LBB159_dspi_dotprod_off_s16_aes3: # 0x1ea
|
||||
j .Lt_0_25602 # [0]
|
||||
|
||||
.LBB40_dspi_dotprod_off_s16_aes3: # 0x1ed
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:787
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:788
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:789
|
||||
loopnez a3,.LBB182_dspi_dotprod_off_s16_aes3 # [9]
|
||||
|
||||
.LBB180_dspi_dotprod_off_s16_aes3: # 0x206
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a11,q0,q1,q2,q3 # [0*II+0] id:790
|
||||
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+1] id:791
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:792
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q2,q1,q0 # [0*II+4] id:793
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q2,q6 # [0*II+5] id:794
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q1,q0,q3 # [0*II+7] id:795
|
||||
ee.vmulas.s16.accx.ld.ip q3,a15,16,q1,q6 # [0*II+8] id:796
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+9] id:797
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q3,q0,q1,q2 # [0*II+11] id:798
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+12] id:799
|
||||
|
||||
.LBB182_dspi_dotprod_off_s16_aes3: # 0x22c
|
||||
j .Lt_0_27138 # [0]
|
||||
|
||||
.LBB46_dspi_dotprod_off_s16_aes3: # 0x22f
|
||||
movi.n a10,-16 # [0]
|
||||
l32i a11,a1,80 # [1] gra_spill_temp_0
|
||||
addi a8,a2,16 # [2]
|
||||
addi a11,a11,16 # [3]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [4] id:800
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [5] id:801
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [7] id:802
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [8] id:803
|
||||
srli a3,a3,2 # [9]
|
||||
mov.n a2,a8 # [10]
|
||||
loopnez a3,.LBB205_dspi_dotprod_off_s16_aes3 # [11]
|
||||
|
||||
.LBB203_dspi_dotprod_off_s16_aes3: # 0x24e
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+1] id:805
|
||||
ee.ld.128.usar.xp q1,a2,a11 # [0*II+2] id:806
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q2,q1,q3 # [0*II+4] id:807
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q2,q6 # [0*II+5] id:808
|
||||
ee.ld.128.usar.xp q4,a2,a11 # [0*II+6] id:809
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q1,q4,q3 # [0*II+8] id:810
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q1,q6 # [0*II+9] id:811
|
||||
ee.ld.128.usar.xp q1,a2,a11 # [0*II+10] id:812
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a10,q0,q4,q1,q3 # [0*II+12] id:813
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q4,q6 # [0*II+13] id:814
|
||||
ee.ld.128.usar.xp q2,a2,a11 # [0*II+14] id:815
|
||||
|
||||
.LBB205_dspi_dotprod_off_s16_aes3: # 0x27a
|
||||
j .Lt_0_28674 # [0]
|
||||
|
||||
.LBB52_dspi_dotprod_off_s16_aes3: # 0x27d
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:816
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:817
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:818
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB228_dspi_dotprod_off_s16_aes3 # [10]
|
||||
|
||||
.LBB226_dspi_dotprod_off_s16_aes3: # 0x299
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q1,q2,q3 # [0*II+0] id:819
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:820
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q3,q0 # [0*II+3] id:821
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:822
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q0,q4 # [0*II+6] id:823
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q3,q6 # [0*II+7] id:825
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+8] id:824
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q0,q1,q2 # [0*II+10] id:826
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+11] id:827
|
||||
|
||||
.LBB228_dspi_dotprod_off_s16_aes3: # 0x2bc
|
||||
j .Lt_0_30210 # [0]
|
||||
|
||||
.LBB58_dspi_dotprod_off_s16_aes3: # 0x2bf
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,80 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:828
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:829
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:830
|
||||
mov.n a8,a2 # [10]
|
||||
loopnez a3,.LBB250_dspi_dotprod_off_s16_aes3 # [11]
|
||||
|
||||
.LBB248_dspi_dotprod_off_s16_aes3: # 0x2dd
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:831
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q1,q6 # [0*II+1] id:832
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:833
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q2,q6 # [0*II+4] id:834
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:835
|
||||
ee.vmulas.s16.accx.ld.ip q5,a15,16,q3,q6 # [0*II+7] id:836
|
||||
ee.vmulas.s16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:837
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+10] id:838
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:839
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q4,q6 # [0*II+13] id:840
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:841
|
||||
ee.vmulas.s16.accx.ld.ip q1,a15,16,q1,q6 # [0*II+16] id:842
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:843
|
||||
ee.vmulas.s16.accx.ld.ip q4,a15,16,q5,q6 # [0*II+19] id:845
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:844
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:846
|
||||
ee.vmulas.s16.accx.ld.ip q0,a15,16,q0,q6 # [0*II+23] id:847
|
||||
|
||||
.LBB250_dspi_dotprod_off_s16_aes3: # 0x320
|
||||
j .Lt_0_33026 # [0]
|
||||
|
||||
.Lt_0_35586: # 0x323
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:874
|
||||
retw.n # [3]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int16_t *i_data = (int16_t *)in_image->data;
|
||||
int16_t *f_data = (int16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s16_arp4
|
||||
.global dspi_dotprod_off_s16_ansi
|
||||
.type dspi_dotprod_off_s16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
|
||||
dspi_dotprod_off_s16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// current i_data - t4
|
||||
// current f_data - t5
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_s16_arp4_body
|
||||
j dspi_dotprod_off_s16_ansi
|
||||
|
||||
.dspi_dotprod_off_s16_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.16.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.s16 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,408 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_57, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_s8_aes3
|
||||
.type dspi_dotprod_off_s8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s8_aes3
|
||||
dspi_dotprod_off_s8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_s8_aes3: # 0x4
|
||||
entry a1,112 #
|
||||
l32i.n a10,a2,4 # [0] id:745
|
||||
l32i.n a12,a2,12 # [1] id:744
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:746
|
||||
l32i.n a9,a2,16 # [1] id:747
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:749
|
||||
l32i.n a14,a3,12 # [1] id:748
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB86_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:751
|
||||
l32i.n a9,a3,8 # [1] id:750
|
||||
s32i a9,a1,72 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB86_dspi_dotprod_off_s8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:752
|
||||
s32i a8,a1,68 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_35330 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_35330 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_35330 # [0]
|
||||
|
||||
l32i a11,a1,72 # [0] gra_spill_temp_2
|
||||
beqi a11,1,.Lt_0_18946 # [2]
|
||||
|
||||
.Lt_0_35330: # 0x46
|
||||
.Lt_0_19202: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s8_ansi, @function
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB86_dspi_dotprod_off_s8_aes3: # 0x59
|
||||
l32r a2,.LC0_1_57 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18946: # 0x5e
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_36098 # [1]
|
||||
|
||||
addi.n a15,a13,-1 # [0]
|
||||
bnez a15,.Lt_0_36098 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_36098 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_36098 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
blt a9,a5,.LBB27_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_36610: # 0x75
|
||||
.Lt_0_20994: # 0x75
|
||||
mov.n a8,a1 # [0]
|
||||
l8ui a9,a1,112 # [1] id:754 offset+0x0
|
||||
l32i.n a15,a2,0 # [2] id:753
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,68 # [4] gra_spill_temp_1
|
||||
s32i a10,a1,64 # [5] gra_spill_temp_0
|
||||
sext a9,a9,7 # [6]
|
||||
movi.n a10,4 # [7]
|
||||
# loop-count fixed at 4
|
||||
loop a10,.LBB140_dspi_dotprod_off_s8_aes3 # [8]
|
||||
|
||||
.LBB135_dspi_dotprod_off_s8_aes3: # 0x8d
|
||||
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
|
||||
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
|
||||
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
|
||||
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
|
||||
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
|
||||
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
|
||||
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
|
||||
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
|
||||
addi.n a8,a8,8 # [0*II+8]
|
||||
|
||||
.LBB140_dspi_dotprod_off_s8_aes3: # 0xa7
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-48 # [1]
|
||||
|
||||
addi.n a12,a1,8 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.accx_0 a13 # [5]
|
||||
wur.accx_1 a13 # [6]
|
||||
ee.vld.128.ip q6,a12,0 # [7] id:756
|
||||
s32i.n a12,a1,32 # [8] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_s8_aes3 # [9]
|
||||
|
||||
l32i a2,a1,68 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:771
|
||||
st.qr q0,a1,48 # [3] q0
|
||||
|
||||
.Lt_0_24578: # 0xc6
|
||||
addi a14,a5,-32 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_26626: # 0xcc
|
||||
.Lt_0_26114: # 0xcc
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_28162: # 0xd2
|
||||
.Lt_0_27650: # 0xd2
|
||||
addi a9,a5,-64 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_29698: # 0xd8
|
||||
.Lt_0_29186: # 0xd8
|
||||
addi a10,a5,-128 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_s8_aes3 # [1]
|
||||
|
||||
movi a11,128 # [0]
|
||||
bge a11,a5,.Lt_0_32514 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
|
||||
beqz.n a3,.Lt_0_32514 # [5]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
l32i a14,a1,64 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a5 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33282: # 0x108
|
||||
beqz.n a13,.Lt_0_33538 # [0]
|
||||
|
||||
loopnez a13,.LBB277_dspi_dotprod_off_s8_aes3 # [0]
|
||||
|
||||
.LBB275_dspi_dotprod_off_s8_aes3: # 0x10d
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
|
||||
ee.vmulas.s8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
|
||||
|
||||
.LBB277_dspi_dotprod_off_s8_aes3: # 0x12d
|
||||
|
||||
.Lt_0_33538: # 0x12d
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
|
||||
movi.n a8,32 # [2]
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
|
||||
ee.vmulas.s8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
|
||||
movi.n a9,-16 # [5]
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
|
||||
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
|
||||
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
|
||||
bne a12,a3,.Lt_0_33282 # [12]
|
||||
|
||||
.Lt_0_32514: # 0x159
|
||||
.Lt_0_32258: # 0x159
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_36098: # 0x175
|
||||
.Lt_0_20226: # 0x175
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_s8_aes3: # 0x188
|
||||
extui a14,a5,0,1 # [0]
|
||||
beqz a14,.Lt_0_36610 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_s8_aes3: # 0x1a1
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
|
||||
beqz.n a6,.Lt_0_24578 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB163_dspi_dotprod_off_s8_aes3 # [4]
|
||||
|
||||
.LBB161_dspi_dotprod_off_s8_aes3: # 0x1b9
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
|
||||
ee.vmulas.s8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
|
||||
|
||||
.LBB163_dspi_dotprod_off_s8_aes3: # 0x1d4
|
||||
st.qr q1,a1,48 # [0] q0
|
||||
j .Lt_0_24578 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_s8_aes3: # 0x1da
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
|
||||
beqz.n a3,.Lt_0_26626 # [7]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB186_dspi_dotprod_off_s8_aes3 # [3]
|
||||
|
||||
.LBB184_dspi_dotprod_off_s8_aes3: # 0x1f8
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
|
||||
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
|
||||
ee.vmulas.s8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
|
||||
|
||||
.LBB186_dspi_dotprod_off_s8_aes3: # 0x21e
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_26626 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_s8_aes3: # 0x224
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,64 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
|
||||
beqz.n a3,.Lt_0_28162 # [10]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB209_dspi_dotprod_off_s8_aes3 # [2]
|
||||
|
||||
.LBB207_dspi_dotprod_off_s8_aes3: # 0x248
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
|
||||
|
||||
.LBB209_dspi_dotprod_off_s8_aes3: # 0x274
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_28162 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_s8_aes3: # 0x27a
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
|
||||
beqz.n a3,.Lt_0_29698 # [4]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
movi.n a11,-16 # [3]
|
||||
sub a12,a12,a5 # [4]
|
||||
addi a12,a12,16 # [5]
|
||||
loopnez a3,.LBB232_dspi_dotprod_off_s8_aes3 # [6]
|
||||
|
||||
.LBB230_dspi_dotprod_off_s8_aes3: # 0x298
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
|
||||
|
||||
.LBB232_dspi_dotprod_off_s8_aes3: # 0x2bb
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_29698 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_s8_aes3: # 0x2c1
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
|
||||
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ld.qr q0,a1,48 # [8] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
|
||||
mov.n a8,a15 # [10]
|
||||
loopnez a3,.LBB254_dspi_dotprod_off_s8_aes3 # [11]
|
||||
|
||||
.LBB252_dspi_dotprod_off_s8_aes3: # 0x2df
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
|
||||
ee.vmulas.s8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
|
||||
ee.vmulas.s8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
|
||||
ee.vmulas.s8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
|
||||
ee.vmulas.s8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
|
||||
ee.vmulas.s8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
|
||||
|
||||
.LBB254_dspi_dotprod_off_s8_aes3: # 0x322
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int8_t *i_data = (int8_t *)in_image->data;
|
||||
int8_t *f_data = (int8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_s8_arp4
|
||||
.global dspi_dotprod_off_s8_ansi
|
||||
.type dspi_dotprod_off_s8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int8_t offset);
|
||||
dspi_dotprod_off_s8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_s8_arp4_body
|
||||
j dspi_dotprod_off_s8_ansi
|
||||
|
||||
.dspi_dotprod_off_s8_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.8.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.s8 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,417 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_61, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_u16_aes3
|
||||
.type dspi_dotprod_off_u16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u16_aes3
|
||||
dspi_dotprod_off_u16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_off_u16_aes3: # 0x4
|
||||
entry a1,144 #
|
||||
l32i.n a10,a2,4 # [0] id:760
|
||||
l32i.n a12,a2,12 # [1] id:759
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:761
|
||||
l32i.n a9,a2,16 # [1] id:762
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:764
|
||||
l32i.n a14,a3,12 # [1] id:763
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB89_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:766
|
||||
l32i.n a9,a3,8 # [1] id:765
|
||||
s32i a9,a1,104 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB89_dspi_dotprod_off_u16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:767
|
||||
s32i a8,a1,100 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_36354 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_36354 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_36354 # [0]
|
||||
|
||||
l32i a9,a1,104 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_19458 # [2]
|
||||
|
||||
.Lt_0_36354: # 0x46
|
||||
.Lt_0_19714: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:876
|
||||
.type dspi_dotprod_off_u16_ansi, @function
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB89_dspi_dotprod_off_u16_aes3: # 0x5e
|
||||
l32r a2,.LC0_1_61 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_19458: # 0x63
|
||||
addi.n a9,a10,-1 # [0]
|
||||
bnez a9,.Lt_0_37122 # [1]
|
||||
|
||||
addi.n a10,a13,-1 # [0]
|
||||
bnez a10,.Lt_0_37122 # [1]
|
||||
|
||||
extui a11,a5,0,3 # [0]
|
||||
bnez.n a11,.Lt_0_37122 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_37122 # [0]
|
||||
|
||||
movi.n a14,32 # [0]
|
||||
blt a14,a5,.LBB27_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_37634: # 0x7a
|
||||
.Lt_0_21506: # 0x7a
|
||||
l16ui a9,a1,144 # [0] id:768 offset+0x0
|
||||
addi a8,a1,16 # [1] temp_offset
|
||||
l32i.n a15,a2,0 # [2] id:769
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,100 # [4] gra_spill_temp_1
|
||||
slli a10,a10,1 # [5]
|
||||
s32i a10,a1,96 # [6] gra_spill_temp_0
|
||||
movi.n a10,2 # [7]
|
||||
# loop-count fixed at 2
|
||||
loop a10,.LBB143_dspi_dotprod_off_u16_aes3 # [8]
|
||||
|
||||
.LBB138_dspi_dotprod_off_u16_aes3: # 0x93
|
||||
s16i a9,a8,0 # [0*II+0] id:770 temp_offset+0x0
|
||||
s16i a9,a8,2 # [0*II+1] id:770 temp_offset+0x0
|
||||
s16i a9,a8,4 # [0*II+2] id:770 temp_offset+0x0
|
||||
s16i a9,a8,6 # [0*II+3] id:770 temp_offset+0x0
|
||||
s16i a9,a8,8 # [0*II+4] id:770 temp_offset+0x0
|
||||
s16i a9,a8,10 # [0*II+5] id:770 temp_offset+0x0
|
||||
s16i a9,a8,12 # [0*II+6] id:770 temp_offset+0x0
|
||||
s16i a9,a8,14 # [0*II+7] id:770 temp_offset+0x0
|
||||
addi a8,a8,16 # [0*II+8]
|
||||
|
||||
.LBB143_dspi_dotprod_off_u16_aes3: # 0xae
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-24 # [1]
|
||||
addi a12,a1,24 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.sar_byte a13 # [5]
|
||||
wur.accx_0 a13 # [6]
|
||||
wur.accx_1 a13 # [7]
|
||||
ee.vld.128.ip q6,a12,0 # [8] id:771
|
||||
s32i.n a12,a1,48 # [9] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_u16_aes3 # [10]
|
||||
|
||||
l32i a2,a1,100 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:787
|
||||
st.qr q0,a1,64 # [3] q0
|
||||
|
||||
.Lt_0_25090: # 0xd1
|
||||
addi a14,a5,-16 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_27138: # 0xd7
|
||||
.Lt_0_26626: # 0xd7
|
||||
addi a8,a5,-8 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_28674: # 0xdd
|
||||
.Lt_0_28162: # 0xdd
|
||||
addi a9,a5,-32 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_30210: # 0xe3
|
||||
.Lt_0_29698: # 0xe3
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_u16_aes3 # [1]
|
||||
|
||||
movi.n a11,64 # [0]
|
||||
bge a11,a5,.Lt_0_33026 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:849
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:850
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:851
|
||||
beqz.n a3,.Lt_0_33026 # [5]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
slli a8,a5,1 # [1]
|
||||
l32i a14,a1,96 # [2] gra_spill_temp_0
|
||||
addi a13,a5,31 # [3]
|
||||
movgez a13,a5,a5 # [4]
|
||||
srai a13,a13,5 # [5]
|
||||
sub a14,a14,a8 # [6]
|
||||
addi a14,a14,16 # [7]
|
||||
addi.n a13,a13,-1 # [8]
|
||||
|
||||
.Lt_0_33794: # 0x115
|
||||
beqz.n a13,.Lt_0_34050 # [0]
|
||||
|
||||
loopnez a13,.LBB280_dspi_dotprod_off_u16_aes3 # [0]
|
||||
|
||||
.LBB278_dspi_dotprod_off_u16_aes3: # 0x11a
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:852
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:853
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:854
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:855
|
||||
ee.vmulas.u16.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:856
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:857
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:858
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:859
|
||||
|
||||
.LBB280_dspi_dotprod_off_u16_aes3: # 0x13a
|
||||
|
||||
.Lt_0_34050: # 0x13a
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:860
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [1] id:861
|
||||
movi.n a9,32 # [2]
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:862
|
||||
ee.vmulas.u16.accx.ld.ip q7,a2,16,q2,q6 # [4] id:863
|
||||
movi.n a10,-16 # [5]
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a10,q7,q3,q4,q0 # [6] id:864
|
||||
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [7] id:866
|
||||
ee.ld.128.usar.xp q1,a15,a9 # [8] id:865
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:867
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [11] id:868
|
||||
bne a12,a3,.Lt_0_33794 # [12]
|
||||
|
||||
.Lt_0_33026: # 0x166
|
||||
.Lt_0_32770: # 0x166
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_35586 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srli a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:874
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_37122: # 0x18c
|
||||
.Lt_0_20738: # 0x18c
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:877
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_u16_aes3: # 0x1a4
|
||||
extui a9,a5,0,1 # [0]
|
||||
beqz a9,.Lt_0_37634 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
l16ui a8,a1,144 # [6] id:768 offset+0x0
|
||||
s32i.n a8,a1,0 # [7] id:878
|
||||
call8 dspi_dotprod_off_u16_ansi # [8] dspi_dotprod_off_u16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_u16_aes3: # 0x1c2
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:776
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:777
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:778
|
||||
beqz.n a6,.Lt_0_25090 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,96 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB166_dspi_dotprod_off_u16_aes3 # [4]
|
||||
|
||||
.LBB164_dspi_dotprod_off_u16_aes3: # 0x1da
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:779
|
||||
ee.vmulas.u16.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:780
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:781
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:782
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:784
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:783
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:785
|
||||
|
||||
.LBB166_dspi_dotprod_off_u16_aes3: # 0x1f5
|
||||
st.qr q1,a1,64 # [0] q0
|
||||
j .Lt_0_25090 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_u16_aes3: # 0x1fb
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,96 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:788
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:789
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:790
|
||||
beqz.n a3,.Lt_0_27138 # [7]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB189_dspi_dotprod_off_u16_aes3 # [3]
|
||||
|
||||
.LBB187_dspi_dotprod_off_u16_aes3: # 0x219
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:791
|
||||
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:792
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:793
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:794
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:795
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:796
|
||||
ee.vmulas.u16.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:797
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:798
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:799
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:800
|
||||
|
||||
.LBB189_dspi_dotprod_off_u16_aes3: # 0x23f
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_27138 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_u16_aes3: # 0x245
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,96 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:801
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:802
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:803
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:804
|
||||
beqz.n a3,.Lt_0_28674 # [10]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB212_dspi_dotprod_off_u16_aes3 # [2]
|
||||
|
||||
.LBB210_dspi_dotprod_off_u16_aes3: # 0x269
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:805
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:806
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:807
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:808
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:809
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:810
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:811
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:812
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:813
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:814
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:815
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:816
|
||||
|
||||
.LBB212_dspi_dotprod_off_u16_aes3: # 0x295
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_28674 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_u16_aes3: # 0x29b
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:817
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:818
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:819
|
||||
beqz.n a3,.Lt_0_30210 # [4]
|
||||
|
||||
ld.qr q0,a1,64 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
l32i a12,a1,96 # [3] gra_spill_temp_0
|
||||
slli a13,a5,1 # [4]
|
||||
sub a12,a12,a13 # [5]
|
||||
addi a12,a12,16 # [6]
|
||||
loopnez a3,.LBB235_dspi_dotprod_off_u16_aes3 # [7]
|
||||
|
||||
.LBB233_dspi_dotprod_off_u16_aes3: # 0x2bc
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:820
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:821
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:822
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:823
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:824
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:826
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:825
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:827
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:828
|
||||
|
||||
.LBB235_dspi_dotprod_off_u16_aes3: # 0x2df
|
||||
st.qr q0,a1,64 # [0] q0
|
||||
j .Lt_0_30210 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_u16_aes3: # 0x2e5
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i a12,a1,96 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [4] id:829
|
||||
ee.ld.128.usar.ip q2,a15,16 # [5] id:830
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ld.qr q0,a1,64 # [9] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [10] id:831
|
||||
mov.n a8,a15 # [11]
|
||||
loopnez a3,.LBB257_dspi_dotprod_off_u16_aes3 # [12]
|
||||
|
||||
.LBB255_dspi_dotprod_off_u16_aes3: # 0x306
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:832
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:833
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:834
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:835
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:836
|
||||
ee.vmulas.u16.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:837
|
||||
ee.vmulas.u16.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:838
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:839
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:840
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:841
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:842
|
||||
ee.vmulas.u16.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:843
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:844
|
||||
ee.vmulas.u16.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:846
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:845
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:847
|
||||
ee.vmulas.u16.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:848
|
||||
|
||||
.LBB257_dspi_dotprod_off_u16_aes3: # 0x349
|
||||
j .Lt_0_33026 # [0]
|
||||
|
||||
.Lt_0_35586: # 0x34c
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:875
|
||||
retw.n # [3]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint16_t *i_data = (uint16_t *)in_image->data;
|
||||
uint16_t *f_data = (uint16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * ((int32_t)f_data[filter->step_x * x] + (int32_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u16_arp4
|
||||
.global dspi_dotprod_off_u16_ansi
|
||||
.type dspi_dotprod_off_u16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, unt16_t offset);
|
||||
dspi_dotprod_off_u16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_u16_arp4_body
|
||||
j dspi_dotprod_off_u16_ansi
|
||||
|
||||
.dspi_dotprod_off_u16_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.16.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.u16 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,407 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_57, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_off_u8_aes3
|
||||
.type dspi_dotprod_off_u8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u8_aes3
|
||||
dspi_dotprod_off_u8_aes3: # 0x4
|
||||
|
||||
.LBB1_dspi_dotprod_off_u8_aes3: # 0x4
|
||||
entry a1,112 #
|
||||
l32i.n a10,a2,4 # [0] id:745
|
||||
l32i.n a12,a2,12 # [1] id:744
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a12,a8,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a13,a2,8 # [0] id:746
|
||||
l32i.n a9,a2,16 # [1] id:747
|
||||
mull a11,a13,a6 # [2]
|
||||
blt a9,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:749
|
||||
l32i.n a14,a3,12 # [1] id:748
|
||||
mull a11,a15,a5 # [2]
|
||||
blt a14,a11,.LBB86_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:751
|
||||
l32i.n a9,a3,8 # [1] id:750
|
||||
s32i a9,a1,72 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB86_dspi_dotprod_off_u8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:752
|
||||
s32i a8,a1,68 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_35330 # [2]
|
||||
|
||||
bne a14,a11,.Lt_0_35330 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_35330 # [0]
|
||||
|
||||
l32i a11,a1,72 # [0] gra_spill_temp_2
|
||||
beqi a11,1,.Lt_0_18946 # [2]
|
||||
|
||||
.Lt_0_35330: # 0x46
|
||||
.Lt_0_19202: # 0x46
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_u8_ansi, @function
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB86_dspi_dotprod_off_u8_aes3: # 0x59
|
||||
l32r a2,.LC0_1_57 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18946: # 0x5e
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_36098 # [1]
|
||||
|
||||
addi.n a15,a13,-1 # [0]
|
||||
bnez a15,.Lt_0_36098 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_36098 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_36098 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
blt a9,a5,.LBB27_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_36610: # 0x75
|
||||
.Lt_0_20994: # 0x75
|
||||
l8ui a9,a1,112 # [0] id:754 offset+0x0
|
||||
mov.n a8,a1 # [1]
|
||||
l32i.n a15,a2,0 # [2] id:753
|
||||
mull a10,a12,a13 # [3]
|
||||
l32i a2,a1,68 # [4] gra_spill_temp_1
|
||||
s32i a10,a1,64 # [5] gra_spill_temp_0
|
||||
movi.n a10,4 # [6]
|
||||
# loop-count fixed at 4
|
||||
loop a10,.LBB140_dspi_dotprod_off_u8_aes3 # [7]
|
||||
|
||||
.LBB135_dspi_dotprod_off_u8_aes3: # 0x8a
|
||||
s8i a9,a8,0 # [0*II+0] id:755 temp_offset+0x0
|
||||
s8i a9,a8,1 # [0*II+1] id:755 temp_offset+0x0
|
||||
s8i a9,a8,2 # [0*II+2] id:755 temp_offset+0x0
|
||||
s8i a9,a8,3 # [0*II+3] id:755 temp_offset+0x0
|
||||
s8i a9,a8,4 # [0*II+4] id:755 temp_offset+0x0
|
||||
s8i a9,a8,5 # [0*II+5] id:755 temp_offset+0x0
|
||||
s8i a9,a8,6 # [0*II+6] id:755 temp_offset+0x0
|
||||
s8i a9,a8,7 # [0*II+7] id:755 temp_offset+0x0
|
||||
addi.n a8,a8,8 # [0*II+8]
|
||||
|
||||
.LBB140_dspi_dotprod_off_u8_aes3: # 0xa4
|
||||
mov.n a3,a6 # [0]
|
||||
addi a11,a5,-48 # [1]
|
||||
addi.n a12,a1,8 # [3] temp_offset+8
|
||||
movi.n a13,0 # [4]
|
||||
wur.accx_0 a13 # [5]
|
||||
wur.accx_1 a13 # [6]
|
||||
ee.vld.128.ip q6,a12,0 # [7] id:756
|
||||
s32i.n a12,a1,32 # [8] offset_data_ptr
|
||||
beqz a11,.LBB34_dspi_dotprod_off_u8_aes3 # [9]
|
||||
|
||||
l32i a2,a1,68 # [0] gra_spill_temp_1
|
||||
ee.vld.128.ip q0,a2,16 # [2] id:771
|
||||
st.qr q0,a1,48 # [3] q0
|
||||
|
||||
.Lt_0_24578: # 0xc3
|
||||
addi a14,a5,-32 # [0]
|
||||
beqz a14,.LBB43_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_26626: # 0xc9
|
||||
.Lt_0_26114: # 0xc9
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB50_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_28162: # 0xcf
|
||||
.Lt_0_27650: # 0xcf
|
||||
addi a9,a5,-64 # [0]
|
||||
beqz a9,.LBB57_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_29698: # 0xd5
|
||||
.Lt_0_29186: # 0xd5
|
||||
addi a10,a5,-128 # [0]
|
||||
beqz a10,.LBB64_dspi_dotprod_off_u8_aes3 # [1]
|
||||
|
||||
movi a11,128 # [0]
|
||||
bge a11,a5,.Lt_0_32514 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a15,16 # [1] id:833
|
||||
ee.ld.128.usar.ip q2,a15,16 # [2] id:834
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [4] id:835
|
||||
beqz.n a3,.Lt_0_32514 # [5]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
l32i a14,a1,64 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a5 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_33282: # 0x105
|
||||
beqz.n a13,.Lt_0_33538 # [0]
|
||||
|
||||
loopnez a13,.LBB277_dspi_dotprod_off_u8_aes3 # [0]
|
||||
|
||||
.LBB275_dspi_dotprod_off_u8_aes3: # 0x10a
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:836
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+1] id:837
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a15,16,q1,q2,q3,q0 # [0*II+3] id:838
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+4] id:839
|
||||
ee.vmulas.u8.accx.ld.ip.qup q2,a15,16,q4,q3,q0,q1 # [0*II+6] id:840
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:841
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+9] id:842
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:843
|
||||
|
||||
.LBB277_dspi_dotprod_off_u8_aes3: # 0x12a
|
||||
|
||||
.Lt_0_33538: # 0x12a
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a15,16,q0,q1,q2,q3 # [0] id:844
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [1] id:845
|
||||
movi.n a8,32 # [2]
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a14,q1,q2,q3,q4 # [3] id:846
|
||||
ee.vmulas.u8.accx.ld.ip q7,a2,16,q2,q6 # [4] id:847
|
||||
movi.n a9,-16 # [5]
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a9,q7,q3,q4,q0 # [6] id:848
|
||||
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [7] id:850
|
||||
ee.ld.128.usar.xp q1,a15,a8 # [8] id:849
|
||||
addi.n a12,a12,1 # [9]
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q5,q4,q1,q2 # [10] id:851
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [11] id:852
|
||||
bne a12,a3,.Lt_0_33282 # [12]
|
||||
|
||||
.Lt_0_32514: # 0x156
|
||||
.Lt_0_32258: # 0x156
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_36098: # 0x172
|
||||
.Lt_0_20226: # 0x172
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB27_dspi_dotprod_off_u8_aes3: # 0x185
|
||||
extui a14,a5,0,1 # [0]
|
||||
beqz a14,.Lt_0_36610 # [1]
|
||||
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB34_dspi_dotprod_off_u8_aes3: # 0x19e
|
||||
ee.ld.128.usar.ip q0,a15,16 # [0] id:760
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:761
|
||||
ee.src.q.ld.ip q3,a15,16,q0,q2 # [3] id:762
|
||||
beqz.n a6,.Lt_0_24578 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB163_dspi_dotprod_off_u8_aes3 # [4]
|
||||
|
||||
.LBB161_dspi_dotprod_off_u8_aes3: # 0x1b6
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q0,q6 # [0*II+0] id:763
|
||||
ee.vmulas.u8.accx.ld.xp.qup q1,a15,a12,q1,q0,q2,q3 # [0*II+2] id:764
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+3] id:765
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q0,q2,q3,q1 # [0*II+5] id:766
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q3,q6 # [0*II+6] id:768
|
||||
ee.ld.128.usar.xp q0,a15,a10 # [0*II+7] id:767
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q1,q3,q0,q2 # [0*II+9] id:769
|
||||
|
||||
.LBB163_dspi_dotprod_off_u8_aes3: # 0x1d1
|
||||
st.qr q1,a1,48 # [0] q0
|
||||
j .Lt_0_24578 # [1]
|
||||
|
||||
.LBB43_dspi_dotprod_off_u8_aes3: # 0x1d7
|
||||
srli a3,a6,1 # [0]
|
||||
l32i a12,a1,64 # [1] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [2] id:772
|
||||
ee.ld.128.usar.ip q2,a15,16 # [3] id:773
|
||||
addi a12,a12,-16 # [5]
|
||||
ee.src.q.ld.xp q3,a15,a12,q1,q2 # [6] id:774
|
||||
beqz.n a3,.Lt_0_26626 # [7]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopnez a3,.LBB186_dspi_dotprod_off_u8_aes3 # [3]
|
||||
|
||||
.LBB184_dspi_dotprod_off_u8_aes3: # 0x1f5
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a15,a11,q0,q1,q2,q3 # [0*II+0] id:775
|
||||
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+1] id:776
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+2] id:777
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q2,q1,q0 # [0*II+4] id:778
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q2,q6 # [0*II+5] id:779
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q4,q1,q0,q3 # [0*II+7] id:780
|
||||
ee.vmulas.u8.accx.ld.ip q3,a2,16,q1,q6 # [0*II+8] id:781
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+9] id:782
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a12,q3,q0,q1,q2 # [0*II+11] id:783
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+12] id:784
|
||||
|
||||
.LBB186_dspi_dotprod_off_u8_aes3: # 0x21b
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_26626 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_off_u8_aes3: # 0x221
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a13,-16 # [1]
|
||||
l32i a11,a1,64 # [2] gra_spill_temp_0
|
||||
addi a15,a15,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a15,a13 # [5] id:785
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [6] id:786
|
||||
ee.src.q.ld.xp q3,a15,a13,q1,q2 # [8] id:787
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [9] id:788
|
||||
beqz.n a3,.Lt_0_28162 # [10]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,-16 # [1]
|
||||
loopnez a3,.LBB209_dspi_dotprod_off_u8_aes3 # [2]
|
||||
|
||||
.LBB207_dspi_dotprod_off_u8_aes3: # 0x245
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q2,q3 # [0*II+0] id:789
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+1] id:790
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+2] id:791
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q2,q1,q3 # [0*II+4] id:792
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q2,q6 # [0*II+5] id:793
|
||||
ee.ld.128.usar.xp q4,a15,a11 # [0*II+6] id:794
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q1,q4,q3 # [0*II+8] id:795
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q1,q6 # [0*II+9] id:796
|
||||
ee.ld.128.usar.xp q1,a15,a11 # [0*II+10] id:797
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a15,a10,q0,q4,q1,q3 # [0*II+12] id:798
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q4,q6 # [0*II+13] id:799
|
||||
ee.ld.128.usar.xp q2,a15,a11 # [0*II+14] id:800
|
||||
|
||||
.LBB209_dspi_dotprod_off_u8_aes3: # 0x271
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_28162 # [1]
|
||||
|
||||
.LBB57_dspi_dotprod_off_u8_aes3: # 0x277
|
||||
ee.ld.128.usar.ip q1,a15,16 # [0] id:801
|
||||
ee.ld.128.usar.ip q2,a15,16 # [1] id:802
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [3] id:803
|
||||
beqz.n a3,.Lt_0_29698 # [4]
|
||||
|
||||
ld.qr q0,a1,48 # [0] q0
|
||||
movi.n a10,32 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
movi.n a11,-16 # [3]
|
||||
sub a12,a12,a5 # [4]
|
||||
addi a12,a12,16 # [5]
|
||||
loopnez a3,.LBB232_dspi_dotprod_off_u8_aes3 # [6]
|
||||
|
||||
.LBB230_dspi_dotprod_off_u8_aes3: # 0x295
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a15,16,q0,q1,q2,q3 # [0*II+0] id:804
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:805
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a15,a12,q4,q2,q3,q0 # [0*II+3] id:806
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:807
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a15,a11,q1,q3,q0,q4 # [0*II+6] id:808
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q3,q6 # [0*II+7] id:809
|
||||
ee.ld.128.usar.xp q1,a15,a10 # [0*II+8] id:810
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a15,16,q4,q0,q1,q2 # [0*II+10] id:811
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+11] id:812
|
||||
|
||||
.LBB232_dspi_dotprod_off_u8_aes3: # 0x2b8
|
||||
st.qr q0,a1,48 # [0] q0
|
||||
j .Lt_0_29698 # [1]
|
||||
|
||||
.LBB64_dspi_dotprod_off_u8_aes3: # 0x2be
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i a12,a1,64 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a15,16 # [3] id:813
|
||||
ee.ld.128.usar.ip q2,a15,16 # [4] id:814
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ld.qr q0,a1,48 # [8] q0
|
||||
ee.src.q.ld.ip q3,a15,16,q1,q2 # [9] id:815
|
||||
mov.n a8,a15 # [10]
|
||||
loopnez a3,.LBB254_dspi_dotprod_off_u8_aes3 # [11]
|
||||
|
||||
.LBB252_dspi_dotprod_off_u8_aes3: # 0x2dc
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q1,q2,q3 # [0*II+0] id:816
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q1,q6 # [0*II+1] id:817
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a8,16,q4,q2,q3,q0 # [0*II+3] id:818
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q2,q6 # [0*II+4] id:819
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a8,16,q1,q3,q0,q4 # [0*II+6] id:820
|
||||
ee.vmulas.u8.accx.ld.ip q5,a2,16,q3,q6 # [0*II+7] id:821
|
||||
ee.vmulas.u8.accx.ld.ip.qup q5,a8,16,q5,q0,q4,q1 # [0*II+9] id:822
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+10] id:823
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a8,16,q0,q4,q1,q5 # [0*II+12] id:824
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q4,q6 # [0*II+13] id:825
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a12,q4,q1,q5,q0 # [0*II+15] id:826
|
||||
ee.vmulas.u8.accx.ld.ip q1,a2,16,q1,q6 # [0*II+16] id:827
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a8,a11,q1,q5,q0,q4 # [0*II+18] id:828
|
||||
ee.vmulas.u8.accx.ld.ip q4,a2,16,q5,q6 # [0*II+19] id:829
|
||||
ee.ld.128.usar.xp q1,a8,a10 # [0*II+20] id:830
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a8,16,q4,q0,q1,q2 # [0*II+22] id:831
|
||||
ee.vmulas.u8.accx.ld.ip q0,a2,16,q0,q6 # [0*II+23] id:832
|
||||
|
||||
.LBB254_dspi_dotprod_off_u8_aes3: # 0x31f
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:854
|
||||
retw.n # [10]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint8_t *i_data = (uint8_t *)in_image->data;
|
||||
uint8_t *f_data = (uint8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * ((int16_t)f_data[filter->step_x * x] + (int16_t)offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_off_u8_arp4
|
||||
.global dspi_dotprod_off_u8_ansi
|
||||
.type dspi_dotprod_off_u8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
|
||||
dspi_dotprod_off_u8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
// offset - a6
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_off_u8_arp4_body
|
||||
j dspi_dotprod_off_u8_ansi
|
||||
|
||||
.dspi_dotprod_off_u8_arp4_body:
|
||||
add sp, sp, -16
|
||||
|
||||
sw a6, 0(sp)
|
||||
mv t6, sp
|
||||
esp.vldbc.8.ip q2, t6, 0
|
||||
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
|
||||
addi a7, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a7
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q1, t5, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q0, t4, 16 // q1 - f_data
|
||||
esp.vadd.u8 q3, q2, q1
|
||||
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q1, t5, 16, q0, q3 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,372 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_53, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_s16_aes3
|
||||
.type dspi_dotprod_s16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_s16_aes3
|
||||
dspi_dotprod_s16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_s16_aes3: # 0x4
|
||||
entry a1,64 #
|
||||
l32i.n a10,a2,4 # [0] id:678
|
||||
l32i.n a11,a2,12 # [1] id:677
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:679
|
||||
l32i.n a9,a2,16 # [1] id:680
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:682
|
||||
l32i.n a14,a3,12 # [1] id:681
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB81_dspi_dotprod_s16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:684
|
||||
l32i.n a9,a3,8 # [1] id:683
|
||||
s32i.n a9,a1,24 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB81_dspi_dotprod_s16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:685
|
||||
s32i.n a8,a1,20 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_34050 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_34050 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_34050 # [0]
|
||||
|
||||
l32i.n a9,a1,24 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_18178 # [2]
|
||||
|
||||
.Lt_0_34050: # 0x43
|
||||
.Lt_0_18434: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s16_ansi, @function
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB81_dspi_dotprod_s16_aes3: # 0x56
|
||||
l32r a2,.LC0_1_53 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18178: # 0x5b
|
||||
addi.n a13,a10,-1 # [0]
|
||||
bnez a13,.Lt_0_34818 # [1]
|
||||
|
||||
addi.n a14,a12,-1 # [0]
|
||||
bnez a14,.Lt_0_34818 # [1]
|
||||
|
||||
extui a15,a5,0,3 # [0]
|
||||
bnez.n a15,.Lt_0_34818 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_34818 # [0]
|
||||
|
||||
movi.n a8,32 # [0]
|
||||
bge a8,a5,.Lt_0_35330 # [1]
|
||||
|
||||
extui a9,a5,0,1 # [0]
|
||||
bnez a9,.LBB28_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_35330: # 0x78
|
||||
.Lt_0_20226: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a10,a5,-24 # [1]
|
||||
mull a13,a11,a12 # [2]
|
||||
l32i.n a15,a1,20 # [3] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [4] id:686
|
||||
movi.n a14,0 # [5]
|
||||
wur.sar_byte a14 # [6]
|
||||
wur.accx_0 a14 # [8]
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:690
|
||||
slli a13,a13,1 # [11]
|
||||
s32i.n a13,a1,16 # [12] gra_spill_temp_0
|
||||
beqz a10,.LBB32_dspi_dotprod_s16_aes3 # [13]
|
||||
|
||||
.Lt_0_23298: # 0x99
|
||||
.Lt_0_22786: # 0x99
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_24834: # 0x9f
|
||||
.Lt_0_24322: # 0x9f
|
||||
addi a9,a5,-8 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_26370: # 0xa5
|
||||
.Lt_0_25858: # 0xa5
|
||||
addi a10,a5,-32 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
.Lt_0_27906: # 0xab
|
||||
.Lt_0_27394: # 0xab
|
||||
addi a11,a5,-64 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_s16_aes3 # [1]
|
||||
|
||||
movi.n a12,64 # [0]
|
||||
bge a12,a5,.Lt_0_30722 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:762
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:763
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:764
|
||||
beqz.n a3,.Lt_0_30722 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i.n a14,a1,16 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_31490: # 0xd9
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a9,32 # [1]
|
||||
beqz.n a13,.Lt_0_31746 # [2]
|
||||
|
||||
loopnez a13,.LBB221_dspi_dotprod_s16_aes3 # [0]
|
||||
|
||||
.LBB219_dspi_dotprod_s16_aes3: # 0xe2
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:766
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:765
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:768
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:767
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:770
|
||||
ee.vmulas.s16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:769
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:772
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:771
|
||||
|
||||
.LBB221_dspi_dotprod_s16_aes3: # 0xfe
|
||||
|
||||
.Lt_0_31746: # 0xfe
|
||||
ee.vmulas.s16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:773
|
||||
movi.n a10,-16 # [1]
|
||||
ee.vld.128.ip q0,a15,16 # [2] id:774
|
||||
ee.vld.128.ip q6,a15,16 # [3] id:776
|
||||
ee.vmulas.s16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:775
|
||||
ee.vld.128.ip q4,a15,16 # [5] id:779
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:777
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [7] id:778
|
||||
ee.vld.128.ip q0,a15,16 # [8] id:781
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:780
|
||||
bne a12,a3,.Lt_0_31490 # [10]
|
||||
|
||||
.Lt_0_30722: # 0x122
|
||||
.Lt_0_30466: # 0x122
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_33282 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srai a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:787
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_34818: # 0x148
|
||||
.Lt_0_19458: # 0x148
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_s16_aes3: # 0x15b
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:691
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:692
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:693
|
||||
beqz.n a6,.Lt_0_23298 # [4]
|
||||
|
||||
addi a12,a13,-32 # [0]
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopgtz a6,.LBB107_dspi_dotprod_s16_aes3 # [3]
|
||||
|
||||
.LBB105_dspi_dotprod_s16_aes3: # 0x170
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
|
||||
ee.vmulas.s16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:694
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:697
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:696
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:698
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:700
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:699
|
||||
|
||||
.LBB107_dspi_dotprod_s16_aes3: # 0x188
|
||||
j .Lt_0_23298 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_s16_aes3: # 0x18b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:701
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:702
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:703
|
||||
loopnez a3,.LBB130_dspi_dotprod_s16_aes3 # [9]
|
||||
|
||||
.LBB128_dspi_dotprod_s16_aes3: # 0x1a3
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:705
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:704
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:706
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:708
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:707
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:710
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:709
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:711
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:713
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:712
|
||||
|
||||
.LBB130_dspi_dotprod_s16_aes3: # 0x1c5
|
||||
j .Lt_0_24834 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_s16_aes3: # 0x1c8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,16 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:714
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:715
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:716
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:717
|
||||
loopnez a3,.LBB153_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB151_dspi_dotprod_s16_aes3: # 0x1e4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:719
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:718
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:720
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:722
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:721
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:723
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:725
|
||||
ee.vmulas.s16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:724
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:726
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:728
|
||||
ee.vmulas.s16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:727
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:729
|
||||
|
||||
.LBB153_dspi_dotprod_s16_aes3: # 0x20c
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_26370 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_s16_aes3: # 0x211
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:730
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:731
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:732
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB176_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB174_dspi_dotprod_s16_aes3: # 0x22c
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:734
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:733
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:736
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:735
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:737
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:738
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:741
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:740
|
||||
|
||||
.LBB176_dspi_dotprod_s16_aes3: # 0x24b
|
||||
j .Lt_0_27906 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_s16_aes3: # 0x24e
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:742
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:743
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:744
|
||||
loopnez a3,.LBB198_dspi_dotprod_s16_aes3 # [10]
|
||||
|
||||
.LBB196_dspi_dotprod_s16_aes3: # 0x269
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:746
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:745
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:748
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:747
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:750
|
||||
ee.vmulas.s16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:749
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:752
|
||||
ee.vmulas.s16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:751
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:754
|
||||
ee.vmulas.s16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:753
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:756
|
||||
ee.vmulas.s16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:755
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:759
|
||||
ee.vmulas.s16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:757
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:758
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:761
|
||||
ee.vmulas.s16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:760
|
||||
|
||||
.LBB198_dspi_dotprod_s16_aes3: # 0x2a4
|
||||
j .Lt_0_30722 # [0]
|
||||
|
||||
.Lt_0_33282: # 0x2a7
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:788
|
||||
retw.n # [3]
|
||||
|
||||
.LBB28_dspi_dotprod_s16_aes3: # 0x2b1
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int16_t *i_data = (int16_t *)in_image->data;
|
||||
int16_t *f_data = (int16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_s16_arp4
|
||||
.global dspi_dotprod_s16_ansi
|
||||
.type dspi_dotprod_s16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_s16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_s16_arp4_body
|
||||
j dspi_dotprod_s16_ansi
|
||||
|
||||
.dspi_dotprod_s16_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.s16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,370 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_52, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_s8_aes3
|
||||
.type dspi_dotprod_s8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_s8_aes3
|
||||
dspi_dotprod_s8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_s8_aes3: # 0x4
|
||||
entry a1,48 #
|
||||
l32i.n a10,a2,4 # [0] id:668
|
||||
l32i.n a11,a2,12 # [1] id:667
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:669
|
||||
l32i.n a9,a2,16 # [1] id:670
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:672
|
||||
l32i.n a14,a3,12 # [1] id:671
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB78_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:674
|
||||
l32i.n a9,a3,8 # [1] id:673
|
||||
s32i.n a9,a1,8 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB78_dspi_dotprod_s8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:675
|
||||
s32i.n a8,a1,4 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_33026 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_33026 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_33026 # [0]
|
||||
|
||||
l32i.n a13,a1,8 # [0] gra_spill_temp_2
|
||||
beqi a13,1,.Lt_0_17666 # [2]
|
||||
|
||||
.Lt_0_33026: # 0x43
|
||||
.Lt_0_17922: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s8_ansi, @function
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB78_dspi_dotprod_s8_aes3: # 0x56
|
||||
l32r a2,.LC0_1_52 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_17666: # 0x5b
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_33794 # [1]
|
||||
|
||||
addi.n a15,a12,-1 # [0]
|
||||
bnez a15,.Lt_0_33794 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_33794 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_33794 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
bge a9,a5,.Lt_0_34306 # [1]
|
||||
|
||||
extui a10,a5,0,1 # [0]
|
||||
bnez a10,.LBB28_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_34306: # 0x78
|
||||
.Lt_0_19714: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a13,a5,-48 # [1]
|
||||
movi.n a14,0 # [2]
|
||||
mull a15,a11,a12 # [3]
|
||||
l32i.n a2,a2,0 # [4] id:676
|
||||
s32i.n a15,a1,0 # [6] gra_spill_temp_0
|
||||
wur.accx_0 a14 # [7]
|
||||
l32i.n a15,a1,4 # [8] gra_spill_temp_1
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:679
|
||||
beqz a13,.LBB32_dspi_dotprod_s8_aes3 # [11]
|
||||
|
||||
.Lt_0_22786: # 0x93
|
||||
.Lt_0_22274: # 0x93
|
||||
addi a8,a5,-32 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_24322: # 0x99
|
||||
.Lt_0_23810: # 0x99
|
||||
addi a9,a5,-16 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_25858: # 0x9f
|
||||
.Lt_0_25346: # 0x9f
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
.Lt_0_27394: # 0xa5
|
||||
.Lt_0_26882: # 0xa5
|
||||
addi a11,a5,-128 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_s8_aes3 # [1]
|
||||
|
||||
movi a12,128 # [0]
|
||||
bge a12,a5,.Lt_0_30210 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:751
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:752
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:753
|
||||
beqz.n a3,.Lt_0_30210 # [5]
|
||||
|
||||
l32i.n a14,a1,0 # [0] gra_spill_temp_0
|
||||
addi a13,a5,63 # [1]
|
||||
movgez a13,a5,a5 # [2]
|
||||
srai a13,a13,6 # [3]
|
||||
sub a14,a14,a5 # [4]
|
||||
addi a14,a14,16 # [5]
|
||||
addi.n a13,a13,-1 # [6]
|
||||
|
||||
.Lt_0_30978: # 0xd1
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a8,32 # [1]
|
||||
movi.n a9,-16 # [2]
|
||||
beqz.n a13,.Lt_0_31234 # [3]
|
||||
|
||||
loopnez a13,.LBB218_dspi_dotprod_s8_aes3 # [0]
|
||||
|
||||
.LBB216_dspi_dotprod_s8_aes3: # 0xdc
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:755
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:754
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:757
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:756
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:759
|
||||
ee.vmulas.s8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:758
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:761
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:760
|
||||
|
||||
.LBB218_dspi_dotprod_s8_aes3: # 0xf8
|
||||
|
||||
.Lt_0_31234: # 0xf8
|
||||
ee.vmulas.s8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:762
|
||||
ee.vld.128.ip q0,a15,16 # [1] id:763
|
||||
ee.vld.128.ip q6,a15,16 # [2] id:765
|
||||
ee.vmulas.s8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:764
|
||||
ee.vld.128.ip q4,a15,16 # [4] id:768
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:766
|
||||
ee.ld.128.usar.xp q1,a2,a8 # [6] id:767
|
||||
ee.vld.128.ip q0,a15,16 # [7] id:770
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:769
|
||||
bne a12,a3,.Lt_0_30978 # [9]
|
||||
|
||||
.Lt_0_30210: # 0x11a
|
||||
.Lt_0_29954: # 0x11a
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:772
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_33794: # 0x136
|
||||
.Lt_0_18946: # 0x136
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
#.LBB25_dspi_dotprod_s8_aes3: # 0x145
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_s8_aes3: # 0x149
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:680
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:681
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:682
|
||||
beqz.n a6,.Lt_0_22786 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i.n a12,a1,0 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB104_dspi_dotprod_s8_aes3 # [4]
|
||||
|
||||
.LBB102_dspi_dotprod_s8_aes3: # 0x160
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:684
|
||||
ee.vmulas.s8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:683
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:686
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:685
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:687
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:689
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:688
|
||||
|
||||
.LBB104_dspi_dotprod_s8_aes3: # 0x178
|
||||
j .Lt_0_22786 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_s8_aes3: # 0x17b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,0 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:690
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:691
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:692
|
||||
loopnez a3,.LBB127_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB125_dspi_dotprod_s8_aes3: # 0x193
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:694
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:693
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:695
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:697
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:696
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:699
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:698
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:700
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:702
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:701
|
||||
|
||||
.LBB127_dspi_dotprod_s8_aes3: # 0x1b5
|
||||
j .Lt_0_24322 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_s8_aes3: # 0x1b8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,0 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:703
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:704
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:705
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:706
|
||||
loopnez a3,.LBB150_dspi_dotprod_s8_aes3 # [10]
|
||||
|
||||
.LBB148_dspi_dotprod_s8_aes3: # 0x1d4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:707
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:709
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:710
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:712
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:714
|
||||
ee.vmulas.s8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:713
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:715
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:717
|
||||
ee.vmulas.s8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:716
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:718
|
||||
|
||||
.LBB150_dspi_dotprod_s8_aes3: # 0x1fc
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_25858 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_s8_aes3: # 0x201
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:719
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:720
|
||||
sub a12,a12,a5 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:721
|
||||
addi a12,a12,16 # [8]
|
||||
loopnez a3,.LBB173_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB171_dspi_dotprod_s8_aes3: # 0x219
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:723
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:722
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:725
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:724
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:728
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:726
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:727
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:730
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:729
|
||||
|
||||
.LBB173_dspi_dotprod_s8_aes3: # 0x238
|
||||
j .Lt_0_27394 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_s8_aes3: # 0x23b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:731
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:732
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:733
|
||||
loopnez a3,.LBB195_dspi_dotprod_s8_aes3 # [9]
|
||||
|
||||
.LBB193_dspi_dotprod_s8_aes3: # 0x253
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:735
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:734
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:737
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:736
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:739
|
||||
ee.vmulas.s8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:738
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:741
|
||||
ee.vmulas.s8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:740
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:743
|
||||
ee.vmulas.s8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:742
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:745
|
||||
ee.vmulas.s8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:744
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:748
|
||||
ee.vmulas.s8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:746
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:747
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:750
|
||||
ee.vmulas.s8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:749
|
||||
|
||||
.LBB195_dspi_dotprod_s8_aes3: # 0x28e
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
sra a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:772
|
||||
retw.n # [10]
|
||||
|
||||
.LBB28_dspi_dotprod_s8_aes3: # 0x2aa
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s8_ansi # [6] dspi_dotprod_s8_ansi
|
||||
|
||||
#.LBB29_dspi_dotprod_s8_aes3: # 0x2b9
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
int8_t *i_data = (int8_t *)in_image->data;
|
||||
int8_t *f_data = (int8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_s8_arp4
|
||||
.global dspi_dotprod_s8_ansi
|
||||
.type dspi_dotprod_s8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_s8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_s8_arp4_body
|
||||
j dspi_dotprod_s8_ansi
|
||||
|
||||
.dspi_dotprod_s8_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.s8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.s.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,371 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_55, 458755
|
||||
|
||||
# Program Unit: dspi_dotprod_u16_aes3
|
||||
.type dspi_dotprod_u16_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_u16_aes3
|
||||
dspi_dotprod_u16_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_u16_aes3: # 0x4
|
||||
entry a1,64 #
|
||||
l32i.n a10,a2,4 # [0] id:681
|
||||
l32i.n a11,a2,12 # [1] id:680
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:682
|
||||
l32i.n a9,a2,16 # [1] id:683
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:685
|
||||
l32i.n a14,a3,12 # [1] id:684
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB81_dspi_dotprod_u16_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:687
|
||||
l32i.n a9,a3,8 # [1] id:686
|
||||
s32i.n a9,a1,24 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB81_dspi_dotprod_u16_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:688
|
||||
s32i.n a8,a1,20 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_34050 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_34050 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_34050 # [0]
|
||||
|
||||
l32i.n a9,a1,24 # [0] gra_spill_temp_2
|
||||
beqi a9,1,.Lt_0_18178 # [2]
|
||||
|
||||
.Lt_0_34050: # 0x43
|
||||
.Lt_0_18434: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_s16_ansi, @function
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB81_dspi_dotprod_u16_aes3: # 0x56
|
||||
l32r a2,.LC0_1_55 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_18178: # 0x5b
|
||||
addi.n a13,a10,-1 # [0]
|
||||
bnez a13,.Lt_0_34818 # [1]
|
||||
|
||||
addi.n a14,a12,-1 # [0]
|
||||
bnez a14,.Lt_0_34818 # [1]
|
||||
|
||||
extui a15,a5,0,3 # [0]
|
||||
bnez.n a15,.Lt_0_34818 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_34818 # [0]
|
||||
|
||||
movi.n a8,32 # [0]
|
||||
bge a8,a5,.Lt_0_35330 # [1]
|
||||
|
||||
extui a9,a5,0,1 # [0]
|
||||
bnez a9,.LBB28_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_35330: # 0x78
|
||||
.Lt_0_20226: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a10,a5,-24 # [1]
|
||||
mull a13,a11,a12 # [2]
|
||||
l32i.n a15,a1,20 # [3] gra_spill_temp_1
|
||||
l32i.n a2,a2,0 # [4] id:689
|
||||
movi.n a14,0 # [5]
|
||||
wur.sar_byte a14 # [6]
|
||||
wur.accx_0 a14 # [8]
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:693
|
||||
slli a13,a13,1 # [11]
|
||||
s32i.n a13,a1,16 # [12] gra_spill_temp_0
|
||||
beqz a10,.LBB32_dspi_dotprod_u16_aes3 # [13]
|
||||
|
||||
.Lt_0_23298: # 0x99
|
||||
.Lt_0_22786: # 0x99
|
||||
addi a8,a5,-16 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_24834: # 0x9f
|
||||
.Lt_0_24322: # 0x9f
|
||||
addi a9,a5,-8 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_26370: # 0xa5
|
||||
.Lt_0_25858: # 0xa5
|
||||
addi a10,a5,-32 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
.Lt_0_27906: # 0xab
|
||||
.Lt_0_27394: # 0xab
|
||||
addi a11,a5,-64 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_u16_aes3 # [1]
|
||||
|
||||
movi.n a12,64 # [0]
|
||||
bge a12,a5,.Lt_0_30722 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:765
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:766
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:767
|
||||
beqz.n a3,.Lt_0_30722 # [5]
|
||||
|
||||
slli a8,a5,1 # [0]
|
||||
l32i.n a14,a1,16 # [1] gra_spill_temp_0
|
||||
addi a13,a5,31 # [2]
|
||||
movgez a13,a5,a5 # [3]
|
||||
srai a13,a13,5 # [4]
|
||||
sub a14,a14,a8 # [5]
|
||||
addi a14,a14,16 # [6]
|
||||
addi.n a13,a13,-1 # [7]
|
||||
|
||||
.Lt_0_31490: # 0xd9
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a9,32 # [1]
|
||||
beqz.n a13,.Lt_0_31746 # [2]
|
||||
|
||||
loopnez a13,.LBB221_dspi_dotprod_u16_aes3 # [0]
|
||||
|
||||
.LBB219_dspi_dotprod_u16_aes3: # 0xe2
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:769
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:768
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:771
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:770
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:773
|
||||
ee.vmulas.u16.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:772
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:775
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:774
|
||||
|
||||
.LBB221_dspi_dotprod_u16_aes3: # 0xfe
|
||||
|
||||
.Lt_0_31746: # 0xfe
|
||||
ee.vmulas.u16.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:776
|
||||
movi.n a10,-16 # [1]
|
||||
ee.vld.128.ip q0,a15,16 # [2] id:777
|
||||
ee.vld.128.ip q6,a15,16 # [3] id:779
|
||||
ee.vmulas.u16.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [4] id:778
|
||||
ee.vld.128.ip q4,a15,16 # [5] id:782
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a10,q6,q3,q5,q7 # [6] id:780
|
||||
ee.ld.128.usar.xp q1,a2,a9 # [7] id:781
|
||||
ee.vld.128.ip q0,a15,16 # [8] id:784
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [9] id:783
|
||||
bne a12,a3,.Lt_0_31490 # [10]
|
||||
|
||||
.Lt_0_30722: # 0x122
|
||||
.Lt_0_30466: # 0x122
|
||||
rur.accx_0 a9 # [0]
|
||||
rur.accx_1 a10 # [1]
|
||||
blti a7,1,.Lt_0_33282 # [2]
|
||||
|
||||
movi.n a2,0 # [0]
|
||||
addi a13,a7,-33 # [1]
|
||||
addi.n a14,a7,-1 # [2]
|
||||
ssr a14 # [3]
|
||||
sra a12,a10 # [4]
|
||||
src a11,a10,a9 # [5]
|
||||
movgez a11,a12,a13 # [6]
|
||||
addi.n a11,a11,1 # [7]
|
||||
srli a11,a11,1 # [8]
|
||||
s16i a11,a4,0 # [9] id:790
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_34818: # 0x148
|
||||
.Lt_0_19458: # 0x148
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_u16_aes3: # 0x15b
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:694
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:695
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:696
|
||||
beqz.n a6,.Lt_0_23298 # [4]
|
||||
|
||||
addi a12,a13,-32 # [0]
|
||||
movi.n a10,32 # [1]
|
||||
movi.n a11,-16 # [2]
|
||||
loopgtz a6,.LBB107_dspi_dotprod_u16_aes3 # [3]
|
||||
|
||||
.LBB105_dspi_dotprod_u16_aes3: # 0x170
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:698
|
||||
ee.vmulas.u16.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:697
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:700
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:699
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:701
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:703
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:702
|
||||
|
||||
.LBB107_dspi_dotprod_u16_aes3: # 0x188
|
||||
j .Lt_0_23298 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_u16_aes3: # 0x18b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:704
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:705
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:706
|
||||
loopnez a3,.LBB130_dspi_dotprod_u16_aes3 # [9]
|
||||
|
||||
.LBB128_dspi_dotprod_u16_aes3: # 0x1a3
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:708
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:707
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:709
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:711
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:710
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:713
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:712
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:714
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:716
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:715
|
||||
|
||||
.LBB130_dspi_dotprod_u16_aes3: # 0x1c5
|
||||
j .Lt_0_24834 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_u16_aes3: # 0x1c8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,16 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:717
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:718
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:719
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:720
|
||||
loopnez a3,.LBB153_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB151_dspi_dotprod_u16_aes3: # 0x1e4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:722
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:721
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:723
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:725
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:724
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:726
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:728
|
||||
ee.vmulas.u16.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:727
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:729
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:731
|
||||
ee.vmulas.u16.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:730
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:732
|
||||
|
||||
.LBB153_dspi_dotprod_u16_aes3: # 0x20c
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_26370 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_u16_aes3: # 0x211
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:733
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:734
|
||||
sub a12,a12,a13 # [6]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:735
|
||||
addi a12,a12,16 # [9]
|
||||
loopnez a3,.LBB176_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB174_dspi_dotprod_u16_aes3: # 0x22c
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:737
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:736
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:739
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:738
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:742
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:740
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:741
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:744
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:743
|
||||
|
||||
.LBB176_dspi_dotprod_u16_aes3: # 0x24b
|
||||
j .Lt_0_27906 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_u16_aes3: # 0x24e
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
slli a13,a5,1 # [2]
|
||||
l32i.n a12,a1,16 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:745
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:746
|
||||
sub a12,a12,a13 # [7]
|
||||
addi a12,a12,16 # [8]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [9] id:747
|
||||
loopnez a3,.LBB198_dspi_dotprod_u16_aes3 # [10]
|
||||
|
||||
.LBB196_dspi_dotprod_u16_aes3: # 0x269
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:749
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:748
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:751
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:750
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:753
|
||||
ee.vmulas.u16.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:752
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:755
|
||||
ee.vmulas.u16.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:754
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:757
|
||||
ee.vmulas.u16.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:756
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:759
|
||||
ee.vmulas.u16.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:758
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:762
|
||||
ee.vmulas.u16.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:760
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:761
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:764
|
||||
ee.vmulas.u16.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:763
|
||||
|
||||
.LBB198_dspi_dotprod_u16_aes3: # 0x2a4
|
||||
j .Lt_0_30722 # [0]
|
||||
|
||||
.Lt_0_33282: # 0x2a7
|
||||
movi.n a2,0 # [0]
|
||||
sext a14,a9,15 # [1]
|
||||
s16i a14,a4,0 # [2] id:791
|
||||
retw.n # [3]
|
||||
|
||||
.LBB28_dspi_dotprod_u16_aes3: # 0x2b1
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_s16_ansi # [6] dspi_dotprod_s16_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint16_t *i_data = (uint16_t *)in_image->data;
|
||||
uint16_t *f_data = (uint16_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int64_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int32_t)i_data[in_image->step_x * x] * (int32_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_u16_arp4
|
||||
.global dspi_dotprod_u16_ansi
|
||||
.type dspi_dotprod_u16_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_u16_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 7
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_u16_arp4_body
|
||||
j dspi_dotprod_u16_ansi
|
||||
|
||||
.dspi_dotprod_u16_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
slli t2, t2, 1 // i_step = i_step<<1
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
slli t3, t3, 1 // f_step = f_step<<1
|
||||
|
||||
srli t6, a3, 3 // t5 = len/8
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.u16.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,367 @@
|
||||
// Copyright 2018-2021 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.literal .LC0_1_52, 458755
|
||||
|
||||
.type dspi_dotprod_u8_aes3, @function
|
||||
.align 4
|
||||
.global dspi_dotprod_u8_aes3
|
||||
dspi_dotprod_u8_aes3: # 0x4
|
||||
.LBB1_dspi_dotprod_u8_aes3: # 0x4
|
||||
entry a1,48 #
|
||||
l32i.n a10,a2,4 # [0] id:669
|
||||
l32i.n a11,a2,12 # [1] id:668
|
||||
mull a8,a10,a5 # [2]
|
||||
blt a11,a8,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a12,a2,8 # [0] id:670
|
||||
l32i.n a9,a2,16 # [1] id:671
|
||||
mull a13,a12,a6 # [2]
|
||||
blt a9,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a15,a3,4 # [0] id:673
|
||||
l32i.n a14,a3,12 # [1] id:672
|
||||
mull a13,a15,a5 # [2]
|
||||
blt a14,a13,.LBB78_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
l32i.n a8,a3,16 # [0] id:675
|
||||
l32i.n a9,a3,8 # [1] id:674
|
||||
s32i.n a9,a1,8 # [2] gra_spill_temp_2
|
||||
mull a9,a9,a6 # [3]
|
||||
blt a8,a9,.LBB78_dspi_dotprod_u8_aes3 # [5]
|
||||
|
||||
l32i.n a8,a3,0 # [0] id:676
|
||||
s32i.n a8,a1,4 # [1] gra_spill_temp_1
|
||||
bbsi a8,0,.Lt_0_33026 # [2]
|
||||
|
||||
bne a14,a13,.Lt_0_33026 # [0]
|
||||
|
||||
bnei a15,1,.Lt_0_33026 # [0]
|
||||
|
||||
l32i.n a13,a1,8 # [0] gra_spill_temp_2
|
||||
beqi a13,1,.Lt_0_17666 # [2]
|
||||
|
||||
.Lt_0_33026: # 0x43
|
||||
.Lt_0_17922: # 0x43
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
.type dspi_dotprod_u8_ansi, @function
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB78_dspi_dotprod_u8_aes3: # 0x56
|
||||
l32r a2,.LC0_1_52 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.Lt_0_17666: # 0x5b
|
||||
addi.n a14,a10,-1 # [0]
|
||||
bnez a14,.Lt_0_33794 # [1]
|
||||
|
||||
addi.n a15,a12,-1 # [0]
|
||||
bnez a15,.Lt_0_33794 # [1]
|
||||
|
||||
extui a8,a5,0,4 # [0]
|
||||
bnez.n a8,.Lt_0_33794 # [1]
|
||||
|
||||
blti a6,4,.Lt_0_33794 # [0]
|
||||
|
||||
movi.n a9,64 # [0]
|
||||
bge a9,a5,.Lt_0_34306 # [1]
|
||||
|
||||
extui a10,a5,0,1 # [0]
|
||||
bnez a10,.LBB28_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_34306: # 0x78
|
||||
.Lt_0_19714: # 0x78
|
||||
mov.n a3,a6 # [0]
|
||||
addi a13,a5,-48 # [1]
|
||||
movi.n a14,0 # [2]
|
||||
mull a15,a11,a12 # [3]
|
||||
l32i.n a2,a2,0 # [4] id:677
|
||||
s32i.n a15,a1,0 # [6] gra_spill_temp_0
|
||||
wur.accx_0 a14 # [7]
|
||||
l32i.n a15,a1,4 # [8] gra_spill_temp_1
|
||||
wur.accx_1 a14 # [9]
|
||||
ee.vld.128.ip q0,a15,16 # [10] id:680
|
||||
beqz a13,.LBB32_dspi_dotprod_u8_aes3 # [11]
|
||||
|
||||
.Lt_0_22786: # 0x93
|
||||
.Lt_0_22274: # 0x93
|
||||
addi a8,a5,-32 # [0]
|
||||
beqz a8,.LBB38_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_24322: # 0x99
|
||||
.Lt_0_23810: # 0x99
|
||||
addi a9,a5,-16 # [0]
|
||||
beqz a9,.LBB44_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_25858: # 0x9f
|
||||
.Lt_0_25346: # 0x9f
|
||||
addi a10,a5,-64 # [0]
|
||||
beqz a10,.LBB50_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
.Lt_0_27394: # 0xa5
|
||||
.Lt_0_26882: # 0xa5
|
||||
addi a11,a5,-128 # [0]
|
||||
beqz a11,.LBB56_dspi_dotprod_u8_aes3 # [1]
|
||||
|
||||
movi a12,128 # [0]
|
||||
bge a12,a5,.Lt_0_30210 # [1]
|
||||
|
||||
movi.n a12,0 # [0]
|
||||
ee.ld.128.usar.ip q1,a2,16 # [1] id:752
|
||||
ee.ld.128.usar.ip q2,a2,16 # [2] id:753
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [4] id:754
|
||||
beqz.n a3,.Lt_0_30210 # [5]
|
||||
|
||||
l32i.n a14,a1,0 # [0] gra_spill_temp_0
|
||||
addi a13,a5,31 # [1]
|
||||
movgez a13,a5,a5 # [2]
|
||||
srai a13,a13,5 # [3]
|
||||
sub a14,a14,a5 # [4]
|
||||
addi a14,a14,16 # [5]
|
||||
addi.n a13,a13,-1 # [6]
|
||||
|
||||
.Lt_0_30978: # 0xd1
|
||||
addi.n a12,a12,1 # [0]
|
||||
movi.n a8,32 # [1]
|
||||
movi.n a9,-16 # [2]
|
||||
beqz.n a13,.Lt_0_31234 # [3]
|
||||
|
||||
loopnez a13,.LBB218_dspi_dotprod_u8_aes3 # [0]
|
||||
|
||||
.LBB216_dspi_dotprod_u8_aes3: # 0xdc
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:756
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:755
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:758
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q2,q3,q4 # [0*II+3] id:757
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:760
|
||||
ee.vmulas.u8.accx.ld.ip.qup q2,a2,16,q0,q3,q4,q1 # [0*II+5] id:759
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+6] id:762
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+7] id:761
|
||||
|
||||
.LBB218_dspi_dotprod_u8_aes3: # 0xf8
|
||||
|
||||
.Lt_0_31234: # 0xf8
|
||||
ee.vmulas.u8.accx.ld.ip.qup q5,a2,16,q0,q1,q2,q3 # [0] id:763
|
||||
ee.vld.128.ip q0,a15,16 # [1] id:764
|
||||
ee.vld.128.ip q6,a15,16 # [2] id:766
|
||||
ee.vmulas.u8.accx.ld.xp.qup q7,a2,a14,q0,q2,q3,q5 # [3] id:765
|
||||
ee.vld.128.ip q4,a15,16 # [4] id:769
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a9,q6,q3,q5,q7 # [5] id:767
|
||||
ee.ld.128.usar.xp q1,a2,a8 # [6] id:768
|
||||
ee.vld.128.ip q0,a15,16 # [7] id:771
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q4,q5,q1,q2 # [8] id:770
|
||||
bne a12,a3,.Lt_0_30978 # [9]
|
||||
|
||||
.Lt_0_30210: # 0x11a
|
||||
.Lt_0_29954: # 0x11a
|
||||
movi.n a2,0 # [0]
|
||||
rur.accx_0 a10 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
movi.n a11,1 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
srl a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:773
|
||||
retw.n # [10]
|
||||
|
||||
.Lt_0_33794: # 0x136
|
||||
.Lt_0_18946: # 0x136
|
||||
mov.n a10,a2 # [0]
|
||||
mov.n a11,a3 # [1]
|
||||
mov.n a12,a4 # [2]
|
||||
mov.n a13,a5 # [3]
|
||||
mov.n a14,a6 # [4]
|
||||
mov.n a15,a7 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
.LBB32_dspi_dotprod_u8_aes3: # 0x149
|
||||
ee.ld.128.usar.ip q1,a2,16 # [0] id:681
|
||||
ee.ld.128.usar.ip q2,a2,16 # [1] id:682
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [3] id:683
|
||||
beqz.n a6,.Lt_0_22786 # [4]
|
||||
|
||||
movi.n a10,32 # [0]
|
||||
l32i.n a12,a1,0 # [1] gra_spill_temp_0
|
||||
movi.n a11,-16 # [2]
|
||||
addi a12,a12,-32 # [3]
|
||||
loopgtz a6,.LBB104_dspi_dotprod_u8_aes3 # [4]
|
||||
|
||||
.LBB102_dspi_dotprod_u8_aes3: # 0x160
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:685
|
||||
ee.vmulas.u8.accx.ld.xp.qup q1,a2,a12,q0,q1,q2,q3 # [0*II+1] id:684
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+2] id:687
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q4,q2,q3,q1 # [0*II+3] id:686
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+4] id:688
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+5] id:690
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q3,q1,q2 # [0*II+6] id:689
|
||||
|
||||
.LBB104_dspi_dotprod_u8_aes3: # 0x178
|
||||
j .Lt_0_22786 # [0]
|
||||
|
||||
.LBB38_dspi_dotprod_u8_aes3: # 0x17b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
srli a3,a6,1 # [2]
|
||||
l32i.n a12,a1,0 # [3] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [4] id:691
|
||||
ee.ld.128.usar.ip q2,a2,16 # [5] id:692
|
||||
addi a12,a12,-16 # [7]
|
||||
ee.src.q.ld.xp q3,a2,a12,q1,q2 # [8] id:693
|
||||
loopnez a3,.LBB127_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB125_dspi_dotprod_u8_aes3: # 0x193
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:695
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a11,q0,q1,q2,q3 # [0*II+1] id:694
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+2] id:696
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:698
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a2,a12,q4,q2,q1,q3 # [0*II+4] id:697
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+5] id:700
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q0,q1,q3,q4 # [0*II+6] id:699
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+7] id:701
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+8] id:703
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a2,a12,q5,q3,q1,q2 # [0*II+9] id:702
|
||||
|
||||
.LBB127_dspi_dotprod_u8_aes3: # 0x1b5
|
||||
j .Lt_0_24322 # [0]
|
||||
|
||||
.LBB44_dspi_dotprod_u8_aes3: # 0x1b8
|
||||
srli a3,a3,2 # [0]
|
||||
movi.n a10,-16 # [1]
|
||||
l32i.n a11,a1,0 # [2] gra_spill_temp_0
|
||||
addi a8,a2,16 # [3]
|
||||
addi a11,a11,16 # [4]
|
||||
ee.ld.128.usar.xp q2,a8,a10 # [5] id:704
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [6] id:705
|
||||
ee.src.q.ld.xp q3,a8,a10,q1,q2 # [8] id:706
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [9] id:707
|
||||
loopnez a3,.LBB150_dspi_dotprod_u8_aes3 # [10]
|
||||
|
||||
.LBB148_dspi_dotprod_u8_aes3: # 0x1d4
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:709
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q0,q1,q2,q3 # [0*II+1] id:708
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+2] id:710
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+3] id:712
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q4,q2,q1,q3 # [0*II+4] id:711
|
||||
ee.ld.128.usar.xp q3,a8,a11 # [0*II+5] id:713
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+6] id:715
|
||||
ee.vmulas.u8.accx.ld.xp.qup q4,a8,a10,q0,q1,q3,q4 # [0*II+7] id:714
|
||||
ee.ld.128.usar.xp q1,a8,a11 # [0*II+8] id:716
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+9] id:718
|
||||
ee.vmulas.u8.accx.ld.xp.qup q3,a8,a10,q5,q3,q1,q4 # [0*II+10] id:717
|
||||
ee.ld.128.usar.xp q2,a8,a11 # [0*II+11] id:719
|
||||
|
||||
.LBB150_dspi_dotprod_u8_aes3: # 0x1fc
|
||||
mov.n a2,a8 # [0]
|
||||
j .Lt_0_25858 # [1]
|
||||
|
||||
.LBB50_dspi_dotprod_u8_aes3: # 0x201
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:720
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:721
|
||||
sub a12,a12,a5 # [5]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [7] id:722
|
||||
addi a12,a12,16 # [8]
|
||||
loopnez a3,.LBB173_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB171_dspi_dotprod_u8_aes3: # 0x219
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+0] id:724
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q0,q1,q2,q3 # [0*II+1] id:723
|
||||
ee.vld.128.ip q1,a15,16 # [0*II+2] id:726
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q2,q3,q4 # [0*II+3] id:725
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:729
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q1,q3,q4,q0 # [0*II+5] id:727
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+6] id:728
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+7] id:731
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+8] id:730
|
||||
|
||||
.LBB173_dspi_dotprod_u8_aes3: # 0x238
|
||||
j .Lt_0_27394 # [0]
|
||||
|
||||
.LBB56_dspi_dotprod_u8_aes3: # 0x23b
|
||||
movi.n a10,32 # [0]
|
||||
movi.n a11,-16 # [1]
|
||||
l32i.n a12,a1,0 # [2] gra_spill_temp_0
|
||||
ee.ld.128.usar.ip q1,a2,16 # [3] id:732
|
||||
ee.ld.128.usar.ip q2,a2,16 # [4] id:733
|
||||
sub a12,a12,a5 # [6]
|
||||
addi a12,a12,16 # [7]
|
||||
ee.src.q.ld.ip q3,a2,16,q1,q2 # [8] id:734
|
||||
loopnez a3,.LBB195_dspi_dotprod_u8_aes3 # [9]
|
||||
|
||||
.LBB193_dspi_dotprod_u8_aes3: # 0x253
|
||||
ee.vld.128.ip q4,a15,16 # [0*II+0] id:736
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q0,q1,q2,q3 # [0*II+1] id:735
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+2] id:738
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q4,q2,q3,q1 # [0*II+3] id:737
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+4] id:740
|
||||
ee.vmulas.u8.accx.ld.ip.qup q0,a2,16,q0,q3,q1,q4 # [0*II+5] id:739
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+6] id:742
|
||||
ee.vmulas.u8.accx.ld.ip.qup q1,a2,16,q5,q1,q4,q0 # [0*II+7] id:741
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+8] id:744
|
||||
ee.vmulas.u8.accx.ld.ip.qup q4,a2,16,q6,q4,q0,q1 # [0*II+9] id:743
|
||||
ee.vld.128.ip q6,a15,16 # [0*II+10] id:746
|
||||
ee.vmulas.u8.accx.ld.xp.qup q0,a2,a12,q5,q0,q1,q4 # [0*II+11] id:745
|
||||
ee.vld.128.ip q5,a15,16 # [0*II+12] id:749
|
||||
ee.vmulas.u8.accx.ld.xp.qup q2,a2,a11,q6,q1,q4,q0 # [0*II+13] id:747
|
||||
ee.ld.128.usar.xp q1,a2,a10 # [0*II+14] id:748
|
||||
ee.vld.128.ip q0,a15,16 # [0*II+15] id:751
|
||||
ee.vmulas.u8.accx.ld.ip.qup q3,a2,16,q5,q4,q1,q2 # [0*II+16] id:750
|
||||
|
||||
.LBB195_dspi_dotprod_u8_aes3: # 0x28e
|
||||
movi.n a2,0 # [0]
|
||||
movi.n a11,1 # [1]
|
||||
addi.n a12,a7,-1 # [2]
|
||||
rur.accx_0 a10 # [3]
|
||||
ssl a12 # [4]
|
||||
sll a11,a11 # [5]
|
||||
ssr a7 # [6]
|
||||
add.n a10,a10,a11 # [7]
|
||||
srl a10,a10 # [8]
|
||||
s8i a10,a4,0 # [9] id:773
|
||||
retw.n # [10]
|
||||
|
||||
.LBB28_dspi_dotprod_u8_aes3: # 0x2aa
|
||||
mov.n a15,a7 # [0]
|
||||
mov.n a14,a6 # [1]
|
||||
mov.n a13,a5 # [2]
|
||||
mov.n a12,a4 # [3]
|
||||
mov.n a11,a3 # [4]
|
||||
mov.n a10,a2 # [5]
|
||||
call8 dspi_dotprod_u8_ansi # [6] dspi_dotprod_u8_ansi
|
||||
|
||||
mov.n a2,a10 # [0]
|
||||
retw.n # [1]
|
||||
|
||||
|
||||
#endif // dsps_dotprod_s16_aes3_enabled
|
||||
@@ -0,0 +1,49 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
uint8_t *i_data = (uint8_t *)in_image->data;
|
||||
uint8_t *f_data = (uint8_t *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
int32_t acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += (int16_t)i_data[in_image->step_x * x] * (int16_t)f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
acc += 1 << (shift - 1); // round operation
|
||||
acc >>= shift;
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod_platform.h"
|
||||
#if (dspi_dotprod_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dspi_dotprod_u8_arp4
|
||||
.global dspi_dotprod_u8_ansi
|
||||
.type dspi_dotprod_u8_arp4,@function
|
||||
|
||||
// esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
|
||||
dspi_dotprod_u8_arp4:
|
||||
// in_image - a0
|
||||
// filter - a1
|
||||
// out_value - a2
|
||||
// count_x - a3
|
||||
// count_y - a4
|
||||
// shift - a5
|
||||
|
||||
// i_data - t0
|
||||
// f_data - t1
|
||||
// i_step - t2
|
||||
// f_step - t3
|
||||
// t4 - current i_data
|
||||
// t5 - current f_data
|
||||
|
||||
lw t1, 4(a0) // load in_image->step_x
|
||||
lw t2, 4(a1) // load filter->step_x
|
||||
or t1, t1, t2
|
||||
addi t1, t1, -1 // should be 0 now
|
||||
andi t2, a3, 15
|
||||
or t1, t1, t2
|
||||
|
||||
beqz t1, .dspi_dotprod_u8_arp4_body
|
||||
j dspi_dotprod_u8_ansi
|
||||
|
||||
.dspi_dotprod_u8_arp4_body:
|
||||
add sp, sp, -16
|
||||
lw t0, 0(a0) // i_data
|
||||
lw t1, 0(a1) // f_data
|
||||
|
||||
lw t2, 8(a0) // step_y
|
||||
lw t4, 12(a0) // stride_x
|
||||
mul t2, t4, t2
|
||||
|
||||
lw t3, 8(a1) // step_y
|
||||
lw t5, 12(a1) // stride_x
|
||||
mul t3, t5, t3
|
||||
|
||||
srli t6, a3, 4 // t5 = len/16
|
||||
|
||||
addi a6, a5, -1
|
||||
li t4, 1
|
||||
sll t4, t4, a6
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t4
|
||||
|
||||
.loop_count_y:
|
||||
mv t4, t0
|
||||
mv t5, t1
|
||||
esp.vld.128.ip q0, t4, 16 // q0 - i_data
|
||||
|
||||
esp.lp.setup 0, t6, .loop_count_x
|
||||
esp.vld.128.ip q1, t5, 16 // q1 - f_data
|
||||
.loop_count_x: esp.vmulas.u8.xacc.ld.ip q0, t4, 16, q0, q1 // q0 - i_data
|
||||
|
||||
add t0, t0, t2
|
||||
add t1, t1, t3
|
||||
add a4,a4, -1
|
||||
bgtz a4, .loop_count_y
|
||||
|
||||
esp.srs.u.xacc t5, a5 // shift accx register by final_shift amount (a5), save the lower 32bits to t5
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dspi_dotprod_arp4_enabled
|
||||
@@ -0,0 +1,80 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_s16_ae32_enabled == 1)
|
||||
#include "dsps_dotprod_s16_m_ae32.S"
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_s16_ae32
|
||||
.type dsps_dotprod_s16_ae32,@function
|
||||
|
||||
|
||||
//esp_err_t dsps_dotprod_s16_ae32(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
|
||||
dsps_dotprod_s16_ae32:
|
||||
// src1 - a2
|
||||
// src2 - a3
|
||||
// dest - a4
|
||||
// len - a5
|
||||
// shift - a6
|
||||
|
||||
entry a1, 16
|
||||
|
||||
// Check minimum length
|
||||
movi a8, 4
|
||||
blt a5, a8, dsps_dotprod_s16_ae32_error
|
||||
|
||||
// Clear accumulator
|
||||
movi a8, 0
|
||||
wsr a8, acchi
|
||||
|
||||
// Prepare and load round value
|
||||
movi a8, 0x7fff
|
||||
ssr a6
|
||||
srl a8, a8
|
||||
wsr a8, acclo // initialize acc with shifted round value
|
||||
|
||||
// Compensate for pre-increment
|
||||
// Right shift to 16 bits
|
||||
// RS = -shift + 15
|
||||
neg a6, a6
|
||||
addi a6, a6, 15
|
||||
|
||||
/* number of loop iterations (see below):
|
||||
* a7 = count / 4 - 1
|
||||
*/
|
||||
|
||||
srli a7, a5, 2
|
||||
addi a7, a7, -1
|
||||
|
||||
movi.n a10, 0 // load 0 to the a10 to increment second array
|
||||
|
||||
dotprod_s16_ae32_full a2, a3, a7, a5
|
||||
|
||||
/* Get accumulator */
|
||||
ssr a6
|
||||
rsr a2, acchi
|
||||
rsr a3, acclo
|
||||
src a2, a2, a3
|
||||
|
||||
s16i a2, a4, 0
|
||||
movi.n a2, 0
|
||||
retw.n
|
||||
dsps_dotprod_s16_ae32_error:
|
||||
movi.n a2, ESP_ERR_DSP_INVALID_LENGTH
|
||||
retw.n
|
||||
|
||||
#endif // dsps_dotprod_s16_ae32_enabled
|
||||
@@ -0,0 +1,33 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
|
||||
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift)
|
||||
{
|
||||
// To make correct round operation we have to shift round value
|
||||
long long acc = 0x7fff >> shift;
|
||||
|
||||
for (int i = 0 ; i < len ; i++) {
|
||||
acc += (int32_t)src1[i] * (int32_t)src2[i];
|
||||
}
|
||||
|
||||
int final_shift = shift - 15;
|
||||
if (final_shift > 0) {
|
||||
*dest = (acc << final_shift);
|
||||
} else {
|
||||
*dest = (acc >> (-final_shift));
|
||||
}
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_s16_arp4_enabled == 1)
|
||||
#include "dsp_err_codes.h"
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_s16_arp4
|
||||
.global dsps_dotprod_s16_ansi
|
||||
.type dsps_dotprod_s16_arp4,@function
|
||||
|
||||
//esp_err_t dsps_dotprod_s16_arp4(const int16_t* src1, const int16_t* src2, int16_t* dest, int len, int8_t shift);
|
||||
dsps_dotprod_s16_arp4:
|
||||
// src1 - a0
|
||||
// src2 - a1
|
||||
// dest - a2
|
||||
// len - a3
|
||||
// shift - a4
|
||||
andi a5, a3, 7
|
||||
beqz a5, .dsps_dotprod_s16_arp4_body
|
||||
j dsps_dotprod_s16_ansi
|
||||
|
||||
.dsps_dotprod_s16_arp4_body:
|
||||
add sp,sp,-16
|
||||
|
||||
// Enable analigned data access
|
||||
esp.movx.r.cfg t6
|
||||
or t6, t6, 2
|
||||
esp.movx.w.cfg t6
|
||||
|
||||
add t6, a4, -15
|
||||
neg t6, t6 // t6 - real_shift
|
||||
|
||||
li t3, 0x7fff
|
||||
srl t3, t3, a4
|
||||
esp.zero.xacc
|
||||
esp.movx.w.xacc.l t3
|
||||
|
||||
mv t3, a0
|
||||
mv t4, a1
|
||||
|
||||
esp.vld.128.ip q0, t3, 16 //q0 - src1
|
||||
srli t5, a3, 3 // t5 = len>>3
|
||||
# esp.lp.setup 0, t5, .main_loop
|
||||
# esp.vld.128.ip q1, t4, 16 // q1 - src1
|
||||
# .main_loop: esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
|
||||
|
||||
.main_loop:
|
||||
esp.vld.128.ip q1, t4, 16 // q1 - src1
|
||||
esp.vmulas.s16.xacc.ld.ip q0, t3, 16, q0, q1 // q0 - src2
|
||||
add t5, t5, -1
|
||||
bgtz t5, .main_loop
|
||||
|
||||
esp.srs.s.xacc t5, t6 // shift accx register by final_shift amount (a6), save the lower 32bits to a15
|
||||
sh t5, 0(a2) // store result to output buffer
|
||||
|
||||
li a0,0
|
||||
add sp,sp,16
|
||||
ret
|
||||
|
||||
#endif // dsps_dotprod_s16_ae32_enabled
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dotprod_s16_ae32 x1, x2, count
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (for example a2)
|
||||
// x2 - input array2 register (for example a3)
|
||||
// count - counter register (for example a7)
|
||||
// count - samples_count / 4 - 1
|
||||
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
|
||||
// acchi, and acclo have to be initialize before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
/*
|
||||
* Data schedule. Each line represents instruction, columns represent
|
||||
* register contents. Last column (MUL) shows the multiplication which
|
||||
* takes place. Values loaded in the given cycle are shown in square brackets.
|
||||
*
|
||||
* m0 m1 m2 m3 MUL
|
||||
* --------- pre-load ------------
|
||||
*[x0 x1] (no MULs in the first 3 instructions)
|
||||
* x0 x1 [y0 y1]
|
||||
* x0 x1 [x2 x3] y0 y1
|
||||
* x0 x1 x2 x3 y0 y1 [y2 y3] x0*y0
|
||||
* ---------- loop -------------- (the following 4 instructions are
|
||||
*[x4 x5] x2 x3 y0 y1 y2 y3 x1*y1 repeated as much as needed)
|
||||
* x4 x5 x2 x3 [y4 y5] y2 y3 x2*y2
|
||||
* x4 x5 [x6 x7] y4 y5 y2 y3 x3*y3
|
||||
* x4 x5 x6 x7 y4 y5 [y6 y7] x4*y4
|
||||
* --------- finalize ------------
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x5*y5 (nothing is load)
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x6*y6
|
||||
* x4 x5 x6 x7 y4 y5 y6 y7 x7*y7
|
||||
*/
|
||||
|
||||
addi \x1, \x1, -4 // To arrange fist pointer
|
||||
addi \x2, \x2, -4 // To arrange fist pointer
|
||||
//lddec m0, \x1
|
||||
//lddec m2, \x2 // To arrange fist pointer
|
||||
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
ldinc m1, \x1
|
||||
|
||||
mula.dd.ll.ldinc m3, \x2, m0, m2
|
||||
loopnez \count, .loop_end
|
||||
.loop:
|
||||
mula.dd.hh.ldinc m0, \x1, m0, m2
|
||||
mula.dd.ll.ldinc m2, \x2, m1, m3
|
||||
mula.dd.hh.ldinc m1, \x1, m1, m3
|
||||
mula.dd.ll.ldinc m3, \x2, m0, m2
|
||||
.loop_end:
|
||||
|
||||
mula.dd.hh m0, m2
|
||||
mula.dd.ll m1, m3
|
||||
mula.dd.hh m1, m3
|
||||
|
||||
.endm // dotprod_s16_ae32
|
||||
|
||||
|
||||
.macro dotprod_s16_ae32_full x1, x2, count, full_count
|
||||
// This macro calculates fixed point dot product for ((count + 1)*4) int16 samples
|
||||
// x1 - input array1 register (for example a2)
|
||||
// x2 - input array2 register (for example a3)
|
||||
// count - counter register (for example a7)
|
||||
// count - samples_count / 4 - 1
|
||||
// full_count - samples_count
|
||||
// acc += x1[i + 0]*x2[i + 0] + x1[i + 1]*x2[i + 1] + x1[i + 2]*x2[i + 2] + x1[i + 3]*x2[i + 3]; i: 0..count
|
||||
// acchi, and acclo have to be initialize before
|
||||
// Result - acchi || acclo
|
||||
// Modifies:
|
||||
// m0, m1, m2, m3
|
||||
// acchi || acclo - must be loaded before (for example 0x3fff to acclo).
|
||||
|
||||
dotprod_s16_ae32 \x1, \x2, \count
|
||||
|
||||
bbci \full_count, 1, .mod2chk
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
mula.dd.hh m0, m2
|
||||
mula.dd.ll m0, m2
|
||||
.mod2chk:
|
||||
bbci \full_count, 0, .mod1chk
|
||||
ldinc m0, \x1
|
||||
ldinc m2, \x2
|
||||
mula.dd.ll m0, m2
|
||||
.mod1chk:
|
||||
|
||||
.endm // dotprod_s16_ae32_full
|
||||
@@ -0,0 +1,47 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
float *i_data = (float *)in_image->data;
|
||||
float *f_data = (float *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
float acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += i_data[in_image->step_x * x] * f_data[filter->step_x * x];
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
|
||||
esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset)
|
||||
{
|
||||
if (in_image->step_x * count_x > in_image->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (in_image->step_y * count_y > in_image->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_x * count_x > filter->stride_x) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
if (filter->step_y * count_y > filter->stride_y) {
|
||||
return ESP_ERR_DSP_PARAM_OUTOFRANGE;
|
||||
}
|
||||
|
||||
float *i_data = (float *)in_image->data;
|
||||
float *f_data = (float *)filter->data;
|
||||
int i_step = in_image->stride_x * in_image->step_y;
|
||||
int f_step = filter->stride_x * filter->step_y;
|
||||
|
||||
float acc = 0;
|
||||
for (int y = 0; y < count_y; y++) {
|
||||
for (int x = 0; x < count_x; x++) {
|
||||
acc += i_data[in_image->step_x * x] * (f_data[filter->step_x * x] + offset);
|
||||
}
|
||||
i_data += i_step;
|
||||
f_data += f_step;
|
||||
}
|
||||
*out_value = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dotprod_f32_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_dotprod_f32_m_ae32.S"
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_f32_ae32
|
||||
.global .dsps_dotprod_f32_ae32_body
|
||||
.type dsps_dotprod_f32_ae32,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
|
||||
//{
|
||||
// float acc = 0;
|
||||
// for (int i=0 ; i< len ; i++)
|
||||
// {
|
||||
// acc += src1[i]*src2[i];
|
||||
// }
|
||||
// *dest = acc;
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_dotprod_f32_ae32:
|
||||
// src1 - a2
|
||||
// src2 - a3
|
||||
// dest - a4
|
||||
// len - a5
|
||||
|
||||
entry a1, 16
|
||||
.dsps_dotprod_f32_ae32_body:
|
||||
// Array increment for floating point data should be 4
|
||||
movi.n a8, 4
|
||||
// Clear initial state of the result register
|
||||
movi.n a9, 0
|
||||
wfr f1, a9
|
||||
// a2 - input1
|
||||
// a3 - input2
|
||||
// a5 - length
|
||||
// a8 - 4, step in arrays
|
||||
dotprod_f32_ae32 a2, a3, a5, a9, a8;
|
||||
|
||||
ssi f1, a4, 0 // Store result from f1 to memory at a4
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dotprode_f32_ae32_enabled
|
||||
@@ -0,0 +1,85 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_f32_aes3_enabled == 1)
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_f32_aes3
|
||||
.global .dsps_dotprod_f32_ae32_body
|
||||
.type dsps_dotprod_f32_aes3,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
|
||||
//{
|
||||
// float acc = 0;
|
||||
// for (int i=0 ; i< len ; i++)
|
||||
// {
|
||||
// acc += src1[i]*src2[i];
|
||||
// }
|
||||
// *dest = acc;
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_dotprod_f32_aes3:
|
||||
// src1 - a2
|
||||
// src2 - a3
|
||||
// dest - a4
|
||||
// len - a5
|
||||
|
||||
entry a1, 16
|
||||
// Check length and align
|
||||
movi.n a10, 3
|
||||
and a10, a10, a5
|
||||
movi.n a9, 15
|
||||
or a11, a3, a2
|
||||
and a11, a9, a11
|
||||
or a10, a10, a11
|
||||
beqz a10, .dsps_dotprod_f32_aes3_body
|
||||
// Call Esp32 function
|
||||
J .dsps_dotprod_f32_ae32_body
|
||||
|
||||
.dsps_dotprod_f32_aes3_body:
|
||||
// Clear initial state of the result register
|
||||
movi.n a9, 0
|
||||
wfr f0, a9
|
||||
wfr f1, a9
|
||||
wfr f2, a9
|
||||
wfr f3, a9
|
||||
// a2 - input1
|
||||
// a3 - input2
|
||||
// a5 - length
|
||||
|
||||
srli a6, a5, 2 // N count
|
||||
// lsx f0, a2, a9
|
||||
loopnez a6, .loop_mac_end_m_ae32
|
||||
EE.LDF.128.IP f11, f10, f9, f8, a2, 16
|
||||
EE.LDF.128.IP f7, f6, f5, f4, a3, 16
|
||||
madd.s f0, f4, f8 // f0 = X11*Y11
|
||||
madd.s f1, f5, f9 // f1 = X12*Y11
|
||||
madd.s f2, f6, f10 // f2 = X13*Y11
|
||||
madd.s f3, f7, f11 // f3 = X14*Y11
|
||||
.loop_mac_end_m_ae32:
|
||||
|
||||
add.s f0, f0, f1
|
||||
add.s f0, f0, f2
|
||||
add.s f0, f0, f3
|
||||
|
||||
ssi f0, a4, 0 // Store result from f1 to memory at a4
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif // dotprode_f32_ae32_enabled
|
||||
@@ -0,0 +1,25 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
|
||||
esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len)
|
||||
{
|
||||
float acc = 0;
|
||||
for (int i = 0 ; i < len ; i++) {
|
||||
acc += src1[i] * src2[i];
|
||||
}
|
||||
*dest = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_f32_arp4_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprod_f32_arp4
|
||||
.type dsps_dotprod_f32_arp4,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_dotprod_f32(const float* src1, const float* src2, float* dest, int len)
|
||||
//{
|
||||
// float acc = 0;
|
||||
// for (int i=0 ; i< len ; i++)
|
||||
// {
|
||||
// acc += src1[i]*src2[i];
|
||||
// }
|
||||
// *dest = acc;
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_dotprod_f32_arp4:
|
||||
// src1 - a0
|
||||
// src2 - a1
|
||||
// dest - a2
|
||||
// len - a3
|
||||
add sp,sp,-16
|
||||
|
||||
fmv.w.x fa2,zero
|
||||
|
||||
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, 4
|
||||
add a1, a1, 4
|
||||
li a4, 2
|
||||
ble a3, a4, .loop_less_2
|
||||
|
||||
// Loop when len > 2
|
||||
esp.lp.setup 0, a3, .dotprod_loop
|
||||
fmadd.s fa2, fa0, fa1, fa2
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, 4
|
||||
.dotprod_loop: add a1, a1, 4
|
||||
fsw fa2, 0(a2)
|
||||
|
||||
add sp,sp,16
|
||||
li a0,0
|
||||
ret
|
||||
// Loop when len <=2
|
||||
.loop_less_2:
|
||||
fmadd.s fa2, fa0, fa1, fa2
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, 4
|
||||
add a1, a1, 4
|
||||
add a3, a3, -1
|
||||
bnez a3, .loop_less_2
|
||||
fsw fa2, 0(a2)
|
||||
add sp,sp,16
|
||||
li a0,0
|
||||
ret
|
||||
|
||||
#endif // dotprode_f32_arp4_enabled
|
||||
@@ -0,0 +1,42 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dotprod_f32_ae32 x1 x2 count step1 step2
|
||||
// This macro calculates floating point dot product for count float samples
|
||||
// x1, x2 - input arrays
|
||||
// count - amount of samples
|
||||
// step1 - start step
|
||||
//,step2 - A register for array step increment. (should be divided by 4)
|
||||
// f1 - contains initial value
|
||||
//
|
||||
// result in f1
|
||||
//
|
||||
// Macros body:
|
||||
// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
|
||||
// affected: f0, f1, f2
|
||||
// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
|
||||
// a8 == 4, step is 4 bytes
|
||||
// a5 == 32, length of array is 32
|
||||
//
|
||||
// mov \step1, \step2
|
||||
lsx f0, \x2, \step1
|
||||
// sub \x1, \x1, \step1 // To compensate first increment
|
||||
loopnez \count, .loop_mac_end_m_ae32
|
||||
lsx f2, \x1, \step1
|
||||
madd.s f1, f2, f0
|
||||
add.n \step1, \step1, \step2
|
||||
lsx f0, \x2, \step1
|
||||
.loop_mac_end_m_ae32:
|
||||
.endm
|
||||
@@ -0,0 +1,64 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dotprode_f32_ae32_enabled == 1)
|
||||
|
||||
#include "dsps_dotprode_f32_m_ae32.S"
|
||||
|
||||
// This is dot product function for ESP32 processor.
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprode_f32_ae32
|
||||
.type dsps_dotprode_f32_ae32,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_dotprod_f32_ae32(const float* src1, const float* src2, float* dest, int len)
|
||||
//{
|
||||
// float acc = 0;
|
||||
// for (int i=0 ; i< len ; i++)
|
||||
// {
|
||||
// acc += src1[i]*src2[i];
|
||||
// }
|
||||
// *dest = acc;
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_dotprode_f32_ae32:
|
||||
// src1 - a2
|
||||
// src2 - a3
|
||||
// dest - a4
|
||||
// len - a5
|
||||
// step1- a6
|
||||
// step2- a7
|
||||
|
||||
entry a1, 16
|
||||
// Array increment for floating point data should be 4
|
||||
|
||||
slli a6,a6, 2
|
||||
slli a7,a7, 2
|
||||
// Clear initial state of the result register
|
||||
movi.n a9, 0
|
||||
wfr f1, a9
|
||||
// a2 - input1
|
||||
// a3 - input2
|
||||
// a5 - length
|
||||
// a6,a7, step in arrays
|
||||
dotprode_f32_ae32 a2, a3, a5, a6, a7;
|
||||
|
||||
ssi f1, a4, 0 // Store result from f1 to memory at a4
|
||||
|
||||
movi.n a2, 0 // return status ESP_OK
|
||||
retw.n
|
||||
|
||||
#endif //dotprode_f32_ae32_enabled
|
||||
@@ -0,0 +1,25 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
|
||||
esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
|
||||
{
|
||||
float acc = 0;
|
||||
for (int i = 0 ; i < len ; i++) {
|
||||
acc += src1[i * step1] * src2[i * step2];
|
||||
}
|
||||
*dest = acc;
|
||||
return ESP_OK;
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
// Copyright 2024 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
#if (dsps_dotprod_f32_arp4_enabled == 1)
|
||||
|
||||
.text
|
||||
.align 4
|
||||
.global dsps_dotprode_f32_arp4
|
||||
.type dsps_dotprode_f32_arp4,@function
|
||||
// The function implements the following C code:
|
||||
//esp_err_t dsps_dotprode_f32(const float *src1, const float *src2, float *dest, int len, int step1, int step2)
|
||||
//{
|
||||
// float acc = 0;
|
||||
// for (int i = 0 ; i < len ; i++) {
|
||||
// acc += src1[i * step1] * src2[i * step2];
|
||||
// }
|
||||
// *dest = acc;
|
||||
// return ESP_OK;
|
||||
//}
|
||||
|
||||
dsps_dotprode_f32_arp4:
|
||||
// src1 - a0
|
||||
// src2 - a1
|
||||
// dest - a2
|
||||
// len - a3
|
||||
add sp,sp,-16
|
||||
|
||||
fmv.w.x fa2,zero
|
||||
slli a4, a4, 2 // step address increment by 4
|
||||
slli a5, a5, 2 // step address increment by 4
|
||||
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, a4
|
||||
add a1, a1, a5
|
||||
li a6, 2
|
||||
ble a3, a6, .loop_less_2
|
||||
|
||||
// Loop when len > 2
|
||||
esp.lp.setup 0, a3, .dotprod_loop
|
||||
fmadd.s fa2, fa0, fa1, fa2
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, a4
|
||||
.dotprod_loop: add a1, a1, a5
|
||||
fsw fa2, 0(a2)
|
||||
|
||||
add sp,sp,16
|
||||
li a0,0
|
||||
ret
|
||||
// Loop when len <=2
|
||||
.loop_less_2:
|
||||
fmadd.s fa2, fa0, fa1, fa2
|
||||
flw fa0, 0(a0)
|
||||
flw fa1, 0(a1)
|
||||
add a0, a0, a4
|
||||
add a1, a1, a5
|
||||
add a3, a3, -1
|
||||
bnez a3, .loop_less_2
|
||||
|
||||
fsw fa2, 0(a2)
|
||||
add sp,sp,16
|
||||
li a0,0
|
||||
ret
|
||||
|
||||
#endif // dotprode_f32_arp4_enabled
|
||||
@@ -0,0 +1,41 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
.macro dotprode_f32_ae32 x1 x2 count step1 step2
|
||||
// This macro calculates floating point dot product for count float samples
|
||||
// x1, x2 - input arrays
|
||||
// count - amount of samples
|
||||
// step1,step2 - A register for array step. (should be divided by 4)
|
||||
// f1 - contains initial value
|
||||
//
|
||||
// result in f1
|
||||
//
|
||||
// Macros body:
|
||||
// f1 += x1[i*step1]*x2[i*step2]; i: 0..counter-1
|
||||
// affected: f0, f1, f2
|
||||
// Example: dotprod_f32_ae32 a2 a3 a5 a8 a9
|
||||
// a8 == 4, step is 4 bytes
|
||||
// a5 == 32, length of array is 32
|
||||
//
|
||||
lsi f0, \x2, 0
|
||||
sub \x1, \x1, \step1 // To compensate first increment
|
||||
loopnez \count, .loop_mace_end_m_ae32
|
||||
add.n \x1, \x1, \step1
|
||||
lsi f2, \x1, 0
|
||||
madd.s f1, f2, f0
|
||||
add.n \x2, \x2, \step2
|
||||
lsi f0, \x2, 0
|
||||
.loop_mace_end_m_ae32:
|
||||
.endm
|
||||
@@ -0,0 +1,191 @@
|
||||
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
#ifndef _dspi_dotprod_H_
|
||||
#define _dspi_dotprod_H_
|
||||
|
||||
#include "esp_log.h"
|
||||
#include "dsp_err.h"
|
||||
#include "dsp_types.h"
|
||||
#include "dspi_dotprod_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two images
|
||||
* Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] in_image descriptor of the image
|
||||
* @param[in] filter descriptor of the filter
|
||||
* @param[out] out_value pointer to the output value
|
||||
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
|
||||
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dspi_dotprod_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y);
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two images
|
||||
* Dot product calculation for two floating point images: *out_value += image[i*...] * src2[i*...]); i= [0..count_x*count_y)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] in_image descriptor of the image
|
||||
* @param[in] filter descriptor of the filter
|
||||
* @param[out] out_value pointer to the output value
|
||||
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
|
||||
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
|
||||
* @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dspi_dotprod_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
|
||||
|
||||
esp_err_t dspi_dotprod_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
|
||||
|
||||
esp_err_t dspi_dotprod_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift);
|
||||
esp_err_t dspi_dotprod_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift);
|
||||
|
||||
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two images with input offset
|
||||
* Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] in_image descriptor of the image
|
||||
* @param[in] filter descriptor of the filter
|
||||
* @param[out] out_value pointer to the output value
|
||||
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
|
||||
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
|
||||
* @param[in] offset - input offset value.
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dspi_dotprod_off_f32_ansi(image2d_t *in_image, image2d_t *filter, float *out_value, int count_x, int count_y, float offset);
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two images with input offset
|
||||
* Dot product calculation for two floating point images: *out_value += (image[i*...] + offset) * src2[i*...]); i= [0..count_x*count_y)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] in_image descriptor of the image
|
||||
* @param[in] filter descriptor of the filter
|
||||
* @param[out] out_value pointer to the output value
|
||||
* @param[in] count_x amount of samples by X axis (count_x*step_X <= widdth)
|
||||
* @param[in] count_y amount of samples by Y axis (count_y*step_Y <= height)
|
||||
* @param[in] shift - result shift to right, by default must be 15 for int16_t or 7 for int8_t
|
||||
* @param[in] offset - input offset value.
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dspi_dotprod_off_s16_ansi(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
|
||||
esp_err_t dspi_dotprod_off_u16_ansi(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
|
||||
esp_err_t dspi_dotprod_off_s8_ansi(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
|
||||
esp_err_t dspi_dotprod_off_u8_ansi(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
|
||||
|
||||
esp_err_t dspi_dotprod_off_s16_aes3(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
|
||||
esp_err_t dspi_dotprod_off_u16_aes3(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
|
||||
esp_err_t dspi_dotprod_off_s8_aes3(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
|
||||
esp_err_t dspi_dotprod_off_u8_aes3(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
|
||||
|
||||
esp_err_t dspi_dotprod_off_s16_arp4(image2d_t *in_image, image2d_t *filter, int16_t *out_value, int count_x, int count_y, int shift, int16_t offset);
|
||||
esp_err_t dspi_dotprod_off_u16_arp4(image2d_t *in_image, image2d_t *filter, uint16_t *out_value, int count_x, int count_y, int shift, uint16_t offset);
|
||||
esp_err_t dspi_dotprod_off_s8_arp4(image2d_t *in_image, image2d_t *filter, int8_t *out_value, int count_x, int count_y, int shift, int8_t offset);
|
||||
esp_err_t dspi_dotprod_off_u8_arp4(image2d_t *in_image, image2d_t *filter, uint8_t *out_value, int count_x, int count_y, int shift, uint8_t offset);
|
||||
|
||||
/**@}*/
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
|
||||
#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
|
||||
#if (dspi_dotprod_aes3_enabled == 1)
|
||||
#define dspi_dotprod_s16 dspi_dotprod_s16_aes3
|
||||
#define dspi_dotprod_u16 dspi_dotprod_u16_aes3
|
||||
#define dspi_dotprod_s8 dspi_dotprod_s8_aes3
|
||||
#define dspi_dotprod_u8 dspi_dotprod_u8_aes3
|
||||
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_aes3
|
||||
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_aes3
|
||||
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_aes3
|
||||
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_aes3
|
||||
#elif (dspi_dotprod_arp4_enabled == 1)
|
||||
#define dspi_dotprod_s16 dspi_dotprod_s16_arp4
|
||||
#define dspi_dotprod_s8 dspi_dotprod_s8_arp4
|
||||
#define dspi_dotprod_u16 dspi_dotprod_u16_arp4
|
||||
#define dspi_dotprod_u8 dspi_dotprod_u8_arp4
|
||||
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_arp4
|
||||
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_arp4
|
||||
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_arp4
|
||||
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_arp4
|
||||
#else
|
||||
#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
|
||||
#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
|
||||
#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
|
||||
#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
|
||||
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
|
||||
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
|
||||
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
|
||||
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
|
||||
#endif
|
||||
#endif
|
||||
#ifdef CONFIG_DSP_ANSI
|
||||
#define dspi_dotprod_f32 dspi_dotprod_f32_ansi
|
||||
#define dspi_dotprod_off_f32 dspi_dotprod_off_f32_ansi
|
||||
#define dspi_dotprod_s16 dspi_dotprod_s16_ansi
|
||||
#define dspi_dotprod_s8 dspi_dotprod_s8_ansi
|
||||
#define dspi_dotprod_off_s16 dspi_dotprod_off_s16_ansi
|
||||
#define dspi_dotprod_off_s8 dspi_dotprod_off_s8_ansi
|
||||
#define dspi_dotprod_u16 dspi_dotprod_u16_ansi
|
||||
#define dspi_dotprod_u8 dspi_dotprod_u8_ansi
|
||||
#define dspi_dotprod_off_u16 dspi_dotprod_off_u16_ansi
|
||||
#define dspi_dotprod_off_u8 dspi_dotprod_off_u8_ansi
|
||||
#endif
|
||||
|
||||
|
||||
#endif // _dspi_dotprod_H_
|
||||
@@ -0,0 +1,24 @@
|
||||
#ifndef _dspi_dotprod_platform_H_
|
||||
#define _dspi_dotprod_platform_H_
|
||||
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef __XTENSA__
|
||||
#include <xtensa/config/core-isa.h>
|
||||
#include <xtensa/config/core-matmap.h>
|
||||
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32S3
|
||||
#define dspi_dotprod_aes3_enabled 1
|
||||
#endif
|
||||
#endif // __XTENSA__
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32P4
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#define dspi_dotprod_arp4_enabled 1
|
||||
#else
|
||||
#define dspi_dotprod_arp4_enabled 0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // _dspi_dotprod_platform_H_
|
||||
@@ -0,0 +1,128 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef _DSPI_DOTPROD_H_
|
||||
#define _DSPI_DOTPROD_H_
|
||||
|
||||
#include "esp_log.h"
|
||||
#include "dsp_err.h"
|
||||
|
||||
#include "dsps_dotprod_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
// These functions calculates dotproduct of two vectors.
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two 16 bit vectors
|
||||
* Dot product calculation for two signed 16 bit arrays: *dest += (src1[i] * src2[i]) >> (15-shift); i= [0..N)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] src1 source array 1
|
||||
* @param[in] src2 source array 2
|
||||
* @param dest destination pointer
|
||||
* @param[in] len length of input arrays
|
||||
* @param[in] shift shift of the result.
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dotprod_s16_ansi(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
|
||||
esp_err_t dsps_dotprod_s16_ae32(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
|
||||
esp_err_t dsps_dotprod_s16_arp4(const int16_t *src1, const int16_t *src2, int16_t *dest, int len, int8_t shift);
|
||||
/**@}*/
|
||||
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two float vectors
|
||||
* Dot product calculation for two floating point arrays: *dest += (src1[i] * src2[i]); i= [0..N)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] src1 source array 1
|
||||
* @param[in] src2 source array 2
|
||||
* @param dest destination pointer
|
||||
* @param[in] len length of input arrays
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dotprod_f32_ansi(const float *src1, const float *src2, float *dest, int len);
|
||||
esp_err_t dsps_dotprod_f32_ae32(const float *src1, const float *src2, float *dest, int len);
|
||||
esp_err_t dsps_dotprod_f32_aes3(const float *src1, const float *src2, float *dest, int len);
|
||||
esp_err_t dsps_dotprod_f32_arp4(const float *src1, const float *src2, float *dest, int len);
|
||||
/**@}*/
|
||||
|
||||
/**@{*/
|
||||
/**
|
||||
* @brief dot product of two float vectors with step
|
||||
* Dot product calculation for two floating point arrays: *dest += (src1[i*step1] * src2[i*step2]); i= [0..N)
|
||||
* The extension (_ansi) use ANSI C and could be compiled and run on any platform.
|
||||
* The extension (_ae32) is optimized for ESP32 chip.
|
||||
*
|
||||
* @param[in] src1 source array 1
|
||||
* @param[in] src2 source array 2
|
||||
* @param dest destination pointer
|
||||
* @param[in] len length of input arrays
|
||||
* @param[in] step1 step over elements in first array
|
||||
* @param[in] step2 step over elements in second array
|
||||
* @return
|
||||
* - ESP_OK on success
|
||||
* - One of the error codes from DSP library
|
||||
*/
|
||||
esp_err_t dsps_dotprode_f32_ansi(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
|
||||
esp_err_t dsps_dotprode_f32_ae32(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
|
||||
esp_err_t dsps_dotprode_f32_arp4(const float *src1, const float *src2, float *dest, int len, int step1, int step2);
|
||||
/**@}*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#if (dsps_dotprod_s16_ae32_enabled == 1)
|
||||
#define dsps_dotprod_s16 dsps_dotprod_s16_ae32
|
||||
#elif (dsps_dotprod_s16_arp4_enabled == 1)
|
||||
#define dsps_dotprod_s16 dsps_dotprod_s16_arp4
|
||||
#else
|
||||
#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
|
||||
#endif // dsps_dotprod_s16_ae32_enabled
|
||||
|
||||
#if (dsps_dotprod_f32_aes3_enabled == 1)
|
||||
#define dsps_dotprod_f32 dsps_dotprod_f32_aes3
|
||||
#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
|
||||
#elif (dsps_dotprod_f32_arp4_enabled == 1)
|
||||
#define dsps_dotprod_f32 dsps_dotprod_f32_arp4
|
||||
#define dsps_dotprode_f32 dsps_dotprode_f32_arp4
|
||||
#elif (dotprod_f32_ae32_enabled == 1)
|
||||
#define dsps_dotprod_f32 dsps_dotprod_f32_ae32
|
||||
#define dsps_dotprode_f32 dsps_dotprode_f32_ae32
|
||||
#else
|
||||
#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
|
||||
#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
|
||||
#endif // dsps_dotprod_f32_ae32_enabled
|
||||
|
||||
#else // CONFIG_DSP_OPTIMIZED
|
||||
#define dsps_dotprod_s16 dsps_dotprod_s16_ansi
|
||||
#define dsps_dotprod_f32 dsps_dotprod_f32_ansi
|
||||
#define dsps_dotprode_f32 dsps_dotprode_f32_ansi
|
||||
#endif // CONFIG_DSP_OPTIMIZED
|
||||
|
||||
#endif // _DSPI_DOTPROD_H_
|
||||
@@ -0,0 +1,42 @@
|
||||
#ifndef _dsps_dotprod_platform_H_
|
||||
#define _dsps_dotprod_platform_H_
|
||||
|
||||
#include "sdkconfig.h"
|
||||
|
||||
#ifdef __XTENSA__
|
||||
#include <xtensa/config/core-isa.h>
|
||||
#include <xtensa/config/core-matmap.h>
|
||||
|
||||
|
||||
#if ((XCHAL_HAVE_FP == 1) && (XCHAL_HAVE_LOOPS == 1))
|
||||
|
||||
#define dotprod_f32_ae32_enabled 1
|
||||
#define dotprode_f32_ae32_enabled 1
|
||||
|
||||
#endif //
|
||||
|
||||
#if ((XCHAL_HAVE_LOOPS == 1) && (XCHAL_HAVE_MAC16 == 1))
|
||||
|
||||
#define dsps_dotprod_s16_ae32_enabled 1
|
||||
|
||||
#endif //
|
||||
#endif // __XTENSA__
|
||||
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32S3
|
||||
#define dsps_dotprod_s16_aes3_enabled 1
|
||||
#define dsps_dotprod_f32_aes3_enabled 1
|
||||
#endif
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32P4
|
||||
#ifdef CONFIG_DSP_OPTIMIZED
|
||||
#define dsps_dotprod_s16_arp4_enabled 1
|
||||
#define dsps_dotprod_f32_arp4_enabled 1
|
||||
#else
|
||||
#define dsps_dotprod_s16_arp4_enabled 0
|
||||
#define dsps_dotprod_f32_arp4_enabled 0
|
||||
#endif // CONFIG_DSP_OPTIMIZED
|
||||
#endif
|
||||
|
||||
|
||||
#endif // _dsps_dotprod_platform_H_
|
||||
@@ -0,0 +1,167 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "esp_dsp.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
TEST_CASE("dsps_dotprod_f32_aexx functionality", "[dsps]")
|
||||
{
|
||||
float check_value = 1235;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 1;
|
||||
y[i] = 3;
|
||||
}
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_f32(x, y, &z[1], i);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(i * 3, z[1]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dotprod_f32_aexx benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
printf("Benchmark dsps_dotprod_f32_aexx - x=%8.8"PRIx32", y=%8.8"PRIx32", len=%8.8x\n", (uint32_t)x, (uint32_t)y, 1024);
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprod_f32(x, y, &z[1], 1024);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (repeat_count);
|
||||
printf("Benchmark dsps_dotprod_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
|
||||
float min_exec = 1024;
|
||||
float max_exec = 6 * 1024;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("dsps_dotprod_f32_ansi functionality", "[dsps]")
|
||||
{
|
||||
float check_value = 1235;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
float *z = (float *)malloc(max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 1;
|
||||
y[i] = 3;
|
||||
}
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_f32_ansi(x, y, &z[1], i);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(i * 3, z[1]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dotprod_f32_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)malloc(max_N * sizeof(float));
|
||||
float *y = (float *)malloc(max_N * sizeof(float));
|
||||
float *z = (float *)malloc(max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprod_f32_ansi(x, y, &z[1], 1024);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (repeat_count);
|
||||
printf("Benchmark dsps_dotprod_f32_ansi - %f per sample + overhead.\n", cycles);
|
||||
float min_exec = 1024;
|
||||
float max_exec = 20 * 1024;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
// Test dsps_dotprod_s16_ansi function
|
||||
TEST_CASE("dsps_dotprod_s16_ansi functionality", "[dsps]")
|
||||
{
|
||||
int16_t check_value = 1235;
|
||||
int max_N = 1024;
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
// Check result == 0
|
||||
for (int i = 4; i < 1024; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, 0);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
|
||||
int16_t val_x = 0x080;
|
||||
int16_t val_y = 0x100;
|
||||
int16_t val_shift = 0;
|
||||
|
||||
for (int i = 0; i < max_N; i++) {
|
||||
x[i] = val_x;
|
||||
y[i] = val_y;
|
||||
}
|
||||
|
||||
// We check that dotproduct working with shift = 0;
|
||||
for (int i = 4 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
|
||||
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
|
||||
}
|
||||
val_shift = 2;
|
||||
for (int i = 4 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16_ansi(x, y, &z[1], i, val_shift);
|
||||
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(((long long)i * ((long long)val_x * (long long)val_y) + ((long long)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
// Test dsps_dotprod_s16_ansi function
|
||||
TEST_CASE("dsps_dotprod_s16_aexx functionality", "[dsps]")
|
||||
{
|
||||
int16_t check_value = 1235;
|
||||
int max_N = 1024;
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
// Check result == 0
|
||||
for (int i = 4 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, 0);
|
||||
{
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
}
|
||||
|
||||
int16_t val_x = 0x080;
|
||||
int16_t val_y = 0x100;
|
||||
int16_t val_shift = 0;
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = val_x;
|
||||
y[i] = val_y;
|
||||
}
|
||||
// We check that dotproduct working with shift = 0;
|
||||
for (int i = 4 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
|
||||
{
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL((i * (val_x * val_y) + (0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
|
||||
}
|
||||
}
|
||||
val_shift = 2;
|
||||
for (int i = 4 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprod_s16(x, y, &z[1], i, val_shift);
|
||||
{
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL((i * (val_x * val_y) + ((int)0x7fff >> val_shift)) >> (15 - val_shift), z[1]);
|
||||
}
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
static portMUX_TYPE testnlock = portMUX_INITIALIZER_UNLOCKED;
|
||||
TEST_CASE("dsps_dotprod_s16 benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0x100;
|
||||
y[i] = 0x200;
|
||||
}
|
||||
// Disable interrupt to get exect count
|
||||
|
||||
portENTER_CRITICAL(&testnlock);
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprod_s16(x, y, &z[1], 1024, 0);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
portEXIT_CRITICAL(&testnlock);
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (repeat_count);
|
||||
printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
|
||||
float min_exec = 256;
|
||||
float max_exec = 8 * 1024;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dotprod_s16_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0x100;
|
||||
y[i] = 0x200;
|
||||
}
|
||||
// Disable interrupt to get exect count
|
||||
|
||||
portENTER_CRITICAL(&testnlock);
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprod_s16_ansi(x, y, &z[1], 1024, 0);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
portEXIT_CRITICAL(&testnlock);
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (repeat_count);
|
||||
printf("Benchmark dsps_dotprod_s16 - %f cycles for 1024 samples + overhead. Result = %08x\n", cycles, z[1]);
|
||||
float min_exec = 1024 * 10;
|
||||
float max_exec = 1024 * 30;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dsps_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
TEST_CASE("dsps_dotprode_f32 functionality", "[dsps]")
|
||||
{
|
||||
float check_value = 1235;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 1;
|
||||
y[i] = 3;
|
||||
}
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprode_f32(x, y, &z[1], i, 1, 1);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(i * 3, z[1]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dotprode_f32 benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprode_f32(x, y, &z[1], 1024, 1, 1);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (repeat_count);
|
||||
printf("Benchmark dsps_dotprode_f32_aexx - %f per 1024 samples + overhead.\n", cycles);
|
||||
float min_exec = 1024;
|
||||
float max_exec = 6 * 1024;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("dsps_dotprode_f32_ansi functionality", "[dsps]")
|
||||
{
|
||||
float check_value = 1235;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
z[0] = check_value;
|
||||
z[2] = check_value + 1;
|
||||
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(0, z[1]);
|
||||
}
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 1;
|
||||
y[i] = 3;
|
||||
}
|
||||
for (int i = 1 ; i < 1024 ; i++) {
|
||||
esp_err_t status = dsps_dotprode_f32_ansi(x, y, &z[1], i, 1, 1);
|
||||
TEST_ASSERT_EQUAL(status, ESP_OK);
|
||||
TEST_ASSERT_EQUAL(check_value, z[0]);
|
||||
TEST_ASSERT_EQUAL(check_value + 1, z[2]);
|
||||
TEST_ASSERT_EQUAL(i * 3, z[1]);
|
||||
}
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
|
||||
TEST_CASE("dsps_dotprode_f32_ansi benchmark", "[dsps]")
|
||||
{
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
|
||||
for (int i = 0 ; i < max_N ; i++) {
|
||||
x[i] = 0;
|
||||
y[i] = 1000;
|
||||
}
|
||||
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
int repeat_count = 1024;
|
||||
for (int i = 0 ; i < repeat_count ; i++) {
|
||||
dsps_dotprode_f32_ansi(x, y, &z[1], 1024, 1, 1);
|
||||
}
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
|
||||
float total_b = end_b - start_b;
|
||||
float cycles = total_b / (1024 * repeat_count);
|
||||
printf("Benchmark dsps_dotprode_f32_ansi - %f per sample + overhead.\n", cycles);
|
||||
float min_exec = 5;
|
||||
float max_exec = 25;
|
||||
TEST_ASSERT_EXEC_IN_RANGE(min_exec, max_exec, cycles);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_f32_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_f32_ansi functionality", "[dspi]")
|
||||
{
|
||||
float check_value1 = 336;
|
||||
float check_value2 = 480;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 1;
|
||||
y[i] = i % 8 + 1;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
float result = -1;
|
||||
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
|
||||
ESP_LOGI(TAG, "result 1 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
|
||||
ESP_LOGI(TAG, "result 2 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
|
||||
ESP_LOGI(TAG, "result 3 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_f32_ansi(&image1, &image2, &result, 4, 4);
|
||||
ESP_LOGI(TAG, "result 4 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_f32_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_f32_ansi functionality", "[dspi]")
|
||||
{
|
||||
float check_value1 = 976;
|
||||
float check_value2 = 1280;
|
||||
float offset = 10;
|
||||
int max_N = 1024;
|
||||
float *x = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *y = (float *)memalign(16, max_N * sizeof(float));
|
||||
float *z = (float *)memalign(16, max_N * sizeof(float));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 1;
|
||||
y[i] = i % 8 + 1;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
float result = -1;
|
||||
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
|
||||
ESP_LOGI(TAG, "result 1 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
|
||||
ESP_LOGI(TAG, "result 2 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
|
||||
ESP_LOGI(TAG, "result 3 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_f32_ansi(&image1, &image2, &result, 4, 4, offset);
|
||||
ESP_LOGI(TAG, "result 4 = %f", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_s16";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_s16_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
int16_t offset = 7;
|
||||
|
||||
int max_N = 8192;
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s16 8x8");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
|
||||
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s16(&image1, &image2, &result, 8, 8, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s16 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s16(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s16 24x24");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s16(&image1, &image2, &result, 24, 24, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s16 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s16(&image1, &image2, &result, 32, 32, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s16 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_s16_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_s16_ansi functionality", "[dspi]")
|
||||
{
|
||||
int16_t check_value1 = 8676;
|
||||
int16_t check_value2 = 8742;
|
||||
int shift = 7;
|
||||
int16_t offset = 11;
|
||||
|
||||
int max_N = 1024;
|
||||
int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 255;
|
||||
y[i] = i % 8 + 255;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
int16_t result = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_s8";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_s8_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
int8_t offset = 5;
|
||||
|
||||
int max_N = 16384;
|
||||
int8_t *x = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
|
||||
int8_t *y = (int8_t *)memalign(16, (max_N) * sizeof(int8_t));
|
||||
int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
|
||||
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s8(&image1, &image2, &result, 32, 32, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 48x48");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s8(&image1, &image2, &result, 48, 48, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 64x64");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s8(&image1, &image2, &result, 64, 64, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 128x128");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_s8(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_s8 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_s8_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_s8_ansi functionality", "[dspi]")
|
||||
{
|
||||
int8_t check_value1 = 98;
|
||||
int8_t check_value2 = 106;
|
||||
int shift = 7;
|
||||
int8_t offset = 11;
|
||||
|
||||
int max_N = 1024;
|
||||
int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 20;
|
||||
y[i] = i % 8 + 20;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
int8_t result = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_s8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_u16";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_u16_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
uint16_t offset = 7;
|
||||
|
||||
int max_N = 8192;
|
||||
uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u16 8x8");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
|
||||
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u16(&image1, &image2, &result, 8, 8, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u16 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u16(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u16 24x24");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u16(&image1, &image2, &result, 24, 24, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u16 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u16(&image1, &image2, &result, 32, 32, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u16 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_u16_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_u16_ansi functionality", "[dspi]")
|
||||
{
|
||||
uint16_t check_value1 = 8676;
|
||||
uint16_t check_value2 = 8742;
|
||||
int shift = 7;
|
||||
uint16_t offset = 11;
|
||||
|
||||
int max_N = 1024;
|
||||
uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 255;
|
||||
y[i] = i % 8 + 255;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
uint16_t result = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u16_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_u8";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_u8_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
uint8_t offset = 7;
|
||||
|
||||
int max_N = 16384;
|
||||
uint8_t *x = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
|
||||
uint8_t *y = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
|
||||
uint8_t *z = (uint8_t *)memalign(16, max_N * sizeof(uint8_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
uint8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint8_t result_ref = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
|
||||
uint8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u8(&image1, &image2, &result, 32, 32, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint8_t result_ref = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 32, 32, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 48x48");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
|
||||
uint8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u8(&image1, &image2, &result, 48, 48, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint8_t result_ref = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 48, 48, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 64x64");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
|
||||
uint8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u8(&image1, &image2, &result, 64, 64, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint8_t result_ref = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 64, 64, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 128x128");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
uint8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_off_u8(&image1, &image2, &result, 16, 16, shift, offset);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint8_t result_ref = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result_ref, 16, 16, shift, offset);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_off_u8 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_off_u8_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_off_u8_ansi functionality", "[dspi]")
|
||||
{
|
||||
uint8_t check_value1 = 98;
|
||||
uint8_t check_value2 = 106;
|
||||
int shift = 7;
|
||||
uint8_t offset = 11;
|
||||
|
||||
int max_N = 1024;
|
||||
uint8_t *x = (uint8_t *)malloc(max_N * sizeof(uint8_t));
|
||||
uint8_t *y = (uint8_t *)malloc(max_N * sizeof(uint8_t));
|
||||
uint8_t *z = (uint8_t *)malloc(max_N * sizeof(uint8_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 20;
|
||||
y[i] = i % 8 + 20;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
uint8_t result = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_off_u8_ansi(&image1, &image2, &result, 4, 4, shift, offset);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_s16";
|
||||
|
||||
TEST_CASE("dspi_dotprod_s16_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
|
||||
int max_N = 8192;
|
||||
int16_t *x = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s16 8x8");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
|
||||
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s16(&image1, &image2, &result, 8, 8, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s16 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s16(&image1, &image2, &result, 16, 16, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s16 24x24");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s16(&image1, &image2, &result, 24, 24, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s16 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
|
||||
int16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s16(&image1, &image2, &result, 32, 32, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int16_t result_ref = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s16 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_s16_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_s16_ansi functionality", "[dspi]")
|
||||
{
|
||||
int16_t check_value1 = 8321;
|
||||
int16_t check_value2 = 8386;
|
||||
int shift = 7;
|
||||
|
||||
int max_N = 1024;
|
||||
int16_t *x = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
int16_t *y = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
int16_t *z = (int16_t *)malloc(max_N * sizeof(int16_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 255;
|
||||
y[i] = i % 8 + 255;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
int16_t result = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_s16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,121 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_s8";
|
||||
|
||||
TEST_CASE("dspi_dotprod_s8_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
|
||||
int max_N = 16384;
|
||||
int8_t *x = (int8_t *)memalign(16, max_N * sizeof(int8_t));
|
||||
int8_t *y = (int8_t *)memalign(16, max_N * sizeof(int8_t));
|
||||
int8_t *z = (int8_t *)memalign(16, max_N * sizeof(int8_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s8(&image1, &image2, &result, 32, 32, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 32, 32, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 48x48");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 48, 48, 48, 48}; // Umage 48x48
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s8(&image1, &image2, &result, 48, 48, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 48, 48, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 64x64");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 64, 64, 64, 64}; // Umage 32x32
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s8(&image1, &image2, &result, 64, 64, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 64, 64, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 128x128");
|
||||
image2d_t image1 = {&x[3], 1, 1, 128, 128, 128, 128}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
int8_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_s8(&image1, &image2, &result, 16, 16, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
int8_t result_ref = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result_ref, 16, 16, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_s8 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "";
|
||||
|
||||
TEST_CASE("dspi_dotprod_s8_ansi functionality", "[dspi]")
|
||||
{
|
||||
int8_t check_value1 = 67;
|
||||
int8_t check_value2 = 73;
|
||||
int shift = 7;
|
||||
|
||||
int max_N = 1024;
|
||||
int8_t *x = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
int8_t *y = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
int8_t *z = (int8_t *)malloc(max_N * sizeof(int8_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 20;
|
||||
y[i] = i % 8 + 20;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
int8_t result = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_s8_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,106 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
#include <malloc.h>
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_u16";
|
||||
|
||||
TEST_CASE("dspi_dotprod_u16_aexx functionality", "[dspi]")
|
||||
{
|
||||
int shift = 2;
|
||||
|
||||
int max_N = 8192;
|
||||
uint16_t *x = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
uint16_t *y = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
uint16_t *z = (uint16_t *)memalign(16, max_N * sizeof(int16_t));
|
||||
|
||||
printf("Data: x=%8.8"PRIx32", y=%8.8"PRIx32", z=%8.8"PRIx32" \n", (uint32_t)x, (uint32_t)y, (uint32_t)z);
|
||||
for (size_t i = 0; i < max_N; i++) {
|
||||
x[i] = i % 7;
|
||||
y[i] = i % 7;
|
||||
z[i] = 0;
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_u16 8x8");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64
|
||||
image2d_t image2 = {y, 1, 1, 8, 8, 8, 8}; // Umage 64
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_u16(&image1, &image2, &result, 8, 8, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", (end_b - start_b));
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 8, 8, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_u16 16x16");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 16, 16, 16, 16}; // Umage 16x16
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_u16(&image1, &image2, &result, 16, 16, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 16, 16, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_u16 24x24");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 24, 24, 24, 24}; // Umage 24x24
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_u16(&image1, &image2, &result, 24, 24, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 24, 24, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
{
|
||||
ESP_LOGI(TAG, "dspi_dotprod_u16 32x32");
|
||||
image2d_t image1 = {&x[3], 1, 1, 64, 64, 64, 64}; // Image 64x64
|
||||
image2d_t image2 = {y, 1, 1, 32, 32, 32, 32}; // Umage 32x32
|
||||
uint16_t result = -1;
|
||||
unsigned int start_b = dsp_get_cpu_cycle_count();
|
||||
dspi_dotprod_u16(&image1, &image2, &result, 32, 32, shift);
|
||||
unsigned int end_b = dsp_get_cpu_cycle_count();
|
||||
ESP_LOGI(TAG, "cycles = %i", end_b - start_b);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
uint16_t result_ref = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result_ref, 32, 32, shift);
|
||||
ESP_LOGI(TAG, "result ref = %i", result_ref);
|
||||
TEST_ASSERT_EQUAL( result, result_ref);
|
||||
}
|
||||
|
||||
ESP_LOGI(TAG, "dspi_dotprod_u16 done");
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// Copyright 2018-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
#include "unity.h"
|
||||
#include "dsp_platform.h"
|
||||
#include "esp_log.h"
|
||||
|
||||
#include "dspi_dotprod.h"
|
||||
#include "dsp_tests.h"
|
||||
|
||||
static const char *TAG = "dspi_dotprod_u16_ansi";
|
||||
|
||||
TEST_CASE("dspi_dotprod_u16_ansi functionality", "[dspi]")
|
||||
{
|
||||
uint16_t check_value1 = 8321;
|
||||
uint16_t check_value2 = 8386;
|
||||
int shift = 7;
|
||||
|
||||
int max_N = 1024;
|
||||
uint16_t *x = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
uint16_t *y = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
uint16_t *z = (uint16_t *)malloc(max_N * sizeof(uint16_t));
|
||||
for (size_t i = 0; i < 256; i++) {
|
||||
x[i] = i % 8 + 255;
|
||||
y[i] = i % 8 + 255;
|
||||
z[i] = 0;
|
||||
}
|
||||
image2d_t image1 = {x, 2, 2, 8, 8, 8, 8}; // Image 8x8
|
||||
image2d_t image2 = {y, 2, 2, 8, 8, 8, 8}; // Umage 8x8
|
||||
uint16_t result = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 1 = %i", result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[1];
|
||||
image2.data = &y[1];
|
||||
result = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 2 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
image1.data = &x[image1.stride_x];
|
||||
image2.data = &y[image2.stride_x];
|
||||
result = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 3 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value1);
|
||||
image1.data = &x[image1.stride_x + 1];
|
||||
image2.data = &y[image2.stride_x + 1];
|
||||
result = -1;
|
||||
dspi_dotprod_u16_ansi(&image1, &image2, &result, 4, 4, shift);
|
||||
ESP_LOGI(TAG, "result 4 = %i", (int)result);
|
||||
TEST_ASSERT_EQUAL( result, check_value2);
|
||||
|
||||
free(x);
|
||||
free(y);
|
||||
free(z);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user