Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 83 additions & 53 deletions modules/fir/float/dsps_fird_f32_aes3.S
Original file line number Diff line number Diff line change
Expand Up @@ -24,46 +24,40 @@
//esp_err_t dsps_fird_f32_aes3(fir_f32_t* fir, const float* input, float* output, int len);

dsps_fird_f32_aes3:
// fir - a2
// input - a3
// output - a4
// len - a5

// a2 - fir structure
// a3 - input
// a4 - output
// a5 - length

// a6 - fir length
// a7 - position in delay line
// a6 - fir->N - amount of coefficients
// a7 - fir->pos - position in delay line
// a8 - temp
// a10 - coeffs ptr
// a11 - delay line ptr
// a12 - const
// a13 -
// a9 - return value (= length)
// a10 - fir->coeffs - pointer to constant coefficients
// a11 - fir->delay - pointer to delay line
// a12 - constant: -16 (= 0xFFFFFFF0)
// a13 - constant: 15 (= 0x0000000F)
// a14 - temp for loops
// a15 - delay line rounded to 16
// a15 - delay line rounded down to 16

entry a1, 16
// Array increment for floating point data should be 4
l32i a7, a2, 12 // a7 - pos

l32i a6, a2, 8 // a6 - N - amount of coefficients
l32i a10, a2, 0 // a10 - coeffs
l32i a11, a2, 4 // a11 - delay line
addx4 a11, a7, a11 // a11 = a11 + a7*4
l32i a6, a2, 8 // a6 - N

mov.n a9, a5
movi.n a12, 3
mov.n a9, a5

movi.n a12, -16
movi.n a13, 15
// Main loop for input samples
.fird_loop_len:
// Store K values from input to delay line:

l32i a14, a2, 16 // a14 - decimation
l32i a14, a2, 16 // a14 - fir->decim = K (decimation factor)
loopnez a14, .fird_load_data // K loops
// Store to delay line
lsip f15, a3, 4 // a3 += 4, f15 = input[n]
Expand All @@ -79,16 +73,16 @@ dsps_fird_f32_aes3:
//
// Process data
//
// Load rounded delay line address

l32i a10, a2, 0 // a10 - coeffs

// Clear f4, f5 for multiplications
// Clear f4, f5, f6, f7 for multiplications
const.s f4, 0
const.s f5, 0
const.s f6, 0
const.s f7, 0

// Branch according to the current position (modulo 4) in delay line
and a8, a11, a13 // a8 = a11 & 15
beqz a8, .offset_0
addi a8, a8, -4
Expand All @@ -101,123 +95,158 @@ dsps_fird_f32_aes3:
// a10 - coeffs
// a11 - delay line
.offset_0:
// a14 = (N - pos) / 4
sub a14, a6, a7 // a14 = N-pos
srli a14, a14, 2
loopnez a14, .first_fir_loop_0 // pos...N-1
loopnez a14, .first_fir_loop_0 // d in [pos,N[ (stride of 4) ; c in [0,N-pos[ (stride of 4)
EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f4, f0, f8
madd.s f5, f1, f9
madd.s f4, f0, f8 // f4 += coeffs[c] * delay[d]
madd.s f5, f1, f9 // f5 += coeffs[c+1] * delay[d+1]
madd.s f6, f2, f10
madd.s f7, f3, f11
.first_fir_loop_0:

l32i a15, a2, 4 // a11 - delay line [0]
// Reset delay line pointer
l32i a15, a2, 4 // a15 - delay line [0]

// a14 = pos / 4
srli a14, a7, 2
loopnez a14, .second_fir_loop_0 // 0..pos
loopnez a14, .second_fir_loop_0 // i in [0,pos[ (stride of 4) ; c in [N-pos,N[ (stride of 4)
EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f4, f0, f8
madd.s f5, f1, f9
madd.s f4, f0, f8 // f4 += coeffs[c] * delay[d]
madd.s f5, f1, f9 // f5 += coeffs[c+1] * delay[d+1]
madd.s f6, f2, f10
madd.s f7, f3, f11
.second_fir_loop_0:
j .store_fir_result;

.offset_1:
// a14 = (N - pos + 3) / 4
sub a14, a6, a7 // a14 = N-pos
addi a14, a14, 3
srli a14, a14, 2

const.s f3, 0 // f3 = 0
EE.LDF.128.IP f11, f10, f9, f12, a15, 16 // Load data from delay line

// f12 - delay[N-1], store for the last operation
// f9..f11 - delay[0..2]
loopnez a14, .first_fir_loop_1 // pos...N-1
madd.s f4, f3, f8 // multiplies f8 by 0 in the first iteration

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f4, f0, f9
madd.s f5, f1, f10
madd.s f6, f2, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f7, f3, f8
madd.s f5, f0, f9
madd.s f6, f1, f10
madd.s f7, f2, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line (out of bounds access on last iteration! - delay[N], delay[N+1], delay[N+2], delay[N+3])
.first_fir_loop_1:

l32i a15, a2, 4 // a11 - delay line [0]
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
// Reset delay line pointer
l32i a15, a2, 4 // a15 - delay line [0]

// a14 = pos / 4
srli a14, a7, 2
loopnez a14, .second_fir_loop_1 // 0..pos
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f4, f3, f8

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f5, f0, f9
madd.s f6, f1, f10
madd.s f7, f2, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
.second_fir_loop_1:

// Both loops together evaluate to N/4 iterations (N madds)

madd.s f4, f3, f12
j .store_fir_result;

.offset_2:
// a14 = (N - pos + 3) / 4
sub a14, a6, a7 // a14 = N-pos
addi a14, a14, 3
srli a14, a14, 2

const.s f2, 0 // f2 = 0
const.s f3, 0 // f3 = 0
EE.LDF.128.IP f11, f10, f13, f12, a15, 16 // Load data from delay line

// f12, f13 - delay[N-1], delay[N-2], store for the last operation
// f10..f11 - delay[0..1]
loopnez a14, .first_fir_loop_2 // pos...N-1
madd.s f4, f2, f8 // multiplies f8 by 0 in the first iteration
madd.s f5, f3, f9 // multiplies f9 by 0 in the first iteration

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f4, f0, f10
madd.s f5, f1, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f6, f2, f8
madd.s f7, f3, f9
madd.s f6, f0, f10
madd.s f7, f1, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line (out of bounds access on last iteration! - delay[N], delay[N+1], delay[N+2], delay[N+3])
.first_fir_loop_2:

l32i a15, a2, 4 // a11 - delay line [0]
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
// Reset delay line pointer
l32i a15, a2, 4 // a11 - delay line [0]

srli a14, a7, 2
loopnez a14, .second_fir_loop_2 // 0..pos
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f4, f2, f8
madd.s f5, f3, f9

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f6, f0, f10
madd.s f7, f1, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
.second_fir_loop_2:

// Both loops together evaluate to N/4 iterations (N madds)

madd.s f4, f2, f12
madd.s f5, f3, f13
j .store_fir_result;

.offset_3:
// a14 = (N - pos + 3) / 4
sub a14, a6, a7 // a14 = N-pos
addi a14, a14, 3
srli a14, a14, 2

const.s f1, 0 // f1 = 0
const.s f2, 0 // f2 = 0
const.s f3, 0 // f3 = 0
EE.LDF.128.IP f11, f14, f13, f12, a15, 16 // Load data from delay line

// f12, f13, f14 - delay[N-1], delay[N-2], delay[N-3], store for the last operation
// f11 - delay[0]
loopnez a14, .first_fir_loop_3 // pos...N-1
madd.s f4, f1, f8
madd.s f5, f2, f9
madd.s f6, f3, f10

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f4, f0, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f5, f1, f8
madd.s f6, f2, f9
madd.s f7, f3, f10
madd.s f7, f0, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line (out of bounds access on last iteration! - delay[N], delay[N+1], delay[N+2], delay[N+3])
.first_fir_loop_3:

l32i a15, a2, 4 // a11 - delay line [0]
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
// Reset delay line pointer
l32i a15, a2, 4 // a11 - delay line [0]

srli a14, a7, 2
loopnez a14, .second_fir_loop_3 // 0..pos
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
madd.s f4, f1, f8
madd.s f5, f2, f9
madd.s f6, f3, f10

EE.LDF.128.IP f3, f2, f1, f0, a10, 16 // Load coeffs
madd.s f7, f0, f11
EE.LDF.128.IP f11, f10, f9, f8, a15, 16 // Load data from delay line
.second_fir_loop_3:

// Both loops together evaluate to N/4 iterations (N madds)

madd.s f4, f1, f12
madd.s f5, f2, f13
madd.s f4, f3, f14
madd.s f6, f3, f14

.store_fir_result:

Expand All @@ -227,13 +256,14 @@ dsps_fird_f32_aes3:

// Store result
ssip f4, a4, 4 // y++ - save result and increment output pointer
// Check loop length

// Check loop break condition
addi a5, a5, -1
bnez a5, .fird_loop_len
// store state

// Store state
s32i a7, a2, 12 // pos = a7
mov.n a2, a9
retw.n

#endif // dsps_fir_f32_aes3_enabled
#endif // dsps_fir_f32_aes3_enabled