Delaying allocation of the SILK temporary output buffer to reduce peak stack
diff --git a/silk/dec_API.c b/silk/dec_API.c
index 660f93d..1087c67 100644
--- a/silk/dec_API.c
+++ b/silk/dec_API.c
@@ -31,6 +31,7 @@
#include "API.h"
#include "main.h"
#include "stack_alloc.h"
+#include "os_support.h"
/************************/
/* Decoder Super Struct */
@@ -90,7 +91,8 @@
opus_int i, n, decode_only_middle = 0, ret = SILK_NO_ERROR;
opus_int32 nSamplesOutDec, LBRR_symbol;
opus_int16 *samplesOut1_tmp[ 2 ];
- VARDECL( opus_int16, samplesOut1_tmp_storage );
+ VARDECL( opus_int16, samplesOut1_tmp_storage1 );
+ VARDECL( opus_int16, samplesOut1_tmp_storage2 );
VARDECL( opus_int16, samplesOut2_tmp );
opus_int32 MS_pred_Q13[ 2 ] = { 0 };
opus_int16 *resample_out_ptr;
@@ -98,6 +100,7 @@
silk_decoder_state *channel_state = psDec->channel_state;
opus_int has_side;
opus_int stereo_to_mono;
+ int delay_stack_alloc;
SAVE_STACK;
silk_assert( decControl->nChannelsInternal == 1 || decControl->nChannelsInternal == 2 );
@@ -251,13 +254,22 @@
psDec->channel_state[ 1 ].first_frame_after_reset = 1;
}
- ALLOC( samplesOut1_tmp_storage,
- decControl->nChannelsInternal*(
- channel_state[ 0 ].frame_length + 2 ),
+ /* Check if the temp buffer fits into the output PCM buffer. If it fits,
+ we can delay allocating the temp buffer until after the SILK peak stack
+ usage. We need to use a < and not a <= because of the two extra samples. */
+ delay_stack_alloc = decControl->internalSampleRate*decControl->nChannelsInternal
+ < decControl->API_sampleRate*decControl->nChannelsAPI;
+ ALLOC( samplesOut1_tmp_storage1, delay_stack_alloc ? ALLOC_NONE
+ : decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 ),
opus_int16 );
- samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage;
- samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage
- + channel_state[ 0 ].frame_length + 2;
+ if ( delay_stack_alloc )
+ {
+ samplesOut1_tmp[ 0 ] = samplesOut;
+ samplesOut1_tmp[ 1 ] = samplesOut + channel_state[ 0 ].frame_length + 2;
+ } else {
+ samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage1;
+ samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage1 + channel_state[ 0 ].frame_length + 2;
+ }
if( lostFlag == FLAG_DECODE_NORMAL ) {
has_side = !decode_only_middle;
@@ -312,6 +324,15 @@
resample_out_ptr = samplesOut;
}
+ ALLOC( samplesOut1_tmp_storage2, delay_stack_alloc
+ ? decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2 )
+ : ALLOC_NONE,
+ opus_int16 );
+ if ( delay_stack_alloc ) {
+ OPUS_COPY(samplesOut1_tmp_storage2, samplesOut, decControl->nChannelsInternal*(channel_state[ 0 ].frame_length + 2));
+ samplesOut1_tmp[ 0 ] = samplesOut1_tmp_storage2;
+ samplesOut1_tmp[ 1 ] = samplesOut1_tmp_storage2 + channel_state[ 0 ].frame_length + 2;
+ }
for( n = 0; n < silk_min( decControl->nChannelsAPI, decControl->nChannelsInternal ); n++ ) {
/* Resample decoded signal to API_sampleRate */