fork FLAC__bitreader_read_rice_signed_block() into MSVC and non-MSVC versions; minor optimizations to non-MSVC version
diff --git a/src/libFLAC/bitreader.c b/src/libFLAC/bitreader.c
index 47aa403..93f7fbb 100644
--- a/src/libFLAC/bitreader.c
+++ b/src/libFLAC/bitreader.c
@@ -48,14 +48,13 @@
#include "FLAC/assert.h"
/* Things should be fastest when this matches the machine word size */
-/* WATCHOUT: if you change this you must also change the following #defines down to ALIGNED_UNARY_BITS below to match */
+/* WATCHOUT: if you change this you must also change the following #defines down to COUNT_ZERO_MSBS below to match */
/* WATCHOUT: there are a few places where the code will not work unless brword is >= 32 bits wide */
/* also, some sections currently only have fast versions for 4 or 8 bytes per word */
typedef FLAC__uint32 brword;
#define FLAC__BYTES_PER_WORD 4
#define FLAC__BITS_PER_WORD 32
#define FLAC__WORD_ALL_ONES ((FLAC__uint32)0xffffffff)
-#define FLAC__WORD_TOP_BIT_ONE ((FLAC__uint32)0x80000000)
/* SWAP_BE_WORD_TO_HOST swaps bytes in a brword (which is always big-endian) if necessary to match host byte order */
#if WORDS_BIGENDIAN
#define SWAP_BE_WORD_TO_HOST(x) (x)
@@ -67,13 +66,13 @@
#endif
#endif
/* counts the # of zero MSBs in a word */
-#define ALIGNED_UNARY_BITS(word) ( \
+#define COUNT_ZERO_MSBS(word) ( \
(word) <= 0xffff ? \
( (word) <= 0xff? byte_to_unary_table[word] + 24 : byte_to_unary_table[(word) >> 8] + 16 ) : \
( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 : byte_to_unary_table[(word) >> 24] ) \
)
/* this alternate might be slightly faster on some systems/compilers: */
-#define ALIGNED_UNARY_BITS2(word) ( (word) <= 0xff ? byte_to_unary_table[word] + 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 : ((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 : byte_to_unary_table[(word) >> 24])) )
+#define COUNT_ZERO_MSBS2(word) ( (word) <= 0xff ? byte_to_unary_table[word] + 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 : ((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 : byte_to_unary_table[(word) >> 24])) )
/*
@@ -724,12 +723,7 @@
while(br->consumed_words < br->words) { /* if we've not consumed up to a partial tail word... */
brword b = br->buffer[br->consumed_words] << br->consumed_bits;
if(b) {
-#if 0 /* too slow, but this is the idea: */
- for(i = 0; !(b & FLAC__WORD_TOP_BIT_ONE); i++)
- b <<= 1;
-#else
- i = ALIGNED_UNARY_BITS(b);
-#endif
+ i = COUNT_ZERO_MSBS(b);
*val += i;
i++;
br->consumed_bits += i;
@@ -759,12 +753,7 @@
const unsigned end = br->bytes * 8;
brword b = (br->buffer[br->consumed_words] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << br->consumed_bits;
if(b) {
-#if 0 /* too slow, but this is the idea: */
- for(i = 0; !(b & FLAC__WORD_TOP_BIT_ONE); i++)
- b <<= 1;
-#else
- i = ALIGNED_UNARY_BITS(b);
-#endif
+ i = COUNT_ZERO_MSBS(b);
*val += i;
i++;
br->consumed_bits += i;
@@ -814,6 +803,189 @@
/* this is by far the most heavily used reader call. it ain't pretty but it's fast */
/* a lot of the logic is copied, then adapted, from FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
+#ifdef _MSC_VER
+{
+ unsigned i;
+ unsigned uval = 0;
+ unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword */
+
+ /* try and get br->consumed_words and br->consumed_bits into register;
+ * must remember to flush them back to *br before calling other
+ * bitwriter functions that use them, and before returning */
+ register unsigned cwords;
+ register unsigned cbits;
+
+ FLAC__ASSERT(0 != br);
+ FLAC__ASSERT(0 != br->buffer);
+ /* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
+ FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
+ FLAC__ASSERT(parameter < 32);
+ /* the above two asserts also guarantee that the binary part never straddles more that 2 words, so we don't have to loop to read it */
+
+ if(nvals == 0)
+ return true;
+
+ cbits = br->consumed_bits;
+ cwords = br->consumed_words;
+
+ while(1) {
+
+ /* read unary part */
+ while(1) {
+ while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+ brword b = br->buffer[cwords] << cbits;
+ if(b) {
+#if 0 /* slower, probably due to bad register allocation... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32
+ __asm {
+ bsr eax, b
+ not eax
+ and eax, 31
+ mov i, eax
+ }
+#else
+ i = COUNT_ZERO_MSBS(b);
+#endif
+ uval += i;
+ bits = parameter;
+ i++;
+ cbits += i;
+ if(cbits == FLAC__BITS_PER_WORD) {
+ crc16_update_word_(br, br->buffer[cwords]);
+ cwords++;
+ cbits = 0;
+ }
+ goto break1;
+ }
+ else {
+ uval += FLAC__BITS_PER_WORD - cbits;
+ crc16_update_word_(br, br->buffer[cwords]);
+ cwords++;
+ cbits = 0;
+ /* didn't find stop bit yet, have to keep going... */
+ }
+ }
+ /* at this point we've eaten up all the whole words; have to try
+ * reading through any tail bytes before calling the read callback.
+ * this is a repeat of the above logic adjusted for the fact we
+ * don't have a whole word. note though if the client is feeding
+ * us data a byte at a time (unlikely), br->consumed_bits may not
+ * be zero.
+ */
+ if(br->bytes) {
+ const unsigned end = br->bytes * 8;
+ brword b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits;
+ if(b) {
+ i = COUNT_ZERO_MSBS(b);
+ uval += i;
+ bits = parameter;
+ i++;
+ cbits += i;
+ FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+ goto break1;
+ }
+ else {
+ uval += end - cbits;
+ cbits += end;
+ FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
+ /* didn't find stop bit yet, have to keep going... */
+ }
+ }
+ /* flush registers and read; bitreader_read_from_client_() does
+ * not touch br->consumed_bits at all but we still need to set
+ * it in case it fails and we have to return false.
+ */
+ br->consumed_bits = cbits;
+ br->consumed_words = cwords;
+ if(!bitreader_read_from_client_(br))
+ return false;
+ cwords = br->consumed_words;
+ }
+break1:
+ /* read binary part */
+ FLAC__ASSERT(cwords <= br->words);
+
+ if(bits) {
+ while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits < bits) {
+ /* flush registers and read; bitreader_read_from_client_() does
+ * not touch br->consumed_bits at all but we still need to set
+ * it in case it fails and we have to return false.
+ */
+ br->consumed_bits = cbits;
+ br->consumed_words = cwords;
+ if(!bitreader_read_from_client_(br))
+ return false;
+ cwords = br->consumed_words;
+ }
+ if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
+ if(cbits) {
+ /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+ const unsigned n = FLAC__BITS_PER_WORD - cbits;
+ const brword word = br->buffer[cwords];
+ if(bits < n) {
+ uval <<= bits;
+ uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-bits);
+ cbits += bits;
+ goto break2;
+ }
+ uval <<= n;
+ uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
+ bits -= n;
+ crc16_update_word_(br, word);
+ cwords++;
+ cbits = 0;
+ if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
+ uval <<= bits;
+ uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
+ cbits = bits;
+ }
+ goto break2;
+ }
+ else {
+ FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
+ uval <<= bits;
+ uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
+ cbits = bits;
+ goto break2;
+ }
+ }
+ else {
+ /* in this case we're starting our read at a partial tail word;
+ * the reader has guaranteed that we have at least 'bits' bits
+ * available to read, which makes this case simpler.
+ */
+ uval <<= bits;
+ if(cbits) {
+ /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+ FLAC__ASSERT(cbits + bits <= br->bytes*8);
+ uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-bits);
+ cbits += bits;
+ goto break2;
+ }
+ else {
+ uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
+ cbits += bits;
+ goto break2;
+ }
+ }
+ }
+break2:
+ /* compose the value */
+ *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+
+ /* are we done? */
+ --nvals;
+ if(nvals == 0) {
+ br->consumed_bits = cbits;
+ br->consumed_words = cwords;
+ return true;
+ }
+
+ uval = 0;
+ ++vals;
+
+ }
+}
+#else
{
unsigned i;
unsigned uval = 0;
@@ -846,11 +1018,16 @@
while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
brword b = br->buffer[cwords] << cbits;
if(b) {
-#if 0 /* too slow, but this is the idea: */
- for(i = 0; !(b & FLAC__WORD_TOP_BIT_ONE); i++)
- b <<= 1;
+#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 && defined __GNUC__
+ asm volatile (
+ "bsrl %1, %0;"
+ "notl %0;"
+ "andl $31, %0;"
+ : "=r"(i)
+ : "r"(b)
+ );
#else
- i = ALIGNED_UNARY_BITS(b);
+ i = COUNT_ZERO_MSBS(b);
#endif
uval += i;
cbits += i;
@@ -881,12 +1058,7 @@
const unsigned end = br->bytes * 8;
brword b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
if(b) {
-#if 0 /* too slow, but this is the idea: */
- for(i = 0; !(b & FLAC__WORD_TOP_BIT_ONE); i++)
- b <<= 1;
-#else
- i = ALIGNED_UNARY_BITS(b);
-#endif
+ i = COUNT_ZERO_MSBS(b);
uval += i;
cbits += i;
cbits++; /* skip over stop bit */
@@ -937,31 +1109,30 @@
}
if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
if(cbits) {
- /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
+ /* this also works when consumed_bits==0, it's just slower than necessary for that case */
const unsigned n = FLAC__BITS_PER_WORD - cbits;
const brword word = br->buffer[cwords];
if(parameter < n) {
uval <<= parameter;
uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
cbits += parameter;
- goto break2;
}
- uval <<= n;
- uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
- crc16_update_word_(br, word);
- cwords++;
- cbits = parameter - n;
- if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
- uval <<= cbits;
- uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
+ else {
+ uval <<= n;
+ uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
+ crc16_update_word_(br, word);
+ cwords++;
+ cbits = parameter - n;
+ if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
+ uval <<= cbits;
+ uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
+ }
}
- goto break2;
}
else {
cbits = parameter;
uval <<= parameter;
uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
- goto break2;
}
}
else {
@@ -975,16 +1146,14 @@
FLAC__ASSERT(cbits + parameter <= br->bytes*8);
uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
cbits += parameter;
- goto break2;
}
else {
cbits = parameter;
uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
- goto break2;
}
}
}
-break2:
+
ucbits -= parameter;
/* compose the value */
@@ -1003,6 +1172,7 @@
}
}
+#endif
#if 0 /* UNUSED */
FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter)