| /* |
| * Copyright (C) 2013, The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LATINIME_BYTE_ARRAY_UTILS_H |
| #define LATINIME_BYTE_ARRAY_UTILS_H |
| |
| #include <cstdint> |
| |
| #include "defines.h" |
| |
| namespace latinime { |
| |
| /** |
| * Utility methods for reading byte arrays. |
| */ |
| class ByteArrayUtils { |
| public: |
| /** |
| * Integer writing |
| * |
| * Each method write a corresponding size integer in a big endian manner. |
| */ |
| static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, |
| const uint32_t data, const int size, int *const pos) { |
| // size must be in 1 to 4. |
| ASSERT(size >= 1 && size <= 4); |
| switch (size) { |
| case 1: |
| ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); |
| return; |
| case 2: |
| ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); |
| return; |
| case 3: |
| ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); |
| return; |
| case 4: |
| ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); |
| return; |
| default: |
| break; |
| } |
| } |
| |
| /** |
| * Integer reading |
| * |
| * Each method read a corresponding size integer in a big endian manner. |
| */ |
| static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { |
| return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) |
| ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; |
| } |
| |
| static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { |
| return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; |
| } |
| |
| static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { |
| return (buffer[pos] << 8) ^ buffer[pos + 1]; |
| } |
| |
| static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { |
| return buffer[pos]; |
| } |
| |
| static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( |
| const uint8_t *const buffer, int *const pos) { |
| const uint32_t value = readUint32(buffer, *pos); |
| *pos += 4; |
| return value; |
| } |
| |
| static AK_FORCE_INLINE int readSint24AndAdvancePosition( |
| const uint8_t *const buffer, int *const pos) { |
| const uint8_t value = readUint8(buffer, *pos); |
| if (value < 0x80) { |
| return readUint24AndAdvancePosition(buffer, pos); |
| } else { |
| (*pos)++; |
| return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); |
| } |
| } |
| |
| static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( |
| const uint8_t *const buffer, int *const pos) { |
| const uint32_t value = readUint24(buffer, *pos); |
| *pos += 3; |
| return value; |
| } |
| |
| static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( |
| const uint8_t *const buffer, int *const pos) { |
| const uint16_t value = readUint16(buffer, *pos); |
| *pos += 2; |
| return value; |
| } |
| |
| static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( |
| const uint8_t *const buffer, int *const pos) { |
| return buffer[(*pos)++]; |
| } |
| |
| static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, |
| const int size, const int pos) { |
| // size must be in 1 to 4. |
| ASSERT(size >= 1 && size <= 4); |
| switch (size) { |
| case 1: |
| return ByteArrayUtils::readUint8(buffer, pos); |
| case 2: |
| return ByteArrayUtils::readUint16(buffer, pos); |
| case 3: |
| return ByteArrayUtils::readUint24(buffer, pos); |
| case 4: |
| return ByteArrayUtils::readUint32(buffer, pos); |
| default: |
| return 0; |
| } |
| } |
| |
| /** |
| * Code Point Reading |
| * |
| * 1 byte = bbbbbbbb match |
| * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte |
| * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because |
| * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with |
| * 00011111 would be outside unicode. |
| * else: iso-latin-1 code |
| * This allows for the whole unicode range to be encoded, including chars outside of |
| * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control |
| * characters which should never happen anyway (and still work, but take 3 bytes). |
| */ |
| static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { |
| int p = pos; |
| return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); |
| } |
| |
| static AK_FORCE_INLINE int readCodePointAndAdvancePosition( |
| const uint8_t *const buffer, const int *const codePointTable, int *const pos) { |
| /* |
| * codePointTable is an array to convert the most frequent characters in this dictionary to |
| * 1 byte code points. It is only made of the original code points of the most frequent |
| * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. |
| * The original code points are restored by picking the code points at the indices of the |
| * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. |
| */ |
| const uint8_t firstByte = readUint8(buffer, *pos); |
| if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { |
| if (firstByte == CHARACTER_ARRAY_TERMINATOR) { |
| *pos += 1; |
| return NOT_A_CODE_POINT; |
| } else { |
| return readUint24AndAdvancePosition(buffer, pos); |
| } |
| } else { |
| *pos += 1; |
| if (codePointTable) { |
| return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; |
| } |
| return firstByte; |
| } |
| } |
| |
| /** |
| * String (array of code points) Reading |
| * |
| * Reads code points until the terminator is found. |
| */ |
| // Returns the length of the string. |
| static int readStringAndAdvancePosition(const uint8_t *const buffer, |
| const int maxLength, const int *const codePointTable, int *const outBuffer, |
| int *const pos) { |
| int length = 0; |
| int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); |
| while (NOT_A_CODE_POINT != codePoint && length < maxLength) { |
| outBuffer[length++] = codePoint; |
| codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); |
| } |
| return length; |
| } |
| |
| // Advances the position and returns the length of the string. |
| static int advancePositionToBehindString( |
| const uint8_t *const buffer, const int maxLength, int *const pos) { |
| int length = 0; |
| int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); |
| while (NOT_A_CODE_POINT != codePoint && length < maxLength) { |
| codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); |
| length++; |
| } |
| return length; |
| } |
| |
| /** |
| * String (array of code points) Writing |
| */ |
| static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, |
| const int *const codePoints, const int codePointCount, const bool writesTerminator, |
| int *const pos) { |
| for (int i = 0; i < codePointCount; ++i) { |
| const int codePoint = codePoints[i]; |
| if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { |
| break; |
| } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE |
| || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { |
| // three bytes character. |
| writeUint24AndAdvancePosition(buffer, codePoint, pos); |
| } else { |
| // one byte character. |
| writeUint8AndAdvancePosition(buffer, codePoint, pos); |
| } |
| } |
| if (writesTerminator) { |
| writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); |
| } |
| } |
| |
| static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, |
| const int codePointCount, const bool writesTerminator) { |
| int byteCount = 0; |
| for (int i = 0; i < codePointCount; ++i) { |
| const int codePoint = codePoints[i]; |
| if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { |
| break; |
| } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE |
| || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { |
| // three bytes character. |
| byteCount += 3; |
| } else { |
| // one byte character. |
| byteCount += 1; |
| } |
| } |
| if (writesTerminator) { |
| // The terminator is one byte. |
| byteCount += 1; |
| } |
| return byteCount; |
| } |
| |
| private: |
| DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); |
| |
| static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; |
| static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; |
| static const uint8_t CHARACTER_ARRAY_TERMINATOR; |
| |
| static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, |
| const uint32_t data, int *const pos) { |
| buffer[(*pos)++] = (data >> 24) & 0xFF; |
| buffer[(*pos)++] = (data >> 16) & 0xFF; |
| buffer[(*pos)++] = (data >> 8) & 0xFF; |
| buffer[(*pos)++] = data & 0xFF; |
| } |
| |
| static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, |
| const uint32_t data, int *const pos) { |
| buffer[(*pos)++] = (data >> 16) & 0xFF; |
| buffer[(*pos)++] = (data >> 8) & 0xFF; |
| buffer[(*pos)++] = data & 0xFF; |
| } |
| |
| static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, |
| const uint16_t data, int *const pos) { |
| buffer[(*pos)++] = (data >> 8) & 0xFF; |
| buffer[(*pos)++] = data & 0xFF; |
| } |
| |
| static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, |
| const uint8_t data, int *const pos) { |
| buffer[(*pos)++] = data & 0xFF; |
| } |
| }; |
| } // namespace latinime |
| #endif /* LATINIME_BYTE_ARRAY_UTILS_H */ |