Fredrik Roubert | 0596fae | 2017-04-18 21:34:02 +0200 | [diff] [blame] | 1 | // © 2016 and later: Unicode, Inc. and others. |
Fredrik Roubert | 64339d3 | 2016-10-21 19:43:16 +0200 | [diff] [blame] | 2 | // License & terms of use: http://www.unicode.org/copyright.html |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 3 | /* |
| 4 | ****************************************************************************** |
| 5 | * |
| 6 | * Copyright (C) 2001-2008, International Business Machines |
| 7 | * Corporation and others. All Rights Reserved. |
| 8 | * |
| 9 | ****************************************************************************** |
| 10 | * file name: utrie2_impl.h |
Fredrik Roubert | 0596fae | 2017-04-18 21:34:02 +0200 | [diff] [blame] | 11 | * encoding: UTF-8 |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 12 | * tab size: 8 (not used) |
| 13 | * indentation:4 |
| 14 | * |
| 15 | * created on: 2008sep26 (split off from utrie2.c) |
| 16 | * created by: Markus W. Scherer |
| 17 | * |
| 18 | * Definitions needed for both runtime and builder code for UTrie2, |
| 19 | * used by utrie2.c and utrie2_builder.c. |
| 20 | */ |
| 21 | |
| 22 | #ifndef __UTRIE2_IMPL_H__ |
| 23 | #define __UTRIE2_IMPL_H__ |
| 24 | |
Victor Chang | 4578a1c | 2018-10-22 04:26:58 +0100 | [diff] [blame] | 25 | #ifdef UCPTRIE_DEBUG |
| 26 | #include "unicode/umutablecptrie.h" |
| 27 | #endif |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 28 | #include "utrie2.h" |
| 29 | |
| 30 | /* Public UTrie2 API implementation ----------------------------------------- */ |
| 31 | |
| 32 | /* |
Victor Chang | 4578a1c | 2018-10-22 04:26:58 +0100 | [diff] [blame] | 33 | * These definitions are mostly needed by utrie2.cpp, |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 34 | * but also by utrie2_serialize() and utrie2_swap(). |
| 35 | */ |
| 36 | |
Victor Chang | 4578a1c | 2018-10-22 04:26:58 +0100 | [diff] [blame] | 37 | // UTrie2 signature values, in platform endianness and opposite endianness. |
| 38 | // The UTrie2 signature ASCII byte values spell "Tri2". |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 39 | #define UTRIE2_SIG 0x54726932 |
| 40 | #define UTRIE2_OE_SIG 0x32697254 |
| 41 | |
| 42 | /** |
| 43 | * Trie data structure in serialized form: |
| 44 | * |
| 45 | * UTrie2Header header; |
| 46 | * uint16_t index[header.index2Length]; |
| 47 | * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] |
| 48 | * @internal |
| 49 | */ |
| 50 | typedef struct UTrie2Header { |
| 51 | /** "Tri2" in big-endian US-ASCII (0x54726932) */ |
| 52 | uint32_t signature; |
| 53 | |
| 54 | /** |
| 55 | * options bit field: |
| 56 | * 15.. 4 reserved (0) |
| 57 | * 3.. 0 UTrie2ValueBits valueBits |
| 58 | */ |
| 59 | uint16_t options; |
| 60 | |
| 61 | /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ |
| 62 | uint16_t indexLength; |
| 63 | |
| 64 | /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ |
| 65 | uint16_t shiftedDataLength; |
| 66 | |
| 67 | /** Null index and data blocks, not shifted. */ |
| 68 | uint16_t index2NullOffset, dataNullOffset; |
| 69 | |
| 70 | /** |
| 71 | * First code point of the single-value range ending with U+10ffff, |
| 72 | * rounded up and then shifted right by UTRIE2_SHIFT_1. |
| 73 | */ |
| 74 | uint16_t shiftedHighStart; |
| 75 | } UTrie2Header; |
| 76 | |
| 77 | /** |
| 78 | * Constants for use with UTrie2Header.options. |
| 79 | * @internal |
| 80 | */ |
| 81 | enum { |
| 82 | /** Mask to get the UTrie2ValueBits valueBits from options. */ |
| 83 | UTRIE2_OPTIONS_VALUE_BITS_MASK=0xf |
| 84 | }; |
| 85 | |
| 86 | /* Building a trie ---------------------------------------------------------- */ |
| 87 | |
| 88 | /* |
| 89 | * These definitions are mostly needed by utrie2_builder.c, but also by |
| 90 | * utrie2_get32() and utrie2_enum(). |
| 91 | */ |
| 92 | |
| 93 | enum { |
| 94 | /** |
| 95 | * At build time, leave a gap in the index-2 table, |
| 96 | * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table |
| 97 | * and the supplementary index-1 table. |
| 98 | * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting. |
| 99 | */ |
| 100 | UNEWTRIE2_INDEX_GAP_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH, |
| 101 | UNEWTRIE2_INDEX_GAP_LENGTH= |
| 102 | ((UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH)+UTRIE2_INDEX_2_MASK)& |
| 103 | ~UTRIE2_INDEX_2_MASK, |
| 104 | |
| 105 | /** |
| 106 | * Maximum length of the build-time index-2 array. |
| 107 | * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2, |
| 108 | * plus the part of the index-2 table for lead surrogate code points, |
| 109 | * plus the build-time index gap, |
| 110 | * plus the null index-2 block. |
| 111 | */ |
| 112 | UNEWTRIE2_MAX_INDEX_2_LENGTH= |
| 113 | (0x110000>>UTRIE2_SHIFT_2)+ |
| 114 | UTRIE2_LSCP_INDEX_2_LENGTH+ |
| 115 | UNEWTRIE2_INDEX_GAP_LENGTH+ |
| 116 | UTRIE2_INDEX_2_BLOCK_LENGTH, |
| 117 | |
| 118 | UNEWTRIE2_INDEX_1_LENGTH=0x110000>>UTRIE2_SHIFT_1 |
| 119 | }; |
| 120 | |
| 121 | /** |
| 122 | * Maximum length of the build-time data array. |
| 123 | * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block, |
| 124 | * plus values for the 0x400 surrogate code units. |
| 125 | */ |
| 126 | #define UNEWTRIE2_MAX_DATA_LENGTH (0x110000+0x40+0x40+0x400) |
| 127 | |
| 128 | /* |
| 129 | * Build-time trie structure. |
| 130 | * |
| 131 | * Just using a boolean flag for "repeat use" could lead to data array overflow |
| 132 | * because we would not be able to detect when a data block becomes unused. |
| 133 | * It also leads to orphan data blocks that are kept through serialization. |
| 134 | * |
| 135 | * Need to use reference counting for data blocks, |
| 136 | * and allocDataBlock() needs to look for a free block before increasing dataLength. |
| 137 | * |
| 138 | * This scheme seems like overkill for index-2 blocks since the whole index array is |
| 139 | * preallocated anyway (unlike the growable data array). |
| 140 | * Just allocating multiple index-2 blocks as needed. |
| 141 | */ |
| 142 | struct UNewTrie2 { |
| 143 | int32_t index1[UNEWTRIE2_INDEX_1_LENGTH]; |
| 144 | int32_t index2[UNEWTRIE2_MAX_INDEX_2_LENGTH]; |
| 145 | uint32_t *data; |
Victor Chang | 4578a1c | 2018-10-22 04:26:58 +0100 | [diff] [blame] | 146 | #ifdef UCPTRIE_DEBUG |
| 147 | UMutableCPTrie *t3; |
| 148 | #endif |
Claire Ho | 85bf2e2 | 2009-11-24 14:23:02 -0800 | [diff] [blame] | 149 | |
| 150 | uint32_t initialValue, errorValue; |
| 151 | int32_t index2Length, dataCapacity, dataLength; |
| 152 | int32_t firstFreeBlock; |
| 153 | int32_t index2NullOffset, dataNullOffset; |
| 154 | UChar32 highStart; |
| 155 | UBool isCompacted; |
| 156 | |
| 157 | /** |
| 158 | * Multi-purpose per-data-block table. |
| 159 | * |
| 160 | * Before compacting: |
| 161 | * |
| 162 | * Per-data-block reference counters/free-block list. |
| 163 | * 0: unused |
| 164 | * >0: reference counter (number of index-2 entries pointing here) |
| 165 | * <0: next free data block in free-block list |
| 166 | * |
| 167 | * While compacting: |
| 168 | * |
| 169 | * Map of adjusted indexes, used in compactData() and compactIndex2(). |
| 170 | * Maps from original indexes to new ones. |
| 171 | */ |
| 172 | int32_t map[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2]; |
| 173 | }; |
| 174 | |
| 175 | #endif |