blob: c526a4f14f2df823aa2e45e340936c0b8a241ae3 [file] [log] [blame]
Fredrik Roubert0596fae2017-04-18 21:34:02 +02001// © 2016 and later: Unicode, Inc. and others.
Fredrik Roubert64339d32016-10-21 19:43:16 +02002// License & terms of use: http://www.unicode.org/copyright.html
ccorneliusfceb3982014-04-16 12:27:14 -07003/*
4*******************************************************************************
5* Copyright (C) 2012-2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* collationkeys.h
9*
10* created on: 2012sep02
11* created by: Markus W. Scherer
12*/
13
14#ifndef __COLLATIONKEYS_H__
15#define __COLLATIONKEYS_H__
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_COLLATION
20
21#include "unicode/bytestream.h"
22#include "unicode/ucol.h"
23#include "charstr.h"
24#include "collation.h"
25
26U_NAMESPACE_BEGIN
27
28class CollationIterator;
29struct CollationDataReader;
30struct CollationSettings;
31
32class SortKeyByteSink : public ByteSink {
33public:
34 SortKeyByteSink(char *dest, int32_t destCapacity)
35 : buffer_(dest), capacity_(destCapacity),
36 appended_(0), ignore_(0) {}
37 virtual ~SortKeyByteSink();
38
39 void IgnoreBytes(int32_t numIgnore) { ignore_ = numIgnore; }
40
41 virtual void Append(const char *bytes, int32_t n);
42 void Append(uint32_t b) {
43 if (ignore_ > 0) {
44 --ignore_;
45 } else {
46 if (appended_ < capacity_ || Resize(1, appended_)) {
47 buffer_[appended_] = (char)b;
48 }
49 ++appended_;
50 }
51 }
52 virtual char *GetAppendBuffer(int32_t min_capacity,
53 int32_t desired_capacity_hint,
54 char *scratch, int32_t scratch_capacity,
55 int32_t *result_capacity);
56 int32_t NumberOfBytesAppended() const { return appended_; }
57
58 /**
59 * @return how many bytes can be appended (including ignored ones)
60 * without reallocation
61 */
62 int32_t GetRemainingCapacity() const {
63 // Either ignore_ or appended_ should be 0.
64 return ignore_ + capacity_ - appended_;
65 }
66
67 UBool Overflowed() const { return appended_ > capacity_; }
Victor Chang978167a2021-01-18 17:56:33 +000068 /** @return false if memory allocation failed */
ccorneliusfceb3982014-04-16 12:27:14 -070069 UBool IsOk() const { return buffer_ != NULL; }
70
71protected:
72 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
73 virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
74
75 void SetNotOk() {
76 buffer_ = NULL;
77 capacity_ = 0;
78 }
79
80 char *buffer_;
81 int32_t capacity_;
82 int32_t appended_;
83 int32_t ignore_;
84
85private:
86 SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
87 SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
88};
89
90class U_I18N_API CollationKeys /* not : public UObject because all methods are static */ {
91public:
92 class LevelCallback : public UMemory {
93 public:
94 virtual ~LevelCallback();
95 /**
96 * @param level The next level about to be written to the ByteSink.
Victor Chang978167a2021-01-18 17:56:33 +000097 * @return true if the level is to be written
98 * (the base class implementation always returns true)
ccorneliusfceb3982014-04-16 12:27:14 -070099 */
100 virtual UBool needToWrite(Collation::Level level);
101 };
102
103 /**
104 * Writes the sort key bytes for minLevel up to the iterator data's strength.
105 * Optionally writes the case level.
Victor Chang978167a2021-01-18 17:56:33 +0000106 * Stops writing levels when callback.needToWrite(level) returns false.
ccorneliusfceb3982014-04-16 12:27:14 -0700107 * Separates levels with the LEVEL_SEPARATOR_BYTE
108 * but does not write a TERMINATOR_BYTE.
109 */
110 static void writeSortKeyUpToQuaternary(CollationIterator &iter,
111 const UBool *compressibleBytes,
112 const CollationSettings &settings,
113 SortKeyByteSink &sink,
114 Collation::Level minLevel, LevelCallback &callback,
115 UBool preflight, UErrorCode &errorCode);
116private:
117 friend struct CollationDataReader;
118
119 CollationKeys(); // no instantiation
120
121 // Secondary level: Compress up to 33 common weights as 05..25 or 25..45.
122 static const uint32_t SEC_COMMON_LOW = Collation::COMMON_BYTE;
123 static const uint32_t SEC_COMMON_MIDDLE = SEC_COMMON_LOW + 0x20;
124 static const uint32_t SEC_COMMON_HIGH = SEC_COMMON_LOW + 0x40;
125 static const int32_t SEC_COMMON_MAX_COUNT = 0x21;
126
127 // Case level, lowerFirst: Compress up to 7 common weights as 1..7 or 7..13.
128 static const uint32_t CASE_LOWER_FIRST_COMMON_LOW = 1;
129 static const uint32_t CASE_LOWER_FIRST_COMMON_MIDDLE = 7;
130 static const uint32_t CASE_LOWER_FIRST_COMMON_HIGH = 13;
131 static const int32_t CASE_LOWER_FIRST_COMMON_MAX_COUNT = 7;
132
133 // Case level, upperFirst: Compress up to 13 common weights as 3..15.
134 static const uint32_t CASE_UPPER_FIRST_COMMON_LOW = 3;
135 static const uint32_t CASE_UPPER_FIRST_COMMON_HIGH = 15;
136 static const int32_t CASE_UPPER_FIRST_COMMON_MAX_COUNT = 13;
137
138 // Tertiary level only (no case): Compress up to 97 common weights as 05..65 or 65..C5.
139 static const uint32_t TER_ONLY_COMMON_LOW = Collation::COMMON_BYTE;
140 static const uint32_t TER_ONLY_COMMON_MIDDLE = TER_ONLY_COMMON_LOW + 0x60;
141 static const uint32_t TER_ONLY_COMMON_HIGH = TER_ONLY_COMMON_LOW + 0xc0;
142 static const int32_t TER_ONLY_COMMON_MAX_COUNT = 0x61;
143
144 // Tertiary with case, lowerFirst: Compress up to 33 common weights as 05..25 or 25..45.
145 static const uint32_t TER_LOWER_FIRST_COMMON_LOW = Collation::COMMON_BYTE;
146 static const uint32_t TER_LOWER_FIRST_COMMON_MIDDLE = TER_LOWER_FIRST_COMMON_LOW + 0x20;
147 static const uint32_t TER_LOWER_FIRST_COMMON_HIGH = TER_LOWER_FIRST_COMMON_LOW + 0x40;
148 static const int32_t TER_LOWER_FIRST_COMMON_MAX_COUNT = 0x21;
149
150 // Tertiary with case, upperFirst: Compress up to 33 common weights as 85..A5 or A5..C5.
151 static const uint32_t TER_UPPER_FIRST_COMMON_LOW = Collation::COMMON_BYTE + 0x80;
152 static const uint32_t TER_UPPER_FIRST_COMMON_MIDDLE = TER_UPPER_FIRST_COMMON_LOW + 0x20;
153 static const uint32_t TER_UPPER_FIRST_COMMON_HIGH = TER_UPPER_FIRST_COMMON_LOW + 0x40;
154 static const int32_t TER_UPPER_FIRST_COMMON_MAX_COUNT = 0x21;
155
156 // Quaternary level: Compress up to 113 common weights as 1C..8C or 8C..FC.
157 static const uint32_t QUAT_COMMON_LOW = 0x1c;
158 static const uint32_t QUAT_COMMON_MIDDLE = QUAT_COMMON_LOW + 0x70;
159 static const uint32_t QUAT_COMMON_HIGH = QUAT_COMMON_LOW + 0xE0;
160 static const int32_t QUAT_COMMON_MAX_COUNT = 0x71;
161 // Primary weights shifted to quaternary level must be encoded with
162 // a lead byte below the common-weight compression range.
163 static const uint32_t QUAT_SHIFTED_LIMIT_BYTE = QUAT_COMMON_LOW - 1; // 0x1b
164};
165
166U_NAMESPACE_END
167
168#endif // !UCONFIG_NO_COLLATION
169#endif // __COLLATIONKEYS_H__