blob: e003e9410f54bc99a900a457f863955ac85ac70e [file] [log] [blame]
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +01001// Copyright 2010 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Transformations on dictionary words.
16
17#ifndef BROTLI_ENC_TRANSFORM_H_
18#define BROTLI_ENC_TRANSFORM_H_
19
20#include <string>
21
22#include "./dictionary.h"
23
24namespace brotli {
25
26enum WordTransformType {
27 kIdentity = 0,
Zoltan Szabadka34778192014-03-25 16:48:25 +010028 kOmitLast1 = 1,
29 kOmitLast2 = 2,
30 kOmitLast3 = 3,
31 kOmitLast4 = 4,
32 kOmitLast5 = 5,
33 kOmitLast6 = 6,
34 kOmitLast7 = 7,
35 kOmitLast8 = 8,
36 kOmitLast9 = 9,
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +010037 kUppercaseFirst = 10,
38 kUppercaseAll = 11,
Zoltan Szabadka34778192014-03-25 16:48:25 +010039 kOmitFirst1 = 12,
40 kOmitFirst2 = 13,
41 kOmitFirst3 = 14,
42 kOmitFirst4 = 15,
43 kOmitFirst5 = 16,
44 kOmitFirst6 = 17,
45 kOmitFirst7 = 18,
46 kOmitFirst8 = 19,
47 kOmitFirst9 = 20,
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +010048};
49
50struct Transform {
51 const char* prefix;
52 WordTransformType word_transform;
53 const char* suffix;
54};
55
56static const Transform kTransforms[] = {
Zoltan Szabadka34778192014-03-25 16:48:25 +010057 { "", kIdentity, "" },
58 { "", kIdentity, " " },
59 { " ", kIdentity, " " },
60 { "", kOmitFirst1, "" },
61 { "", kUppercaseFirst, " " },
62 { "", kIdentity, " the " },
63 { " ", kIdentity, "" },
64 { "s ", kIdentity, " " },
65 { "", kIdentity, " of " },
66 { "", kUppercaseFirst, "" },
67 { "", kIdentity, " and " },
68 { "", kOmitFirst2, "" },
69 { "", kOmitLast1, "" },
70 { ", ", kIdentity, " " },
71 { "", kIdentity, ", " },
72 { " ", kUppercaseFirst, " " },
73 { "", kIdentity, " in " },
74 { "", kIdentity, " to " },
75 { "e ", kIdentity, " " },
76 { "", kIdentity, "\"" },
77 { "", kIdentity, "." },
78 { "", kIdentity, "\">" },
79 { "", kIdentity, "\n" },
80 { "", kOmitLast3, "" },
81 { "", kIdentity, "]" },
82 { "", kIdentity, " for " },
83 { "", kOmitFirst3, "" },
84 { "", kOmitLast2, "" },
85 { "", kIdentity, " a " },
86 { "", kIdentity, " that " },
87 { " ", kUppercaseFirst, "" },
88 { "", kIdentity, ". " },
89 { ".", kIdentity, "" },
90 { " ", kIdentity, ", " },
91 { "", kOmitFirst4, "" },
92 { "", kIdentity, " with " },
93 { "", kIdentity, "'" },
94 { "", kIdentity, " from " },
95 { "", kIdentity, " by " },
96 { "", kOmitFirst5, "" },
97 { "", kOmitFirst6, "" },
98 { " the ", kIdentity, "" },
99 { "", kOmitLast4, "" },
100 { "", kIdentity, ". The " },
101 { "", kUppercaseAll, "" },
102 { "", kIdentity, " on " },
103 { "", kIdentity, " as " },
104 { "", kIdentity, " is " },
105 { "", kOmitLast7, "" },
106 { "", kOmitLast1, "ing " },
107 { "", kIdentity, "\n\t" },
108 { "", kIdentity, ":" },
109 { " ", kIdentity, ". " },
110 { "", kIdentity, "ed " },
111 { "", kOmitFirst9, "" },
112 { "", kOmitFirst7, "" },
113 { "", kOmitLast6, "" },
114 { "", kIdentity, "(" },
115 { "", kUppercaseFirst, ", " },
116 { "", kOmitLast8, "" },
117 { "", kIdentity, " at " },
118 { "", kIdentity, "ly " },
119 { " the ", kIdentity, " of " },
120 { "", kOmitLast5, "" },
121 { "", kOmitLast9, "" },
122 { " ", kUppercaseFirst, ", " },
123 { "", kUppercaseFirst, "\"" },
124 { ".", kIdentity, "(" },
125 { "", kUppercaseAll, " " },
126 { "", kUppercaseFirst, "\">" },
127 { "", kIdentity, "=\"" },
128 { " ", kIdentity, "." },
129 { ".com/", kIdentity, "" },
130 { " the ", kIdentity, " of the " },
131 { "", kUppercaseFirst, "'" },
132 { "", kIdentity, ". This " },
133 { "", kIdentity, "," },
134 { ".", kIdentity, " " },
135 { "", kUppercaseFirst, "(" },
136 { "", kUppercaseFirst, "." },
137 { "", kIdentity, " not " },
138 { " ", kIdentity, "=\"" },
139 { "", kIdentity, "er " },
140 { " ", kUppercaseAll, " " },
141 { "", kIdentity, "al " },
142 { " ", kUppercaseAll, "" },
143 { "", kIdentity, "='" },
144 { "", kUppercaseAll, "\"" },
145 { "", kUppercaseFirst, ". " },
146 { " ", kIdentity, "(" },
147 { "", kIdentity, "ful " },
148 { " ", kUppercaseFirst, ". " },
149 { "", kIdentity, "ive " },
150 { "", kIdentity, "less " },
151 { "", kUppercaseAll, "'" },
152 { "", kIdentity, "est " },
153 { " ", kUppercaseFirst, "." },
154 { "", kUppercaseAll, "\">" },
155 { " ", kIdentity, "='" },
156 { "", kUppercaseFirst, "," },
157 { "", kIdentity, "ize " },
158 { "", kUppercaseAll, "." },
159 { "\xc2\xa0", kIdentity, "" },
160 { " ", kIdentity, "," },
161 { "", kUppercaseFirst, "=\"" },
162 { "", kUppercaseAll, "=\"" },
163 { "", kIdentity, "ous " },
164 { "", kUppercaseAll, ", " },
165 { "", kUppercaseFirst, "='" },
166 { " ", kUppercaseFirst, "," },
167 { " ", kUppercaseAll, "=\"" },
168 { " ", kUppercaseAll, ", " },
169 { "", kUppercaseAll, "," },
170 { "", kUppercaseAll, "(" },
171 { "", kUppercaseAll, ". " },
172 { " ", kUppercaseAll, "." },
173 { "", kUppercaseAll, "='" },
174 { " ", kUppercaseAll, ". " },
175 { " ", kUppercaseFirst, "=\"" },
176 { " ", kUppercaseAll, "='" },
177 { " ", kUppercaseFirst, "='" },
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +0100178};
179
180static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
181
Zoltan Szabadka66098832015-06-12 16:45:17 +0200182static const int kOmitFirstNTransforms[10] = {
183 0, 3, 11, 26, 34, 39, 40, 55, 0, 54
184};
185
186static const int kOmitLastNTransforms[10] = {
187 0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
188};
189
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +0100190static int ToUpperCase(uint8_t *p, int len) {
191 if (len == 1 || p[0] < 0xc0) {
192 if (p[0] >= 'a' && p[0] <= 'z') {
193 p[0] ^= 32;
194 }
195 return 1;
196 }
197 if (p[0] < 0xe0) {
198 p[1] ^= 32;
199 return 2;
200 }
201 if (len == 2) {
202 return 2;
203 }
204 p[2] ^= 5;
205 return 3;
206}
207
208inline std::string ApplyTransform(
209 const Transform& t, const uint8_t* word, int len) {
210 std::string ret(t.prefix);
Zoltan Szabadka34778192014-03-25 16:48:25 +0100211 if (t.word_transform <= kOmitLast9) {
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +0100212 len -= t.word_transform;
213 }
214 if (len > 0) {
Zoltan Szabadka34778192014-03-25 16:48:25 +0100215 if (t.word_transform >= kOmitFirst1) {
216 const int skip = t.word_transform - (kOmitFirst1 - 1);
217 if (len > skip) {
218 ret += std::string(word + skip, word + len);
219 }
220 } else {
221 ret += std::string(word, word + len);
222 uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
223 if (t.word_transform == kUppercaseFirst) {
224 ToUpperCase(uppercase, len);
225 } else if (t.word_transform == kUppercaseAll) {
226 while (len > 0) {
227 int step = ToUpperCase(uppercase, len);
228 uppercase += step;
229 len -= step;
230 }
Zoltan Szabadka2f268ad2014-02-17 14:25:36 +0100231 }
232 }
233 }
234 ret += std::string(t.suffix);
235 return ret;
236}
237
238inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
239 int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
240 int offset = kBrotliDictionaryOffsetsByLength[len_code];
241 int t = word_id / num_words;
242 int word_idx = word_id % num_words;
243 offset += len_code * word_idx;
244 const uint8_t* word = &kBrotliDictionary[offset];
245 return ApplyTransform(kTransforms[t], word, len_code);
246}
247
248} // namespace brotli
249
250#endif // BROTLI_ENC_TRANSFORM_H_