blob: 67b1a73310876d1e3cd74ae700e79690dce95930 [file] [log] [blame]
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +01001// Copyright 2010 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Transformations on dictionary words.
16
17#ifndef BROTLI_ENC_TRANSFORM_H_
18#define BROTLI_ENC_TRANSFORM_H_
19
20#include <string>
21
22#include "./dictionary.h"
23
24namespace brotli {
25
26enum WordTransformType {
27 kIdentity = 0,
Zoltan Szabadka0829e372014-03-25 16:48:25 +010028 kOmitLast1 = 1,
29 kOmitLast2 = 2,
30 kOmitLast3 = 3,
31 kOmitLast4 = 4,
32 kOmitLast5 = 5,
33 kOmitLast6 = 6,
34 kOmitLast7 = 7,
35 kOmitLast8 = 8,
36 kOmitLast9 = 9,
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +010037 kUppercaseFirst = 10,
38 kUppercaseAll = 11,
Zoltan Szabadka0829e372014-03-25 16:48:25 +010039 kOmitFirst1 = 12,
40 kOmitFirst2 = 13,
41 kOmitFirst3 = 14,
42 kOmitFirst4 = 15,
43 kOmitFirst5 = 16,
44 kOmitFirst6 = 17,
45 kOmitFirst7 = 18,
46 kOmitFirst8 = 19,
47 kOmitFirst9 = 20,
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +010048};
49
50struct Transform {
51 const char* prefix;
52 WordTransformType word_transform;
53 const char* suffix;
54};
55
56static const Transform kTransforms[] = {
Zoltan Szabadka0829e372014-03-25 16:48:25 +010057 { "", kIdentity, "" },
58 { "", kIdentity, " " },
59 { " ", kIdentity, " " },
60 { "", kOmitFirst1, "" },
61 { "", kUppercaseFirst, " " },
62 { "", kIdentity, " the " },
63 { " ", kIdentity, "" },
64 { "s ", kIdentity, " " },
65 { "", kIdentity, " of " },
66 { "", kUppercaseFirst, "" },
67 { "", kIdentity, " and " },
68 { "", kOmitFirst2, "" },
69 { "", kOmitLast1, "" },
70 { ", ", kIdentity, " " },
71 { "", kIdentity, ", " },
72 { " ", kUppercaseFirst, " " },
73 { "", kIdentity, " in " },
74 { "", kIdentity, " to " },
75 { "e ", kIdentity, " " },
76 { "", kIdentity, "\"" },
77 { "", kIdentity, "." },
78 { "", kIdentity, "\">" },
79 { "", kIdentity, "\n" },
80 { "", kOmitLast3, "" },
81 { "", kIdentity, "]" },
82 { "", kIdentity, " for " },
83 { "", kOmitFirst3, "" },
84 { "", kOmitLast2, "" },
85 { "", kIdentity, " a " },
86 { "", kIdentity, " that " },
87 { " ", kUppercaseFirst, "" },
88 { "", kIdentity, ". " },
89 { ".", kIdentity, "" },
90 { " ", kIdentity, ", " },
91 { "", kOmitFirst4, "" },
92 { "", kIdentity, " with " },
93 { "", kIdentity, "'" },
94 { "", kIdentity, " from " },
95 { "", kIdentity, " by " },
96 { "", kOmitFirst5, "" },
97 { "", kOmitFirst6, "" },
98 { " the ", kIdentity, "" },
99 { "", kOmitLast4, "" },
100 { "", kIdentity, ". The " },
101 { "", kUppercaseAll, "" },
102 { "", kIdentity, " on " },
103 { "", kIdentity, " as " },
104 { "", kIdentity, " is " },
105 { "", kOmitLast7, "" },
106 { "", kOmitLast1, "ing " },
107 { "", kIdentity, "\n\t" },
108 { "", kIdentity, ":" },
109 { " ", kIdentity, ". " },
110 { "", kIdentity, "ed " },
111 { "", kOmitFirst9, "" },
112 { "", kOmitFirst7, "" },
113 { "", kOmitLast6, "" },
114 { "", kIdentity, "(" },
115 { "", kUppercaseFirst, ", " },
116 { "", kOmitLast8, "" },
117 { "", kIdentity, " at " },
118 { "", kIdentity, "ly " },
119 { " the ", kIdentity, " of " },
120 { "", kOmitLast5, "" },
121 { "", kOmitLast9, "" },
122 { " ", kUppercaseFirst, ", " },
123 { "", kUppercaseFirst, "\"" },
124 { ".", kIdentity, "(" },
125 { "", kUppercaseAll, " " },
126 { "", kUppercaseFirst, "\">" },
127 { "", kIdentity, "=\"" },
128 { " ", kIdentity, "." },
129 { ".com/", kIdentity, "" },
130 { " the ", kIdentity, " of the " },
131 { "", kUppercaseFirst, "'" },
132 { "", kIdentity, ". This " },
133 { "", kIdentity, "," },
134 { ".", kIdentity, " " },
135 { "", kUppercaseFirst, "(" },
136 { "", kUppercaseFirst, "." },
137 { "", kIdentity, " not " },
138 { " ", kIdentity, "=\"" },
139 { "", kIdentity, "er " },
140 { " ", kUppercaseAll, " " },
141 { "", kIdentity, "al " },
142 { " ", kUppercaseAll, "" },
143 { "", kIdentity, "='" },
144 { "", kUppercaseAll, "\"" },
145 { "", kUppercaseFirst, ". " },
146 { " ", kIdentity, "(" },
147 { "", kIdentity, "ful " },
148 { " ", kUppercaseFirst, ". " },
149 { "", kIdentity, "ive " },
150 { "", kIdentity, "less " },
151 { "", kUppercaseAll, "'" },
152 { "", kIdentity, "est " },
153 { " ", kUppercaseFirst, "." },
154 { "", kUppercaseAll, "\">" },
155 { " ", kIdentity, "='" },
156 { "", kUppercaseFirst, "," },
157 { "", kIdentity, "ize " },
158 { "", kUppercaseAll, "." },
159 { "\xc2\xa0", kIdentity, "" },
160 { " ", kIdentity, "," },
161 { "", kUppercaseFirst, "=\"" },
162 { "", kUppercaseAll, "=\"" },
163 { "", kIdentity, "ous " },
164 { "", kUppercaseAll, ", " },
165 { "", kUppercaseFirst, "='" },
166 { " ", kUppercaseFirst, "," },
167 { " ", kUppercaseAll, "=\"" },
168 { " ", kUppercaseAll, ", " },
169 { "", kUppercaseAll, "," },
170 { "", kUppercaseAll, "(" },
171 { "", kUppercaseAll, ". " },
172 { " ", kUppercaseAll, "." },
173 { "", kUppercaseAll, "='" },
174 { " ", kUppercaseAll, ". " },
175 { " ", kUppercaseFirst, "=\"" },
176 { " ", kUppercaseAll, "='" },
177 { " ", kUppercaseFirst, "='" },
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +0100178};
179
180static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
181
182static int ToUpperCase(uint8_t *p, int len) {
183 if (len == 1 || p[0] < 0xc0) {
184 if (p[0] >= 'a' && p[0] <= 'z') {
185 p[0] ^= 32;
186 }
187 return 1;
188 }
189 if (p[0] < 0xe0) {
190 p[1] ^= 32;
191 return 2;
192 }
193 if (len == 2) {
194 return 2;
195 }
196 p[2] ^= 5;
197 return 3;
198}
199
200inline std::string ApplyTransform(
201 const Transform& t, const uint8_t* word, int len) {
202 std::string ret(t.prefix);
Zoltan Szabadka0829e372014-03-25 16:48:25 +0100203 if (t.word_transform <= kOmitLast9) {
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +0100204 len -= t.word_transform;
205 }
206 if (len > 0) {
Zoltan Szabadka0829e372014-03-25 16:48:25 +0100207 if (t.word_transform >= kOmitFirst1) {
208 const int skip = t.word_transform - (kOmitFirst1 - 1);
209 if (len > skip) {
210 ret += std::string(word + skip, word + len);
211 }
212 } else {
213 ret += std::string(word, word + len);
214 uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
215 if (t.word_transform == kUppercaseFirst) {
216 ToUpperCase(uppercase, len);
217 } else if (t.word_transform == kUppercaseAll) {
218 while (len > 0) {
219 int step = ToUpperCase(uppercase, len);
220 uppercase += step;
221 len -= step;
222 }
Zoltan Szabadka2733d6c2014-02-17 14:25:36 +0100223 }
224 }
225 }
226 ret += std::string(t.suffix);
227 return ret;
228}
229
230inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
231 int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
232 int offset = kBrotliDictionaryOffsetsByLength[len_code];
233 int t = word_id / num_words;
234 int word_idx = word_id % num_words;
235 offset += len_code * word_idx;
236 const uint8_t* word = &kBrotliDictionary[offset];
237 return ApplyTransform(kTransforms[t], word, len_code);
238}
239
240} // namespace brotli
241
242#endif // BROTLI_ENC_TRANSFORM_H_