Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 1 | // Copyright 2010 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | // |
| 15 | // Transformations on dictionary words. |
| 16 | |
| 17 | #ifndef BROTLI_ENC_TRANSFORM_H_ |
| 18 | #define BROTLI_ENC_TRANSFORM_H_ |
| 19 | |
| 20 | #include <string> |
| 21 | |
| 22 | #include "./dictionary.h" |
| 23 | |
| 24 | namespace brotli { |
| 25 | |
| 26 | enum WordTransformType { |
| 27 | kIdentity = 0, |
Zoltan Szabadka | 3477819 | 2014-03-25 16:48:25 +0100 | [diff] [blame] | 28 | kOmitLast1 = 1, |
| 29 | kOmitLast2 = 2, |
| 30 | kOmitLast3 = 3, |
| 31 | kOmitLast4 = 4, |
| 32 | kOmitLast5 = 5, |
| 33 | kOmitLast6 = 6, |
| 34 | kOmitLast7 = 7, |
| 35 | kOmitLast8 = 8, |
| 36 | kOmitLast9 = 9, |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 37 | kUppercaseFirst = 10, |
| 38 | kUppercaseAll = 11, |
Zoltan Szabadka | 3477819 | 2014-03-25 16:48:25 +0100 | [diff] [blame] | 39 | kOmitFirst1 = 12, |
| 40 | kOmitFirst2 = 13, |
| 41 | kOmitFirst3 = 14, |
| 42 | kOmitFirst4 = 15, |
| 43 | kOmitFirst5 = 16, |
| 44 | kOmitFirst6 = 17, |
| 45 | kOmitFirst7 = 18, |
| 46 | kOmitFirst8 = 19, |
| 47 | kOmitFirst9 = 20, |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 48 | }; |
| 49 | |
| 50 | struct Transform { |
| 51 | const char* prefix; |
| 52 | WordTransformType word_transform; |
| 53 | const char* suffix; |
| 54 | }; |
| 55 | |
| 56 | static const Transform kTransforms[] = { |
Zoltan Szabadka | 3477819 | 2014-03-25 16:48:25 +0100 | [diff] [blame] | 57 | { "", kIdentity, "" }, |
| 58 | { "", kIdentity, " " }, |
| 59 | { " ", kIdentity, " " }, |
| 60 | { "", kOmitFirst1, "" }, |
| 61 | { "", kUppercaseFirst, " " }, |
| 62 | { "", kIdentity, " the " }, |
| 63 | { " ", kIdentity, "" }, |
| 64 | { "s ", kIdentity, " " }, |
| 65 | { "", kIdentity, " of " }, |
| 66 | { "", kUppercaseFirst, "" }, |
| 67 | { "", kIdentity, " and " }, |
| 68 | { "", kOmitFirst2, "" }, |
| 69 | { "", kOmitLast1, "" }, |
| 70 | { ", ", kIdentity, " " }, |
| 71 | { "", kIdentity, ", " }, |
| 72 | { " ", kUppercaseFirst, " " }, |
| 73 | { "", kIdentity, " in " }, |
| 74 | { "", kIdentity, " to " }, |
| 75 | { "e ", kIdentity, " " }, |
| 76 | { "", kIdentity, "\"" }, |
| 77 | { "", kIdentity, "." }, |
| 78 | { "", kIdentity, "\">" }, |
| 79 | { "", kIdentity, "\n" }, |
| 80 | { "", kOmitLast3, "" }, |
| 81 | { "", kIdentity, "]" }, |
| 82 | { "", kIdentity, " for " }, |
| 83 | { "", kOmitFirst3, "" }, |
| 84 | { "", kOmitLast2, "" }, |
| 85 | { "", kIdentity, " a " }, |
| 86 | { "", kIdentity, " that " }, |
| 87 | { " ", kUppercaseFirst, "" }, |
| 88 | { "", kIdentity, ". " }, |
| 89 | { ".", kIdentity, "" }, |
| 90 | { " ", kIdentity, ", " }, |
| 91 | { "", kOmitFirst4, "" }, |
| 92 | { "", kIdentity, " with " }, |
| 93 | { "", kIdentity, "'" }, |
| 94 | { "", kIdentity, " from " }, |
| 95 | { "", kIdentity, " by " }, |
| 96 | { "", kOmitFirst5, "" }, |
| 97 | { "", kOmitFirst6, "" }, |
| 98 | { " the ", kIdentity, "" }, |
| 99 | { "", kOmitLast4, "" }, |
| 100 | { "", kIdentity, ". The " }, |
| 101 | { "", kUppercaseAll, "" }, |
| 102 | { "", kIdentity, " on " }, |
| 103 | { "", kIdentity, " as " }, |
| 104 | { "", kIdentity, " is " }, |
| 105 | { "", kOmitLast7, "" }, |
| 106 | { "", kOmitLast1, "ing " }, |
| 107 | { "", kIdentity, "\n\t" }, |
| 108 | { "", kIdentity, ":" }, |
| 109 | { " ", kIdentity, ". " }, |
| 110 | { "", kIdentity, "ed " }, |
| 111 | { "", kOmitFirst9, "" }, |
| 112 | { "", kOmitFirst7, "" }, |
| 113 | { "", kOmitLast6, "" }, |
| 114 | { "", kIdentity, "(" }, |
| 115 | { "", kUppercaseFirst, ", " }, |
| 116 | { "", kOmitLast8, "" }, |
| 117 | { "", kIdentity, " at " }, |
| 118 | { "", kIdentity, "ly " }, |
| 119 | { " the ", kIdentity, " of " }, |
| 120 | { "", kOmitLast5, "" }, |
| 121 | { "", kOmitLast9, "" }, |
| 122 | { " ", kUppercaseFirst, ", " }, |
| 123 | { "", kUppercaseFirst, "\"" }, |
| 124 | { ".", kIdentity, "(" }, |
| 125 | { "", kUppercaseAll, " " }, |
| 126 | { "", kUppercaseFirst, "\">" }, |
| 127 | { "", kIdentity, "=\"" }, |
| 128 | { " ", kIdentity, "." }, |
| 129 | { ".com/", kIdentity, "" }, |
| 130 | { " the ", kIdentity, " of the " }, |
| 131 | { "", kUppercaseFirst, "'" }, |
| 132 | { "", kIdentity, ". This " }, |
| 133 | { "", kIdentity, "," }, |
| 134 | { ".", kIdentity, " " }, |
| 135 | { "", kUppercaseFirst, "(" }, |
| 136 | { "", kUppercaseFirst, "." }, |
| 137 | { "", kIdentity, " not " }, |
| 138 | { " ", kIdentity, "=\"" }, |
| 139 | { "", kIdentity, "er " }, |
| 140 | { " ", kUppercaseAll, " " }, |
| 141 | { "", kIdentity, "al " }, |
| 142 | { " ", kUppercaseAll, "" }, |
| 143 | { "", kIdentity, "='" }, |
| 144 | { "", kUppercaseAll, "\"" }, |
| 145 | { "", kUppercaseFirst, ". " }, |
| 146 | { " ", kIdentity, "(" }, |
| 147 | { "", kIdentity, "ful " }, |
| 148 | { " ", kUppercaseFirst, ". " }, |
| 149 | { "", kIdentity, "ive " }, |
| 150 | { "", kIdentity, "less " }, |
| 151 | { "", kUppercaseAll, "'" }, |
| 152 | { "", kIdentity, "est " }, |
| 153 | { " ", kUppercaseFirst, "." }, |
| 154 | { "", kUppercaseAll, "\">" }, |
| 155 | { " ", kIdentity, "='" }, |
| 156 | { "", kUppercaseFirst, "," }, |
| 157 | { "", kIdentity, "ize " }, |
| 158 | { "", kUppercaseAll, "." }, |
| 159 | { "\xc2\xa0", kIdentity, "" }, |
| 160 | { " ", kIdentity, "," }, |
| 161 | { "", kUppercaseFirst, "=\"" }, |
| 162 | { "", kUppercaseAll, "=\"" }, |
| 163 | { "", kIdentity, "ous " }, |
| 164 | { "", kUppercaseAll, ", " }, |
| 165 | { "", kUppercaseFirst, "='" }, |
| 166 | { " ", kUppercaseFirst, "," }, |
| 167 | { " ", kUppercaseAll, "=\"" }, |
| 168 | { " ", kUppercaseAll, ", " }, |
| 169 | { "", kUppercaseAll, "," }, |
| 170 | { "", kUppercaseAll, "(" }, |
| 171 | { "", kUppercaseAll, ". " }, |
| 172 | { " ", kUppercaseAll, "." }, |
| 173 | { "", kUppercaseAll, "='" }, |
| 174 | { " ", kUppercaseAll, ". " }, |
| 175 | { " ", kUppercaseFirst, "=\"" }, |
| 176 | { " ", kUppercaseAll, "='" }, |
| 177 | { " ", kUppercaseFirst, "='" }, |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 178 | }; |
| 179 | |
| 180 | static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]); |
| 181 | |
Zoltan Szabadka | 6609883 | 2015-06-12 16:45:17 +0200 | [diff] [blame] | 182 | static const int kOmitFirstNTransforms[10] = { |
| 183 | 0, 3, 11, 26, 34, 39, 40, 55, 0, 54 |
| 184 | }; |
| 185 | |
| 186 | static const int kOmitLastNTransforms[10] = { |
| 187 | 0, 12, 27, 23, 42, 63, 56, 48, 59, 64, |
| 188 | }; |
| 189 | |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 190 | static int ToUpperCase(uint8_t *p, int len) { |
| 191 | if (len == 1 || p[0] < 0xc0) { |
| 192 | if (p[0] >= 'a' && p[0] <= 'z') { |
| 193 | p[0] ^= 32; |
| 194 | } |
| 195 | return 1; |
| 196 | } |
| 197 | if (p[0] < 0xe0) { |
| 198 | p[1] ^= 32; |
| 199 | return 2; |
| 200 | } |
| 201 | if (len == 2) { |
| 202 | return 2; |
| 203 | } |
| 204 | p[2] ^= 5; |
| 205 | return 3; |
| 206 | } |
| 207 | |
| 208 | inline std::string ApplyTransform( |
| 209 | const Transform& t, const uint8_t* word, int len) { |
| 210 | std::string ret(t.prefix); |
Zoltan Szabadka | 3477819 | 2014-03-25 16:48:25 +0100 | [diff] [blame] | 211 | if (t.word_transform <= kOmitLast9) { |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 212 | len -= t.word_transform; |
| 213 | } |
| 214 | if (len > 0) { |
Zoltan Szabadka | 3477819 | 2014-03-25 16:48:25 +0100 | [diff] [blame] | 215 | if (t.word_transform >= kOmitFirst1) { |
| 216 | const int skip = t.word_transform - (kOmitFirst1 - 1); |
| 217 | if (len > skip) { |
| 218 | ret += std::string(word + skip, word + len); |
| 219 | } |
| 220 | } else { |
| 221 | ret += std::string(word, word + len); |
| 222 | uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]); |
| 223 | if (t.word_transform == kUppercaseFirst) { |
| 224 | ToUpperCase(uppercase, len); |
| 225 | } else if (t.word_transform == kUppercaseAll) { |
| 226 | while (len > 0) { |
| 227 | int step = ToUpperCase(uppercase, len); |
| 228 | uppercase += step; |
| 229 | len -= step; |
| 230 | } |
Zoltan Szabadka | 2f268ad | 2014-02-17 14:25:36 +0100 | [diff] [blame] | 231 | } |
| 232 | } |
| 233 | } |
| 234 | ret += std::string(t.suffix); |
| 235 | return ret; |
| 236 | } |
| 237 | |
| 238 | inline std::string GetTransformedDictionaryWord(int len_code, int word_id) { |
| 239 | int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code]; |
| 240 | int offset = kBrotliDictionaryOffsetsByLength[len_code]; |
| 241 | int t = word_id / num_words; |
| 242 | int word_idx = word_id % num_words; |
| 243 | offset += len_code * word_idx; |
| 244 | const uint8_t* word = &kBrotliDictionary[offset]; |
| 245 | return ApplyTransform(kTransforms[t], word, len_code); |
| 246 | } |
| 247 | |
| 248 | } // namespace brotli |
| 249 | |
| 250 | #endif // BROTLI_ENC_TRANSFORM_H_ |