halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2011 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "SkPDFMakeToUnicodeCmap.h" |
Hal Canary | c640d0d | 2018-06-13 09:59:02 -0400 | [diff] [blame^] | 9 | |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 10 | #include "SkPDFUtils.h" |
Hal Canary | c640d0d | 2018-06-13 09:59:02 -0400 | [diff] [blame^] | 11 | #include "SkTo.h" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 12 | #include "SkUtils.h" |
| 13 | |
| 14 | static void append_tounicode_header(SkDynamicMemoryWStream* cmap, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 15 | bool multibyte) { |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 16 | // 12 dict begin: 12 is an Adobe-suggested value. Shall not change. |
| 17 | // It's there to prevent old version Adobe Readers from malfunctioning. |
| 18 | const char* kHeader = |
| 19 | "/CIDInit /ProcSet findresource begin\n" |
| 20 | "12 dict begin\n" |
| 21 | "begincmap\n"; |
| 22 | cmap->writeText(kHeader); |
| 23 | |
| 24 | // The /CIDSystemInfo must be consistent to the one in |
| 25 | // SkPDFFont::populateCIDFont(). |
| 26 | // We can not pass over the system info object here because the format is |
| 27 | // different. This is not a reference object. |
| 28 | const char* kSysInfo = |
| 29 | "/CIDSystemInfo\n" |
halcanary | 59be20c | 2016-09-01 14:10:00 -0700 | [diff] [blame] | 30 | "<< /Registry (Adobe)\n" |
| 31 | "/Ordering (UCS)\n" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 32 | "/Supplement 0\n" |
| 33 | ">> def\n"; |
| 34 | cmap->writeText(kSysInfo); |
| 35 | |
| 36 | // The CMapName must be consistent to /CIDSystemInfo above. |
| 37 | // /CMapType 2 means ToUnicode. |
| 38 | // Codespace range just tells the PDF processor the valid range. |
| 39 | const char* kTypeInfoHeader = |
halcanary | 59be20c | 2016-09-01 14:10:00 -0700 | [diff] [blame] | 40 | "/CMapName /Adobe-Identity-UCS def\n" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 41 | "/CMapType 2 def\n" |
| 42 | "1 begincodespacerange\n"; |
| 43 | cmap->writeText(kTypeInfoHeader); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 44 | if (multibyte) { |
| 45 | cmap->writeText("<0000> <FFFF>\n"); |
| 46 | } else { |
| 47 | cmap->writeText("<00> <FF>\n"); |
| 48 | } |
| 49 | cmap->writeText("endcodespacerange\n"); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 50 | } |
| 51 | |
| 52 | static void append_cmap_footer(SkDynamicMemoryWStream* cmap) { |
| 53 | const char kFooter[] = |
| 54 | "endcmap\n" |
| 55 | "CMapName currentdict /CMap defineresource pop\n" |
| 56 | "end\n" |
| 57 | "end"; |
| 58 | cmap->writeText(kFooter); |
| 59 | } |
| 60 | |
| 61 | namespace { |
| 62 | struct BFChar { |
| 63 | SkGlyphID fGlyphId; |
| 64 | SkUnichar fUnicode; |
| 65 | }; |
| 66 | |
| 67 | struct BFRange { |
| 68 | SkGlyphID fStart; |
| 69 | SkGlyphID fEnd; |
| 70 | SkUnichar fUnicode; |
| 71 | }; |
| 72 | } // namespace |
| 73 | |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 74 | static void write_glyph(SkDynamicMemoryWStream* cmap, |
| 75 | bool multiByte, |
| 76 | SkGlyphID gid) { |
| 77 | if (multiByte) { |
| 78 | SkPDFUtils::WriteUInt16BE(cmap, gid); |
| 79 | } else { |
| 80 | SkPDFUtils::WriteUInt8(cmap, SkToU8(gid)); |
| 81 | } |
| 82 | } |
| 83 | |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 84 | static void append_bfchar_section(const SkTDArray<BFChar>& bfchar, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 85 | bool multiByte, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 86 | SkDynamicMemoryWStream* cmap) { |
| 87 | // PDF spec defines that every bf* list can have at most 100 entries. |
| 88 | for (int i = 0; i < bfchar.count(); i += 100) { |
| 89 | int count = bfchar.count() - i; |
| 90 | count = SkMin32(count, 100); |
| 91 | cmap->writeDecAsText(count); |
| 92 | cmap->writeText(" beginbfchar\n"); |
| 93 | for (int j = 0; j < count; ++j) { |
| 94 | cmap->writeText("<"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 95 | write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 96 | cmap->writeText("> <"); |
halcanary | f59d18a | 2016-09-16 14:44:57 -0700 | [diff] [blame] | 97 | SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 98 | cmap->writeText(">\n"); |
| 99 | } |
| 100 | cmap->writeText("endbfchar\n"); |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | static void append_bfrange_section(const SkTDArray<BFRange>& bfrange, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 105 | bool multiByte, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 106 | SkDynamicMemoryWStream* cmap) { |
| 107 | // PDF spec defines that every bf* list can have at most 100 entries. |
| 108 | for (int i = 0; i < bfrange.count(); i += 100) { |
| 109 | int count = bfrange.count() - i; |
| 110 | count = SkMin32(count, 100); |
| 111 | cmap->writeDecAsText(count); |
| 112 | cmap->writeText(" beginbfrange\n"); |
| 113 | for (int j = 0; j < count; ++j) { |
| 114 | cmap->writeText("<"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 115 | write_glyph(cmap, multiByte, bfrange[i + j].fStart); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 116 | cmap->writeText("> <"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 117 | write_glyph(cmap, multiByte, bfrange[i + j].fEnd); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 118 | cmap->writeText("> <"); |
halcanary | f59d18a | 2016-09-16 14:44:57 -0700 | [diff] [blame] | 119 | SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 120 | cmap->writeText(">\n"); |
| 121 | } |
| 122 | cmap->writeText("endbfrange\n"); |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | // Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe |
| 127 | // Technote 5014. |
| 128 | // The function is not static so we can test it in unit tests. |
| 129 | // |
| 130 | // Current implementation guarantees bfchar and bfrange entries do not overlap. |
| 131 | // |
| 132 | // Current implementation does not attempt aggresive optimizations against |
| 133 | // following case because the specification is not clear. |
| 134 | // |
| 135 | // 4 beginbfchar 1 beginbfchar |
| 136 | // <0003> <0013> <0020> <0014> |
| 137 | // <0005> <0015> to endbfchar |
| 138 | // <0007> <0017> 1 beginbfrange |
| 139 | // <0020> <0014> <0003> <0007> <0013> |
| 140 | // endbfchar endbfrange |
| 141 | // |
| 142 | // Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may |
| 143 | // overlap, but succeeding maps supersede preceding maps." |
| 144 | // |
| 145 | // In case of searching text in PDF, bfrange will have higher precedence so |
| 146 | // typing char id 0x0014 in search box will get glyph id 0x0004 first. However, |
| 147 | // the spec does not mention how will this kind of conflict being resolved. |
| 148 | // |
| 149 | // For the worst case (having 65536 continuous unicode and we use every other |
| 150 | // one of them), the possible savings by aggressive optimization is 416KB |
| 151 | // pre-compressed and does not provide enough motivation for implementation. |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 152 | void SkPDFAppendCmapSections(const SkUnichar* glyphToUnicode, |
halcanary | 530032a | 2016-08-18 14:22:52 -0700 | [diff] [blame] | 153 | const SkBitSet* subset, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 154 | SkDynamicMemoryWStream* cmap, |
| 155 | bool multiByteGlyphs, |
| 156 | SkGlyphID firstGlyphID, |
| 157 | SkGlyphID lastGlyphID) { |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 158 | int glyphOffset = 0; |
| 159 | if (!multiByteGlyphs) { |
| 160 | glyphOffset = firstGlyphID - 1; |
| 161 | } |
| 162 | |
| 163 | SkTDArray<BFChar> bfcharEntries; |
| 164 | SkTDArray<BFRange> bfrangeEntries; |
| 165 | |
| 166 | BFRange currentRangeEntry = {0, 0, 0}; |
| 167 | bool rangeEmpty = true; |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 168 | const int limit = (int)lastGlyphID + 1 - glyphOffset; |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 169 | |
| 170 | for (int i = firstGlyphID - glyphOffset; i < limit + 1; ++i) { |
| 171 | bool inSubset = i < limit && |
| 172 | (subset == nullptr || subset->has(i + glyphOffset)); |
| 173 | if (!rangeEmpty) { |
| 174 | // PDF spec requires bfrange not changing the higher byte, |
| 175 | // e.g. <1035> <10FF> <2222> is ok, but |
| 176 | // <1035> <1100> <2222> is no good |
| 177 | bool inRange = |
| 178 | i == currentRangeEntry.fEnd + 1 && |
| 179 | i >> 8 == currentRangeEntry.fStart >> 8 && |
| 180 | i < limit && |
| 181 | glyphToUnicode[i + glyphOffset] == |
| 182 | currentRangeEntry.fUnicode + i - currentRangeEntry.fStart; |
| 183 | if (!inSubset || !inRange) { |
| 184 | if (currentRangeEntry.fEnd > currentRangeEntry.fStart) { |
| 185 | bfrangeEntries.push(currentRangeEntry); |
| 186 | } else { |
| 187 | BFChar* entry = bfcharEntries.append(); |
| 188 | entry->fGlyphId = currentRangeEntry.fStart; |
| 189 | entry->fUnicode = currentRangeEntry.fUnicode; |
| 190 | } |
| 191 | rangeEmpty = true; |
| 192 | } |
| 193 | } |
| 194 | if (inSubset) { |
| 195 | currentRangeEntry.fEnd = i; |
| 196 | if (rangeEmpty) { |
| 197 | currentRangeEntry.fStart = i; |
| 198 | currentRangeEntry.fUnicode = glyphToUnicode[i + glyphOffset]; |
| 199 | rangeEmpty = false; |
| 200 | } |
| 201 | } |
| 202 | } |
| 203 | |
| 204 | // The spec requires all bfchar entries for a font must come before bfrange |
| 205 | // entries. |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 206 | append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap); |
| 207 | append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 208 | } |
| 209 | |
| 210 | sk_sp<SkPDFStream> SkPDFMakeToUnicodeCmap( |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 211 | const SkUnichar* glyphToUnicode, |
halcanary | 530032a | 2016-08-18 14:22:52 -0700 | [diff] [blame] | 212 | const SkBitSet* subset, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 213 | bool multiByteGlyphs, |
| 214 | SkGlyphID firstGlyphID, |
| 215 | SkGlyphID lastGlyphID) { |
| 216 | SkDynamicMemoryWStream cmap; |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 217 | append_tounicode_header(&cmap, multiByteGlyphs); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 218 | SkPDFAppendCmapSections(glyphToUnicode, subset, &cmap, multiByteGlyphs, |
| 219 | firstGlyphID, lastGlyphID); |
| 220 | append_cmap_footer(&cmap); |
| 221 | return sk_make_sp<SkPDFStream>( |
| 222 | std::unique_ptr<SkStreamAsset>(cmap.detachAsStream())); |
| 223 | } |