halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2011 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #include "SkPDFMakeToUnicodeCmap.h" |
Hal Canary | c640d0d | 2018-06-13 09:59:02 -0400 | [diff] [blame] | 9 | |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 10 | #include "SkPDFUtils.h" |
Hal Canary | 8b68110 | 2018-09-05 22:32:41 -0400 | [diff] [blame] | 11 | #include "SkStreamPriv.h" |
Hal Canary | c640d0d | 2018-06-13 09:59:02 -0400 | [diff] [blame] | 12 | #include "SkTo.h" |
Hal Canary | ea60b95 | 2018-08-21 11:45:46 -0400 | [diff] [blame] | 13 | #include "SkUTF.h" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 14 | |
| 15 | static void append_tounicode_header(SkDynamicMemoryWStream* cmap, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 16 | bool multibyte) { |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 17 | // 12 dict begin: 12 is an Adobe-suggested value. Shall not change. |
| 18 | // It's there to prevent old version Adobe Readers from malfunctioning. |
| 19 | const char* kHeader = |
| 20 | "/CIDInit /ProcSet findresource begin\n" |
| 21 | "12 dict begin\n" |
| 22 | "begincmap\n"; |
| 23 | cmap->writeText(kHeader); |
| 24 | |
| 25 | // The /CIDSystemInfo must be consistent to the one in |
| 26 | // SkPDFFont::populateCIDFont(). |
| 27 | // We can not pass over the system info object here because the format is |
| 28 | // different. This is not a reference object. |
| 29 | const char* kSysInfo = |
| 30 | "/CIDSystemInfo\n" |
halcanary | 59be20c | 2016-09-01 14:10:00 -0700 | [diff] [blame] | 31 | "<< /Registry (Adobe)\n" |
| 32 | "/Ordering (UCS)\n" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 33 | "/Supplement 0\n" |
| 34 | ">> def\n"; |
| 35 | cmap->writeText(kSysInfo); |
| 36 | |
| 37 | // The CMapName must be consistent to /CIDSystemInfo above. |
| 38 | // /CMapType 2 means ToUnicode. |
| 39 | // Codespace range just tells the PDF processor the valid range. |
| 40 | const char* kTypeInfoHeader = |
halcanary | 59be20c | 2016-09-01 14:10:00 -0700 | [diff] [blame] | 41 | "/CMapName /Adobe-Identity-UCS def\n" |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 42 | "/CMapType 2 def\n" |
| 43 | "1 begincodespacerange\n"; |
| 44 | cmap->writeText(kTypeInfoHeader); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 45 | if (multibyte) { |
| 46 | cmap->writeText("<0000> <FFFF>\n"); |
| 47 | } else { |
| 48 | cmap->writeText("<00> <FF>\n"); |
| 49 | } |
| 50 | cmap->writeText("endcodespacerange\n"); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 51 | } |
| 52 | |
| 53 | static void append_cmap_footer(SkDynamicMemoryWStream* cmap) { |
| 54 | const char kFooter[] = |
| 55 | "endcmap\n" |
| 56 | "CMapName currentdict /CMap defineresource pop\n" |
| 57 | "end\n" |
| 58 | "end"; |
| 59 | cmap->writeText(kFooter); |
| 60 | } |
| 61 | |
| 62 | namespace { |
| 63 | struct BFChar { |
| 64 | SkGlyphID fGlyphId; |
| 65 | SkUnichar fUnicode; |
| 66 | }; |
| 67 | |
| 68 | struct BFRange { |
| 69 | SkGlyphID fStart; |
| 70 | SkGlyphID fEnd; |
| 71 | SkUnichar fUnicode; |
| 72 | }; |
| 73 | } // namespace |
| 74 | |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 75 | static void write_glyph(SkDynamicMemoryWStream* cmap, |
| 76 | bool multiByte, |
| 77 | SkGlyphID gid) { |
| 78 | if (multiByte) { |
| 79 | SkPDFUtils::WriteUInt16BE(cmap, gid); |
| 80 | } else { |
| 81 | SkPDFUtils::WriteUInt8(cmap, SkToU8(gid)); |
| 82 | } |
| 83 | } |
| 84 | |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 85 | static void append_bfchar_section(const std::vector<BFChar>& bfchar, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 86 | bool multiByte, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 87 | SkDynamicMemoryWStream* cmap) { |
| 88 | // PDF spec defines that every bf* list can have at most 100 entries. |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 89 | for (size_t i = 0; i < bfchar.size(); i += 100) { |
| 90 | int count = SkToInt(bfchar.size() - i); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 91 | count = SkMin32(count, 100); |
Hal Canary | 8b68110 | 2018-09-05 22:32:41 -0400 | [diff] [blame] | 92 | SkWStreamWriteDecAsText(cmap, count); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 93 | cmap->writeText(" beginbfchar\n"); |
| 94 | for (int j = 0; j < count; ++j) { |
| 95 | cmap->writeText("<"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 96 | write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 97 | cmap->writeText("> <"); |
halcanary | f59d18a | 2016-09-16 14:44:57 -0700 | [diff] [blame] | 98 | SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 99 | cmap->writeText(">\n"); |
| 100 | } |
| 101 | cmap->writeText("endbfchar\n"); |
| 102 | } |
| 103 | } |
| 104 | |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 105 | static void append_bfrange_section(const std::vector<BFRange>& bfrange, |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 106 | bool multiByte, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 107 | SkDynamicMemoryWStream* cmap) { |
| 108 | // PDF spec defines that every bf* list can have at most 100 entries. |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 109 | for (size_t i = 0; i < bfrange.size(); i += 100) { |
| 110 | int count = SkToInt(bfrange.size() - i); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 111 | count = SkMin32(count, 100); |
Hal Canary | 8b68110 | 2018-09-05 22:32:41 -0400 | [diff] [blame] | 112 | SkWStreamWriteDecAsText(cmap, count); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 113 | cmap->writeText(" beginbfrange\n"); |
| 114 | for (int j = 0; j < count; ++j) { |
| 115 | cmap->writeText("<"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 116 | write_glyph(cmap, multiByte, bfrange[i + j].fStart); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 117 | cmap->writeText("> <"); |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 118 | write_glyph(cmap, multiByte, bfrange[i + j].fEnd); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 119 | cmap->writeText("> <"); |
halcanary | f59d18a | 2016-09-16 14:44:57 -0700 | [diff] [blame] | 120 | SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 121 | cmap->writeText(">\n"); |
| 122 | } |
| 123 | cmap->writeText("endbfrange\n"); |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | // Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe |
| 128 | // Technote 5014. |
| 129 | // The function is not static so we can test it in unit tests. |
| 130 | // |
| 131 | // Current implementation guarantees bfchar and bfrange entries do not overlap. |
| 132 | // |
| 133 | // Current implementation does not attempt aggresive optimizations against |
| 134 | // following case because the specification is not clear. |
| 135 | // |
| 136 | // 4 beginbfchar 1 beginbfchar |
| 137 | // <0003> <0013> <0020> <0014> |
| 138 | // <0005> <0015> to endbfchar |
| 139 | // <0007> <0017> 1 beginbfrange |
| 140 | // <0020> <0014> <0003> <0007> <0013> |
| 141 | // endbfchar endbfrange |
| 142 | // |
| 143 | // Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may |
| 144 | // overlap, but succeeding maps supersede preceding maps." |
| 145 | // |
| 146 | // In case of searching text in PDF, bfrange will have higher precedence so |
| 147 | // typing char id 0x0014 in search box will get glyph id 0x0004 first. However, |
| 148 | // the spec does not mention how will this kind of conflict being resolved. |
| 149 | // |
| 150 | // For the worst case (having 65536 continuous unicode and we use every other |
| 151 | // one of them), the possible savings by aggressive optimization is 416KB |
| 152 | // pre-compressed and does not provide enough motivation for implementation. |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 153 | void SkPDFAppendCmapSections(const SkUnichar* glyphToUnicode, |
halcanary | 530032a | 2016-08-18 14:22:52 -0700 | [diff] [blame] | 154 | const SkBitSet* subset, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 155 | SkDynamicMemoryWStream* cmap, |
| 156 | bool multiByteGlyphs, |
| 157 | SkGlyphID firstGlyphID, |
| 158 | SkGlyphID lastGlyphID) { |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 159 | int glyphOffset = 0; |
| 160 | if (!multiByteGlyphs) { |
| 161 | glyphOffset = firstGlyphID - 1; |
| 162 | } |
| 163 | |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 164 | std::vector<BFChar> bfcharEntries; |
| 165 | std::vector<BFRange> bfrangeEntries; |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 166 | |
| 167 | BFRange currentRangeEntry = {0, 0, 0}; |
| 168 | bool rangeEmpty = true; |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 169 | const int limit = (int)lastGlyphID + 1 - glyphOffset; |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 170 | |
| 171 | for (int i = firstGlyphID - glyphOffset; i < limit + 1; ++i) { |
| 172 | bool inSubset = i < limit && |
| 173 | (subset == nullptr || subset->has(i + glyphOffset)); |
| 174 | if (!rangeEmpty) { |
| 175 | // PDF spec requires bfrange not changing the higher byte, |
| 176 | // e.g. <1035> <10FF> <2222> is ok, but |
| 177 | // <1035> <1100> <2222> is no good |
| 178 | bool inRange = |
| 179 | i == currentRangeEntry.fEnd + 1 && |
| 180 | i >> 8 == currentRangeEntry.fStart >> 8 && |
| 181 | i < limit && |
| 182 | glyphToUnicode[i + glyphOffset] == |
| 183 | currentRangeEntry.fUnicode + i - currentRangeEntry.fStart; |
| 184 | if (!inSubset || !inRange) { |
| 185 | if (currentRangeEntry.fEnd > currentRangeEntry.fStart) { |
Mike Reed | 5edcd31 | 2018-08-08 11:23:41 -0400 | [diff] [blame] | 186 | bfrangeEntries.push_back(currentRangeEntry); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 187 | } else { |
Hal Canary | 9e41c21 | 2018-09-03 12:00:23 -0400 | [diff] [blame] | 188 | bfcharEntries.push_back({currentRangeEntry.fStart, currentRangeEntry.fUnicode}); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 189 | } |
| 190 | rangeEmpty = true; |
| 191 | } |
| 192 | } |
| 193 | if (inSubset) { |
| 194 | currentRangeEntry.fEnd = i; |
| 195 | if (rangeEmpty) { |
| 196 | currentRangeEntry.fStart = i; |
| 197 | currentRangeEntry.fUnicode = glyphToUnicode[i + glyphOffset]; |
| 198 | rangeEmpty = false; |
| 199 | } |
| 200 | } |
| 201 | } |
| 202 | |
| 203 | // The spec requires all bfchar entries for a font must come before bfrange |
| 204 | // entries. |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 205 | append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap); |
| 206 | append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 207 | } |
| 208 | |
| 209 | sk_sp<SkPDFStream> SkPDFMakeToUnicodeCmap( |
Hal Canary | 46cc3da | 2018-05-09 11:50:34 -0400 | [diff] [blame] | 210 | const SkUnichar* glyphToUnicode, |
halcanary | 530032a | 2016-08-18 14:22:52 -0700 | [diff] [blame] | 211 | const SkBitSet* subset, |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 212 | bool multiByteGlyphs, |
| 213 | SkGlyphID firstGlyphID, |
| 214 | SkGlyphID lastGlyphID) { |
| 215 | SkDynamicMemoryWStream cmap; |
halcanary | 3d01c62 | 2016-08-31 12:52:35 -0700 | [diff] [blame] | 216 | append_tounicode_header(&cmap, multiByteGlyphs); |
halcanary | 8eccc30 | 2016-08-09 13:04:34 -0700 | [diff] [blame] | 217 | SkPDFAppendCmapSections(glyphToUnicode, subset, &cmap, multiByteGlyphs, |
| 218 | firstGlyphID, lastGlyphID); |
| 219 | append_cmap_footer(&cmap); |
| 220 | return sk_make_sp<SkPDFStream>( |
| 221 | std::unique_ptr<SkStreamAsset>(cmap.detachAsStream())); |
| 222 | } |