blob: 709f322b3a288dbe7156e57f203e6638b0b57d0e [file] [log] [blame]
halcanary8eccc302016-08-09 13:04:34 -07001/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "SkPDFMakeToUnicodeCmap.h"
Hal Canaryc640d0d2018-06-13 09:59:02 -04009
halcanary8eccc302016-08-09 13:04:34 -070010#include "SkPDFUtils.h"
Hal Canaryc640d0d2018-06-13 09:59:02 -040011#include "SkTo.h"
Hal Canaryea60b952018-08-21 11:45:46 -040012#include "SkUTF.h"
halcanary8eccc302016-08-09 13:04:34 -070013
14static void append_tounicode_header(SkDynamicMemoryWStream* cmap,
halcanary3d01c622016-08-31 12:52:35 -070015 bool multibyte) {
halcanary8eccc302016-08-09 13:04:34 -070016 // 12 dict begin: 12 is an Adobe-suggested value. Shall not change.
17 // It's there to prevent old version Adobe Readers from malfunctioning.
18 const char* kHeader =
19 "/CIDInit /ProcSet findresource begin\n"
20 "12 dict begin\n"
21 "begincmap\n";
22 cmap->writeText(kHeader);
23
24 // The /CIDSystemInfo must be consistent to the one in
25 // SkPDFFont::populateCIDFont().
26 // We can not pass over the system info object here because the format is
27 // different. This is not a reference object.
28 const char* kSysInfo =
29 "/CIDSystemInfo\n"
halcanary59be20c2016-09-01 14:10:00 -070030 "<< /Registry (Adobe)\n"
31 "/Ordering (UCS)\n"
halcanary8eccc302016-08-09 13:04:34 -070032 "/Supplement 0\n"
33 ">> def\n";
34 cmap->writeText(kSysInfo);
35
36 // The CMapName must be consistent to /CIDSystemInfo above.
37 // /CMapType 2 means ToUnicode.
38 // Codespace range just tells the PDF processor the valid range.
39 const char* kTypeInfoHeader =
halcanary59be20c2016-09-01 14:10:00 -070040 "/CMapName /Adobe-Identity-UCS def\n"
halcanary8eccc302016-08-09 13:04:34 -070041 "/CMapType 2 def\n"
42 "1 begincodespacerange\n";
43 cmap->writeText(kTypeInfoHeader);
halcanary3d01c622016-08-31 12:52:35 -070044 if (multibyte) {
45 cmap->writeText("<0000> <FFFF>\n");
46 } else {
47 cmap->writeText("<00> <FF>\n");
48 }
49 cmap->writeText("endcodespacerange\n");
halcanary8eccc302016-08-09 13:04:34 -070050}
51
52static void append_cmap_footer(SkDynamicMemoryWStream* cmap) {
53 const char kFooter[] =
54 "endcmap\n"
55 "CMapName currentdict /CMap defineresource pop\n"
56 "end\n"
57 "end";
58 cmap->writeText(kFooter);
59}
60
61namespace {
62struct BFChar {
63 SkGlyphID fGlyphId;
64 SkUnichar fUnicode;
65};
66
67struct BFRange {
68 SkGlyphID fStart;
69 SkGlyphID fEnd;
70 SkUnichar fUnicode;
71};
72} // namespace
73
halcanary3d01c622016-08-31 12:52:35 -070074static void write_glyph(SkDynamicMemoryWStream* cmap,
75 bool multiByte,
76 SkGlyphID gid) {
77 if (multiByte) {
78 SkPDFUtils::WriteUInt16BE(cmap, gid);
79 } else {
80 SkPDFUtils::WriteUInt8(cmap, SkToU8(gid));
81 }
82}
83
Hal Canary9e41c212018-09-03 12:00:23 -040084static void append_bfchar_section(const std::vector<BFChar>& bfchar,
halcanary3d01c622016-08-31 12:52:35 -070085 bool multiByte,
halcanary8eccc302016-08-09 13:04:34 -070086 SkDynamicMemoryWStream* cmap) {
87 // PDF spec defines that every bf* list can have at most 100 entries.
Hal Canary9e41c212018-09-03 12:00:23 -040088 for (size_t i = 0; i < bfchar.size(); i += 100) {
89 int count = SkToInt(bfchar.size() - i);
halcanary8eccc302016-08-09 13:04:34 -070090 count = SkMin32(count, 100);
Cary Clark60ebf142018-09-06 12:22:33 +000091 cmap->writeDecAsText(count);
halcanary8eccc302016-08-09 13:04:34 -070092 cmap->writeText(" beginbfchar\n");
93 for (int j = 0; j < count; ++j) {
94 cmap->writeText("<");
halcanary3d01c622016-08-31 12:52:35 -070095 write_glyph(cmap, multiByte, bfchar[i + j].fGlyphId);
halcanary8eccc302016-08-09 13:04:34 -070096 cmap->writeText("> <");
halcanaryf59d18a2016-09-16 14:44:57 -070097 SkPDFUtils::WriteUTF16beHex(cmap, bfchar[i + j].fUnicode);
halcanary8eccc302016-08-09 13:04:34 -070098 cmap->writeText(">\n");
99 }
100 cmap->writeText("endbfchar\n");
101 }
102}
103
Hal Canary9e41c212018-09-03 12:00:23 -0400104static void append_bfrange_section(const std::vector<BFRange>& bfrange,
halcanary3d01c622016-08-31 12:52:35 -0700105 bool multiByte,
halcanary8eccc302016-08-09 13:04:34 -0700106 SkDynamicMemoryWStream* cmap) {
107 // PDF spec defines that every bf* list can have at most 100 entries.
Hal Canary9e41c212018-09-03 12:00:23 -0400108 for (size_t i = 0; i < bfrange.size(); i += 100) {
109 int count = SkToInt(bfrange.size() - i);
halcanary8eccc302016-08-09 13:04:34 -0700110 count = SkMin32(count, 100);
Cary Clark60ebf142018-09-06 12:22:33 +0000111 cmap->writeDecAsText(count);
halcanary8eccc302016-08-09 13:04:34 -0700112 cmap->writeText(" beginbfrange\n");
113 for (int j = 0; j < count; ++j) {
114 cmap->writeText("<");
halcanary3d01c622016-08-31 12:52:35 -0700115 write_glyph(cmap, multiByte, bfrange[i + j].fStart);
halcanary8eccc302016-08-09 13:04:34 -0700116 cmap->writeText("> <");
halcanary3d01c622016-08-31 12:52:35 -0700117 write_glyph(cmap, multiByte, bfrange[i + j].fEnd);
halcanary8eccc302016-08-09 13:04:34 -0700118 cmap->writeText("> <");
halcanaryf59d18a2016-09-16 14:44:57 -0700119 SkPDFUtils::WriteUTF16beHex(cmap, bfrange[i + j].fUnicode);
halcanary8eccc302016-08-09 13:04:34 -0700120 cmap->writeText(">\n");
121 }
122 cmap->writeText("endbfrange\n");
123 }
124}
125
126// Generate <bfchar> and <bfrange> table according to PDF spec 1.4 and Adobe
127// Technote 5014.
128// The function is not static so we can test it in unit tests.
129//
130// Current implementation guarantees bfchar and bfrange entries do not overlap.
131//
Hal Canaryac907bd2019-01-09 14:00:49 -0500132// Current implementation does not attempt aggressive optimizations against
halcanary8eccc302016-08-09 13:04:34 -0700133// following case because the specification is not clear.
134//
135// 4 beginbfchar 1 beginbfchar
136// <0003> <0013> <0020> <0014>
137// <0005> <0015> to endbfchar
138// <0007> <0017> 1 beginbfrange
139// <0020> <0014> <0003> <0007> <0013>
140// endbfchar endbfrange
141//
142// Adobe Technote 5014 said: "Code mappings (unlike codespace ranges) may
143// overlap, but succeeding maps supersede preceding maps."
144//
145// In case of searching text in PDF, bfrange will have higher precedence so
146// typing char id 0x0014 in search box will get glyph id 0x0004 first. However,
147// the spec does not mention how will this kind of conflict being resolved.
148//
149// For the worst case (having 65536 continuous unicode and we use every other
150// one of them), the possible savings by aggressive optimization is 416KB
151// pre-compressed and does not provide enough motivation for implementation.
Hal Canary46cc3da2018-05-09 11:50:34 -0400152void SkPDFAppendCmapSections(const SkUnichar* glyphToUnicode,
Hal Canary31355982018-10-19 12:21:41 -0400153 const SkPDFGlyphUse* subset,
halcanary8eccc302016-08-09 13:04:34 -0700154 SkDynamicMemoryWStream* cmap,
155 bool multiByteGlyphs,
156 SkGlyphID firstGlyphID,
157 SkGlyphID lastGlyphID) {
halcanary8eccc302016-08-09 13:04:34 -0700158 int glyphOffset = 0;
159 if (!multiByteGlyphs) {
160 glyphOffset = firstGlyphID - 1;
161 }
162
Hal Canary9e41c212018-09-03 12:00:23 -0400163 std::vector<BFChar> bfcharEntries;
164 std::vector<BFRange> bfrangeEntries;
halcanary8eccc302016-08-09 13:04:34 -0700165
166 BFRange currentRangeEntry = {0, 0, 0};
167 bool rangeEmpty = true;
Hal Canary46cc3da2018-05-09 11:50:34 -0400168 const int limit = (int)lastGlyphID + 1 - glyphOffset;
halcanary8eccc302016-08-09 13:04:34 -0700169
170 for (int i = firstGlyphID - glyphOffset; i < limit + 1; ++i) {
Hal Canary31355982018-10-19 12:21:41 -0400171 SkGlyphID gid = i + glyphOffset;
172 bool inSubset = i < limit && (subset == nullptr || subset->has(gid));
halcanary8eccc302016-08-09 13:04:34 -0700173 if (!rangeEmpty) {
174 // PDF spec requires bfrange not changing the higher byte,
175 // e.g. <1035> <10FF> <2222> is ok, but
176 // <1035> <1100> <2222> is no good
177 bool inRange =
178 i == currentRangeEntry.fEnd + 1 &&
179 i >> 8 == currentRangeEntry.fStart >> 8 &&
180 i < limit &&
Hal Canary31355982018-10-19 12:21:41 -0400181 glyphToUnicode[gid] ==
halcanary8eccc302016-08-09 13:04:34 -0700182 currentRangeEntry.fUnicode + i - currentRangeEntry.fStart;
183 if (!inSubset || !inRange) {
184 if (currentRangeEntry.fEnd > currentRangeEntry.fStart) {
Mike Reed5edcd312018-08-08 11:23:41 -0400185 bfrangeEntries.push_back(currentRangeEntry);
halcanary8eccc302016-08-09 13:04:34 -0700186 } else {
Hal Canary9e41c212018-09-03 12:00:23 -0400187 bfcharEntries.push_back({currentRangeEntry.fStart, currentRangeEntry.fUnicode});
halcanary8eccc302016-08-09 13:04:34 -0700188 }
189 rangeEmpty = true;
190 }
191 }
192 if (inSubset) {
193 currentRangeEntry.fEnd = i;
194 if (rangeEmpty) {
195 currentRangeEntry.fStart = i;
Hal Canary31355982018-10-19 12:21:41 -0400196 currentRangeEntry.fUnicode = glyphToUnicode[gid];
halcanary8eccc302016-08-09 13:04:34 -0700197 rangeEmpty = false;
198 }
199 }
200 }
201
202 // The spec requires all bfchar entries for a font must come before bfrange
203 // entries.
halcanary3d01c622016-08-31 12:52:35 -0700204 append_bfchar_section(bfcharEntries, multiByteGlyphs, cmap);
205 append_bfrange_section(bfrangeEntries, multiByteGlyphs, cmap);
halcanary8eccc302016-08-09 13:04:34 -0700206}
207
Hal Canary9a3f5542018-12-10 19:59:07 -0500208std::unique_ptr<SkStreamAsset> SkPDFMakeToUnicodeCmap(
Hal Canary46cc3da2018-05-09 11:50:34 -0400209 const SkUnichar* glyphToUnicode,
Hal Canary31355982018-10-19 12:21:41 -0400210 const SkPDFGlyphUse* subset,
halcanary8eccc302016-08-09 13:04:34 -0700211 bool multiByteGlyphs,
212 SkGlyphID firstGlyphID,
213 SkGlyphID lastGlyphID) {
214 SkDynamicMemoryWStream cmap;
halcanary3d01c622016-08-31 12:52:35 -0700215 append_tounicode_header(&cmap, multiByteGlyphs);
halcanary8eccc302016-08-09 13:04:34 -0700216 SkPDFAppendCmapSections(glyphToUnicode, subset, &cmap, multiByteGlyphs,
217 firstGlyphID, lastGlyphID);
218 append_cmap_footer(&cmap);
Hal Canary9a3f5542018-12-10 19:59:07 -0500219 return cmap.detachAsStream();
halcanary8eccc302016-08-09 13:04:34 -0700220}