blob: 5f16176376290c1f140c05bd9e7b78e522058a46 [file] [log] [blame]
edisonn@google.com3aac1f92013-07-02 22:42:53 +00001#include "SkNativeParsedPDF.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00002#include "SkPdfNativeTokenizer.h"
3#include "SkPdfBasics.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00004#include "SkPdfObject.h"
edisonn@google.com3aac1f92013-07-02 22:42:53 +00005
edisonn@google.com571c70b2013-07-10 17:09:50 +00006#include <stdio.h>
7#include <string.h>
8#include <sys/types.h>
9#include <sys/stat.h>
edisonn@google.com3aac1f92013-07-02 22:42:53 +000010
edisonn@google.com571c70b2013-07-10 17:09:50 +000011#include "SkPdfFileTrailerDictionary_autogen.h"
12#include "SkPdfCatalogDictionary_autogen.h"
13#include "SkPdfPageObjectDictionary_autogen.h"
14#include "SkPdfPageTreeNodeDictionary_autogen.h"
15#include "SkPdfMapper_autogen.h"
16
17
18
edisonn@google.coma3356fc2013-07-10 18:20:06 +000019static long getFileSize(const char* filename)
edisonn@google.com571c70b2013-07-10 17:09:50 +000020{
21 struct stat stat_buf;
22 int rc = stat(filename, &stat_buf);
edisonn@google.coma3356fc2013-07-10 18:20:06 +000023 return rc == 0 ? (long)stat_buf.st_size : -1;
edisonn@google.com3aac1f92013-07-02 22:42:53 +000024}
25
edisonn@google.coma3356fc2013-07-10 18:20:06 +000026static unsigned char* lineHome(unsigned char* start, unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000027 while (current > start && !isPdfEOL(*(current - 1))) {
28 current--;
29 }
30 return current;
31}
32
edisonn@google.coma3356fc2013-07-10 18:20:06 +000033static unsigned char* previousLineHome(unsigned char* start, unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000034 if (current > start && isPdfEOL(*(current - 1))) {
35 current--;
36 }
37
38 // allows CR+LF, LF+CR but not two CR+CR or LF+LF
39 if (current > start && isPdfEOL(*(current - 1)) && *current != *(current - 1)) {
40 current--;
41 }
42
43 while (current > start && !isPdfEOL(*(current - 1))) {
44 current--;
45 }
46
47 return current;
48}
49
edisonn@google.coma3356fc2013-07-10 18:20:06 +000050static unsigned char* ignoreLine(unsigned char* current, unsigned char* end) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000051 while (current < end && !isPdfEOL(*current)) {
52 current++;
53 }
54 current++;
55 if (current < end && isPdfEOL(*current) && *current != *(current - 1)) {
56 current++;
57 }
58 return current;
59}
60
edisonn@google.com222382b2013-07-10 22:33:10 +000061SkNativeParsedPDF* gDoc = NULL;
edisonn@google.com571c70b2013-07-10 17:09:50 +000062
63// TODO(edisonn): NYI
64// TODO(edisonn): 3 constructuctors from URL, from stream, from file ...
65// TODO(edisonn): write one that accepts errors in the file and ignores/fixis them
66// TODO(edisonn): testing:
67// 1) run on a lot of file
68// 2) recoverable corupt file: remove endobj, endsteam, remove other keywords, use other white spaces, insert comments randomly, ...
69// 3) irrecoverable corrupt file
70SkNativeParsedPDF::SkNativeParsedPDF(const char* path) : fAllocator(new SkPdfAllocator()) {
edisonn@google.com222382b2013-07-10 22:33:10 +000071 gDoc = this;
edisonn@google.com571c70b2013-07-10 17:09:50 +000072 FILE* file = fopen(path, "r");
73 fContentLength = getFileSize(path);
edisonn@google.com222382b2013-07-10 22:33:10 +000074 fFileContent = new unsigned char[fContentLength + 1];
edisonn@google.com571c70b2013-07-10 17:09:50 +000075 fread(fFileContent, fContentLength, 1, file);
edisonn@google.com222382b2013-07-10 22:33:10 +000076 fFileContent[fContentLength] = '\0';
edisonn@google.com571c70b2013-07-10 17:09:50 +000077 fclose(file);
78 file = NULL;
79
80 unsigned char* eofLine = lineHome(fFileContent, fFileContent + fContentLength - 1);
81 unsigned char* xrefByteOffsetLine = previousLineHome(fFileContent, eofLine);
82 unsigned char* xrefstartKeywordLine = previousLineHome(fFileContent, xrefByteOffsetLine);
83
84 if (strcmp((char*)xrefstartKeywordLine, "startxref") != 0) {
85 // TODO(edisonn): report/issue
86 }
87
88 long xrefByteOffset = atol((const char*)xrefByteOffsetLine);
89
90 bool storeCatalog = true;
91 while (xrefByteOffset >= 0) {
92 unsigned char* trailerStart = readCrossReferenceSection(fFileContent + xrefByteOffset, xrefstartKeywordLine);
93 xrefByteOffset = readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog);
94 storeCatalog = false;
95 }
96
97 // TODO(edisonn): warn/error expect fObjects[fRefCatalogId].fGeneration == fRefCatalogGeneration
98 // TODO(edisonn): security, verify that SkPdfCatalogDictionary is indeed using mapper
99 // load catalog
100 fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef);
101 SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
102
103 fillPages(tree);
104
105 // now actually read all objects if we want, or do it lazyly
106 // and resolve references?... or not ...
107}
108
109// TODO(edisonn): NYI
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000110SkNativeParsedPDF::~SkNativeParsedPDF() {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000111 delete[] fFileContent;
112 delete fAllocator;
113}
114
115unsigned char* SkNativeParsedPDF::readCrossReferenceSection(unsigned char* xrefStart, unsigned char* trailerEnd) {
116 unsigned char* current = ignoreLine(xrefStart, trailerEnd); // TODO(edisonn): verify next keyord is "xref", use nextObject here
117
118 SkPdfObject token;
119 while (current < trailerEnd) {
120 token.reset();
121 unsigned char* previous = current;
122 current = nextObject(current, trailerEnd, &token, NULL);
123 if (!token.isInteger()) {
124 return previous;
125 }
126
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000127 int startId = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000128 token.reset();
129 current = nextObject(current, trailerEnd, &token, NULL);
130
131 if (!token.isInteger()) {
132 // TODO(edisonn): report/warning
133 return current;
134 }
135
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000136 int entries = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000137
138 for (int i = 0; i < entries; i++) {
139 token.reset();
140 current = nextObject(current, trailerEnd, &token, NULL);
141 if (!token.isInteger()) {
142 // TODO(edisonn): report/warning
143 return current;
144 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000145 int offset = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000146
147 token.reset();
148 current = nextObject(current, trailerEnd, &token, NULL);
149 if (!token.isInteger()) {
150 // TODO(edisonn): report/warning
151 return current;
152 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000153 int generation = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000154
155 token.reset();
156 current = nextObject(current, trailerEnd, &token, NULL);
157 if (!token.isKeyword() || token.len() != 1 || (*token.c_str() != 'f' && *token.c_str() != 'n')) {
158 // TODO(edisonn): report/warning
159 return current;
160 }
161
162 addCrossSectionInfo(startId + i, generation, offset, *token.c_str() == 'f');
163 }
164 }
165 // TODO(edisonn): it should never get here? there is no trailer?
166 return current;
167}
168
169long SkNativeParsedPDF::readTrailer(unsigned char* trailerStart, unsigned char* trailerEnd, bool storeCatalog) {
170 unsigned char* current = ignoreLine(trailerStart, trailerEnd); // TODO(edisonn): verify next keyord is "trailer" use nextObject here
171
172 SkPdfObject token;
173 current = nextObject(current, trailerEnd, &token, fAllocator);
174 SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
175
176 if (storeCatalog) {
177 const SkPdfObject* ref = trailer->Root(NULL);
178 if (ref == NULL || !ref->isReference()) {
179 // TODO(edisonn): oops, we have to fix the corrup pdf file
180 return -1;
181 }
182 fRootCatalogRef = ref;
183 }
184
185 if (trailer->has_Prev()) {
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000186 return (long)trailer->Prev(NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000187 }
188
189 return -1;
190}
191
192void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
193 // TODO(edisonn): security here
194 while (fObjects.count() < id + 1) {
195 reset(fObjects.append());
196 }
197
198 fObjects[id].fOffset = offset;
199 fObjects[id].fObj = NULL;
200}
201
202SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) const {
203 long startOffset = fObjects[id].fOffset;
204 //long endOffset = fObjects[id].fOffsetEnd;
205 // TODO(edisonn): use hinted endOffset
206 // TODO(edisonn): current implementation will result in a lot of memory usage
207 // to decrease memory usage, we wither need to be smart and know where objects end, and we will
208 // alocate only the chancks needed, or the tokenizer will not make copies, but then it needs to
209 // cache the results so it does not go twice on the same buffer
210 unsigned char* current = fFileContent + startOffset;
211 unsigned char* end = fFileContent + fContentLength;
212
213 SkPdfNativeTokenizer tokenizer(current, end - current, fMapper, fAllocator);
214
215 SkPdfObject idObj;
216 SkPdfObject generationObj;
217 SkPdfObject objKeyword;
218 SkPdfObject* dict = fAllocator->allocObject();
219
220 current = nextObject(current, end, &idObj, NULL);
221 if (current >= end) {
222 // TODO(edisonn): report warning/error
223 return NULL;
224 }
225
226 current = nextObject(current, end, &generationObj, NULL);
227 if (current >= end) {
228 // TODO(edisonn): report warning/error
229 return NULL;
230 }
231
232 current = nextObject(current, end, &objKeyword, NULL);
233 if (current >= end) {
234 // TODO(edisonn): report warning/error
235 return NULL;
236 }
237
238 if (!idObj.isInteger() || !generationObj.isInteger() || id != idObj.intValue()/* || generation != generationObj.intValue()*/) {
239 // TODO(edisonn): report warning/error
240 }
241
242 if (!objKeyword.isKeyword() || strcmp(objKeyword.c_str(), "obj") != 0) {
243 // TODO(edisonn): report warning/error
244 }
245
246 current = nextObject(current, end, dict, fAllocator);
247
248 // TODO(edisonn): report warning/error - verify last token is endobj
249
250 return dict;
251}
252
253void SkNativeParsedPDF::fillPages(SkPdfPageTreeNodeDictionary* tree) {
254 const SkPdfArray* kids = tree->Kids(this);
255 if (kids == NULL) {
256 *fPages.append() = (SkPdfPageObjectDictionary*)tree;
257 return;
258 }
259
260 int cnt = kids->size();
261 for (int i = 0; i < cnt; i++) {
262 const SkPdfObject* obj = resolveReference(kids->objAtAIndex(i));
263 if (fMapper->mapPageObjectDictionary(obj) != kPageObjectDictionary_SkPdfObjectType) {
264 *fPages.append() = (SkPdfPageObjectDictionary*)obj;
265 } else {
266 // TODO(edisonn): verify that it is a page tree indeed
267 fillPages((SkPdfPageTreeNodeDictionary*)obj);
268 }
269 }
270}
271
272int SkNativeParsedPDF::pages() const {
273 return fPages.count();
274}
275
276SkPdfResourceDictionary* SkNativeParsedPDF::pageResources(int page) {
277 return fPages[page]->Resources(this);
278}
279
280// TODO(edisonn): Partial implemented. Move the logics directly in the code generator for inheritable and default value?
281SkRect SkNativeParsedPDF::MediaBox(int page) const {
282 SkPdfPageObjectDictionary* current = fPages[page];
283 while (!current->has_MediaBox() && current->has_Parent()) {
284 current = (SkPdfPageObjectDictionary*)current->Parent(this);
285 }
286 if (current) {
287 return current->MediaBox(this);
288 }
289 return SkRect::MakeEmpty();
290}
291
292// TODO(edisonn): stream or array ... ? for now only array
293SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfPage(int page) const {
294 if (fPages[page]->isContentsAStream(this)) {
295 return tokenizerOfStream(fPages[page]->getContentsAsStream(this));
296 } else {
297 // TODO(edisonn): NYI, we need to concatenate all streams in the array or make the tokenizer smart
298 // so we don't allocate new memory
299 return NULL;
300 }
301}
302
303SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfStream(SkPdfObject* stream) const {
304 if (stream == NULL) {
305 return NULL;
306 }
307
308 return new SkPdfNativeTokenizer(stream, fMapper, fAllocator);
309}
310
311// TODO(edisonn): NYI
312SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfBuffer(unsigned char* buffer, size_t len) const {
313 // warning does not track two calls in the same buffer! the buffer is updated!
314 // make a clean copy if needed!
315 return new SkPdfNativeTokenizer(buffer, len, fMapper, fAllocator);
316}
317
318size_t SkNativeParsedPDF::objects() const {
319 return fObjects.count();
320}
321
322SkPdfObject* SkNativeParsedPDF::object(int i) {
323 SkASSERT(!(i < 0 || i > fObjects.count()));
324
325 if (i < 0 || i > fObjects.count()) {
326 return NULL;
327 }
328
329 if (fObjects[i].fObj == NULL) {
330 // TODO(edisonn): when we read the cross reference sections, store the start of the next object
331 // and fill fOffsetEnd
332 fObjects[i].fObj = readObject(i);
333 }
334
335 return fObjects[i].fObj;
336}
337
338const SkPdfMapper* SkNativeParsedPDF::mapper() const {
339 return fMapper;
340}
341
342SkPdfReal* SkNativeParsedPDF::createReal(double value) const {
343 SkPdfObject* obj = fAllocator->allocObject();
344 SkPdfObject::makeReal(value, obj);
345 return (SkPdfReal*)obj;
346}
347
348SkPdfInteger* SkNativeParsedPDF::createInteger(int value) const {
349 SkPdfObject* obj = fAllocator->allocObject();
350 SkPdfObject::makeInteger(value, obj);
351 return (SkPdfInteger*)obj;
352}
353
354SkPdfString* SkNativeParsedPDF::createString(unsigned char* sz, size_t len) const {
355 SkPdfObject* obj = fAllocator->allocObject();
356 SkPdfObject::makeString(sz, len, obj);
357 return (SkPdfString*)obj;
358}
359
edisonn@google.com571c70b2013-07-10 17:09:50 +0000360SkPdfAllocator* SkNativeParsedPDF::allocator() const {
361 return fAllocator;
362}
363
364SkPdfObject* SkNativeParsedPDF::resolveReference(SkPdfObject* ref) const {
365 return (SkPdfObject*)resolveReference((const SkPdfObject*)ref);
366}
367
368// TODO(edisonn): fix infinite loop if ref to itself!
369// TODO(edisonn): perf, fix refs at load, and resolve will simply return fResolvedReference?
370SkPdfObject* SkNativeParsedPDF::resolveReference(const SkPdfObject* ref) const {
371 if (ref && ref->isReference()) {
372 int id = ref->referenceId();
373 // TODO(edisonn): generation/updates not supported now
374 //int gen = ref->referenceGeneration();
375
376 SkASSERT(!(id < 0 || id > fObjects.count()));
377
378 if (id < 0 || id > fObjects.count()) {
379 return NULL;
380 }
381
382 // TODO(edisonn): verify id and gen expected
383
384 if (fObjects[id].fResolvedReference != NULL) {
385 return fObjects[id].fResolvedReference;
386 }
387
388 if (fObjects[id].fObj == NULL) {
389 fObjects[id].fObj = readObject(id);
390 }
391
392 if (fObjects[id].fResolvedReference == NULL) {
393 if (!fObjects[id].fObj->isReference()) {
394 fObjects[id].fResolvedReference = fObjects[id].fObj;
395 } else {
396 fObjects[id].fResolvedReference = resolveReference(fObjects[id].fObj);
397 }
398 }
399
400 return fObjects[id].fResolvedReference;
401 }
402 // TODO(edisonn): fix the mess with const, probably we need to remove it pretty much everywhere
403 return (SkPdfObject*)ref;
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000404}