blob: fe5d0ee830e986d1a3de6b210140b60db945ccb4 [file] [log] [blame]
edisonn@google.com3aac1f92013-07-02 22:42:53 +00001#include "SkNativeParsedPDF.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00002#include "SkPdfNativeTokenizer.h"
3#include "SkPdfBasics.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00004#include "SkPdfObject.h"
edisonn@google.com3aac1f92013-07-02 22:42:53 +00005
edisonn@google.com571c70b2013-07-10 17:09:50 +00006#include <stdio.h>
7#include <string.h>
8#include <sys/types.h>
9#include <sys/stat.h>
edisonn@google.com3aac1f92013-07-02 22:42:53 +000010
edisonn@google.com571c70b2013-07-10 17:09:50 +000011#include "SkPdfFileTrailerDictionary_autogen.h"
12#include "SkPdfCatalogDictionary_autogen.h"
13#include "SkPdfPageObjectDictionary_autogen.h"
14#include "SkPdfPageTreeNodeDictionary_autogen.h"
15#include "SkPdfMapper_autogen.h"
16
17
18
edisonn@google.coma3356fc2013-07-10 18:20:06 +000019static long getFileSize(const char* filename)
edisonn@google.com571c70b2013-07-10 17:09:50 +000020{
21 struct stat stat_buf;
22 int rc = stat(filename, &stat_buf);
edisonn@google.coma3356fc2013-07-10 18:20:06 +000023 return rc == 0 ? (long)stat_buf.st_size : -1;
edisonn@google.com3aac1f92013-07-02 22:42:53 +000024}
25
edisonn@google.coma3356fc2013-07-10 18:20:06 +000026static unsigned char* lineHome(unsigned char* start, unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000027 while (current > start && !isPdfEOL(*(current - 1))) {
28 current--;
29 }
30 return current;
31}
32
edisonn@google.coma3356fc2013-07-10 18:20:06 +000033static unsigned char* previousLineHome(unsigned char* start, unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000034 if (current > start && isPdfEOL(*(current - 1))) {
35 current--;
36 }
37
38 // allows CR+LF, LF+CR but not two CR+CR or LF+LF
39 if (current > start && isPdfEOL(*(current - 1)) && *current != *(current - 1)) {
40 current--;
41 }
42
43 while (current > start && !isPdfEOL(*(current - 1))) {
44 current--;
45 }
46
47 return current;
48}
49
edisonn@google.coma3356fc2013-07-10 18:20:06 +000050static unsigned char* ignoreLine(unsigned char* current, unsigned char* end) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000051 while (current < end && !isPdfEOL(*current)) {
52 current++;
53 }
54 current++;
55 if (current < end && isPdfEOL(*current) && *current != *(current - 1)) {
56 current++;
57 }
58 return current;
59}
60
edisonn@google.com222382b2013-07-10 22:33:10 +000061SkNativeParsedPDF* gDoc = NULL;
edisonn@google.com571c70b2013-07-10 17:09:50 +000062
63// TODO(edisonn): NYI
64// TODO(edisonn): 3 constructuctors from URL, from stream, from file ...
65// TODO(edisonn): write one that accepts errors in the file and ignores/fixis them
66// TODO(edisonn): testing:
67// 1) run on a lot of file
68// 2) recoverable corupt file: remove endobj, endsteam, remove other keywords, use other white spaces, insert comments randomly, ...
69// 3) irrecoverable corrupt file
edisonn@google.com432640a2013-07-10 22:53:40 +000070SkNativeParsedPDF::SkNativeParsedPDF(const char* path)
71 : fAllocator(new SkPdfAllocator())
72 , fRootCatalogRef(NULL)
73 , fRootCatalog(NULL) {
edisonn@google.com222382b2013-07-10 22:33:10 +000074 gDoc = this;
edisonn@google.com571c70b2013-07-10 17:09:50 +000075 FILE* file = fopen(path, "r");
76 fContentLength = getFileSize(path);
edisonn@google.com222382b2013-07-10 22:33:10 +000077 fFileContent = new unsigned char[fContentLength + 1];
edisonn@google.com620edc52013-07-18 13:03:03 +000078 bool ok = (0 != fread(fFileContent, fContentLength, 1, file));
edisonn@google.com222382b2013-07-10 22:33:10 +000079 fFileContent[fContentLength] = '\0';
edisonn@google.com571c70b2013-07-10 17:09:50 +000080 fclose(file);
81 file = NULL;
82
edisonn@google.com620edc52013-07-18 13:03:03 +000083 if (!ok) {
84 // TODO(edisonn): report read error
85 return; // Doc will have 0 pages
86 }
87
edisonn@google.com571c70b2013-07-10 17:09:50 +000088 unsigned char* eofLine = lineHome(fFileContent, fFileContent + fContentLength - 1);
89 unsigned char* xrefByteOffsetLine = previousLineHome(fFileContent, eofLine);
90 unsigned char* xrefstartKeywordLine = previousLineHome(fFileContent, xrefByteOffsetLine);
91
92 if (strcmp((char*)xrefstartKeywordLine, "startxref") != 0) {
93 // TODO(edisonn): report/issue
94 }
95
96 long xrefByteOffset = atol((const char*)xrefByteOffsetLine);
97
98 bool storeCatalog = true;
99 while (xrefByteOffset >= 0) {
100 unsigned char* trailerStart = readCrossReferenceSection(fFileContent + xrefByteOffset, xrefstartKeywordLine);
101 xrefByteOffset = readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog);
102 storeCatalog = false;
103 }
104
105 // TODO(edisonn): warn/error expect fObjects[fRefCatalogId].fGeneration == fRefCatalogGeneration
106 // TODO(edisonn): security, verify that SkPdfCatalogDictionary is indeed using mapper
107 // load catalog
edisonn@google.com571c70b2013-07-10 17:09:50 +0000108
edisonn@google.com432640a2013-07-10 22:53:40 +0000109 if (fRootCatalogRef) {
110 fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef);
edisonn@google.com8bad7372013-07-10 23:36:56 +0000111 if (fRootCatalog->isDictionary() && fRootCatalog->valid()) {
112 SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
113 if (tree && tree->isDictionary() && tree->valid()) {
114 fillPages(tree);
115 }
116 }
edisonn@google.com432640a2013-07-10 22:53:40 +0000117 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000118
edisonn@google.com8bad7372013-07-10 23:36:56 +0000119 // TODO(edisonn): corrupted pdf, read it from beginning and rebuild (xref, trailer, or just reall all objects)
120 // 0 pages
121
edisonn@google.com571c70b2013-07-10 17:09:50 +0000122 // now actually read all objects if we want, or do it lazyly
123 // and resolve references?... or not ...
124}
125
126// TODO(edisonn): NYI
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000127SkNativeParsedPDF::~SkNativeParsedPDF() {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000128 delete[] fFileContent;
129 delete fAllocator;
130}
131
132unsigned char* SkNativeParsedPDF::readCrossReferenceSection(unsigned char* xrefStart, unsigned char* trailerEnd) {
133 unsigned char* current = ignoreLine(xrefStart, trailerEnd); // TODO(edisonn): verify next keyord is "xref", use nextObject here
134
135 SkPdfObject token;
136 while (current < trailerEnd) {
137 token.reset();
138 unsigned char* previous = current;
edisonn@google.com951d6532013-07-10 23:17:31 +0000139 current = nextObject(current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000140 if (!token.isInteger()) {
141 return previous;
142 }
143
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000144 int startId = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000145 token.reset();
edisonn@google.com951d6532013-07-10 23:17:31 +0000146 current = nextObject(current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000147
148 if (!token.isInteger()) {
149 // TODO(edisonn): report/warning
150 return current;
151 }
152
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000153 int entries = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000154
155 for (int i = 0; i < entries; i++) {
156 token.reset();
edisonn@google.com951d6532013-07-10 23:17:31 +0000157 current = nextObject(current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000158 if (!token.isInteger()) {
159 // TODO(edisonn): report/warning
160 return current;
161 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000162 int offset = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000163
164 token.reset();
edisonn@google.com951d6532013-07-10 23:17:31 +0000165 current = nextObject(current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000166 if (!token.isInteger()) {
167 // TODO(edisonn): report/warning
168 return current;
169 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000170 int generation = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000171
172 token.reset();
edisonn@google.com951d6532013-07-10 23:17:31 +0000173 current = nextObject(current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000174 if (!token.isKeyword() || token.len() != 1 || (*token.c_str() != 'f' && *token.c_str() != 'n')) {
175 // TODO(edisonn): report/warning
176 return current;
177 }
178
179 addCrossSectionInfo(startId + i, generation, offset, *token.c_str() == 'f');
180 }
181 }
182 // TODO(edisonn): it should never get here? there is no trailer?
183 return current;
184}
185
186long SkNativeParsedPDF::readTrailer(unsigned char* trailerStart, unsigned char* trailerEnd, bool storeCatalog) {
187 unsigned char* current = ignoreLine(trailerStart, trailerEnd); // TODO(edisonn): verify next keyord is "trailer" use nextObject here
188
189 SkPdfObject token;
edisonn@google.com951d6532013-07-10 23:17:31 +0000190 current = nextObject(current, trailerEnd, &token, fAllocator, NULL);
edisonn@google.com432640a2013-07-10 22:53:40 +0000191 if (!token.isDictionary()) {
192 return -1;
193 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000194 SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
edisonn@google.com432640a2013-07-10 22:53:40 +0000195 if (!trailer->valid()) {
196 return -1;
197 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000198
199 if (storeCatalog) {
200 const SkPdfObject* ref = trailer->Root(NULL);
201 if (ref == NULL || !ref->isReference()) {
202 // TODO(edisonn): oops, we have to fix the corrup pdf file
203 return -1;
204 }
205 fRootCatalogRef = ref;
206 }
207
208 if (trailer->has_Prev()) {
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000209 return (long)trailer->Prev(NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000210 }
211
212 return -1;
213}
214
215void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
216 // TODO(edisonn): security here
217 while (fObjects.count() < id + 1) {
218 reset(fObjects.append());
219 }
220
221 fObjects[id].fOffset = offset;
222 fObjects[id].fObj = NULL;
223}
224
edisonn@google.com951d6532013-07-10 23:17:31 +0000225SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000226 long startOffset = fObjects[id].fOffset;
227 //long endOffset = fObjects[id].fOffsetEnd;
228 // TODO(edisonn): use hinted endOffset
229 // TODO(edisonn): current implementation will result in a lot of memory usage
230 // to decrease memory usage, we wither need to be smart and know where objects end, and we will
231 // alocate only the chancks needed, or the tokenizer will not make copies, but then it needs to
232 // cache the results so it does not go twice on the same buffer
233 unsigned char* current = fFileContent + startOffset;
234 unsigned char* end = fFileContent + fContentLength;
235
edisonn@google.com951d6532013-07-10 23:17:31 +0000236 SkPdfNativeTokenizer tokenizer(current, end - current, fMapper, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000237
238 SkPdfObject idObj;
239 SkPdfObject generationObj;
240 SkPdfObject objKeyword;
241 SkPdfObject* dict = fAllocator->allocObject();
242
edisonn@google.com951d6532013-07-10 23:17:31 +0000243 current = nextObject(current, end, &idObj, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000244 if (current >= end) {
245 // TODO(edisonn): report warning/error
246 return NULL;
247 }
248
edisonn@google.com951d6532013-07-10 23:17:31 +0000249 current = nextObject(current, end, &generationObj, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000250 if (current >= end) {
251 // TODO(edisonn): report warning/error
252 return NULL;
253 }
254
edisonn@google.com951d6532013-07-10 23:17:31 +0000255 current = nextObject(current, end, &objKeyword, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000256 if (current >= end) {
257 // TODO(edisonn): report warning/error
258 return NULL;
259 }
260
261 if (!idObj.isInteger() || !generationObj.isInteger() || id != idObj.intValue()/* || generation != generationObj.intValue()*/) {
262 // TODO(edisonn): report warning/error
263 }
264
265 if (!objKeyword.isKeyword() || strcmp(objKeyword.c_str(), "obj") != 0) {
266 // TODO(edisonn): report warning/error
267 }
268
edisonn@google.com951d6532013-07-10 23:17:31 +0000269 current = nextObject(current, end, dict, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000270
271 // TODO(edisonn): report warning/error - verify last token is endobj
272
273 return dict;
274}
275
276void SkNativeParsedPDF::fillPages(SkPdfPageTreeNodeDictionary* tree) {
277 const SkPdfArray* kids = tree->Kids(this);
278 if (kids == NULL) {
279 *fPages.append() = (SkPdfPageObjectDictionary*)tree;
280 return;
281 }
282
283 int cnt = kids->size();
284 for (int i = 0; i < cnt; i++) {
285 const SkPdfObject* obj = resolveReference(kids->objAtAIndex(i));
286 if (fMapper->mapPageObjectDictionary(obj) != kPageObjectDictionary_SkPdfObjectType) {
287 *fPages.append() = (SkPdfPageObjectDictionary*)obj;
288 } else {
289 // TODO(edisonn): verify that it is a page tree indeed
290 fillPages((SkPdfPageTreeNodeDictionary*)obj);
291 }
292 }
293}
294
295int SkNativeParsedPDF::pages() const {
296 return fPages.count();
297}
298
299SkPdfResourceDictionary* SkNativeParsedPDF::pageResources(int page) {
300 return fPages[page]->Resources(this);
301}
302
303// TODO(edisonn): Partial implemented. Move the logics directly in the code generator for inheritable and default value?
edisonn@google.com951d6532013-07-10 23:17:31 +0000304SkRect SkNativeParsedPDF::MediaBox(int page) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000305 SkPdfPageObjectDictionary* current = fPages[page];
306 while (!current->has_MediaBox() && current->has_Parent()) {
307 current = (SkPdfPageObjectDictionary*)current->Parent(this);
308 }
309 if (current) {
310 return current->MediaBox(this);
311 }
312 return SkRect::MakeEmpty();
313}
314
315// TODO(edisonn): stream or array ... ? for now only array
edisonn@google.com951d6532013-07-10 23:17:31 +0000316SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfPage(int page) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000317 if (fPages[page]->isContentsAStream(this)) {
318 return tokenizerOfStream(fPages[page]->getContentsAsStream(this));
319 } else {
320 // TODO(edisonn): NYI, we need to concatenate all streams in the array or make the tokenizer smart
321 // so we don't allocate new memory
322 return NULL;
323 }
324}
325
edisonn@google.com951d6532013-07-10 23:17:31 +0000326SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfStream(SkPdfObject* stream) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000327 if (stream == NULL) {
328 return NULL;
329 }
330
edisonn@google.com951d6532013-07-10 23:17:31 +0000331 return new SkPdfNativeTokenizer(stream, fMapper, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000332}
333
334// TODO(edisonn): NYI
edisonn@google.com951d6532013-07-10 23:17:31 +0000335SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfBuffer(unsigned char* buffer, size_t len) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000336 // warning does not track two calls in the same buffer! the buffer is updated!
337 // make a clean copy if needed!
edisonn@google.com951d6532013-07-10 23:17:31 +0000338 return new SkPdfNativeTokenizer(buffer, len, fMapper, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000339}
340
341size_t SkNativeParsedPDF::objects() const {
342 return fObjects.count();
343}
344
345SkPdfObject* SkNativeParsedPDF::object(int i) {
346 SkASSERT(!(i < 0 || i > fObjects.count()));
347
348 if (i < 0 || i > fObjects.count()) {
349 return NULL;
350 }
351
352 if (fObjects[i].fObj == NULL) {
353 // TODO(edisonn): when we read the cross reference sections, store the start of the next object
354 // and fill fOffsetEnd
355 fObjects[i].fObj = readObject(i);
356 }
357
358 return fObjects[i].fObj;
359}
360
361const SkPdfMapper* SkNativeParsedPDF::mapper() const {
362 return fMapper;
363}
364
365SkPdfReal* SkNativeParsedPDF::createReal(double value) const {
366 SkPdfObject* obj = fAllocator->allocObject();
367 SkPdfObject::makeReal(value, obj);
368 return (SkPdfReal*)obj;
369}
370
371SkPdfInteger* SkNativeParsedPDF::createInteger(int value) const {
372 SkPdfObject* obj = fAllocator->allocObject();
373 SkPdfObject::makeInteger(value, obj);
374 return (SkPdfInteger*)obj;
375}
376
377SkPdfString* SkNativeParsedPDF::createString(unsigned char* sz, size_t len) const {
378 SkPdfObject* obj = fAllocator->allocObject();
379 SkPdfObject::makeString(sz, len, obj);
380 return (SkPdfString*)obj;
381}
382
edisonn@google.com571c70b2013-07-10 17:09:50 +0000383SkPdfAllocator* SkNativeParsedPDF::allocator() const {
384 return fAllocator;
385}
386
edisonn@google.com571c70b2013-07-10 17:09:50 +0000387// TODO(edisonn): fix infinite loop if ref to itself!
388// TODO(edisonn): perf, fix refs at load, and resolve will simply return fResolvedReference?
edisonn@google.com951d6532013-07-10 23:17:31 +0000389SkPdfObject* SkNativeParsedPDF::resolveReference(const SkPdfObject* ref) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000390 if (ref && ref->isReference()) {
391 int id = ref->referenceId();
392 // TODO(edisonn): generation/updates not supported now
393 //int gen = ref->referenceGeneration();
394
395 SkASSERT(!(id < 0 || id > fObjects.count()));
396
397 if (id < 0 || id > fObjects.count()) {
398 return NULL;
399 }
400
401 // TODO(edisonn): verify id and gen expected
402
403 if (fObjects[id].fResolvedReference != NULL) {
404 return fObjects[id].fResolvedReference;
405 }
406
407 if (fObjects[id].fObj == NULL) {
408 fObjects[id].fObj = readObject(id);
409 }
410
411 if (fObjects[id].fResolvedReference == NULL) {
412 if (!fObjects[id].fObj->isReference()) {
413 fObjects[id].fResolvedReference = fObjects[id].fObj;
414 } else {
415 fObjects[id].fResolvedReference = resolveReference(fObjects[id].fObj);
416 }
417 }
418
419 return fObjects[id].fResolvedReference;
420 }
421 // TODO(edisonn): fix the mess with const, probably we need to remove it pretty much everywhere
422 return (SkPdfObject*)ref;
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000423}
edisonn@google.coma5aaa792013-07-11 12:27:21 +0000424
edisonn@google.com7b328fd2013-07-11 12:53:06 +0000425size_t SkNativeParsedPDF::bytesUsed() const {
edisonn@google.coma5aaa792013-07-11 12:27:21 +0000426 return fAllocator->bytesUsed() +
427 fContentLength +
428 fObjects.count() * sizeof(PublicObjectEntry) +
429 fPages.count() * sizeof(SkPdfPageObjectDictionary*) +
430 sizeof(*this);
431}