blob: ac543040b0fdd394b25bce25327216061268bcf9 [file] [log] [blame]
edisonn@google.com3aac1f92013-07-02 22:42:53 +00001#include "SkNativeParsedPDF.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00002#include "SkPdfNativeTokenizer.h"
3#include "SkPdfBasics.h"
edisonn@google.com571c70b2013-07-10 17:09:50 +00004#include "SkPdfObject.h"
edisonn@google.com3aac1f92013-07-02 22:42:53 +00005
edisonn@google.com571c70b2013-07-10 17:09:50 +00006#include <stdio.h>
7#include <string.h>
8#include <sys/types.h>
9#include <sys/stat.h>
edisonn@google.com3aac1f92013-07-02 22:42:53 +000010
edisonn@google.com571c70b2013-07-10 17:09:50 +000011#include "SkPdfFileTrailerDictionary_autogen.h"
12#include "SkPdfCatalogDictionary_autogen.h"
13#include "SkPdfPageObjectDictionary_autogen.h"
14#include "SkPdfPageTreeNodeDictionary_autogen.h"
15#include "SkPdfMapper_autogen.h"
16
17
18
edisonn@google.coma3356fc2013-07-10 18:20:06 +000019static long getFileSize(const char* filename)
edisonn@google.com571c70b2013-07-10 17:09:50 +000020{
21 struct stat stat_buf;
22 int rc = stat(filename, &stat_buf);
edisonn@google.coma3356fc2013-07-10 18:20:06 +000023 return rc == 0 ? (long)stat_buf.st_size : -1;
edisonn@google.com3aac1f92013-07-02 22:42:53 +000024}
25
edisonn@google.com2ccc3af2013-07-23 17:43:18 +000026static const unsigned char* lineHome(const unsigned char* start, const unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000027 while (current > start && !isPdfEOL(*(current - 1))) {
28 current--;
29 }
30 return current;
31}
32
edisonn@google.com2ccc3af2013-07-23 17:43:18 +000033static const unsigned char* previousLineHome(const unsigned char* start, const unsigned char* current) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000034 if (current > start && isPdfEOL(*(current - 1))) {
35 current--;
36 }
37
38 // allows CR+LF, LF+CR but not two CR+CR or LF+LF
39 if (current > start && isPdfEOL(*(current - 1)) && *current != *(current - 1)) {
40 current--;
41 }
42
43 while (current > start && !isPdfEOL(*(current - 1))) {
44 current--;
45 }
46
47 return current;
48}
49
edisonn@google.com2ccc3af2013-07-23 17:43:18 +000050static const unsigned char* ignoreLine(const unsigned char* current, const unsigned char* end) {
edisonn@google.com571c70b2013-07-10 17:09:50 +000051 while (current < end && !isPdfEOL(*current)) {
52 current++;
53 }
54 current++;
55 if (current < end && isPdfEOL(*current) && *current != *(current - 1)) {
56 current++;
57 }
58 return current;
59}
60
edisonn@google.com222382b2013-07-10 22:33:10 +000061SkNativeParsedPDF* gDoc = NULL;
edisonn@google.com571c70b2013-07-10 17:09:50 +000062
63// TODO(edisonn): NYI
64// TODO(edisonn): 3 constructuctors from URL, from stream, from file ...
65// TODO(edisonn): write one that accepts errors in the file and ignores/fixis them
66// TODO(edisonn): testing:
67// 1) run on a lot of file
68// 2) recoverable corupt file: remove endobj, endsteam, remove other keywords, use other white spaces, insert comments randomly, ...
69// 3) irrecoverable corrupt file
edisonn@google.com432640a2013-07-10 22:53:40 +000070SkNativeParsedPDF::SkNativeParsedPDF(const char* path)
71 : fAllocator(new SkPdfAllocator())
72 , fRootCatalogRef(NULL)
73 , fRootCatalog(NULL) {
edisonn@google.com222382b2013-07-10 22:33:10 +000074 gDoc = this;
edisonn@google.com571c70b2013-07-10 17:09:50 +000075 FILE* file = fopen(path, "r");
76 fContentLength = getFileSize(path);
edisonn@google.com2ccc3af2013-07-23 17:43:18 +000077 unsigned char* content = new unsigned char[fContentLength + 1];
78 bool ok = (0 != fread(content, fContentLength, 1, file));
79 content[fContentLength] = '\0';
80 fFileContent = content;
edisonn@google.com571c70b2013-07-10 17:09:50 +000081 fclose(file);
82 file = NULL;
83
edisonn@google.com620edc52013-07-18 13:03:03 +000084 if (!ok) {
85 // TODO(edisonn): report read error
86 return; // Doc will have 0 pages
87 }
88
edisonn@google.com2ccc3af2013-07-23 17:43:18 +000089 const unsigned char* eofLine = lineHome(fFileContent, fFileContent + fContentLength - 1);
90 const unsigned char* xrefByteOffsetLine = previousLineHome(fFileContent, eofLine);
91 const unsigned char* xrefstartKeywordLine = previousLineHome(fFileContent, xrefByteOffsetLine);
edisonn@google.com571c70b2013-07-10 17:09:50 +000092
93 if (strcmp((char*)xrefstartKeywordLine, "startxref") != 0) {
94 // TODO(edisonn): report/issue
95 }
96
97 long xrefByteOffset = atol((const char*)xrefByteOffsetLine);
98
99 bool storeCatalog = true;
100 while (xrefByteOffset >= 0) {
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000101 const unsigned char* trailerStart = readCrossReferenceSection(fFileContent + xrefByteOffset, xrefstartKeywordLine);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000102 xrefByteOffset = readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog);
103 storeCatalog = false;
104 }
105
106 // TODO(edisonn): warn/error expect fObjects[fRefCatalogId].fGeneration == fRefCatalogGeneration
107 // TODO(edisonn): security, verify that SkPdfCatalogDictionary is indeed using mapper
108 // load catalog
edisonn@google.com571c70b2013-07-10 17:09:50 +0000109
edisonn@google.com432640a2013-07-10 22:53:40 +0000110 if (fRootCatalogRef) {
111 fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef);
edisonn@google.com8bad7372013-07-10 23:36:56 +0000112 if (fRootCatalog->isDictionary() && fRootCatalog->valid()) {
113 SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
114 if (tree && tree->isDictionary() && tree->valid()) {
115 fillPages(tree);
116 }
117 }
edisonn@google.com432640a2013-07-10 22:53:40 +0000118 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000119
edisonn@google.com8bad7372013-07-10 23:36:56 +0000120 // TODO(edisonn): corrupted pdf, read it from beginning and rebuild (xref, trailer, or just reall all objects)
121 // 0 pages
122
edisonn@google.com571c70b2013-07-10 17:09:50 +0000123 // now actually read all objects if we want, or do it lazyly
124 // and resolve references?... or not ...
125}
126
127// TODO(edisonn): NYI
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000128SkNativeParsedPDF::~SkNativeParsedPDF() {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000129 delete[] fFileContent;
130 delete fAllocator;
131}
132
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000133const unsigned char* SkNativeParsedPDF::readCrossReferenceSection(const unsigned char* xrefStart, const unsigned char* trailerEnd) {
134 const unsigned char* current = ignoreLine(xrefStart, trailerEnd); // TODO(edisonn): verify next keyord is "xref", use nextObject here
edisonn@google.com571c70b2013-07-10 17:09:50 +0000135
136 SkPdfObject token;
137 while (current < trailerEnd) {
138 token.reset();
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000139 const unsigned char* previous = current;
140 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000141 if (!token.isInteger()) {
142 return previous;
143 }
144
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000145 int startId = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000146 token.reset();
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000147 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000148
149 if (!token.isInteger()) {
150 // TODO(edisonn): report/warning
151 return current;
152 }
153
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000154 int entries = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000155
156 for (int i = 0; i < entries; i++) {
157 token.reset();
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000158 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000159 if (!token.isInteger()) {
160 // TODO(edisonn): report/warning
161 return current;
162 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000163 int offset = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000164
165 token.reset();
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000166 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000167 if (!token.isInteger()) {
168 // TODO(edisonn): report/warning
169 return current;
170 }
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000171 int generation = (int)token.intValue();
edisonn@google.com571c70b2013-07-10 17:09:50 +0000172
173 token.reset();
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000174 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000175 if (!token.isKeyword() || token.len() != 1 || (*token.c_str() != 'f' && *token.c_str() != 'n')) {
176 // TODO(edisonn): report/warning
177 return current;
178 }
179
180 addCrossSectionInfo(startId + i, generation, offset, *token.c_str() == 'f');
181 }
182 }
183 // TODO(edisonn): it should never get here? there is no trailer?
184 return current;
185}
186
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000187long SkNativeParsedPDF::readTrailer(const unsigned char* trailerStart, const unsigned char* trailerEnd, bool storeCatalog) {
188 SkPdfObject trailerKeyword;
189 // TODO(edisonn): use null allocator, and let it just fail if memory
190 // needs allocated (but no crash)!
191 const unsigned char* current =
192 nextObject(0, trailerStart, trailerEnd, &trailerKeyword, fAllocator, NULL);
193
194 if (strlen("trailer") != trailerKeyword.len() &&
195 strncmp(trailerKeyword.c_str(), "trailer", strlen("trailer")) != 0) {
196 // TODO(edisonn): report warning, rebuild trailer from objects.
197 return -1;
198 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000199
200 SkPdfObject token;
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000201 current = nextObject(0, current, trailerEnd, &token, fAllocator, NULL);
edisonn@google.com432640a2013-07-10 22:53:40 +0000202 if (!token.isDictionary()) {
203 return -1;
204 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000205 SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
edisonn@google.com432640a2013-07-10 22:53:40 +0000206 if (!trailer->valid()) {
207 return -1;
208 }
edisonn@google.com571c70b2013-07-10 17:09:50 +0000209
210 if (storeCatalog) {
211 const SkPdfObject* ref = trailer->Root(NULL);
212 if (ref == NULL || !ref->isReference()) {
213 // TODO(edisonn): oops, we have to fix the corrup pdf file
214 return -1;
215 }
216 fRootCatalogRef = ref;
217 }
218
219 if (trailer->has_Prev()) {
edisonn@google.coma3356fc2013-07-10 18:20:06 +0000220 return (long)trailer->Prev(NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000221 }
222
223 return -1;
224}
225
226void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
227 // TODO(edisonn): security here
228 while (fObjects.count() < id + 1) {
229 reset(fObjects.append());
230 }
231
232 fObjects[id].fOffset = offset;
233 fObjects[id].fObj = NULL;
234}
235
edisonn@google.com951d6532013-07-10 23:17:31 +0000236SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000237 long startOffset = fObjects[id].fOffset;
238 //long endOffset = fObjects[id].fOffsetEnd;
239 // TODO(edisonn): use hinted endOffset
240 // TODO(edisonn): current implementation will result in a lot of memory usage
241 // to decrease memory usage, we wither need to be smart and know where objects end, and we will
242 // alocate only the chancks needed, or the tokenizer will not make copies, but then it needs to
243 // cache the results so it does not go twice on the same buffer
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000244 const unsigned char* current = fFileContent + startOffset;
245 const unsigned char* end = fFileContent + fContentLength;
edisonn@google.com571c70b2013-07-10 17:09:50 +0000246
edisonn@google.com951d6532013-07-10 23:17:31 +0000247 SkPdfNativeTokenizer tokenizer(current, end - current, fMapper, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000248
249 SkPdfObject idObj;
250 SkPdfObject generationObj;
251 SkPdfObject objKeyword;
252 SkPdfObject* dict = fAllocator->allocObject();
253
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000254 current = nextObject(0, current, end, &idObj, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000255 if (current >= end) {
256 // TODO(edisonn): report warning/error
257 return NULL;
258 }
259
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000260 current = nextObject(0, current, end, &generationObj, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000261 if (current >= end) {
262 // TODO(edisonn): report warning/error
263 return NULL;
264 }
265
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000266 current = nextObject(0, current, end, &objKeyword, NULL, NULL);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000267 if (current >= end) {
268 // TODO(edisonn): report warning/error
269 return NULL;
270 }
271
272 if (!idObj.isInteger() || !generationObj.isInteger() || id != idObj.intValue()/* || generation != generationObj.intValue()*/) {
273 // TODO(edisonn): report warning/error
274 }
275
276 if (!objKeyword.isKeyword() || strcmp(objKeyword.c_str(), "obj") != 0) {
277 // TODO(edisonn): report warning/error
278 }
279
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000280 current = nextObject(1, current, end, dict, fAllocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000281
282 // TODO(edisonn): report warning/error - verify last token is endobj
283
284 return dict;
285}
286
287void SkNativeParsedPDF::fillPages(SkPdfPageTreeNodeDictionary* tree) {
288 const SkPdfArray* kids = tree->Kids(this);
289 if (kids == NULL) {
290 *fPages.append() = (SkPdfPageObjectDictionary*)tree;
291 return;
292 }
293
294 int cnt = kids->size();
295 for (int i = 0; i < cnt; i++) {
296 const SkPdfObject* obj = resolveReference(kids->objAtAIndex(i));
297 if (fMapper->mapPageObjectDictionary(obj) != kPageObjectDictionary_SkPdfObjectType) {
298 *fPages.append() = (SkPdfPageObjectDictionary*)obj;
299 } else {
300 // TODO(edisonn): verify that it is a page tree indeed
301 fillPages((SkPdfPageTreeNodeDictionary*)obj);
302 }
303 }
304}
305
306int SkNativeParsedPDF::pages() const {
307 return fPages.count();
308}
309
310SkPdfResourceDictionary* SkNativeParsedPDF::pageResources(int page) {
311 return fPages[page]->Resources(this);
312}
313
314// TODO(edisonn): Partial implemented. Move the logics directly in the code generator for inheritable and default value?
edisonn@google.com951d6532013-07-10 23:17:31 +0000315SkRect SkNativeParsedPDF::MediaBox(int page) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000316 SkPdfPageObjectDictionary* current = fPages[page];
317 while (!current->has_MediaBox() && current->has_Parent()) {
318 current = (SkPdfPageObjectDictionary*)current->Parent(this);
319 }
320 if (current) {
321 return current->MediaBox(this);
322 }
323 return SkRect::MakeEmpty();
324}
325
326// TODO(edisonn): stream or array ... ? for now only array
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000327SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfPage(int page,
328 SkPdfAllocator* allocator) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000329 if (fPages[page]->isContentsAStream(this)) {
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000330 return tokenizerOfStream(fPages[page]->getContentsAsStream(this), allocator);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000331 } else {
332 // TODO(edisonn): NYI, we need to concatenate all streams in the array or make the tokenizer smart
333 // so we don't allocate new memory
334 return NULL;
335 }
336}
337
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000338SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfStream(SkPdfObject* stream,
339 SkPdfAllocator* allocator) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000340 if (stream == NULL) {
341 return NULL;
342 }
343
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000344 return new SkPdfNativeTokenizer(stream, fMapper, allocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000345}
346
347// TODO(edisonn): NYI
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000348SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfBuffer(const unsigned char* buffer, size_t len,
349 SkPdfAllocator* allocator) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000350 // warning does not track two calls in the same buffer! the buffer is updated!
351 // make a clean copy if needed!
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000352 return new SkPdfNativeTokenizer(buffer, len, fMapper, allocator, this);
edisonn@google.com571c70b2013-07-10 17:09:50 +0000353}
354
355size_t SkNativeParsedPDF::objects() const {
356 return fObjects.count();
357}
358
359SkPdfObject* SkNativeParsedPDF::object(int i) {
360 SkASSERT(!(i < 0 || i > fObjects.count()));
361
362 if (i < 0 || i > fObjects.count()) {
363 return NULL;
364 }
365
366 if (fObjects[i].fObj == NULL) {
367 // TODO(edisonn): when we read the cross reference sections, store the start of the next object
368 // and fill fOffsetEnd
369 fObjects[i].fObj = readObject(i);
370 }
371
372 return fObjects[i].fObj;
373}
374
375const SkPdfMapper* SkNativeParsedPDF::mapper() const {
376 return fMapper;
377}
378
379SkPdfReal* SkNativeParsedPDF::createReal(double value) const {
380 SkPdfObject* obj = fAllocator->allocObject();
381 SkPdfObject::makeReal(value, obj);
382 return (SkPdfReal*)obj;
383}
384
385SkPdfInteger* SkNativeParsedPDF::createInteger(int value) const {
386 SkPdfObject* obj = fAllocator->allocObject();
387 SkPdfObject::makeInteger(value, obj);
388 return (SkPdfInteger*)obj;
389}
390
edisonn@google.com2ccc3af2013-07-23 17:43:18 +0000391SkPdfString* SkNativeParsedPDF::createString(const unsigned char* sz, size_t len) const {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000392 SkPdfObject* obj = fAllocator->allocObject();
393 SkPdfObject::makeString(sz, len, obj);
394 return (SkPdfString*)obj;
395}
396
edisonn@google.com571c70b2013-07-10 17:09:50 +0000397SkPdfAllocator* SkNativeParsedPDF::allocator() const {
398 return fAllocator;
399}
400
edisonn@google.com571c70b2013-07-10 17:09:50 +0000401// TODO(edisonn): fix infinite loop if ref to itself!
402// TODO(edisonn): perf, fix refs at load, and resolve will simply return fResolvedReference?
edisonn@google.com951d6532013-07-10 23:17:31 +0000403SkPdfObject* SkNativeParsedPDF::resolveReference(const SkPdfObject* ref) {
edisonn@google.com571c70b2013-07-10 17:09:50 +0000404 if (ref && ref->isReference()) {
405 int id = ref->referenceId();
406 // TODO(edisonn): generation/updates not supported now
407 //int gen = ref->referenceGeneration();
408
409 SkASSERT(!(id < 0 || id > fObjects.count()));
410
411 if (id < 0 || id > fObjects.count()) {
412 return NULL;
413 }
414
415 // TODO(edisonn): verify id and gen expected
416
417 if (fObjects[id].fResolvedReference != NULL) {
418 return fObjects[id].fResolvedReference;
419 }
420
421 if (fObjects[id].fObj == NULL) {
422 fObjects[id].fObj = readObject(id);
423 }
424
425 if (fObjects[id].fResolvedReference == NULL) {
426 if (!fObjects[id].fObj->isReference()) {
427 fObjects[id].fResolvedReference = fObjects[id].fObj;
428 } else {
429 fObjects[id].fResolvedReference = resolveReference(fObjects[id].fObj);
430 }
431 }
432
433 return fObjects[id].fResolvedReference;
434 }
435 // TODO(edisonn): fix the mess with const, probably we need to remove it pretty much everywhere
436 return (SkPdfObject*)ref;
edisonn@google.com3aac1f92013-07-02 22:42:53 +0000437}
edisonn@google.coma5aaa792013-07-11 12:27:21 +0000438
edisonn@google.com7b328fd2013-07-11 12:53:06 +0000439size_t SkNativeParsedPDF::bytesUsed() const {
edisonn@google.coma5aaa792013-07-11 12:27:21 +0000440 return fAllocator->bytesUsed() +
441 fContentLength +
442 fObjects.count() * sizeof(PublicObjectEntry) +
443 fPages.count() * sizeof(SkPdfPageObjectDictionary*) +
444 sizeof(*this);
445}