pdfviewer: load files with missing xref (we need in order to help with testing, as most good pdfx in the whild miss the xref). add period as a valid character to start a real value. Review URL: https://codereview.chromium.org/21096006 git-svn-id: http://skia.googlecode.com/svn/trunk@10423 2bbb7eff-a529-9590-31e7-b0007b416f81

commit: 4ef4bed00efd247a0ea005b95b7239a9d4c14c68 [log] [tgz]
author: edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> Mon Jul 29 22:14:45 2013 +0000
committer: edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81> Mon Jul 29 22:14:45 2013 +0000
tree: 7a82f5984b24ecff66dbbd1ba05d78b768924a9c
parent: d49173afc862e0a33133190c392cd5a221a7e51f [diff]
diff --git a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
index 8892ee2..e54ba82 100644
--- a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp
+++ b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp

@@ -123,7 +123,7 @@
     bool storeCatalog = true;
     while (xrefByteOffset >= 0) {
         const unsigned char* trailerStart = readCrossReferenceSection(fFileContent + xrefByteOffset, xrefstartKeywordLine);
-        xrefByteOffset = readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog);
+        readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog, &xrefByteOffset, false);
         storeCatalog = false;
     }
 
@@ -141,6 +141,12 @@
         }
     }
 
+    // TODO(edisonn): clean up this doc, or better, let the caller call again and build a new doc
+    // caller should be a static function.
+    if (pages() == 0) {
+        loadWithoutXRef();
+    }
+
     // TODO(edisonn): corrupted pdf, read it from beginning and rebuild (xref, trailer, or just reall all objects)
     // 0 pages
 
@@ -148,6 +154,67 @@
     // and resolve references?... or not ...
 }
 
+void SkNativeParsedPDF::loadWithoutXRef() {
+    const unsigned char* current = fFileContent;
+    const unsigned char* end = fFileContent + fContentLength;
+
+    // TODO(edisonn): read pdf version
+    current = ignoreLine(current, end);
+
+    current = skipPdfWhiteSpaces(0, current, end);
+    while (current < end) {
+        SkPdfObject token;
+        current = nextObject(0, current, end, &token, NULL, NULL);
+        if (token.isInteger()) {
+            int id = (int)token.intValue();
+
+            token.reset();
+            current = nextObject(0, current, end, &token, NULL, NULL);
+            // int generation = (int)token.intValue();  // TODO(edisonn): ignored for now
+
+            token.reset();
+            current = nextObject(0, current, end, &token, NULL, NULL);
+            // TODO(edisonn): must be obj, return error if not? ignore ?
+            if (!token.isKeyword("obj")) {
+                continue;
+            }
+
+            while (fObjects.count() < id + 1) {
+                reset(fObjects.append());
+            }
+
+            fObjects[id].fOffset = current - fFileContent;
+
+            SkPdfObject* obj = fAllocator->allocObject();
+            current = nextObject(0, current, end, obj, fAllocator, this);
+
+            fObjects[id].fResolvedReference = obj;
+            fObjects[id].fObj = obj;
+
+            // set objects
+        } else if (token.isKeyword("trailer")) {
+            long dummy;
+            current = readTrailer(current, end, true, &dummy, true);
+        } else if (token.isKeyword("startxref")) {
+            token.reset();
+            current = nextObject(0, current, end, &token, NULL, NULL);  // ignore
+        }
+
+        current = skipPdfWhiteSpaces(0, current, end);
+    }
+
+    if (fRootCatalogRef) {
+        fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef);
+        if (fRootCatalog->isDictionary() && fRootCatalog->valid()) {
+            SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
+            if (tree && tree->isDictionary() && tree->valid()) {
+                fillPages(tree);
+            }
+        }
+    }
+
+}
+
 // TODO(edisonn): NYI
 SkNativeParsedPDF::~SkNativeParsedPDF() {
     sk_free((void*)fFileContent);
@@ -208,43 +275,47 @@
     return current;
 }
 
-long SkNativeParsedPDF::readTrailer(const unsigned char* trailerStart, const unsigned char* trailerEnd, bool storeCatalog) {
-    SkPdfObject trailerKeyword;
-    // TODO(edisonn): use null allocator, and let it just fail if memory
-    // needs allocated (but no crash)!
-    const unsigned char* current =
-            nextObject(0, trailerStart, trailerEnd, &trailerKeyword, NULL, NULL);
+const unsigned char* SkNativeParsedPDF::readTrailer(const unsigned char* trailerStart, const unsigned char* trailerEnd, bool storeCatalog, long* prev, bool skipKeyword) {
+    *prev = -1;
 
-    if (!trailerKeyword.isKeyword() || strlen("trailer") != trailerKeyword.lenstr() ||
-        strncmp(trailerKeyword.c_str(), "trailer", strlen("trailer")) != 0) {
-        // TODO(edisonn): report warning, rebuild trailer from objects.
-        return -1;
+    const unsigned char* current = trailerStart;
+    if (!skipKeyword) {
+        SkPdfObject trailerKeyword;
+        // TODO(edisonn): use null allocator, and let it just fail if memory
+        // needs allocated (but no crash)!
+        current = nextObject(0, current, trailerEnd, &trailerKeyword, NULL, NULL);
+
+        if (!trailerKeyword.isKeyword() || strlen("trailer") != trailerKeyword.lenstr() ||
+            strncmp(trailerKeyword.c_str(), "trailer", strlen("trailer")) != 0) {
+            // TODO(edisonn): report warning, rebuild trailer from objects.
+            return current;
+        }
     }
 
     SkPdfObject token;
     current = nextObject(0, current, trailerEnd, &token, fAllocator, NULL);
     if (!token.isDictionary()) {
-        return -1;
+        return current;
     }
     SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
     if (!trailer->valid()) {
-        return -1;
+        return current;
     }
 
     if (storeCatalog) {
         const SkPdfObject* ref = trailer->Root(NULL);
         if (ref == NULL || !ref->isReference()) {
             // TODO(edisonn): oops, we have to fix the corrup pdf file
-            return -1;
+            return current;
         }
         fRootCatalogRef = ref;
     }
 
     if (trailer->has_Prev()) {
-        return (long)trailer->Prev(NULL);
+        *prev = (long)trailer->Prev(NULL);
     }
 
-    return -1;
+    return current;
 }
 
 void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
@@ -255,6 +326,7 @@
 
     fObjects[id].fOffset = offset;
     fObjects[id].fObj = NULL;
+    fObjects[id].fResolvedReference = NULL;
 }
 
 SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) {

diff --git a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.h b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.h
index d55d808..77a98c7 100644
--- a/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.h
+++ b/experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.h

@@ -72,9 +72,10 @@
 
     // Takes ownership of bytes.
     void init(const void* bytes, size_t length);
+    void loadWithoutXRef();
 
     const unsigned char* readCrossReferenceSection(const unsigned char* xrefStart, const unsigned char* trailerEnd);
-    long readTrailer(const unsigned char* trailerStart, const unsigned char* trailerEnd, bool storeCatalog);
+    const unsigned char* readTrailer(const unsigned char* trailerStart, const unsigned char* trailerEnd, bool storeCatalog, long* prev, bool skipKeyword);
 
     // TODO(edisonn): updates not supported right now, generation ignored
     void addCrossSectionInfo(int id, int generation, int offset, bool isFreed);

diff --git a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp
index 41bd92d..09b7a0b 100644
--- a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp
+++ b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.cpp

@@ -83,9 +83,9 @@
 #define TRACE_HEXSTRING(start,end)
 #endif
 
-static const unsigned char* skipPdfWhiteSpaces(int level, const unsigned char* start, const unsigned char* end) {
+const unsigned char* skipPdfWhiteSpaces(int level, const unsigned char* start, const unsigned char* end) {
     TRACE_INDENT(level, "White Space");
-    while (start < end && isPdfWhiteSpace(*start)) {
+    while (start < end && (isPdfWhiteSpace(*start) || *start == kComment_PdfDelimiter)) {
         TRACE_COMMENT(*start);
         if (*start == kComment_PdfDelimiter) {
             // skip the comment until end of line
@@ -103,7 +103,7 @@
 }
 
 // TODO(edisonn) '(' can be used, will it break the string a delimiter or space inside () ?
-static const unsigned char* endOfPdfToken(int level, const unsigned char* start, const unsigned char* end) {
+const unsigned char* endOfPdfToken(int level, const unsigned char* start, const unsigned char* end) {
     //int opened brackets
     //TODO(edisonn): what out for special chars, like \n, \032
     TRACE_INDENT(level, "Token");
@@ -636,6 +636,21 @@
     // TODO(edisonn): laod external streams
     // TODO(edisonn): look at the last filter, to determione how to deal with possible issue
 
+
+    if (length >= 0) {
+        const unsigned char* endstream = start + length;
+
+        if (endstream[0] == kCR_PdfWhiteSpace && endstream[1] == kLF_PdfWhiteSpace) {
+            endstream += 2;
+        } else if (endstream[0] == kLF_PdfWhiteSpace) {
+            endstream += 1;
+        }
+
+        if (strncmp((const char*)endstream, "endstream", strlen("endstream")) != 0) {
+            length = -1;
+        }
+    }
+
     if (length < 0) {
         // scan the buffer, until we find first endstream
         // TODO(edisonn): all buffers must have a 0 at the end now,

diff --git a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.h b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.h
index 2884937..134f7b3 100644
--- a/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.h
+++ b/experimental/PdfViewer/pdfparser/native/SkPdfNativeTokenizer.h

@@ -60,11 +60,10 @@
 #define isPdfWhiteSpaceOrPdfDelimiter(ch) (isPdfWhiteSpace(ch)||isPdfDelimiter(ch))
 
 #define isPdfDigit(ch) ((ch)>='0'&&(ch)<='9')
-#define isPdfNumeric(ch) (isPdfDigit(ch)||(ch)=='+'||(ch)=='-')
+#define isPdfNumeric(ch) (isPdfDigit(ch)||(ch)=='+'||(ch)=='-'||(ch)=='.')
 
-const unsigned char* skipPdfWhiteSpaces(int level, const unsigned char* buffer, size_t len);
-const unsigned char* endOfPdfToken(int level, const unsigned char* start, size_t len);
-const unsigned char* skipPdfComment(int level, const unsigned char* start, size_t len);
+const unsigned char* skipPdfWhiteSpaces(int level, const unsigned char* buffer, const unsigned char* end);
+const unsigned char* endOfPdfToken(int level, const unsigned char* start, const unsigned char* end);
 
 // TODO(edisonn): typedef read and integer tyepes? make less readable...
 //typedef double SkPdfReal;

diff --git a/experimental/PdfViewer/pdfparser/native/SkPdfObject.h b/experimental/PdfViewer/pdfparser/native/SkPdfObject.h
index 9df9a23..9ac9a12 100644
--- a/experimental/PdfViewer/pdfparser/native/SkPdfObject.h
+++ b/experimental/PdfViewer/pdfparser/native/SkPdfObject.h

@@ -527,6 +527,22 @@
         return fObjectType == kKeyword_PdfObjectType;
     }
 
+    bool isKeyword(const char* keyword) const {
+        if (!isKeyword()) {
+            return false;
+        }
+
+        if (strlen(keyword) != fStr.fBytes) {
+            return false;
+        }
+
+        if (strncmp(keyword, (const char*)fStr.fBuffer, fStr.fBytes) != 0) {
+            return false;
+        }
+
+        return true;
+    }
+
     bool isName() const {
         return fObjectType == kName_PdfObjectType;
     }
commit	4ef4bed00efd247a0ea005b95b7239a9d4c14c68	[log] [tgz]
author	edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	Mon Jul 29 22:14:45 2013 +0000
committer	edisonn@google.com <edisonn@google.com@2bbb7eff-a529-9590-31e7-b0007b416f81>	Mon Jul 29 22:14:45 2013 +0000
tree	7a82f5984b24ecff66dbbd1ba05d78b768924a9c
parent	d49173afc862e0a33133190c392cd5a221a7e51f [diff]