Add initial support for generating tagged PDFs. Adds an interface for the document creator to pass in a tree of tags indicating the structure of the document, each with a type (from a predetermined enum of possible types) and a node ID. It also adds a setNodeId function to SkCanvas so that page content can be associated with a particular tag. If both the tag tree and marked content are present, Skia can now output a properly tagged PDF. An example program is included. When used properly, the PDF generated by this patch is valid and the tags are parsed properly by Adobe Acrobat. It handles many corner cases like content that spans more than one page, or tags that don't correspond to any marked content, or marked content that doesn't correspond to any tags. However, it doesn't implement all of the features of PDF accessibility yet, there are some additional attributes that can be associated with some tags that need to be supported, too, in order to properly tag things like figures and tables. Bug: skia:8148 Change-Id: I2e448eca8ded8e1b29ba685663b557ae7ad7e23e Reviewed-on: https://skia-review.googlesource.com/141138 Reviewed-by: Hal Canary <halcanary@google.com>

commit: 656cefe65d620f9aa7f689e412fa7720fe01c447 [log] [tgz]
author: Dominic Mazzoni <dmazzoni@chromium.org> Tue Sep 25 20:29:15 2018 -0700
committer: Hal Canary <halcanary@google.com> Thu Sep 27 19:35:40 2018 +0000
tree: 7d7238eaa9b8f492884d95f4969a89ed33d5a8a5
parent: b400d4d7e0905dd2a5a0c16e648be49cf853981a [diff]
diff --git a/src/pdf/SkPDFDevice.cpp b/src/pdf/SkPDFDevice.cpp
index 0026897..3185037 100644
--- a/src/pdf/SkPDFDevice.cpp
+++ b/src/pdf/SkPDFDevice.cpp

@@ -527,6 +527,7 @@
     : INHERITED(SkImageInfo::MakeUnknown(pageSize.width(), pageSize.height()),
                 SkSurfaceProps(0, kUnknown_SkPixelGeometry))
     , fInitialTransform(transform)
+    , fNodeId(0)
     , fDocument(doc)
 {
     SkASSERT(!pageSize.isEmpty());
@@ -550,6 +551,13 @@
         return;
     }
     if (rect.isEmpty()) {
+        if (!strcmp(key, SkPDFGetNodeIdKey())) {
+            int nodeID;
+            if (value->size() != sizeof(nodeID)) { return; }
+            memcpy(&nodeID, value->data(), sizeof(nodeID));
+            fNodeId = nodeID;
+            return;
+        }
         if (!strcmp(SkAnnotationKeys::Define_Named_Dest_Key(), key)) {
             SkPoint transformedPoint;
             this->ctm().mapXY(rect.x(), rect.y(), &transformedPoint);
@@ -1176,6 +1184,19 @@
         SkDynamicMemoryWStream* out = content.stream();
 
         out->writeText("BT\n");
+
+        int markId = -1;
+        if (fNodeId) {
+            markId = fDocument->getMarkIdForNodeId(fNodeId);
+        }
+
+        if (markId != -1) {
+            out->writeText("/P <</MCID ");
+            out->writeDecAsText(markId);
+            out->writeText(" >>BDC\n");
+        }
+        SK_AT_SCOPE_EXIT(if (markId != -1) out->writeText("EMC\n"));
+
         SK_AT_SCOPE_EXIT(out->writeText("ET\n"));
 
         const SkGlyphID maxGlyphID = SkToU16(typeface->countGlyphs() - 1);

diff --git a/src/pdf/SkPDFDevice.h b/src/pdf/SkPDFDevice.h
index aabc001..666c36b 100644
--- a/src/pdf/SkPDFDevice.h
+++ b/src/pdf/SkPDFDevice.h

@@ -167,6 +167,7 @@
     std::vector<sk_sp<SkPDFObject>> fXObjectResources;
     std::vector<sk_sp<SkPDFObject>> fShaderResources;
     std::vector<sk_sp<SkPDFFont>> fFontResources;
+    int fNodeId;
 
     SkSinglyLinkedList<SkDynamicMemoryWStream> fContentEntries;
     struct GraphicStackState {

diff --git a/src/pdf/SkPDFDocument.cpp b/src/pdf/SkPDFDocument.cpp
index 4694577..9b7045a 100644
--- a/src/pdf/SkPDFDocument.cpp
+++ b/src/pdf/SkPDFDocument.cpp

@@ -11,12 +11,19 @@
 #include "SkMakeUnique.h"
 #include "SkPDFCanon.h"
 #include "SkPDFDevice.h"
+#include "SkPDFTag.h"
 #include "SkPDFUtils.h"
 #include "SkStream.h"
 #include "SkTo.h"
 
 #include <utility>
 
+// For use in SkCanvas::drawAnnotation
+const char* SkPDFGetNodeIdKey() {
+    static constexpr char key[] = "PDF_Node_Key";
+    return key;
+}
+
 SkPDFObjectSerializer::SkPDFObjectSerializer() : fBaseOffset(0), fNextToBeSerialized(0) {}
 
 SkPDFObjectSerializer::~SkPDFObjectSerializer() {
@@ -192,6 +199,9 @@
         fInverseRasterScale = kDpiForRasterScaleOne / fMetadata.fRasterDPI;
         fRasterScale        = fMetadata.fRasterDPI / kDpiForRasterScaleOne;
     }
+    if (fMetadata.fStructureElementTreeRoot) {
+        fTagRoot = recursiveBuildTagTree(*fMetadata.fStructureElementTreeRoot, nullptr);
+    }
 }
 
 SkPDFDocument::~SkPDFDocument() {
@@ -265,6 +275,9 @@
     }
     this->serialize(contentObject);
     page->insertObjRef("Contents", std::move(contentObject));
+    // The StructParents unique identifier for each page is just its
+    // 0-based page index.
+    page->insertInt("StructParents", static_cast<int>(fPages.size()));
     fPages.emplace_back(std::move(page));
 }
 
@@ -423,6 +436,39 @@
     return intentArray;
 }
 
+sk_sp<SkPDFDict> SkPDFDocument::getPage(int pageIndex) const {
+    SkASSERT(pageIndex >= 0 && pageIndex < static_cast<int>(fPages.size()));
+    return fPages[pageIndex];
+}
+
+int SkPDFDocument::getMarkIdForNodeId(int nodeId) {
+    sk_sp<SkPDFTag>* tagPtr = fNodeIdToTag.find(nodeId);
+    if (tagPtr == nullptr) {
+        return -1;
+    }
+
+    sk_sp<SkPDFTag> tag = *tagPtr;
+    int pageIndex = static_cast<int>(fPages.size());
+    while (fMarksPerPage.count() < pageIndex + 1) {
+        fMarksPerPage.push_back();
+    }
+    int markId = fMarksPerPage[pageIndex].count();
+    tag->addMarkedContent(pageIndex, markId);
+    fMarksPerPage[pageIndex].push_back(std::move(tag));
+    return markId;
+}
+
+sk_sp<SkPDFTag> SkPDFDocument::recursiveBuildTagTree(
+        const SkPDF::StructureElementNode& node, sk_sp<SkPDFTag> parent) {
+    sk_sp<SkPDFTag> tag = sk_make_sp<SkPDFTag>(node.fNodeId, node.fType, parent);
+    fNodeIdToTag.set(tag->fNodeId, tag);
+    tag->fChildren.reserve(node.fChildCount);
+    for (size_t i = 0; i < node.fChildCount; i++) {
+        tag->appendChild(recursiveBuildTagTree(node.fChildren[i], tag));
+    }
+    return tag;
+}
+
 void SkPDFDocument::onClose(SkWStream* stream) {
     SkASSERT(fCanvas.imageInfo().dimensions().isZero());
     if (fPages.empty()) {
@@ -437,14 +483,63 @@
         // no one has ever asked for this feature.
         docCatalog->insertObject("OutputIntents", make_srgb_output_intents());
     }
-    SkASSERT(!fPages.empty());
-    docCatalog->insertObjRef("Pages", generate_page_tree(&fPages));
-    SkASSERT(fPages.empty());
+
+    std::vector<sk_sp<SkPDFDict>> pagesCopy(fPages);
+    SkASSERT(!pagesCopy.empty());
+    docCatalog->insertObjRef("Pages", generate_page_tree(&pagesCopy));
+    SkASSERT(pagesCopy.empty());
 
     if (fDests->size() > 0) {
         docCatalog->insertObjRef("Dests", std::move(fDests));
     }
 
+    // Handle tagged PDFs.
+    if (fTagRoot) {
+        // In the document catalog, indicate that this PDF is tagged.
+        auto markInfo = sk_make_sp<SkPDFDict>("MarkInfo");
+        markInfo->insertBool("Marked", true);
+        docCatalog->insertObject("MarkInfo", markInfo);
+
+        // Prepare the tag tree, this automatically skips over any
+        // tags that weren't referenced from any marked content.
+        bool success = fTagRoot->prepareTagTreeToEmit(*this);
+        if (!success) {
+            SkDEBUGFAIL("PDF has tag tree but no marked content.");
+        }
+
+        // Build the StructTreeRoot.
+        auto structTreeRoot = sk_make_sp<SkPDFDict>("StructTreeRoot");
+        docCatalog->insertObjRef("StructTreeRoot", structTreeRoot);
+        structTreeRoot->insertObjRef("K", fTagRoot);
+        int pageCount = static_cast<int>(fPages.size());
+        structTreeRoot->insertInt("ParentTreeNextKey", pageCount);
+
+        // The parent of the tag root is the StructTreeRoot.
+        fTagRoot->insertObjRef("P", structTreeRoot);
+
+        // Build the parent tree, which is a mapping from the marked
+        // content IDs on each page to their corressponding tags.
+        auto parentTree = sk_make_sp<SkPDFDict>("ParentTree");
+        structTreeRoot->insertObjRef("ParentTree", parentTree);
+        structTreeRoot->insertInt("ParentTreeNextKey", pageCount);
+        auto parentTreeNums = sk_make_sp<SkPDFArray>();
+        parentTree->insertObject("Nums", parentTreeNums);
+        for (int pageIndex = 0; pageIndex < pageCount; pageIndex++) {
+            // Exit now if there are no more pages with marked content.
+            if (fMarksPerPage.count() <= pageIndex) {
+                break;
+            }
+
+            parentTreeNums->appendInt(pageIndex);
+            auto markToTagArray = sk_make_sp<SkPDFArray>();
+            parentTreeNums->appendObjRef(markToTagArray);
+
+            for (int i = 0; i < fMarksPerPage[pageIndex].count(); i++) {
+                markToTagArray->appendObjRef(fMarksPerPage[pageIndex][i]);
+            }
+        }
+    }
+
     // Build font subsetting info before calling addObjectRecursively().
     SkPDFCanon* canon = &fCanon;
     fFonts.foreach([canon](SkPDFFont* p){ p->getFontSubset(canon); });
@@ -456,6 +551,12 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+void SkPDF::SetNodeId(SkCanvas* canvas, int nodeID) {
+    sk_sp<SkData> payload = SkData::MakeWithCopy(&nodeID, sizeof(nodeID));
+    const char* key = SkPDFGetNodeIdKey();
+    canvas->drawAnnotation({0, 0, 0, 0}, key, payload.get());
+}
+
 sk_sp<SkDocument> SkPDF::MakeDocument(SkWStream* stream, const SkPDF::Metadata& metadata) {
     SkPDF::Metadata meta = metadata;
     if (meta.fRasterDPI <= 0) {

diff --git a/src/pdf/SkPDFDocumentPriv.h b/src/pdf/SkPDFDocumentPriv.h
index 6c7b578..8d88951 100644
--- a/src/pdf/SkPDFDocumentPriv.h
+++ b/src/pdf/SkPDFDocumentPriv.h

@@ -14,6 +14,9 @@
 #include "SkPDFMetadata.h"
 
 class SkPDFDevice;
+class SkPDFTag;
+
+const char* SkPDFGetNodeIdKey();
 
 // Logically part of SkPDFDocument (like SkPDFCanon), but separate to
 // keep similar functionality together.
@@ -64,7 +67,14 @@
     void registerFont(SkPDFFont* f) { fFonts.add(f); }
     const SkPDF::Metadata& metadata() const { return fMetadata; }
 
+    sk_sp<SkPDFDict> getPage(int pageIndex) const;
+    // Returns -1 if no mark ID.
+    int getMarkIdForNodeId(int nodeId);
+
 private:
+    sk_sp<SkPDFTag> recursiveBuildTagTree(const SkPDF::StructureElementNode& node,
+                                          sk_sp<SkPDFTag> parent);
+
     SkPDFObjectSerializer fObjectSerializer;
     SkPDFCanon fCanon;
     SkCanvas fCanvas;
@@ -78,6 +88,15 @@
     SkScalar fRasterScale = 1;
     SkScalar fInverseRasterScale = 1;
 
+    // For tagged PDFs.
+
+    // The tag root, which owns its child tags and so on.
+    sk_sp<SkPDFTag> fTagRoot;
+    // Array of page -> array of marks mapping to tags.
+    SkTArray<SkTArray<sk_sp<SkPDFTag>>> fMarksPerPage;
+    // A mapping from node ID to tag for fast lookup.
+    SkTHashMap<int, sk_sp<SkPDFTag>> fNodeIdToTag;
+
     void reset();
 };
 

diff --git a/src/pdf/SkPDFTag.cpp b/src/pdf/SkPDFTag.cpp
new file mode 100644
index 0000000..3bc3401
--- /dev/null
+++ b/src/pdf/SkPDFTag.cpp

@@ -0,0 +1,206 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkPDFDocumentPriv.h"
+#include "SkPDFTag.h"
+
+namespace {
+
+// Table 333 in PDF 32000-1:2008
+const char* tagNameFromType(SkPDF::DocumentStructureType type) {
+    switch (type) {
+        case SkPDF::DocumentStructureType::kDocument:
+            return "Document";
+        case SkPDF::DocumentStructureType::kPart:
+            return "Part";
+        case SkPDF::DocumentStructureType::kArt:
+            return "Art";
+        case SkPDF::DocumentStructureType::kSect:
+            return "Sect";
+        case SkPDF::DocumentStructureType::kDiv:
+            return "Div";
+        case SkPDF::DocumentStructureType::kBlockQuote:
+            return "BlockQuote";
+        case SkPDF::DocumentStructureType::kCaption:
+            return "Caption";
+        case SkPDF::DocumentStructureType::kTOC:
+            return "TOC";
+        case SkPDF::DocumentStructureType::kTOCI:
+            return "TOCI";
+        case SkPDF::DocumentStructureType::kIndex:
+            return "Index";
+        case SkPDF::DocumentStructureType::kNonStruct:
+            return "NonStruct";
+        case SkPDF::DocumentStructureType::kPrivate:
+            return "Private";
+        case SkPDF::DocumentStructureType::kH:
+            return "H";
+        case SkPDF::DocumentStructureType::kH1:
+            return "H1";
+        case SkPDF::DocumentStructureType::kH2:
+            return "H2";
+        case SkPDF::DocumentStructureType::kH3:
+            return "H3";
+        case SkPDF::DocumentStructureType::kH4:
+            return "H4";
+        case SkPDF::DocumentStructureType::kH5:
+            return "H5";
+        case SkPDF::DocumentStructureType::kH6:
+            return "H6";
+        case SkPDF::DocumentStructureType::kP:
+            return "P";
+        case SkPDF::DocumentStructureType::kL:
+            return "L";
+        case SkPDF::DocumentStructureType::kLI:
+            return "LI";
+        case SkPDF::DocumentStructureType::kLbl:
+            return "Lbl";
+        case SkPDF::DocumentStructureType::kLBody:
+            return "LBody";
+        case SkPDF::DocumentStructureType::kTable:
+            return "Table";
+        case SkPDF::DocumentStructureType::kTR:
+            return "TR";
+        case SkPDF::DocumentStructureType::kTH:
+            return "TH";
+        case SkPDF::DocumentStructureType::kTD:
+            return "TD";
+        case SkPDF::DocumentStructureType::kTHead:
+            return "THead";
+        case SkPDF::DocumentStructureType::kTBody:
+            return "TBody";
+        case SkPDF::DocumentStructureType::kTFoot:
+            return "TFoot";
+        case SkPDF::DocumentStructureType::kSpan:
+            return "Span";
+        case SkPDF::DocumentStructureType::kQuote:
+            return "Quote";
+        case SkPDF::DocumentStructureType::kNote:
+            return "Note";
+        case SkPDF::DocumentStructureType::kReference:
+            return "Reference";
+        case SkPDF::DocumentStructureType::kBibEntry:
+            return "BibEntry";
+        case SkPDF::DocumentStructureType::kCode:
+            return "Code";
+        case SkPDF::DocumentStructureType::kLink:
+            return "Link";
+        case SkPDF::DocumentStructureType::kAnnot:
+            return "Annot";
+        case SkPDF::DocumentStructureType::kRuby:
+            return "Ruby";
+        case SkPDF::DocumentStructureType::kWarichu:
+            return "Warichu";
+        case SkPDF::DocumentStructureType::kFigure:
+            return "Figure";
+        case SkPDF::DocumentStructureType::kFormula:
+            return "Formula";
+        case SkPDF::DocumentStructureType::kForm:
+            return "Form";
+    }
+
+    SK_ABORT("bad tag");
+    return "";
+}
+
+}  // namespace
+
+SkPDFTag::SkPDFTag(int nodeId, SkPDF::DocumentStructureType type, sk_sp<SkPDFTag> parent)
+    : SkPDFDict("StructElem")
+    , fNodeId(nodeId) {
+    insertName("S", tagNameFromType(type));
+    if (parent) {
+        insertObjRef("P", std::move(parent));
+    }
+}
+
+SkPDFTag::~SkPDFTag() {
+}
+
+void SkPDFTag::appendChild(sk_sp<SkPDFTag> child) {
+    fChildren.emplace_back(child);
+}
+
+void SkPDFTag::drop() {
+    // Disconnect the tree so as not to cause reference count loops.
+    fChildren.reset();
+
+    SkPDFDict::drop();
+}
+
+void SkPDFTag::addMarkedContent(int pageIndex, int markId) {
+    MarkedContentInfo mark;
+    mark.pageIndex = pageIndex;
+    mark.markId = markId;
+    fMarkedContent.emplace_back(mark);
+}
+
+bool SkPDFTag::prepareTagTreeToEmit(const SkPDFDocument& document) {
+    // Scan the marked content. If it's all on the page, output a
+    // Pg to the dict. If not, we'll use MCR dicts, below.
+    bool allSamePage = true;
+    if (fMarkedContent.count() > 0) {
+        int firstPageIndex = fMarkedContent[0].pageIndex;
+        for (int i = 1; i < fMarkedContent.count(); i++) {
+            if (fMarkedContent[i].pageIndex != firstPageIndex) {
+                allSamePage = false;
+                break;
+            }
+        }
+
+        if (allSamePage) {
+            insertObjRef("Pg", document.getPage(firstPageIndex));
+        }
+    }
+
+    // Recursively prepare all child tags of this node.
+    SkTArray<sk_sp<SkPDFTag>> validChildren;
+    for (int i = 0; i < fChildren.count(); i++) {
+        if (fChildren[i]->prepareTagTreeToEmit(document)) {
+            validChildren.push_back(fChildren[i]);
+        }
+    }
+
+    // fChildren is no longer needed.
+    fChildren.reset();
+
+    // Now set the kids of this node, which includes both child tags
+    // and marked content IDs.
+    if (validChildren.count() + fMarkedContent.count() == 1) {
+        // If there's just one valid kid, or one marked content,
+        // we can just output the reference directly with no array.
+        if (validChildren.count() == 1) {
+            insertObjRef("K", validChildren[0]);
+        } else {
+            insertInt("K", fMarkedContent[0].markId);
+        }
+        return true;
+    } else if (validChildren.count() + fMarkedContent.count() > 1) {
+        // If there's more than one kid, output them in an array.
+        auto kids = sk_make_sp<SkPDFArray>();
+        for (int i = 0; i < validChildren.count(); i++) {
+            kids->appendObjRef(validChildren[i]);
+        }
+        for (int i = 0; i < fMarkedContent.count(); i++) {
+            if (allSamePage) {
+                kids->appendInt(fMarkedContent[i].markId);
+            } else {
+                auto mcr = sk_make_sp<SkPDFDict>("MCR");
+                mcr->insertObjRef("Pg", document.getPage(fMarkedContent[i].pageIndex));
+                mcr->insertInt("MCID", fMarkedContent[i].markId);
+                kids->appendObject(mcr);
+            }
+        }
+        insertObject("K", kids);
+        return true;
+    }
+
+    // This tag didn't have any marked content or any children with
+    // marked content, so return false. This subtree will be omitted
+    // from the structure tree.
+    return false;
+}

diff --git a/src/pdf/SkPDFTag.h b/src/pdf/SkPDFTag.h
new file mode 100644
index 0000000..e6bf5b3
--- /dev/null
+++ b/src/pdf/SkPDFTag.h

@@ -0,0 +1,67 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkPDFTag_DEFINED
+#define SkPDFTag_DEFINED
+
+#include "SkDocument.h"
+#include "SkPDFTypes.h"
+#include "SkRefCnt.h"
+
+class SkPDFDocument;
+
+/** \class SkPDFTag
+
+    A PDF Tag represents a semantic tag in the tag tree for an
+    accessible tagged PDF. Documents can create an accessible PDF by
+    creating a tree of SkPDFTags representing the semantic tree
+    structure of the overall document, and then calling
+    SkPDF::SetNodeId with the SkCanvas used to draw to the page and
+    the same corresponding node IDs to mark the content for each
+    page. It's allowed for the marked content for one tag to span
+    multiple pages.
+*/
+class SkPDFTag final : public SkPDFDict {
+public:
+    SkPDFTag(int nodeId, SkPDF::DocumentStructureType type, sk_sp<SkPDFTag> parent);
+    ~SkPDFTag() override;
+
+    void appendChild(sk_sp<SkPDFTag> child);
+
+private:
+    friend class SkPDFDocument;
+
+    void drop() override;
+
+    void addMarkedContent(int pageIndex, int markId);
+
+    // Should be called after all content has been emitted. Fills in
+    // all of the SkPDFDict fields in this tag and all descendants.
+    // Returns true if this tag is valid, and false if no tag in this
+    // subtree was referred to by any marked content.
+    bool prepareTagTreeToEmit(const SkPDFDocument& document);
+
+    struct MarkedContentInfo {
+        int pageIndex;
+        int markId;
+    };
+
+    // This tag's node ID, which must correspond to the node ID set
+    // on the SkCanvas when content inside this tag is drawn.
+    // The node IDs are arbitrary and are not output to the PDF.
+    int fNodeId;
+
+    // The children of this tag. Some tags like lists and tables require
+    // a particular hierarchical structure, similar to HTML.
+    SkTArray<sk_sp<SkPDFTag>> fChildren;
+
+    // An array consisting of a [page index, mark ID] pair for each piece
+    // of marked content associated with this tag.
+    SkTArray<MarkedContentInfo> fMarkedContent;
+};
+
+#endif
commit	656cefe65d620f9aa7f689e412fa7720fe01c447	[log] [tgz]
author	Dominic Mazzoni <dmazzoni@chromium.org>	Tue Sep 25 20:29:15 2018 -0700
committer	Hal Canary <halcanary@google.com>	Thu Sep 27 19:35:40 2018 +0000
tree	7d7238eaa9b8f492884d95f4969a89ed33d5a8a5
parent	b400d4d7e0905dd2a5a0c16e648be49cf853981a [diff]