Unify parsing of linearized header.

Change-Id: I3b55b1331ee97af254c248d4ac91b627c9603b59
Reviewed-on: https://pdfium-review.googlesource.com/13831
Commit-Queue: Art Snake <art-snake@yandex-team.ru>
Reviewed-by: dsinclair <dsinclair@chromium.org>
diff --git a/core/fpdfapi/parser/cpdf_data_avail.cpp b/core/fpdfapi/parser/cpdf_data_avail.cpp
index 93dd39b..2f79e56 100644
--- a/core/fpdfapi/parser/cpdf_data_avail.cpp
+++ b/core/fpdfapi/parser/cpdf_data_avail.cpp
@@ -108,6 +108,7 @@
   m_bCurPageDictLoadOK = false;
   m_bLinearedDataOK = false;
   m_bSupportHintTable = bSupportHintTable;
+  m_bHeaderAvail = false;
 }
 
 CPDF_DataAvail::~CPDF_DataAvail() {
@@ -413,25 +414,19 @@
 }
 
 bool CPDF_DataAvail::CheckHeader() {
-  ASSERT(m_dwFileLen >= 0);
-  const uint32_t kReqSize = std::min(static_cast<uint32_t>(m_dwFileLen), 1024U);
-  std::vector<uint8_t> buffer(kReqSize);
-  {
-    const CPDF_ReadValidator::Session read_session(GetValidator().Get());
-    m_pFileRead->ReadBlock(buffer.data(), 0, kReqSize);
-    if (GetValidator()->has_read_problems())
+  switch (CheckHeaderAndLinearized()) {
+    case DocAvailStatus::DataAvailable:
+      m_docStatus = m_pLinearized ? PDF_DATAAVAIL_FIRSTPAGE : PDF_DATAAVAIL_END;
+      return true;
+    case DocAvailStatus::DataNotAvailable:
+      return false;
+    case DocAvailStatus::DataError:
+      m_docStatus = PDF_DATAAVAIL_ERROR;
+      return true;
+    default:
+      NOTREACHED();
       return false;
   }
-
-  if (IsLinearizedFile(buffer.data(), kReqSize)) {
-    m_docStatus = PDF_DATAAVAIL_FIRSTPAGE;
-    return true;
-  }
-  if (m_docStatus == PDF_DATAAVAIL_ERROR)
-    return false;
-
-  m_docStatus = PDF_DATAAVAIL_END;
-  return true;
 }
 
 bool CPDF_DataAvail::CheckFirstPage() {
@@ -504,56 +499,41 @@
 }
 
 CPDF_DataAvail::DocLinearizationStatus CPDF_DataAvail::IsLinearizedPDF() {
-  const uint32_t kReqSize = 1024;
-  if (!m_pFileAvail->IsDataAvail(0, kReqSize))
-    return LinearizationUnknown;
-
-  FX_FILESIZE dwSize = m_pFileRead->GetSize();
-  if (dwSize < (FX_FILESIZE)kReqSize)
-    return LinearizationUnknown;
-
-  std::vector<uint8_t> buffer(kReqSize);
-  m_pFileRead->ReadBlock(buffer.data(), 0, kReqSize);
-  if (IsLinearizedFile(buffer.data(), kReqSize))
-    return Linearized;
-
-  return NotLinearized;
+  switch (CheckHeaderAndLinearized()) {
+    case DocAvailStatus::DataAvailable:
+      return m_pLinearized ? DocLinearizationStatus::Linearized
+                           : DocLinearizationStatus::NotLinearized;
+    case DocAvailStatus::DataNotAvailable:
+      return DocLinearizationStatus::LinearizationUnknown;
+    case DocAvailStatus::DataError:
+      return DocLinearizationStatus::NotLinearized;
+    default:
+      NOTREACHED();
+      return DocLinearizationStatus::LinearizationUnknown;
+  }
 }
 
-bool CPDF_DataAvail::IsLinearized() {
-  return !!m_pLinearized;
-}
+CPDF_DataAvail::DocAvailStatus CPDF_DataAvail::CheckHeaderAndLinearized() {
+  if (m_bHeaderAvail)
+    return DocAvailStatus::DataAvailable;
 
-bool CPDF_DataAvail::IsLinearizedFile(uint8_t* pData, uint32_t dwLen) {
-  if (m_pLinearized)
-    return true;
+  const CPDF_ReadValidator::Session read_session(GetValidator().Get());
+  const int32_t header_offset = GetHeaderOffset(GetValidator());
+  if (GetValidator()->has_read_problems())
+    return DocAvailStatus::DataNotAvailable;
 
-  auto file = pdfium::MakeRetain<CFX_MemoryStream>(
-      pData, static_cast<size_t>(dwLen), false);
-  int32_t offset = GetHeaderOffset(file);
-  if (offset == kInvalidHeaderOffset) {
-    m_docStatus = PDF_DATAAVAIL_ERROR;
-    return false;
-  }
+  if (header_offset == kInvalidHeaderOffset)
+    return DocAvailStatus::DataError;
 
-  m_dwHeaderOffset = offset;
-  m_syntaxParser.InitParser(file, offset);
-  m_syntaxParser.SetPos(m_syntaxParser.m_HeaderOffset + 9);
+  m_dwHeaderOffset = header_offset;
 
-  bool bNumber;
-  ByteString wordObjNum = m_syntaxParser.GetNextWord(&bNumber);
-  if (!bNumber)
-    return false;
+  m_syntaxParser.InitParserWithValidator(GetValidator(), header_offset);
+  m_pLinearized = CPDF_LinearizedHeader::Parse(&m_syntaxParser);
+  if (GetValidator()->has_read_problems())
+    return DocAvailStatus::DataNotAvailable;
 
-  uint32_t objnum = FXSYS_atoui(wordObjNum.c_str());
-  m_pLinearized = CPDF_LinearizedHeader::CreateForObject(
-      ParseIndirectObjectAt(m_syntaxParser.m_HeaderOffset + 9, objnum));
-  if (!m_pLinearized ||
-      m_pLinearized->GetFileSize() != m_pFileRead->GetSize()) {
-    m_pLinearized.reset();
-    return false;
-  }
-  return true;
+  m_bHeaderAvail = true;
+  return DocAvailStatus::DataAvailable;
 }
 
 bool CPDF_DataAvail::CheckEnd() {
diff --git a/core/fpdfapi/parser/cpdf_data_avail.h b/core/fpdfapi/parser/cpdf_data_avail.h
index 2d46be1..0481408 100644
--- a/core/fpdfapi/parser/cpdf_data_avail.h
+++ b/core/fpdfapi/parser/cpdf_data_avail.h
@@ -103,7 +103,6 @@
   DocAvailStatus IsPageAvail(uint32_t dwPage, DownloadHints* pHints);
   DocFormStatus IsFormAvail(DownloadHints* pHints);
   DocLinearizationStatus IsLinearizedPDF();
-  bool IsLinearized();
   RetainPtr<IFX_SeekableReadStream> GetFileRead() const;
   int GetPageCount() const;
   CPDF_Dictionary* GetPage(int index);
@@ -140,7 +139,7 @@
   DocFormStatus CheckAcroForm();
   bool CheckPageStatus();
 
-  bool IsLinearizedFile(uint8_t* pData, uint32_t dwLen);
+  DocAvailStatus CheckHeaderAndLinearized();
   void SetStartOffset(FX_FILESIZE dwOffset);
   bool GetNextToken(ByteString* token);
   bool GetNextChar(uint8_t& ch);
@@ -219,6 +218,7 @@
   std::map<uint32_t, std::unique_ptr<CPDF_PageObjectAvail>> m_PagesObjAvail;
   std::map<const CPDF_Object*, std::unique_ptr<CPDF_PageObjectAvail>>
       m_PagesResourcesAvail;
+  bool m_bHeaderAvail;
 };
 
 #endif  // CORE_FPDFAPI_PARSER_CPDF_DATA_AVAIL_H_
diff --git a/core/fpdfapi/parser/cpdf_linearized_header.cpp b/core/fpdfapi/parser/cpdf_linearized_header.cpp
index 98cdcc4..ce22c55 100644
--- a/core/fpdfapi/parser/cpdf_linearized_header.cpp
+++ b/core/fpdfapi/parser/cpdf_linearized_header.cpp
@@ -12,10 +12,13 @@
 #include "core/fpdfapi/parser/cpdf_array.h"
 #include "core/fpdfapi/parser/cpdf_dictionary.h"
 #include "core/fpdfapi/parser/cpdf_number.h"
+#include "core/fpdfapi/parser/cpdf_syntax_parser.h"
 #include "third_party/base/ptr_util.h"
 
 namespace {
 
+constexpr FX_FILESIZE kLinearizedHeaderOffset = 9;
+
 template <class T>
 bool IsValidNumericDictionaryValue(const CPDF_Dictionary* pDict,
                                    const char* key,
@@ -32,21 +35,48 @@
   return static_cast<T>(raw_value) >= min_value;
 }
 
+bool IsLinearizedHeaderValid(const CPDF_LinearizedHeader* header,
+                             FX_FILESIZE file_size) {
+  ASSERT(header);
+  return header->GetFileSize() == file_size &&
+         header->GetMainXRefTableFirstEntryOffset() < file_size &&
+         header->GetPageCount() > 0 &&
+         header->GetFirstPageEndOffset() < file_size &&
+         header->GetLastXRefOffset() < file_size &&
+         header->GetHintStart() < file_size;
+}
+
 }  // namespace
 
 // static
-std::unique_ptr<CPDF_LinearizedHeader> CPDF_LinearizedHeader::CreateForObject(
-    std::unique_ptr<CPDF_Object> pObj) {
-  auto pDict = ToDictionary(std::move(pObj));
+std::unique_ptr<CPDF_LinearizedHeader> CPDF_LinearizedHeader::Parse(
+    CPDF_SyntaxParser* parser) {
+  parser->SetPos(kLinearizedHeaderOffset);
+
+  const auto pDict = ToDictionary(
+      parser->GetIndirectObject(nullptr, CPDF_SyntaxParser::ParseType::kLoose));
+
   if (!pDict || !pDict->KeyExist("Linearized") ||
       !IsValidNumericDictionaryValue<FX_FILESIZE>(pDict.get(), "L", 1) ||
       !IsValidNumericDictionaryValue<uint32_t>(pDict.get(), "P", 0, false) ||
       !IsValidNumericDictionaryValue<FX_FILESIZE>(pDict.get(), "T", 1) ||
       !IsValidNumericDictionaryValue<uint32_t>(pDict.get(), "N", 0) ||
       !IsValidNumericDictionaryValue<FX_FILESIZE>(pDict.get(), "E", 1) ||
-      !IsValidNumericDictionaryValue<uint32_t>(pDict.get(), "O", 1))
+      !IsValidNumericDictionaryValue<uint32_t>(pDict.get(), "O", 1)) {
     return nullptr;
-  return pdfium::WrapUnique(new CPDF_LinearizedHeader(pDict.get()));
+  }
+  // Move parser to the start of the xref table for the documents first page.
+  // (skpping endobj keyword)
+  if (parser->GetNextWord(nullptr) != "endobj")
+    return nullptr;
+
+  auto result = pdfium::WrapUnique(new CPDF_LinearizedHeader(pDict.get()));
+  result->m_szLastXRefOffset = parser->GetPos();
+
+  return IsLinearizedHeaderValid(result.get(),
+                                 parser->GetFileAccess()->GetSize())
+             ? std::move(result)
+             : nullptr;
 }
 
 CPDF_LinearizedHeader::CPDF_LinearizedHeader(const CPDF_Dictionary* pDict) {
diff --git a/core/fpdfapi/parser/cpdf_linearized_header.h b/core/fpdfapi/parser/cpdf_linearized_header.h
index 98ae9c6..d732160 100644
--- a/core/fpdfapi/parser/cpdf_linearized_header.h
+++ b/core/fpdfapi/parser/cpdf_linearized_header.h
@@ -14,12 +14,13 @@
 
 class CPDF_Dictionary;
 class CPDF_Object;
+class CPDF_SyntaxParser;
 
 class CPDF_LinearizedHeader {
  public:
   ~CPDF_LinearizedHeader();
-  static std::unique_ptr<CPDF_LinearizedHeader> CreateForObject(
-      std::unique_ptr<CPDF_Object> pObj);
+  static std::unique_ptr<CPDF_LinearizedHeader> Parse(
+      CPDF_SyntaxParser* parser);
 
   // Will only return values > 0.
   FX_FILESIZE GetFileSize() const { return m_szFileSize; }
@@ -33,6 +34,8 @@
   FX_FILESIZE GetFirstPageEndOffset() const { return m_szFirstPageEndOffset; }
   // Will only return values > 0.
   uint32_t GetFirstPageObjNum() const { return m_FirstPageObjNum; }
+  // Will only return values > 0.
+  FX_FILESIZE GetLastXRefOffset() const { return m_szLastXRefOffset; }
 
   bool HasHintTable() const;
   // Will only return values > 0.
@@ -51,6 +54,7 @@
   uint32_t m_FirstPageObjNum = 0;
   FX_FILESIZE m_szHintStart = 0;
   uint32_t m_HintLength = 0;
+  FX_FILESIZE m_szLastXRefOffset = 0;
 };
 
 #endif  // CORE_FPDFAPI_PARSER_CPDF_LINEARIZED_HEADER_H_
diff --git a/core/fpdfapi/parser/cpdf_parser.cpp b/core/fpdfapi/parser/cpdf_parser.cpp
index 6957c84..7a8f4f9 100644
--- a/core/fpdfapi/parser/cpdf_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_parser.cpp
@@ -1280,34 +1280,8 @@
   return dwPermission;
 }
 
-bool CPDF_Parser::ParseLinearizedHeader() {
-  m_pSyntax->SetPos(m_pSyntax->m_HeaderOffset + 9);
-
-  FX_FILESIZE SavedPos = m_pSyntax->GetPos();
-  bool bIsNumber;
-  ByteString word = m_pSyntax->GetNextWord(&bIsNumber);
-  if (!bIsNumber)
-    return false;
-
-  word = m_pSyntax->GetNextWord(&bIsNumber);
-  if (!bIsNumber)
-    return false;
-
-  if (m_pSyntax->GetKeyword() != "obj") {
-    m_pSyntax->SetPos(SavedPos);
-    return false;
-  }
-
-  m_pLinearized =
-      CPDF_LinearizedHeader::CreateForObject(m_pSyntax->GetObjectBody(nullptr));
-  if (!m_pLinearized)
-    return false;
-
-  // Move parser onto first page xref table start.
-  m_pSyntax->GetNextWord(nullptr);
-
-  m_LastXRefOffset = m_pSyntax->GetPos();
-  return true;
+std::unique_ptr<CPDF_LinearizedHeader> CPDF_Parser::ParseLinearizedHeader() {
+  return CPDF_LinearizedHeader::Parse(m_pSyntax.get());
 }
 
 CPDF_Parser::Error CPDF_Parser::StartLinearizedParse(
@@ -1320,12 +1294,14 @@
   if (!InitSyntaxParser(pFileAccess))
     return FORMAT_ERROR;
 
-  if (!ParseLinearizedHeader())
+  m_pLinearized = ParseLinearizedHeader();
+  if (!m_pLinearized)
     return StartParseInternal(std::move(pDocument));
 
   m_bHasParsed = true;
   m_pDocument = pDocument;
 
+  m_LastXRefOffset = m_pLinearized->GetLastXRefOffset();
   FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset;
   bool bXRefRebuilt = false;
   bool bLoadV4 = LoadCrossRefV4(dwFirstXRefOffset, false);
diff --git a/core/fpdfapi/parser/cpdf_parser.h b/core/fpdfapi/parser/cpdf_parser.h
index a58838e..6c8cfbd 100644
--- a/core/fpdfapi/parser/cpdf_parser.h
+++ b/core/fpdfapi/parser/cpdf_parser.h
@@ -166,7 +166,7 @@
   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos);
   Error LoadLinearizedMainXRefTable();
   RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number);
-  bool ParseLinearizedHeader();
+  std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
   void SetEncryptDictionary(CPDF_Dictionary* pDict);
   void ShrinkObjectMap(uint32_t size);
   // A simple check whether the cross reference table matches with