AU: improve progress logging during updates

We currently emit a log message for operations at fixed index increments
of 1000. This is fairly useful in keeping the number of log messages
small, but otherwise quite arbitrary and useless in giving an actual
sense of progress, throughput, etc for updates of varying number of
operations in them and subject to network latencies.  Further, this
progress report does not take into the account the download progress,
which appears to be the completion metric used toward the UI.

This CL proposes a simple mechanism for computing an "overall update
progress" and an adaptive method for emitting a progress log message in
reasonable intervals. The raw metrics currently used include (i) the
number of bytes downloaded out of the expected total; (ii) the number of
operations successfully applied out of the expected total.  These are
weighed into a single percentage counter based on a prefixed set of
weights.  Logging occurs based on either (i) completion of update
"chunks" of some predetermined number; or (ii) having enough time
elapsed since the last log message.  This guarantees that we'll see
enough progress messages at regular intervals, but not too many of them.

We're also similarly logging the relative position in a resumed update
right before we start applying from the point we left it.

Note that the max number of regular log intervals, as well as the
timeout to force a progress log, can be easily tweaked by changing
compile-time constants.  They are currently set to 10 (or every 10%) and
30 (for 30 seconds since the last progress log), respectively, which
appear to be a reasonable choices.

Here's an example from an actual update log:

[0118/145759:INFO:delta_performer.cc(451)] Starting to apply update payload operations
[0118/145821:INFO:delta_performer.cc(150)] Completed 100/1040 operations (9%), 38363576/319659860 bytes downloaded (12%), overall progress 10%
[0118/145835:INFO:delta_performer.cc(150)] Completed 213/1040 operations (20%), 63939000/319659860 bytes downloaded (20%), overall progress 20%
[0118/145857:INFO:delta_performer.cc(150)] Completed 298/1040 operations (28%), 102293944/319659860 bytes downloaded (32%), overall progress 30%
[0118/145912:INFO:delta_performer.cc(150)] Completed 421/1040 operations (40%), 127869368/319659860 bytes downloaded (40%), overall progress 40%
[0118/145935:INFO:delta_performer.cc(150)] Completed 507/1040 operations (48%), 166224312/319659860 bytes downloaded (52%), overall progress 50%

(update interrupted and resumed)

[0118/150058:INFO:delta_performer.cc(150)] Resuming after 582/1040 operations (55%), 195577374/319659860 bytes downloaded (61%), overall progress 57%
[0118/150058:INFO:delta_performer.cc(451)] Starting to apply update payload operations
...
[0118/150102:INFO:delta_performer.cc(150)] Completed 604/1040 operations (58%), 200934046/319659860 bytes downloaded (62%), overall progress 60%
[0118/150116:INFO:delta_performer.cc(150)] Completed 749/1040 operations (72%), 221543046/319659860 bytes downloaded (69%), overall progress 70%
[0118/150134:INFO:delta_performer.cc(150)] Completed 840/1040 operations (80%), 255736454/319659860 bytes downloaded (80%), overall progress 80%
[0118/150154:INFO:delta_performer.cc(150)] Completed 936/1040 operations (90%), 289225350/319659860 bytes downloaded (90%), overall progress 90%
[0118/150212:INFO:delta_performer.cc(150)] Completed 1040/1040 operations (100%), 319659860/319659860 bytes downloaded (100%), overall progress 100%

BUG=chromium-os:37910
TEST=Passes unit tests
TEST=Update works, progress logged as expected on x86-alex

Change-Id: I2ec5511fc20df05a264e59651528c65133209132
Reviewed-on: https://gerrit.chromium.org/gerrit/41301
Tested-by: Gilad Arnold <garnold@chromium.org>
Reviewed-by: Jay Srinivasan <jaysri@chromium.org>
Commit-Queue: Gilad Arnold <garnold@chromium.org>
diff --git a/delta_performer.cc b/delta_performer.cc
index 3f549cd..c299d79 100644
--- a/delta_performer.cc
+++ b/delta_performer.cc
@@ -39,6 +39,10 @@
 const uint64_t DeltaPerformer::kDeltaManifestSizeSize = 8;
 const char DeltaPerformer::kUpdatePayloadPublicKeyPath[] =
     "/usr/share/update_engine/update-payload-key.pub.pem";
+const unsigned DeltaPerformer::kProgressLogMaxChunks = 10;
+const unsigned DeltaPerformer::kProgressLogTimeoutSeconds = 30;
+const unsigned DeltaPerformer::kProgressDownloadWeight = 50;
+const unsigned DeltaPerformer::kProgressOperationsWeight = 50;
 
 namespace {
 const int kUpdateStateOperationInvalid = -1;
@@ -114,6 +118,103 @@
 
 }  // namespace {}
 
+
+// Computes the ratio of |part| and |total|, scaled to |norm|, using integer
+// arithmetic.
+static uint64_t IntRatio(uint64_t part, uint64_t total, uint64_t norm) {
+  return part * norm / total;
+}
+
+void DeltaPerformer::LogProgress(const char* message_prefix) {
+  // Format operations total count and percentage.
+  string total_operations_str("?");
+  string completed_percentage_str("");
+  if (num_total_operations_) {
+    total_operations_str = StringPrintf("%zu", num_total_operations_);
+    // Upcasting to 64-bit to avoid overflow, back to size_t for formatting.
+    completed_percentage_str =
+        StringPrintf(" (%llu%%)",
+                     IntRatio(next_operation_num_, num_total_operations_,
+                              100));
+  }
+
+  // Format download total count and percentage.
+  size_t payload_size = install_plan_->payload_size;
+  string payload_size_str("?");
+  string downloaded_percentage_str("");
+  if (payload_size) {
+    payload_size_str = StringPrintf("%zu", payload_size);
+    // Upcasting to 64-bit to avoid overflow, back to size_t for formatting.
+    downloaded_percentage_str =
+        StringPrintf(" (%llu%%)",
+                     IntRatio(total_bytes_received_, payload_size, 100));
+  }
+
+  LOG(INFO) << (message_prefix ? message_prefix : "") << next_operation_num_
+            << "/" << total_operations_str << " operations"
+            << completed_percentage_str << ", " << total_bytes_received_
+            << "/" << payload_size_str << " bytes downloaded"
+            << downloaded_percentage_str << ", overall progress "
+            << overall_progress_ << "%";
+}
+
+void DeltaPerformer::UpdateOverallProgress(bool force_log,
+                                           const char* message_prefix) {
+  // Compute our download and overall progress.
+  unsigned new_overall_progress = 0;
+  COMPILE_ASSERT(kProgressDownloadWeight + kProgressOperationsWeight == 100,
+                 progress_weight_dont_add_up);
+  // Only consider download progress if its total size is known; otherwise
+  // adjust the operations weight to compensate for the absence of download
+  // progress. Also, make sure to cap the download portion at
+  // kProgressDownloadWeight, in case we end up downloading more than we
+  // initially expected (this indicates a problem, but could generally happen).
+  // TODO(garnold) the correction of operations weight when we do not have the
+  // total payload size, as well as the conditional guard below, should both be
+  // eliminated once we ensure that the payload_size in the install plan is
+  // always given and is non-zero. This currently isn't the case during unit
+  // tests (see chromium-os:37969).
+  size_t payload_size = install_plan_->payload_size;
+  unsigned actual_operations_weight = kProgressOperationsWeight;
+  if (payload_size)
+    new_overall_progress += min(
+        static_cast<unsigned>(IntRatio(total_bytes_received_, payload_size,
+                                       kProgressDownloadWeight)),
+        kProgressDownloadWeight);
+  else
+    actual_operations_weight += kProgressDownloadWeight;
+
+  // Only add completed operations if their total number is known; we definitely
+  // expect an update to have at least one operation, so the expectation is that
+  // this will eventually reach |actual_operations_weight|.
+  if (num_total_operations_)
+    new_overall_progress += IntRatio(next_operation_num_, num_total_operations_,
+                                     actual_operations_weight);
+
+  // Progress ratio cannot recede, unless our assumptions about the total
+  // payload size, total number of operations, or the monotonicity of progress
+  // is breached.
+  if (new_overall_progress < overall_progress_) {
+    LOG(WARNING) << "progress counter receded from " << overall_progress_
+                 << "% down to " << new_overall_progress << "%; this is a bug";
+    force_log = true;
+  }
+  overall_progress_ = new_overall_progress;
+
+  // Update chunk index, log as needed: if forced by called, or we completed a
+  // progress chunk, or a timeout has expired.
+  base::Time curr_time = base::Time::Now();
+  unsigned curr_progress_chunk =
+      overall_progress_ * kProgressLogMaxChunks / 100;
+  if (force_log || curr_progress_chunk > last_progress_chunk_ ||
+      curr_time > forced_progress_log_time_) {
+    forced_progress_log_time_ = curr_time + forced_progress_log_wait_;
+    LogProgress(message_prefix);
+  }
+  last_progress_chunk_ = curr_progress_chunk;
+}
+
+
 // Returns true if |op| is idempotent -- i.e., if we can interrupt it and repeat
 // it safely. Returns false otherwise.
 bool DeltaPerformer::IsIdempotentOperation(
@@ -316,6 +417,11 @@
   const char* c_bytes = reinterpret_cast<const char*>(bytes);
   buffer_.insert(buffer_.end(), c_bytes, c_bytes + count);
   system_state_->payload_state()->DownloadProgress(count);
+
+  // Update the total byte downloaded count and the progress logs.
+  total_bytes_received_ += count;
+  UpdateOverallProgress(false, "Completed ");
+
   if (!manifest_valid_) {
     MetadataParseResult result = ParsePayloadMetadata(buffer_,
                                                       &manifest_,
@@ -341,24 +447,29 @@
       LOG(ERROR) << "Unable to prime the update state.";
       return false;
     }
+
+    num_rootfs_operations_ = manifest_.install_operations_size();
+    num_total_operations_ =
+        num_rootfs_operations_ + manifest_.kernel_install_operations_size();
+    if (next_operation_num_ > 0)
+      UpdateOverallProgress(true, "Resuming after ");
+    LOG(INFO) << "Starting to apply update payload operations";
   }
-  ssize_t total_operations = manifest_.install_operations_size() +
-      manifest_.kernel_install_operations_size();
-  while (next_operation_num_ < total_operations) {
+
+  while (next_operation_num_ < num_total_operations_) {
+    const bool is_kernel_partition =
+        (next_operation_num_ >= num_rootfs_operations_);
     const DeltaArchiveManifest_InstallOperation &op =
-        next_operation_num_ < manifest_.install_operations_size() ?
-        manifest_.install_operations(next_operation_num_) :
+        is_kernel_partition ?
         manifest_.kernel_install_operations(
-            next_operation_num_ - manifest_.install_operations_size());
+            next_operation_num_ - num_rootfs_operations_) :
+        manifest_.install_operations(next_operation_num_);
     if (!CanPerformInstallOperation(op)) {
       // This means we don't have enough bytes received yet to carry out the
       // next operation.
       return true;
     }
 
-    bool should_log = (next_operation_num_ % 1000 == 0 ||
-                       next_operation_num_ == total_operations - 1);
-
     // Validate the operation only if the metadata signature is present.
     // Otherwise, keep the old behavior. This serves as a knob to disable
     // the validation logic in case we find some regression after rollout.
@@ -369,7 +480,7 @@
       // Note: Validate must be called only if CanPerformInstallOperation is
       // called. Otherwise, we might be failing operations before even if there
       // isn't sufficient data to compute the proper hash.
-      *error = ValidateOperationHash(op, should_log);
+      *error = ValidateOperationHash(op);
       if (*error != kActionCodeSuccess) {
         if (install_plan_->hash_checks_mandatory) {
           LOG(ERROR) << "Mandatory operation hash check failed";
@@ -387,12 +498,6 @@
     ScopedTerminatorExitUnblocker exit_unblocker =
         ScopedTerminatorExitUnblocker();  // Avoids a compiler unused var bug.
     // Log every thousandth operation, and also the first and last ones
-    if (should_log) {
-      LOG(INFO) << "Performing operation " << (next_operation_num_ + 1) << "/"
-                << total_operations;
-    }
-    bool is_kernel_partition =
-        (next_operation_num_ >= manifest_.install_operations_size());
     if (op.type() == DeltaArchiveManifest_InstallOperation_Type_REPLACE ||
         op.type() == DeltaArchiveManifest_InstallOperation_Type_REPLACE_BZ) {
       if (!PerformReplaceOperation(op, is_kernel_partition)) {
@@ -416,7 +521,9 @@
         return false;
       }
     }
+
     next_operation_num_++;
+    UpdateOverallProgress(false, "Completed ");
     CheckpointUpdateProgress();
   }
   return true;
@@ -754,8 +861,7 @@
 }
 
 ActionExitCode DeltaPerformer::ValidateOperationHash(
-    const DeltaArchiveManifest_InstallOperation& operation,
-    bool should_log) {
+    const DeltaArchiveManifest_InstallOperation& operation) {
 
   if (!operation.data_sha256_hash().size()) {
     if (!operation.data_length()) {
@@ -818,9 +924,6 @@
     return kActionCodeDownloadOperationHashMismatch;
   }
 
-  if (should_log)
-    LOG(INFO) << "Validated operation " << next_operation_num_ + 1;
-
   return kActionCodeSuccess;
 }
 
@@ -1107,6 +1210,10 @@
                         manifest_metadata_size > 0);
   manifest_metadata_size_ = manifest_metadata_size;
 
+  // Advance the download progress to reflect what doesn't need to be
+  // re-downloaded.
+  total_bytes_received_ += buffer_offset_;
+
   // Speculatively count the resume as a failure.
   int64_t resumed_update_failures;
   if (prefs_->GetInt64(kPrefsResumedUpdateFailures, &resumed_update_failures)) {