Add experiment for analyzing debug info

Analyze entropy of debug info to estimates a close upper bound
savings from huffman encoding debug infos.

Also measure how many debug info bytes are dedupable if you exclude
the line number and parameter names.

Sample output:
Debug info bytes 96101012(5.70%)
  DBG_END_SEQUENCE: 6401069(0.38%)
  DBG_ADVANCE_PC: 3709064(0.22%)
  DBG_ADVANCE_LINE: 8620724(0.51%)
  DBG_START_LOCAL: 5244232(0.31%)
  DBG_START_LOCAL_EXTENDED: 1763845(0.10%)
  DBG_END_LOCAL: 1216044(0.07%)
  DBG_RESTART_LOCAL: 565412(0.03%)
  DBG_SET_PROLOGUE bytes 5768714(0.34%)
  DBG_SET_FILE bytes 0(0.00%)
  special: 36310220(2.15%)
Debug info entropy 69199724(4.10%)
Debug info opcode bytes 55613021(3.30%)
Debug info opcode entropy 34792401(2.06%)
Debug info non header bytes 69599324(4.13%)
Debug info deduped non header bytes 52475493(3.11%)

Bug: 77721545
Test: test-art-host
Change-Id: I031322e3b79a1572fcbb9e513ded9708e3b48354
diff --git a/tools/dexanalyze/dexanalyze.cc b/tools/dexanalyze/dexanalyze.cc
index 38725d4..7d7e5f2 100644
--- a/tools/dexanalyze/dexanalyze.cc
+++ b/tools/dexanalyze/dexanalyze.cc
@@ -49,6 +49,8 @@
         << "Usage " << argv[0] << " [options] <dex files>\n"
         << "    [options] is a combination of the following\n"
         << "    -count_indices (Count dex indices accessed from code items)\n"
+        << "    -analyze-strings (Analyze string data)\n"
+        << "    -analyze-debug-info (Analyze debug info)\n"
         << "    -i (Ignore Dex checksum and verification failures)\n"
         << "    -a (Run all experiments)\n"
         << "    -d (Dump on per DEX basis)\n";
@@ -69,6 +71,8 @@
           exp_count_indices_ = true;
         } else if (arg == "-analyze-strings") {
           exp_analyze_strings_ = true;
+        } else if (arg == "-analyze-debug-info") {
+          exp_debug_info_ = true;
         } else if (arg == "-d") {
           dump_per_input_dex_ = true;
         } else if (!arg.empty() && arg[0] == '-') {
@@ -90,6 +94,7 @@
     bool exp_count_indices_ = false;
     bool exp_code_metrics_ = false;
     bool exp_analyze_strings_ = false;
+    bool exp_debug_info_ = false;
     bool run_all_experiments_ = false;
     std::vector<std::string> filenames_;
   };
@@ -106,6 +111,9 @@
       if (options->run_all_experiments_ || options->exp_code_metrics_) {
         experiments_.emplace_back(new CodeMetrics);
       }
+      if (options->run_all_experiments_ || options->exp_debug_info_) {
+        experiments_.emplace_back(new AnalyzeDebugInfo);
+      }
     }
 
     bool ProcessDexFile(const DexFile& dex_file) {
@@ -120,6 +128,7 @@
     void Dump(std::ostream& os) {
       for (std::unique_ptr<Experiment>& experiment : experiments_) {
         experiment->Dump(os, total_size_);
+        os << "\n";
       }
     }
 
diff --git a/tools/dexanalyze/dexanalyze_experiments.cc b/tools/dexanalyze/dexanalyze_experiments.cc
index 7006370..1a3b89c 100644
--- a/tools/dexanalyze/dexanalyze_experiments.cc
+++ b/tools/dexanalyze/dexanalyze_experiments.cc
@@ -75,6 +75,128 @@
   return len;
 }
 
+void AnalyzeDebugInfo::ProcessDexFile(const DexFile& dex_file) {
+  std::set<const uint8_t*> seen;
+  std::vector<size_t> counts(256, 0u);
+  std::vector<size_t> opcode_counts(256, 0u);
+  std::set<std::vector<uint8_t>> unique_non_header;
+  for (ClassAccessor accessor : dex_file.GetClasses()) {
+    for (const ClassAccessor::Method& method : accessor.GetMethods()) {
+      CodeItemDebugInfoAccessor code_item(dex_file, method.GetCodeItem(), method.GetIndex());
+      const uint8_t* debug_info = dex_file.GetDebugInfoStream(code_item.DebugInfoOffset());
+      if (debug_info != nullptr && seen.insert(debug_info).second) {
+        const uint8_t* stream = debug_info;
+        DecodeUnsignedLeb128(&stream);  // line_start
+        uint32_t parameters_size = DecodeUnsignedLeb128(&stream);
+        for (uint32_t i = 0; i < parameters_size; ++i) {
+          DecodeUnsignedLeb128P1(&stream);  // Parameter name.
+        }
+        bool done = false;
+        const uint8_t* after_header_start = stream;
+        while (!done) {
+          const uint8_t* const op_start = stream;
+          uint8_t opcode = *stream++;
+          ++opcode_counts[opcode];
+          ++total_opcode_bytes_;
+          switch (opcode) {
+            case DexFile::DBG_END_SEQUENCE:
+              ++total_end_seq_bytes_;
+              done = true;
+              break;
+            case DexFile::DBG_ADVANCE_PC:
+              DecodeUnsignedLeb128(&stream);  // addr_diff
+              total_advance_pc_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_ADVANCE_LINE:
+              DecodeSignedLeb128(&stream);  // line_diff
+              total_advance_line_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_START_LOCAL:
+              DecodeUnsignedLeb128(&stream);  // register_num
+              DecodeUnsignedLeb128P1(&stream);  // name_idx
+              DecodeUnsignedLeb128P1(&stream);  // type_idx
+              total_start_local_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_START_LOCAL_EXTENDED:
+              DecodeUnsignedLeb128(&stream);  // register_num
+              DecodeUnsignedLeb128P1(&stream);  // name_idx
+              DecodeUnsignedLeb128P1(&stream);  // type_idx
+              DecodeUnsignedLeb128P1(&stream);  // sig_idx
+              total_start_local_extended_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_END_LOCAL:
+              DecodeUnsignedLeb128(&stream);  // register_num
+              total_end_local_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_RESTART_LOCAL:
+              DecodeUnsignedLeb128(&stream);  // register_num
+              total_restart_local_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_SET_PROLOGUE_END:
+            case DexFile::DBG_SET_EPILOGUE_BEGIN:
+              total_epilogue_bytes_ += stream - op_start;
+              break;
+            case DexFile::DBG_SET_FILE: {
+              DecodeUnsignedLeb128P1(&stream);  // name_idx
+              total_set_file_bytes_ += stream - op_start;
+              break;
+            }
+            default: {
+              total_other_bytes_ += stream - op_start;
+              break;
+            }
+          }
+        }
+        const size_t bytes = stream - debug_info;
+        total_bytes_ += bytes;
+        total_non_header_bytes_ += stream - after_header_start;
+        if (unique_non_header.insert(std::vector<uint8_t>(after_header_start, stream)).second) {
+          total_unique_non_header_bytes_ += stream - after_header_start;
+        }
+        for (size_t i = 0; i < bytes; ++i) {
+          ++counts[debug_info[i]];
+        }
+      }
+    }
+  }
+  auto calc_entropy = [](std::vector<size_t> data) {
+    size_t total = std::accumulate(data.begin(), data.end(), 0u);
+    double avg_entropy = 0.0;
+    for (size_t c : data) {
+      if (c > 0) {
+        double ratio = static_cast<double>(c) / static_cast<double>(total);
+        avg_entropy -= ratio * log(ratio) / log(256.0);
+      }
+    }
+    return avg_entropy * total;
+  };
+  total_entropy_ += calc_entropy(counts);
+  total_opcode_entropy_ += calc_entropy(opcode_counts);
+}
+
+void AnalyzeDebugInfo::Dump(std::ostream& os, uint64_t total_size) const {
+  os << "Debug info bytes " << Percent(total_bytes_, total_size) << "\n";
+
+  os << "  DBG_END_SEQUENCE: " << Percent(total_end_seq_bytes_, total_size) << "\n";
+  os << "  DBG_ADVANCE_PC: " << Percent(total_advance_pc_bytes_, total_size) << "\n";
+  os << "  DBG_ADVANCE_LINE: " << Percent(total_advance_line_bytes_, total_size) << "\n";
+  os << "  DBG_START_LOCAL: " << Percent(total_start_local_bytes_, total_size) << "\n";
+  os << "  DBG_START_LOCAL_EXTENDED: "
+     << Percent(total_start_local_extended_bytes_, total_size) << "\n";
+  os << "  DBG_END_LOCAL: " << Percent(total_end_local_bytes_, total_size) << "\n";
+  os << "  DBG_RESTART_LOCAL: " << Percent(total_restart_local_bytes_, total_size) << "\n";
+  os << "  DBG_SET_PROLOGUE bytes " << Percent(total_epilogue_bytes_, total_size) << "\n";
+  os << "  DBG_SET_FILE bytes " << Percent(total_set_file_bytes_, total_size) << "\n";
+  os << "  special: "
+      << Percent(total_other_bytes_, total_size) << "\n";
+  os << "Debug info entropy " << Percent(total_entropy_, total_size) << "\n";
+  os << "Debug info opcode bytes " << Percent(total_opcode_bytes_, total_size) << "\n";
+  os << "Debug info opcode entropy " << Percent(total_opcode_entropy_, total_size) << "\n";
+  os << "Debug info non header bytes " << Percent(total_non_header_bytes_, total_size) << "\n";
+  os << "Debug info deduped non header bytes "
+     << Percent(total_unique_non_header_bytes_, total_size) << "\n";
+}
+
 void AnalyzeStrings::ProcessDexFile(const DexFile& dex_file) {
   std::vector<std::string> strings;
   for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
diff --git a/tools/dexanalyze/dexanalyze_experiments.h b/tools/dexanalyze/dexanalyze_experiments.h
index 7ba2a49..a2621c8 100644
--- a/tools/dexanalyze/dexanalyze_experiments.h
+++ b/tools/dexanalyze/dexanalyze_experiments.h
@@ -51,6 +51,32 @@
   int64_t total_num_prefixes_ = 0u;
 };
 
+// Analyze debug info sizes.
+class AnalyzeDebugInfo  : public Experiment {
+ public:
+  void ProcessDexFile(const DexFile& dex_file);
+  void Dump(std::ostream& os, uint64_t total_size) const;
+
+ private:
+  int64_t total_bytes_ = 0u;
+  int64_t total_entropy_ = 0u;
+  int64_t total_opcode_bytes_ = 0u;
+  int64_t total_opcode_entropy_ = 0u;
+  int64_t total_non_header_bytes_ = 0u;
+  int64_t total_unique_non_header_bytes_ = 0u;
+  // Opcode and related data.
+  int64_t total_end_seq_bytes_ = 0u;
+  int64_t total_advance_pc_bytes_ = 0u;
+  int64_t total_advance_line_bytes_ = 0u;
+  int64_t total_start_local_bytes_ = 0u;
+  int64_t total_start_local_extended_bytes_ = 0u;
+  int64_t total_end_local_bytes_ = 0u;
+  int64_t total_restart_local_bytes_ = 0u;
+  int64_t total_epilogue_bytes_ = 0u;
+  int64_t total_set_file_bytes_ = 0u;
+  int64_t total_other_bytes_ = 0u;
+};
+
 // Count numbers of dex indices.
 class CountDexIndices : public Experiment {
  public: