Fix a scalability problem observed whilst running Helgrind on a large
workload: when scanning a freelist of a given size for a big-enough
block (to allocate), don't scan all the way around the list.  Instead
give up after 100 blocks and try the freelist above.  The pathological
case (as observed) is that the freelist contains tens of thousands of
blocks, but all are too small for the current request, hence they are
all visited pointlessly.  If the new heuristic is used, the freelist
start point is moved along by one block, so that future searches
eventually inspect the entire freelist, just very slowly.

Also, some improvements to stats gathering, and rename of some
existing stats fields in struct Arena.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11567 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/m_mallocfree.c b/coregrind/m_mallocfree.c
index dabc39a..4054c96 100644
--- a/coregrind/m_mallocfree.c
+++ b/coregrind/m_mallocfree.c
@@ -186,9 +186,14 @@
       SizeT        sblocks_used;
       Superblock*  sblocks_initial[SBLOCKS_SIZE_INITIAL];
       // Stats only.
-      SizeT        bytes_on_loan;
-      SizeT        bytes_mmaped;
-      SizeT        bytes_on_loan_max;
+      SizeT        stats__bytes_on_loan;
+      SizeT        stats__bytes_mmaped;
+      SizeT        stats__bytes_on_loan_max;
+      ULong        stats__tot_blocks; /* total # blocks alloc'd */
+      ULong        stats__tot_bytes; /* total # bytes alloc'd */
+      ULong        stats__nsearches; /* total # freelist checks */
+      // If profiling, when should the next profile happen at
+      // (in terms of stats__bytes_on_loan_max) ?
       SizeT        next_profile_at;
    }
    Arena;
@@ -477,13 +482,16 @@
    a->min_sblock_szB = min_sblock_szB;
    for (i = 0; i < N_MALLOC_LISTS; i++) a->freelist[i] = NULL;
 
-   a->sblocks           = & a->sblocks_initial[0];
-   a->sblocks_size      = SBLOCKS_SIZE_INITIAL;
-   a->sblocks_used      = 0;
-   a->bytes_on_loan     = 0;
-   a->bytes_mmaped      = 0;
-   a->bytes_on_loan_max = 0;
-   a->next_profile_at   = 25 * 1000 * 1000;
+   a->sblocks                  = & a->sblocks_initial[0];
+   a->sblocks_size             = SBLOCKS_SIZE_INITIAL;
+   a->sblocks_used             = 0;
+   a->stats__bytes_on_loan     = 0;
+   a->stats__bytes_mmaped      = 0;
+   a->stats__bytes_on_loan_max = 0;
+   a->stats__tot_blocks        = 0;
+   a->stats__tot_bytes         = 0;
+   a->stats__nsearches         = 0;
+   a->next_profile_at          = 25 * 1000 * 1000;
    vg_assert(sizeof(a->sblocks_initial) 
              == SBLOCKS_SIZE_INITIAL * sizeof(Superblock*));
 }
@@ -495,8 +503,14 @@
    for (i = 0; i < VG_N_ARENAS; i++) {
       Arena* a = arenaId_to_ArenaP(i);
       VG_(message)(Vg_DebugMsg,
-         "%8s: %8ld mmap'd, %8ld/%8ld max/curr\n",
-         a->name, a->bytes_mmaped, a->bytes_on_loan_max, a->bytes_on_loan 
+         "%8s: %8ld mmap'd,  %8ld/%8ld max/curr,  "
+                   "%10llu/%10llu totalloc-blocks/bytes,"
+                   "  %10llu searches\n",
+                   a->name, a->stats__bytes_mmaped,
+                   a->stats__bytes_on_loan_max,
+                   a->stats__bytes_on_loan,
+                   a->stats__tot_blocks, a->stats__tot_bytes,
+                   a->stats__nsearches
       );
    }
 }
@@ -695,7 +709,7 @@
    //zzVALGRIND_MAKE_MEM_UNDEFINED(sb, cszB);
    vg_assert(0 == (Addr)sb % VG_MIN_MALLOC_SZB);
    sb->n_payload_bytes = cszB - sizeof(Superblock);
-   a->bytes_mmaped += cszB;
+   a->stats__bytes_mmaped += cszB;
    VG_(debugLog)(1, "mallocfree",
                     "newSuperblock at %p (pszB %7ld) owner %s/%s\n", 
                     sb, sb->n_payload_bytes, 
@@ -993,7 +1007,7 @@
       }
    }
 
-   if (arena_bytes_on_loan != a->bytes_on_loan) {
+   if (arena_bytes_on_loan != a->stats__bytes_on_loan) {
 #     ifdef VERBOSE_MALLOC
       VG_(printf)( "sanity_check_malloc_arena: a->bytes_on_loan %ld, "
                    "arena_bytes_on_loan %ld: "
@@ -1051,7 +1065,7 @@
                    a->name,
                    superblockctr,
                    blockctr_sb, blockctr_sb_free, blockctr_li, 
-                   a->bytes_mmaped, a->bytes_on_loan);   
+                   a->stats__bytes_mmaped, a->stats__bytes_on_loan);   
 #  undef BOMB
 }
 
@@ -1092,7 +1106,8 @@
 
    VG_(printf)(
       "-------- Arena \"%s\": %ld mmap'd, %ld/%ld max/curr --------\n",
-      a->name, a->bytes_mmaped, a->bytes_on_loan_max, a->bytes_on_loan 
+      a->name, a->stats__bytes_mmaped,
+      a->stats__bytes_on_loan_max, a->stats__bytes_on_loan 
    );
 
    for (j = 0; j < a->sblocks_used; ++j) {
@@ -1269,6 +1284,7 @@
    Block*      b = NULL;
    Arena*      a;
    void*       v;
+   UWord       stats__nsearches = 0;
 
    ensure_mm_init(aid);
    a = arenaId_to_ArenaP(aid);
@@ -1301,9 +1317,25 @@
    // behaviour.
    //
    for (lno = pszB_to_listNo(req_pszB); lno < N_MALLOC_LISTS; lno++) {
+      UWord nsearches_this_level = 0;
       b = a->freelist[lno];
       if (NULL == b) continue;   // If this list is empty, try the next one.
       while (True) {
+         stats__nsearches++;
+         nsearches_this_level++;
+         if (UNLIKELY(nsearches_this_level >= 100) 
+             && lno < N_MALLOC_LISTS-1) {
+            /* Avoid excessive scanning on this freelist, and instead
+               try the next one up.  But first, move this freelist's
+               start pointer one element along, so as to ensure that
+               subsequent searches of this list don't endlessly
+               revisit only these 100 elements, but in fact slowly
+               progress through the entire list. */
+            b = a->freelist[lno];
+            vg_assert(b); // this list must be nonempty!
+            a->freelist[lno] = get_next_b(b); // step one along
+            break;
+         }
          b_bszB = get_bszB(b);
          if (b_bszB >= req_bszB) goto obtained_block;    // success!
          b = get_next_b(b);
@@ -1396,18 +1428,22 @@
    }
 
    // Update stats
-   a->bytes_on_loan += bszB_to_pszB(a, b_bszB);
-   if (a->bytes_on_loan > a->bytes_on_loan_max) {
-      a->bytes_on_loan_max = a->bytes_on_loan;
-      if (a->bytes_on_loan_max >= a->next_profile_at) {
+   SizeT loaned = bszB_to_pszB(a, b_bszB);
+   a->stats__bytes_on_loan += loaned;
+   if (a->stats__bytes_on_loan > a->stats__bytes_on_loan_max) {
+      a->stats__bytes_on_loan_max = a->stats__bytes_on_loan;
+      if (a->stats__bytes_on_loan_max >= a->next_profile_at) {
          /* next profile after 10% more growth */
          a->next_profile_at 
             = (SizeT)( 
-                 (((ULong)a->bytes_on_loan_max) * 110ULL) / 100ULL );
+                 (((ULong)a->stats__bytes_on_loan_max) * 110ULL) / 100ULL );
          if (VG_(clo_profile_heap))
             cc_analyse_alloc_arena(aid);
       }
    }
+   a->stats__tot_blocks += (ULong)1;
+   a->stats__tot_bytes  += (ULong)loaned;
+   a->stats__nsearches  += (ULong)stats__nsearches;
 
 #  ifdef DEBUG_MALLOC
    sanity_check_malloc_arena(aid);
@@ -1462,7 +1498,7 @@
    sb_start = &sb->payload_bytes[0];
    sb_end   = &sb->payload_bytes[sb->n_payload_bytes - 1];
 
-   a->bytes_on_loan -= b_pszB;
+   a->stats__bytes_on_loan -= b_pszB;
 
    /* If this is one of V's areas, fill it up with junk to enhance the
       chances of catching any later reads of it.  Note, 0xDD is
@@ -1610,9 +1646,9 @@
 
    /* Payload ptr for the block we are going to split.  Note this
       changes a->bytes_on_loan; we save and restore it ourselves. */
-   saved_bytes_on_loan = a->bytes_on_loan;
+   saved_bytes_on_loan = a->stats__bytes_on_loan;
    base_p = VG_(arena_malloc) ( aid, cc, base_pszB_req );
-   a->bytes_on_loan = saved_bytes_on_loan;
+   a->stats__bytes_on_loan = saved_bytes_on_loan;
 
    /* Give up if we couldn't allocate enough space */
    if (base_p == 0)
@@ -1655,9 +1691,13 @@
 
    vg_assert(req_pszB <= get_pszB(a, get_payload_block(a, align_p)));
 
-   a->bytes_on_loan += get_pszB(a, get_payload_block(a, align_p));
-   if (a->bytes_on_loan > a->bytes_on_loan_max)
-      a->bytes_on_loan_max = a->bytes_on_loan;
+   a->stats__bytes_on_loan += get_pszB(a, get_payload_block(a, align_p));
+   if (a->stats__bytes_on_loan > a->stats__bytes_on_loan_max) {
+      a->stats__bytes_on_loan_max = a->stats__bytes_on_loan;
+   }
+   /* a->stats__tot_blocks, a->stats__tot_bytes, a->stats__nsearches
+      are updated by the call to VG_(arena_malloc) just a few lines
+      above.  So we don't need to update them here. */
 
 #  ifdef DEBUG_MALLOC
    sanity_check_malloc_arena(aid);
@@ -1727,14 +1767,14 @@
 
    // We don't have fastbins so smblks & fsmblks are always 0. Also we don't
    // have a separate mmap allocator so set hblks & hblkhd to 0.
-   mi->arena    = a->bytes_mmaped;
+   mi->arena    = a->stats__bytes_mmaped;
    mi->ordblks  = free_blocks + VG_(free_queue_length);
    mi->smblks   = 0;
    mi->hblks    = 0;
    mi->hblkhd   = 0;
    mi->usmblks  = 0;
    mi->fsmblks  = 0;
-   mi->uordblks = a->bytes_on_loan - VG_(free_queue_volume);
+   mi->uordblks = a->stats__bytes_on_loan - VG_(free_queue_volume);
    mi->fordblks = free_blocks_size + VG_(free_queue_volume);
    mi->keepcost = 0; // may want some value in here
 }