Merge pull request #10613 from ctiller/framing_costs

Optimize framing a little
diff --git a/grpc.def b/grpc.def
index b602172..293f2d8 100644
--- a/grpc.def
+++ b/grpc.def
@@ -141,10 +141,12 @@
     grpc_server_credentials_set_auth_metadata_processor
     grpc_slice_ref
     grpc_slice_unref
+    grpc_slice_copy
     grpc_slice_new
     grpc_slice_new_with_user_data
     grpc_slice_new_with_len
     grpc_slice_malloc
+    grpc_slice_malloc_large
     grpc_slice_intern
     grpc_slice_from_copied_string
     grpc_slice_from_copied_buffer
@@ -153,6 +155,7 @@
     grpc_slice_sub
     grpc_slice_sub_no_ref
     grpc_slice_split_tail
+    grpc_slice_split_tail_maybe_ref
     grpc_slice_split_head
     grpc_empty_slice
     grpc_slice_default_hash_impl
@@ -181,6 +184,7 @@
     grpc_slice_buffer_move_into
     grpc_slice_buffer_trim_end
     grpc_slice_buffer_move_first
+    grpc_slice_buffer_move_first_no_ref
     grpc_slice_buffer_move_first_into_buffer
     grpc_slice_buffer_take_first
     grpc_slice_buffer_undo_take_first
diff --git a/include/grpc/slice.h b/include/grpc/slice.h
index 8c32a30..9c4b158 100644
--- a/include/grpc/slice.h
+++ b/include/grpc/slice.h
@@ -53,6 +53,9 @@
    where dest!=NULL , then (*dest)(start, len).  Requires s initialized.  */
 GPRAPI void grpc_slice_unref(grpc_slice s);
 
+/* Copy slice - create a new slice that contains the same data as s */
+GPRAPI grpc_slice grpc_slice_copy(grpc_slice s);
+
 /* Create a slice pointing at some data. Calls malloc to allocate a refcount
    for the object, and arranges that destroy will be called with the pointer
    passed in at destruction. */
@@ -75,6 +78,13 @@
    call.
    Aborts if malloc() fails. */
 GPRAPI grpc_slice grpc_slice_malloc(size_t length);
+GPRAPI grpc_slice grpc_slice_malloc_large(size_t length);
+
+#define GRPC_SLICE_MALLOC(len)                                    \
+  ((len) <= GRPC_SLICE_INLINED_SIZE                               \
+       ? (grpc_slice){.refcount = NULL,                           \
+                      .data.inlined = {.length = (uint8_t)(len)}} \
+       : grpc_slice_malloc_large((len)))
 
 /* Intern a slice:
 
@@ -117,6 +127,18 @@
    Requires s intialized, split <= s.length */
 GPRAPI grpc_slice grpc_slice_split_tail(grpc_slice *s, size_t split);
 
+typedef enum {
+  GRPC_SLICE_REF_TAIL = 1,
+  GRPC_SLICE_REF_HEAD = 2,
+  GRPC_SLICE_REF_BOTH = 1 + 2
+} grpc_slice_ref_whom;
+
+/* The same as grpc_slice_split_tail, but with an option to skip altering
+ * refcounts (grpc_slice_split_tail_maybe_ref(..., true) is equivalent to
+ * grpc_slice_split_tail(...)) */
+GPRAPI grpc_slice grpc_slice_split_tail_maybe_ref(grpc_slice *s, size_t split,
+                                                  grpc_slice_ref_whom ref_whom);
+
 /* Splits s into two: modifies s to be s[split:s.length], and returns a new
    slice, sharing a refcount with s, that contains s[0:split].
    Requires s intialized, split <= s.length */
diff --git a/include/grpc/slice_buffer.h b/include/grpc/slice_buffer.h
index 2ed8966..cdbd747 100644
--- a/include/grpc/slice_buffer.h
+++ b/include/grpc/slice_buffer.h
@@ -77,6 +77,10 @@
 /* move the first n bytes of src into dst */
 GPRAPI void grpc_slice_buffer_move_first(grpc_slice_buffer *src, size_t n,
                                          grpc_slice_buffer *dst);
+/* move the first n bytes of src into dst without adding references */
+GPRAPI void grpc_slice_buffer_move_first_no_ref(grpc_slice_buffer *src,
+                                                size_t n,
+                                                grpc_slice_buffer *dst);
 /* move the first n bytes of src into dst (copying them) */
 GPRAPI void grpc_slice_buffer_move_first_into_buffer(grpc_exec_ctx *exec_ctx,
                                                      grpc_slice_buffer *src,
diff --git a/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c b/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
index 10af252..87549b7 100644
--- a/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
+++ b/src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
@@ -98,7 +98,7 @@
   pb_encode(&sizestream, grpc_lb_v1_LoadBalanceRequest_fields, request);
   encoded_length = sizestream.bytes_written;
 
-  slice = grpc_slice_malloc(encoded_length);
+  slice = GRPC_SLICE_MALLOC(encoded_length);
   outputstream =
       pb_ostream_from_buffer(GRPC_SLICE_START_PTR(slice), encoded_length);
   GPR_ASSERT(pb_encode(&outputstream, grpc_lb_v1_LoadBalanceRequest_fields,
diff --git a/src/core/ext/filters/http/client/http_client_filter.c b/src/core/ext/filters/http/client/http_client_filter.c
index bf5fbc7..d8ae080 100644
--- a/src/core/ext/filters/http/client/http_client_filter.c
+++ b/src/core/ext/filters/http/client/http_client_filter.c
@@ -323,7 +323,7 @@
         estimated_len += grpc_base64_estimate_encoded_size(
             op->payload->send_message.send_message->length, k_url_safe,
             k_multi_line);
-        grpc_slice path_with_query_slice = grpc_slice_malloc(estimated_len);
+        grpc_slice path_with_query_slice = GRPC_SLICE_MALLOC(estimated_len);
 
         /* memcopy individual pieces into this slice */
         uint8_t *write_ptr =
diff --git a/src/core/ext/transport/chttp2/transport/bin_decoder.c b/src/core/ext/transport/chttp2/transport/bin_decoder.c
index 8c87de1..21bed18 100644
--- a/src/core/ext/transport/chttp2/transport/bin_decoder.c
+++ b/src/core/ext/transport/chttp2/transport/bin_decoder.c
@@ -169,7 +169,7 @@
       }
     }
   }
-  output = grpc_slice_malloc(output_length);
+  output = GRPC_SLICE_MALLOC(output_length);
 
   ctx.input_cur = GRPC_SLICE_START_PTR(input);
   ctx.input_end = GRPC_SLICE_END_PTR(input);
@@ -193,7 +193,7 @@
                                                  grpc_slice input,
                                                  size_t output_length) {
   size_t input_length = GRPC_SLICE_LENGTH(input);
-  grpc_slice output = grpc_slice_malloc(output_length);
+  grpc_slice output = GRPC_SLICE_MALLOC(output_length);
   struct grpc_base64_decode_context ctx;
 
   // The length of a base64 string cannot be 4 * n + 1
diff --git a/src/core/ext/transport/chttp2/transport/bin_encoder.c b/src/core/ext/transport/chttp2/transport/bin_encoder.c
index e301c07..3b8ab1f 100644
--- a/src/core/ext/transport/chttp2/transport/bin_encoder.c
+++ b/src/core/ext/transport/chttp2/transport/bin_encoder.c
@@ -66,7 +66,7 @@
   size_t input_triplets = input_length / 3;
   size_t tail_case = input_length % 3;
   size_t output_length = input_triplets * 4 + tail_xtra[tail_case];
-  grpc_slice output = grpc_slice_malloc(output_length);
+  grpc_slice output = GRPC_SLICE_MALLOC(output_length);
   uint8_t *in = GRPC_SLICE_START_PTR(input);
   char *out = (char *)GRPC_SLICE_START_PTR(output);
   size_t i;
@@ -119,7 +119,7 @@
     nbits += grpc_chttp2_huffsyms[*in].length;
   }
 
-  output = grpc_slice_malloc(nbits / 8 + (nbits % 8 != 0));
+  output = GRPC_SLICE_MALLOC(nbits / 8 + (nbits % 8 != 0));
   out = GRPC_SLICE_START_PTR(output);
   for (in = GRPC_SLICE_START_PTR(input); in != GRPC_SLICE_END_PTR(input);
        ++in) {
@@ -184,7 +184,7 @@
   size_t output_syms = input_triplets * 4 + tail_xtra[tail_case];
   size_t max_output_bits = 11 * output_syms;
   size_t max_output_length = max_output_bits / 8 + (max_output_bits % 8 != 0);
-  grpc_slice output = grpc_slice_malloc(max_output_length);
+  grpc_slice output = GRPC_SLICE_MALLOC(max_output_length);
   uint8_t *in = GRPC_SLICE_START_PTR(input);
   uint8_t *start_out = GRPC_SLICE_START_PTR(output);
   huff_out out;
diff --git a/src/core/ext/transport/chttp2/transport/chttp2_transport.c b/src/core/ext/transport/chttp2/transport/chttp2_transport.c
index 5d74532..d6b79bd 100644
--- a/src/core/ext/transport/chttp2/transport/chttp2_transport.c
+++ b/src/core/ext/transport/chttp2/transport/chttp2_transport.c
@@ -1969,7 +1969,7 @@
      the time we got around to sending this, so instead we ignore HPACK
      compression and just write the uncompressed bytes onto the wire. */
   if (!s->sent_initial_metadata) {
-    http_status_hdr = grpc_slice_malloc(13);
+    http_status_hdr = GRPC_SLICE_MALLOC(13);
     p = GRPC_SLICE_START_PTR(http_status_hdr);
     *p++ = 0x00;
     *p++ = 7;
@@ -1987,7 +1987,7 @@
     GPR_ASSERT(p == GRPC_SLICE_END_PTR(http_status_hdr));
     len += (uint32_t)GRPC_SLICE_LENGTH(http_status_hdr);
 
-    content_type_hdr = grpc_slice_malloc(31);
+    content_type_hdr = GRPC_SLICE_MALLOC(31);
     p = GRPC_SLICE_START_PTR(content_type_hdr);
     *p++ = 0x00;
     *p++ = 12;
@@ -2024,7 +2024,7 @@
     len += (uint32_t)GRPC_SLICE_LENGTH(content_type_hdr);
   }
 
-  status_hdr = grpc_slice_malloc(15 + (grpc_status >= 10));
+  status_hdr = GRPC_SLICE_MALLOC(15 + (grpc_status >= 10));
   p = GRPC_SLICE_START_PTR(status_hdr);
   *p++ = 0x00; /* literal header, not indexed */
   *p++ = 11;   /* len(grpc-status) */
@@ -2053,7 +2053,7 @@
   size_t msg_len = GRPC_SLICE_LENGTH(slice);
   GPR_ASSERT(msg_len <= UINT32_MAX);
   uint32_t msg_len_len = GRPC_CHTTP2_VARINT_LENGTH((uint32_t)msg_len, 1);
-  message_pfx = grpc_slice_malloc(14 + msg_len_len);
+  message_pfx = GRPC_SLICE_MALLOC(14 + msg_len_len);
   p = GRPC_SLICE_START_PTR(message_pfx);
   *p++ = 0x00; /* literal header, not indexed */
   *p++ = 12;   /* len(grpc-message) */
@@ -2075,7 +2075,7 @@
   len += (uint32_t)GRPC_SLICE_LENGTH(message_pfx);
   len += (uint32_t)msg_len;
 
-  hdr = grpc_slice_malloc(9);
+  hdr = GRPC_SLICE_MALLOC(9);
   p = GRPC_SLICE_START_PTR(hdr);
   *p++ = (uint8_t)(len >> 16);
   *p++ = (uint8_t)(len >> 8);
diff --git a/src/core/ext/transport/chttp2/transport/frame_data.c b/src/core/ext/transport/chttp2/transport/frame_data.c
index 9aba3ea..8cb8489 100644
--- a/src/core/ext/transport/chttp2/transport/frame_data.c
+++ b/src/core/ext/transport/chttp2/transport/frame_data.c
@@ -92,7 +92,7 @@
   uint8_t *p;
   static const size_t header_size = 9;
 
-  hdr = grpc_slice_malloc(header_size);
+  hdr = GRPC_SLICE_MALLOC(header_size);
   p = GRPC_SLICE_START_PTR(hdr);
   GPR_ASSERT(write_bytes < (1 << 24));
   *p++ = (uint8_t)(write_bytes >> 16);
@@ -106,7 +106,7 @@
   *p++ = (uint8_t)(id);
   grpc_slice_buffer_add(outbuf, hdr);
 
-  grpc_slice_buffer_move_first(inbuf, write_bytes, outbuf);
+  grpc_slice_buffer_move_first_no_ref(inbuf, write_bytes, outbuf);
 
   stats->framing_bytes += header_size;
   stats->data_bytes += write_bytes;
diff --git a/src/core/ext/transport/chttp2/transport/frame_goaway.c b/src/core/ext/transport/chttp2/transport/frame_goaway.c
index 001271d..0f1c8b0 100644
--- a/src/core/ext/transport/chttp2/transport/frame_goaway.c
+++ b/src/core/ext/transport/chttp2/transport/frame_goaway.c
@@ -163,7 +163,7 @@
 void grpc_chttp2_goaway_append(uint32_t last_stream_id, uint32_t error_code,
                                grpc_slice debug_data,
                                grpc_slice_buffer *slice_buffer) {
-  grpc_slice header = grpc_slice_malloc(9 + 4 + 4);
+  grpc_slice header = GRPC_SLICE_MALLOC(9 + 4 + 4);
   uint8_t *p = GRPC_SLICE_START_PTR(header);
   uint32_t frame_length;
   GPR_ASSERT(GRPC_SLICE_LENGTH(debug_data) < UINT32_MAX - 4 - 4);
diff --git a/src/core/ext/transport/chttp2/transport/frame_ping.c b/src/core/ext/transport/chttp2/transport/frame_ping.c
index 6016e43..f09ca60 100644
--- a/src/core/ext/transport/chttp2/transport/frame_ping.c
+++ b/src/core/ext/transport/chttp2/transport/frame_ping.c
@@ -43,7 +43,7 @@
 static bool g_disable_ping_ack = false;
 
 grpc_slice grpc_chttp2_ping_create(uint8_t ack, uint64_t opaque_8bytes) {
-  grpc_slice slice = grpc_slice_malloc(9 + 8);
+  grpc_slice slice = GRPC_SLICE_MALLOC(9 + 8);
   uint8_t *p = GRPC_SLICE_START_PTR(slice);
 
   *p++ = 0;
diff --git a/src/core/ext/transport/chttp2/transport/frame_rst_stream.c b/src/core/ext/transport/chttp2/transport/frame_rst_stream.c
index 225f15c..e0caa50 100644
--- a/src/core/ext/transport/chttp2/transport/frame_rst_stream.c
+++ b/src/core/ext/transport/chttp2/transport/frame_rst_stream.c
@@ -44,7 +44,7 @@
 grpc_slice grpc_chttp2_rst_stream_create(uint32_t id, uint32_t code,
                                          grpc_transport_one_way_stats *stats) {
   static const size_t frame_size = 13;
-  grpc_slice slice = grpc_slice_malloc(frame_size);
+  grpc_slice slice = GRPC_SLICE_MALLOC(frame_size);
   stats->framing_bytes += frame_size;
   uint8_t *p = GRPC_SLICE_START_PTR(slice);
 
diff --git a/src/core/ext/transport/chttp2/transport/frame_settings.c b/src/core/ext/transport/chttp2/transport/frame_settings.c
index 4f2b827..e3cd70d 100644
--- a/src/core/ext/transport/chttp2/transport/frame_settings.c
+++ b/src/core/ext/transport/chttp2/transport/frame_settings.c
@@ -70,7 +70,7 @@
     n += (new[i] != old[i] || (force_mask & (1u << i)) != 0);
   }
 
-  output = grpc_slice_malloc(9 + 6 * n);
+  output = GRPC_SLICE_MALLOC(9 + 6 * n);
   p = fill_header(GRPC_SLICE_START_PTR(output), 6 * n, 0);
 
   for (i = 0; i < count; i++) {
@@ -91,7 +91,7 @@
 }
 
 grpc_slice grpc_chttp2_settings_ack_create(void) {
-  grpc_slice output = grpc_slice_malloc(9);
+  grpc_slice output = GRPC_SLICE_MALLOC(9);
   fill_header(GRPC_SLICE_START_PTR(output), 0, GRPC_CHTTP2_FLAG_ACK);
   return output;
 }
diff --git a/src/core/ext/transport/chttp2/transport/frame_window_update.c b/src/core/ext/transport/chttp2/transport/frame_window_update.c
index b76b6f6..8ed72dd 100644
--- a/src/core/ext/transport/chttp2/transport/frame_window_update.c
+++ b/src/core/ext/transport/chttp2/transport/frame_window_update.c
@@ -41,7 +41,7 @@
 grpc_slice grpc_chttp2_window_update_create(
     uint32_t id, uint32_t window_update, grpc_transport_one_way_stats *stats) {
   static const size_t frame_size = 13;
-  grpc_slice slice = grpc_slice_malloc(frame_size);
+  grpc_slice slice = GRPC_SLICE_MALLOC(frame_size);
   stats->header_bytes += frame_size;
   uint8_t *p = GRPC_SLICE_START_PTR(slice);
 
diff --git a/src/core/ext/transport/chttp2/transport/hpack_encoder.c b/src/core/ext/transport/chttp2/transport/hpack_encoder.c
index b1bc677..8fdd4ee 100644
--- a/src/core/ext/transport/chttp2/transport/hpack_encoder.c
+++ b/src/core/ext/transport/chttp2/transport/hpack_encoder.c
@@ -123,7 +123,7 @@
    output before beginning */
 static void begin_frame(framer_state *st) {
   st->header_idx =
-      grpc_slice_buffer_add_indexed(st->output, grpc_slice_malloc(9));
+      grpc_slice_buffer_add_indexed(st->output, GRPC_SLICE_MALLOC(9));
   st->output_length_at_start_of_frame = st->output->length;
 }
 
diff --git a/src/core/ext/transport/cronet/transport/cronet_transport.c b/src/core/ext/transport/cronet/transport/cronet_transport.c
index 7896c70..d4e89d6 100644
--- a/src/core/ext/transport/cronet/transport/cronet_transport.c
+++ b/src/core/ext/transport/cronet/transport/cronet_transport.c
@@ -1177,7 +1177,7 @@
     } else if (stream_state->rs.remaining_bytes == 0) {
       CRONET_LOG(GPR_DEBUG, "read operation complete");
       grpc_slice read_data_slice =
-          grpc_slice_malloc((uint32_t)stream_state->rs.length_field);
+          GRPC_SLICE_MALLOC((uint32_t)stream_state->rs.length_field);
       uint8_t *dst_p = GRPC_SLICE_START_PTR(read_data_slice);
       memcpy(dst_p, stream_state->rs.read_buffer,
              (size_t)stream_state->rs.length_field);
diff --git a/src/core/lib/compression/message_compress.c b/src/core/lib/compression/message_compress.c
index 49beb95..fd3d1e6 100644
--- a/src/core/lib/compression/message_compress.c
+++ b/src/core/lib/compression/message_compress.c
@@ -50,7 +50,7 @@
   int r;
   int flush;
   size_t i;
-  grpc_slice outbuf = grpc_slice_malloc(OUTPUT_BLOCK_SIZE);
+  grpc_slice outbuf = GRPC_SLICE_MALLOC(OUTPUT_BLOCK_SIZE);
   const uInt uint_max = ~(uInt)0;
 
   GPR_ASSERT(GRPC_SLICE_LENGTH(outbuf) <= uint_max);
@@ -65,7 +65,7 @@
     do {
       if (zs->avail_out == 0) {
         grpc_slice_buffer_add_indexed(output, outbuf);
-        outbuf = grpc_slice_malloc(OUTPUT_BLOCK_SIZE);
+        outbuf = GRPC_SLICE_MALLOC(OUTPUT_BLOCK_SIZE);
         GPR_ASSERT(GRPC_SLICE_LENGTH(outbuf) <= uint_max);
         zs->avail_out = (uInt)GRPC_SLICE_LENGTH(outbuf);
         zs->next_out = GRPC_SLICE_START_PTR(outbuf);
diff --git a/src/core/lib/iomgr/tcp_windows.c b/src/core/lib/iomgr/tcp_windows.c
index f74aa68..bdd4dd0 100644
--- a/src/core/lib/iomgr/tcp_windows.c
+++ b/src/core/lib/iomgr/tcp_windows.c
@@ -219,7 +219,7 @@
   tcp->read_slices = read_slices;
   grpc_slice_buffer_reset_and_unref_internal(exec_ctx, read_slices);
 
-  tcp->read_slice = grpc_slice_malloc(8192);
+  tcp->read_slice = GRPC_SLICE_MALLOC(8192);
 
   buffer.len = (ULONG)GRPC_SLICE_LENGTH(
       tcp->read_slice);  // we know slice size fits in 32bit.
diff --git a/src/core/lib/security/transport/secure_endpoint.c b/src/core/lib/security/transport/secure_endpoint.c
index 24da949..0d5c743 100644
--- a/src/core/lib/security/transport/secure_endpoint.c
+++ b/src/core/lib/security/transport/secure_endpoint.c
@@ -130,7 +130,7 @@
 static void flush_read_staging_buffer(secure_endpoint *ep, uint8_t **cur,
                                       uint8_t **end) {
   grpc_slice_buffer_add(ep->read_buffer, ep->read_staging_buffer);
-  ep->read_staging_buffer = grpc_slice_malloc(STAGING_BUFFER_SIZE);
+  ep->read_staging_buffer = GRPC_SLICE_MALLOC(STAGING_BUFFER_SIZE);
   *cur = GRPC_SLICE_START_PTR(ep->read_staging_buffer);
   *end = GRPC_SLICE_END_PTR(ep->read_staging_buffer);
 }
@@ -252,7 +252,7 @@
 static void flush_write_staging_buffer(secure_endpoint *ep, uint8_t **cur,
                                        uint8_t **end) {
   grpc_slice_buffer_add(&ep->output_buffer, ep->write_staging_buffer);
-  ep->write_staging_buffer = grpc_slice_malloc(STAGING_BUFFER_SIZE);
+  ep->write_staging_buffer = GRPC_SLICE_MALLOC(STAGING_BUFFER_SIZE);
   *cur = GRPC_SLICE_START_PTR(ep->write_staging_buffer);
   *end = GRPC_SLICE_END_PTR(ep->write_staging_buffer);
 }
@@ -415,8 +415,8 @@
     grpc_slice_buffer_add(&ep->leftover_bytes,
                           grpc_slice_ref_internal(leftover_slices[i]));
   }
-  ep->write_staging_buffer = grpc_slice_malloc(STAGING_BUFFER_SIZE);
-  ep->read_staging_buffer = grpc_slice_malloc(STAGING_BUFFER_SIZE);
+  ep->write_staging_buffer = GRPC_SLICE_MALLOC(STAGING_BUFFER_SIZE);
+  ep->read_staging_buffer = GRPC_SLICE_MALLOC(STAGING_BUFFER_SIZE);
   grpc_slice_buffer_init(&ep->output_buffer);
   grpc_slice_buffer_init(&ep->source_buffer);
   ep->read_buffer = NULL;
diff --git a/src/core/lib/slice/b64.c b/src/core/lib/slice/b64.c
index 2007cc4..d909164 100644
--- a/src/core/lib/slice/b64.c
+++ b/src/core/lib/slice/b64.c
@@ -202,7 +202,7 @@
 
 grpc_slice grpc_base64_decode_with_len(grpc_exec_ctx *exec_ctx, const char *b64,
                                        size_t b64_len, int url_safe) {
-  grpc_slice result = grpc_slice_malloc(b64_len);
+  grpc_slice result = GRPC_SLICE_MALLOC(b64_len);
   unsigned char *current = GRPC_SLICE_START_PTR(result);
   size_t result_size = 0;
   unsigned char codes[4];
diff --git a/src/core/lib/slice/percent_encoding.c b/src/core/lib/slice/percent_encoding.c
index c76c58d..a77c697 100644
--- a/src/core/lib/slice/percent_encoding.c
+++ b/src/core/lib/slice/percent_encoding.c
@@ -71,7 +71,7 @@
     return grpc_slice_ref_internal(slice);
   }
   // second pass: actually encode
-  grpc_slice out = grpc_slice_malloc(output_length);
+  grpc_slice out = GRPC_SLICE_MALLOC(output_length);
   uint8_t *q = GRPC_SLICE_START_PTR(out);
   for (p = slice_start; p < slice_end; p++) {
     if (is_unreserved_character(*p, unreserved_bytes)) {
@@ -125,7 +125,7 @@
     return true;
   }
   p = GRPC_SLICE_START_PTR(slice_in);
-  *slice_out = grpc_slice_malloc(out_length);
+  *slice_out = GRPC_SLICE_MALLOC(out_length);
   uint8_t *q = GRPC_SLICE_START_PTR(*slice_out);
   while (p != in_end) {
     if (*p == '%') {
@@ -163,7 +163,7 @@
     return grpc_slice_ref_internal(slice_in);
   }
   p = GRPC_SLICE_START_PTR(slice_in);
-  grpc_slice out = grpc_slice_malloc(out_length);
+  grpc_slice out = GRPC_SLICE_MALLOC(out_length);
   uint8_t *q = GRPC_SLICE_START_PTR(out);
   while (p != in_end) {
     if (*p == '%') {
diff --git a/src/core/lib/slice/slice.c b/src/core/lib/slice/slice.c
index a2fcbbd..b90738f 100644
--- a/src/core/lib/slice/slice.c
+++ b/src/core/lib/slice/slice.c
@@ -55,6 +55,13 @@
   return out;
 }
 
+grpc_slice grpc_slice_copy(grpc_slice s) {
+  grpc_slice out = GRPC_SLICE_MALLOC(GRPC_SLICE_LENGTH(s));
+  memcpy(GRPC_SLICE_START_PTR(out), GRPC_SLICE_START_PTR(s),
+         GRPC_SLICE_LENGTH(s));
+  return out;
+}
+
 grpc_slice grpc_slice_ref_internal(grpc_slice slice) {
   if (slice.refcount) {
     slice.refcount->vtable->ref(slice.refcount);
@@ -198,7 +205,7 @@
 
 grpc_slice grpc_slice_from_copied_buffer(const char *source, size_t length) {
   if (length == 0) return grpc_empty_slice();
-  grpc_slice slice = grpc_slice_malloc(length);
+  grpc_slice slice = GRPC_SLICE_MALLOC(length);
   memcpy(GRPC_SLICE_START_PTR(slice), source, length);
   return slice;
 }
@@ -228,35 +235,42 @@
     malloc_ref, malloc_unref, grpc_slice_default_eq_impl,
     grpc_slice_default_hash_impl};
 
+grpc_slice grpc_slice_malloc_large(size_t length) {
+  grpc_slice slice;
+
+  /* Memory layout used by the slice created here:
+
+     +-----------+----------------------------------------------------------+
+     | refcount  | bytes                                                    |
+     +-----------+----------------------------------------------------------+
+
+     refcount is a malloc_refcount
+     bytes is an array of bytes of the requested length
+     Both parts are placed in the same allocation returned from gpr_malloc */
+  malloc_refcount *rc = gpr_malloc(sizeof(malloc_refcount) + length);
+
+  /* Initial refcount on rc is 1 - and it's up to the caller to release
+     this reference. */
+  gpr_ref_init(&rc->refs, 1);
+
+  rc->base.vtable = &malloc_vtable;
+  rc->base.sub_refcount = &rc->base;
+
+  /* Build up the slice to be returned. */
+  /* The slices refcount points back to the allocated block. */
+  slice.refcount = &rc->base;
+  /* The data bytes are placed immediately after the refcount struct */
+  slice.data.refcounted.bytes = (uint8_t *)(rc + 1);
+  /* And the length of the block is set to the requested length */
+  slice.data.refcounted.length = length;
+  return slice;
+}
+
 grpc_slice grpc_slice_malloc(size_t length) {
   grpc_slice slice;
 
   if (length > sizeof(slice.data.inlined.bytes)) {
-    /* Memory layout used by the slice created here:
-
-       +-----------+----------------------------------------------------------+
-       | refcount  | bytes                                                    |
-       +-----------+----------------------------------------------------------+
-
-       refcount is a malloc_refcount
-       bytes is an array of bytes of the requested length
-       Both parts are placed in the same allocation returned from gpr_malloc */
-    malloc_refcount *rc = gpr_malloc(sizeof(malloc_refcount) + length);
-
-    /* Initial refcount on rc is 1 - and it's up to the caller to release
-       this reference. */
-    gpr_ref_init(&rc->refs, 1);
-
-    rc->base.vtable = &malloc_vtable;
-    rc->base.sub_refcount = &rc->base;
-
-    /* Build up the slice to be returned. */
-    /* The slices refcount points back to the allocated block. */
-    slice.refcount = &rc->base;
-    /* The data bytes are placed immediately after the refcount struct */
-    slice.data.refcounted.bytes = (uint8_t *)(rc + 1);
-    /* And the length of the block is set to the requested length */
-    slice.data.refcounted.length = length;
+    return grpc_slice_malloc_large(length);
   } else {
     /* small slice: just inline the data */
     slice.refcount = NULL;
@@ -306,7 +320,8 @@
   return subset;
 }
 
-grpc_slice grpc_slice_split_tail(grpc_slice *source, size_t split) {
+grpc_slice grpc_slice_split_tail_maybe_ref(grpc_slice *source, size_t split,
+                                           grpc_slice_ref_whom ref_whom) {
   grpc_slice tail;
 
   if (source->refcount == NULL) {
@@ -320,28 +335,46 @@
   } else {
     size_t tail_length = source->data.refcounted.length - split;
     GPR_ASSERT(source->data.refcounted.length >= split);
-    if (tail_length < sizeof(tail.data.inlined.bytes)) {
+    if (tail_length < sizeof(tail.data.inlined.bytes) &&
+        ref_whom != GRPC_SLICE_REF_TAIL) {
       /* Copy out the bytes - it'll be cheaper than refcounting */
       tail.refcount = NULL;
       tail.data.inlined.length = (uint8_t)tail_length;
       memcpy(tail.data.inlined.bytes, source->data.refcounted.bytes + split,
              tail_length);
+      source->refcount = source->refcount->sub_refcount;
     } else {
       /* Build the result */
-      tail.refcount = source->refcount->sub_refcount;
-      /* Bump the refcount */
-      tail.refcount->vtable->ref(tail.refcount);
+      switch (ref_whom) {
+        case GRPC_SLICE_REF_TAIL:
+          tail.refcount = source->refcount->sub_refcount;
+          source->refcount = &noop_refcount;
+          break;
+        case GRPC_SLICE_REF_HEAD:
+          tail.refcount = &noop_refcount;
+          source->refcount = source->refcount->sub_refcount;
+          break;
+        case GRPC_SLICE_REF_BOTH:
+          tail.refcount = source->refcount->sub_refcount;
+          source->refcount = source->refcount->sub_refcount;
+          /* Bump the refcount */
+          tail.refcount->vtable->ref(tail.refcount);
+          break;
+      }
       /* Point into the source array */
       tail.data.refcounted.bytes = source->data.refcounted.bytes + split;
       tail.data.refcounted.length = tail_length;
     }
-    source->refcount = source->refcount->sub_refcount;
     source->data.refcounted.length = split;
   }
 
   return tail;
 }
 
+grpc_slice grpc_slice_split_tail(grpc_slice *source, size_t split) {
+  return grpc_slice_split_tail_maybe_ref(source, split, GRPC_SLICE_REF_BOTH);
+}
+
 grpc_slice grpc_slice_split_head(grpc_slice *source, size_t split) {
   grpc_slice head;
 
@@ -459,7 +492,7 @@
 }
 
 grpc_slice grpc_slice_dup(grpc_slice a) {
-  grpc_slice copy = grpc_slice_malloc(GRPC_SLICE_LENGTH(a));
+  grpc_slice copy = GRPC_SLICE_MALLOC(GRPC_SLICE_LENGTH(a));
   memcpy(GRPC_SLICE_START_PTR(copy), GRPC_SLICE_START_PTR(a),
          GRPC_SLICE_LENGTH(a));
   return copy;
diff --git a/src/core/lib/slice/slice_buffer.c b/src/core/lib/slice/slice_buffer.c
index c96b9c3..e8d41ca 100644
--- a/src/core/lib/slice/slice_buffer.c
+++ b/src/core/lib/slice/slice_buffer.c
@@ -253,16 +253,18 @@
   src->length = 0;
 }
 
-void grpc_slice_buffer_move_first(grpc_slice_buffer *src, size_t n,
-                                  grpc_slice_buffer *dst) {
-  size_t output_len = dst->length + n;
-  size_t new_input_len = src->length - n;
+static void slice_buffer_move_first_maybe_ref(grpc_slice_buffer *src, size_t n,
+                                              grpc_slice_buffer *dst,
+                                              bool incref) {
   GPR_ASSERT(src->length >= n);
   if (src->length == n) {
     grpc_slice_buffer_move_into(src, dst);
     return;
   }
 
+  size_t output_len = dst->length + n;
+  size_t new_input_len = src->length - n;
+
   while (src->count > 0) {
     grpc_slice slice = grpc_slice_buffer_take_first(src);
     size_t slice_len = GRPC_SLICE_LENGTH(slice);
@@ -272,11 +274,18 @@
     } else if (n == slice_len) {
       grpc_slice_buffer_add(dst, slice);
       break;
-    } else { /* n < slice_len */
-      grpc_slice_buffer_undo_take_first(src, grpc_slice_split_tail(&slice, n));
+    } else if (incref) { /* n < slice_len */
+      grpc_slice_buffer_undo_take_first(
+          src, grpc_slice_split_tail_maybe_ref(&slice, n, GRPC_SLICE_REF_BOTH));
       GPR_ASSERT(GRPC_SLICE_LENGTH(slice) == n);
       grpc_slice_buffer_add(dst, slice);
       break;
+    } else { /* n < slice_len */
+      grpc_slice_buffer_undo_take_first(
+          src, grpc_slice_split_tail_maybe_ref(&slice, n, GRPC_SLICE_REF_TAIL));
+      GPR_ASSERT(GRPC_SLICE_LENGTH(slice) == n);
+      grpc_slice_buffer_add_indexed(dst, slice);
+      break;
     }
   }
   GPR_ASSERT(dst->length == output_len);
@@ -284,6 +293,16 @@
   GPR_ASSERT(src->count > 0);
 }
 
+void grpc_slice_buffer_move_first(grpc_slice_buffer *src, size_t n,
+                                  grpc_slice_buffer *dst) {
+  slice_buffer_move_first_maybe_ref(src, n, dst, true);
+}
+
+void grpc_slice_buffer_move_first_no_ref(grpc_slice_buffer *src, size_t n,
+                                         grpc_slice_buffer *dst) {
+  slice_buffer_move_first_maybe_ref(src, n, dst, false);
+}
+
 void grpc_slice_buffer_move_first_into_buffer(grpc_exec_ctx *exec_ctx,
                                               grpc_slice_buffer *src, size_t n,
                                               void *dst) {
diff --git a/src/core/lib/surface/byte_buffer_reader.c b/src/core/lib/surface/byte_buffer_reader.c
index 1a6ccda..539b014 100644
--- a/src/core/lib/surface/byte_buffer_reader.c
+++ b/src/core/lib/surface/byte_buffer_reader.c
@@ -124,7 +124,7 @@
   grpc_slice in_slice;
   size_t bytes_read = 0;
   const size_t input_size = grpc_byte_buffer_length(reader->buffer_out);
-  grpc_slice out_slice = grpc_slice_malloc(input_size);
+  grpc_slice out_slice = GRPC_SLICE_MALLOC(input_size);
   uint8_t *const outbuf = GRPC_SLICE_START_PTR(out_slice); /* just an alias */
 
   grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
diff --git a/src/ruby/ext/grpc/rb_grpc_imports.generated.c b/src/ruby/ext/grpc/rb_grpc_imports.generated.c
index a412277..221a1e1 100644
--- a/src/ruby/ext/grpc/rb_grpc_imports.generated.c
+++ b/src/ruby/ext/grpc/rb_grpc_imports.generated.c
@@ -179,10 +179,12 @@
 grpc_server_credentials_set_auth_metadata_processor_type grpc_server_credentials_set_auth_metadata_processor_import;
 grpc_slice_ref_type grpc_slice_ref_import;
 grpc_slice_unref_type grpc_slice_unref_import;
+grpc_slice_copy_type grpc_slice_copy_import;
 grpc_slice_new_type grpc_slice_new_import;
 grpc_slice_new_with_user_data_type grpc_slice_new_with_user_data_import;
 grpc_slice_new_with_len_type grpc_slice_new_with_len_import;
 grpc_slice_malloc_type grpc_slice_malloc_import;
+grpc_slice_malloc_large_type grpc_slice_malloc_large_import;
 grpc_slice_intern_type grpc_slice_intern_import;
 grpc_slice_from_copied_string_type grpc_slice_from_copied_string_import;
 grpc_slice_from_copied_buffer_type grpc_slice_from_copied_buffer_import;
@@ -191,6 +193,7 @@
 grpc_slice_sub_type grpc_slice_sub_import;
 grpc_slice_sub_no_ref_type grpc_slice_sub_no_ref_import;
 grpc_slice_split_tail_type grpc_slice_split_tail_import;
+grpc_slice_split_tail_maybe_ref_type grpc_slice_split_tail_maybe_ref_import;
 grpc_slice_split_head_type grpc_slice_split_head_import;
 grpc_empty_slice_type grpc_empty_slice_import;
 grpc_slice_default_hash_impl_type grpc_slice_default_hash_impl_import;
@@ -219,6 +222,7 @@
 grpc_slice_buffer_move_into_type grpc_slice_buffer_move_into_import;
 grpc_slice_buffer_trim_end_type grpc_slice_buffer_trim_end_import;
 grpc_slice_buffer_move_first_type grpc_slice_buffer_move_first_import;
+grpc_slice_buffer_move_first_no_ref_type grpc_slice_buffer_move_first_no_ref_import;
 grpc_slice_buffer_move_first_into_buffer_type grpc_slice_buffer_move_first_into_buffer_import;
 grpc_slice_buffer_take_first_type grpc_slice_buffer_take_first_import;
 grpc_slice_buffer_undo_take_first_type grpc_slice_buffer_undo_take_first_import;
@@ -476,10 +480,12 @@
   grpc_server_credentials_set_auth_metadata_processor_import = (grpc_server_credentials_set_auth_metadata_processor_type) GetProcAddress(library, "grpc_server_credentials_set_auth_metadata_processor");
   grpc_slice_ref_import = (grpc_slice_ref_type) GetProcAddress(library, "grpc_slice_ref");
   grpc_slice_unref_import = (grpc_slice_unref_type) GetProcAddress(library, "grpc_slice_unref");
+  grpc_slice_copy_import = (grpc_slice_copy_type) GetProcAddress(library, "grpc_slice_copy");
   grpc_slice_new_import = (grpc_slice_new_type) GetProcAddress(library, "grpc_slice_new");
   grpc_slice_new_with_user_data_import = (grpc_slice_new_with_user_data_type) GetProcAddress(library, "grpc_slice_new_with_user_data");
   grpc_slice_new_with_len_import = (grpc_slice_new_with_len_type) GetProcAddress(library, "grpc_slice_new_with_len");
   grpc_slice_malloc_import = (grpc_slice_malloc_type) GetProcAddress(library, "grpc_slice_malloc");
+  grpc_slice_malloc_large_import = (grpc_slice_malloc_large_type) GetProcAddress(library, "grpc_slice_malloc_large");
   grpc_slice_intern_import = (grpc_slice_intern_type) GetProcAddress(library, "grpc_slice_intern");
   grpc_slice_from_copied_string_import = (grpc_slice_from_copied_string_type) GetProcAddress(library, "grpc_slice_from_copied_string");
   grpc_slice_from_copied_buffer_import = (grpc_slice_from_copied_buffer_type) GetProcAddress(library, "grpc_slice_from_copied_buffer");
@@ -488,6 +494,7 @@
   grpc_slice_sub_import = (grpc_slice_sub_type) GetProcAddress(library, "grpc_slice_sub");
   grpc_slice_sub_no_ref_import = (grpc_slice_sub_no_ref_type) GetProcAddress(library, "grpc_slice_sub_no_ref");
   grpc_slice_split_tail_import = (grpc_slice_split_tail_type) GetProcAddress(library, "grpc_slice_split_tail");
+  grpc_slice_split_tail_maybe_ref_import = (grpc_slice_split_tail_maybe_ref_type) GetProcAddress(library, "grpc_slice_split_tail_maybe_ref");
   grpc_slice_split_head_import = (grpc_slice_split_head_type) GetProcAddress(library, "grpc_slice_split_head");
   grpc_empty_slice_import = (grpc_empty_slice_type) GetProcAddress(library, "grpc_empty_slice");
   grpc_slice_default_hash_impl_import = (grpc_slice_default_hash_impl_type) GetProcAddress(library, "grpc_slice_default_hash_impl");
@@ -516,6 +523,7 @@
   grpc_slice_buffer_move_into_import = (grpc_slice_buffer_move_into_type) GetProcAddress(library, "grpc_slice_buffer_move_into");
   grpc_slice_buffer_trim_end_import = (grpc_slice_buffer_trim_end_type) GetProcAddress(library, "grpc_slice_buffer_trim_end");
   grpc_slice_buffer_move_first_import = (grpc_slice_buffer_move_first_type) GetProcAddress(library, "grpc_slice_buffer_move_first");
+  grpc_slice_buffer_move_first_no_ref_import = (grpc_slice_buffer_move_first_no_ref_type) GetProcAddress(library, "grpc_slice_buffer_move_first_no_ref");
   grpc_slice_buffer_move_first_into_buffer_import = (grpc_slice_buffer_move_first_into_buffer_type) GetProcAddress(library, "grpc_slice_buffer_move_first_into_buffer");
   grpc_slice_buffer_take_first_import = (grpc_slice_buffer_take_first_type) GetProcAddress(library, "grpc_slice_buffer_take_first");
   grpc_slice_buffer_undo_take_first_import = (grpc_slice_buffer_undo_take_first_type) GetProcAddress(library, "grpc_slice_buffer_undo_take_first");
diff --git a/src/ruby/ext/grpc/rb_grpc_imports.generated.h b/src/ruby/ext/grpc/rb_grpc_imports.generated.h
index 7df8dd6..f62b31e 100644
--- a/src/ruby/ext/grpc/rb_grpc_imports.generated.h
+++ b/src/ruby/ext/grpc/rb_grpc_imports.generated.h
@@ -488,6 +488,9 @@
 typedef void(*grpc_slice_unref_type)(grpc_slice s);
 extern grpc_slice_unref_type grpc_slice_unref_import;
 #define grpc_slice_unref grpc_slice_unref_import
+typedef grpc_slice(*grpc_slice_copy_type)(grpc_slice s);
+extern grpc_slice_copy_type grpc_slice_copy_import;
+#define grpc_slice_copy grpc_slice_copy_import
 typedef grpc_slice(*grpc_slice_new_type)(void *p, size_t len, void (*destroy)(void *));
 extern grpc_slice_new_type grpc_slice_new_import;
 #define grpc_slice_new grpc_slice_new_import
@@ -500,6 +503,9 @@
 typedef grpc_slice(*grpc_slice_malloc_type)(size_t length);
 extern grpc_slice_malloc_type grpc_slice_malloc_import;
 #define grpc_slice_malloc grpc_slice_malloc_import
+typedef grpc_slice(*grpc_slice_malloc_large_type)(size_t length);
+extern grpc_slice_malloc_large_type grpc_slice_malloc_large_import;
+#define grpc_slice_malloc_large grpc_slice_malloc_large_import
 typedef grpc_slice(*grpc_slice_intern_type)(grpc_slice slice);
 extern grpc_slice_intern_type grpc_slice_intern_import;
 #define grpc_slice_intern grpc_slice_intern_import
@@ -524,6 +530,9 @@
 typedef grpc_slice(*grpc_slice_split_tail_type)(grpc_slice *s, size_t split);
 extern grpc_slice_split_tail_type grpc_slice_split_tail_import;
 #define grpc_slice_split_tail grpc_slice_split_tail_import
+typedef grpc_slice(*grpc_slice_split_tail_maybe_ref_type)(grpc_slice *s, size_t split, grpc_slice_ref_whom ref_whom);
+extern grpc_slice_split_tail_maybe_ref_type grpc_slice_split_tail_maybe_ref_import;
+#define grpc_slice_split_tail_maybe_ref grpc_slice_split_tail_maybe_ref_import
 typedef grpc_slice(*grpc_slice_split_head_type)(grpc_slice *s, size_t split);
 extern grpc_slice_split_head_type grpc_slice_split_head_import;
 #define grpc_slice_split_head grpc_slice_split_head_import
@@ -608,6 +617,9 @@
 typedef void(*grpc_slice_buffer_move_first_type)(grpc_slice_buffer *src, size_t n, grpc_slice_buffer *dst);
 extern grpc_slice_buffer_move_first_type grpc_slice_buffer_move_first_import;
 #define grpc_slice_buffer_move_first grpc_slice_buffer_move_first_import
+typedef void(*grpc_slice_buffer_move_first_no_ref_type)(grpc_slice_buffer *src, size_t n, grpc_slice_buffer *dst);
+extern grpc_slice_buffer_move_first_no_ref_type grpc_slice_buffer_move_first_no_ref_import;
+#define grpc_slice_buffer_move_first_no_ref grpc_slice_buffer_move_first_no_ref_import
 typedef void(*grpc_slice_buffer_move_first_into_buffer_type)(grpc_exec_ctx *exec_ctx, grpc_slice_buffer *src, size_t n, void *dst);
 extern grpc_slice_buffer_move_first_into_buffer_type grpc_slice_buffer_move_first_into_buffer_import;
 #define grpc_slice_buffer_move_first_into_buffer grpc_slice_buffer_move_first_into_buffer_import
diff --git a/test/core/util/passthru_endpoint.c b/test/core/util/passthru_endpoint.c
index 121567f..6400845 100644
--- a/test/core/util/passthru_endpoint.c
+++ b/test/core/util/passthru_endpoint.c
@@ -102,13 +102,14 @@
     error = GRPC_ERROR_CREATE_FROM_STATIC_STRING("Endpoint already shutdown");
   } else if (m->on_read != NULL) {
     for (size_t i = 0; i < slices->count; i++) {
-      grpc_slice_buffer_add(m->on_read_out, grpc_slice_ref(slices->slices[i]));
+      grpc_slice_buffer_add(m->on_read_out, grpc_slice_copy(slices->slices[i]));
     }
     grpc_closure_sched(exec_ctx, m->on_read, GRPC_ERROR_NONE);
     m->on_read = NULL;
   } else {
     for (size_t i = 0; i < slices->count; i++) {
-      grpc_slice_buffer_add(&m->read_buffer, grpc_slice_ref(slices->slices[i]));
+      grpc_slice_buffer_add(&m->read_buffer,
+                            grpc_slice_copy(slices->slices[i]));
     }
   }
   gpr_mu_unlock(&m->parent->mu);
diff --git a/test/core/util/trickle_endpoint.c b/test/core/util/trickle_endpoint.c
index 0848147..58ac597 100644
--- a/test/core/util/trickle_endpoint.c
+++ b/test/core/util/trickle_endpoint.c
@@ -66,14 +66,14 @@
 static void te_write(grpc_exec_ctx *exec_ctx, grpc_endpoint *ep,
                      grpc_slice_buffer *slices, grpc_closure *cb) {
   trickle_endpoint *te = (trickle_endpoint *)ep;
-  for (size_t i = 0; i < slices->count; i++) {
-    grpc_slice_ref_internal(slices->slices[i]);
-  }
   gpr_mu_lock(&te->mu);
   if (te->write_buffer.length == 0) {
     te->last_write = gpr_now(GPR_CLOCK_MONOTONIC);
   }
-  grpc_slice_buffer_addn(&te->write_buffer, slices->slices, slices->count);
+  for (size_t i = 0; i < slices->count; i++) {
+    grpc_slice_buffer_add(&te->write_buffer,
+                          grpc_slice_copy(slices->slices[i]));
+  }
   grpc_closure_sched(exec_ctx, cb, GRPC_ERROR_REF(te->error));
   gpr_mu_unlock(&te->mu);
 }
diff --git a/tools/run_tests/sanity/core_banned_functions.py b/tools/run_tests/sanity/core_banned_functions.py
index c3c3cbe..2387c5f 100755
--- a/tools/run_tests/sanity/core_banned_functions.py
+++ b/tools/run_tests/sanity/core_banned_functions.py
@@ -50,6 +50,7 @@
     'grpc_os_error(': ['src/core/lib/iomgr/error.c'],
     'grpc_wsa_error(': ['src/core/lib/iomgr/error.c'],
     'grpc_log_if_error(': ['src/core/lib/iomgr/error.c'],
+    'grpc_slice_malloc(': ['src/core/lib/slice/slice.c'],
 }
 
 errors = 0