Blame - current/sdk/common_os/include/external/zlib/contrib/optimizations/chunkcopy.h - platform/prebuilts/module_sdk/art

blob: f40546d54dbe770c9e4625881317efadceaf0b5b [file] [log] [blame]

Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	1	/* chunkcopy.h -- fast chunk copy and set operations
				2	* Copyright (C) 2017 ARM, Inc.
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	3	* Copyright 2017 The Chromium Authors
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the Chromium source repository LICENSE file.
				6	*/
				7
				8	#ifndef CHUNKCOPY_H
				9	#define CHUNKCOPY_H
				10
				11	#include <stdint.h>
				12	#include "zutil.h"
				13
				14	#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
				15
				16	#if __STDC_VERSION__ >= 199901L
				17	#define Z_RESTRICT restrict
				18	#else
				19	#define Z_RESTRICT
				20	#endif
				21
				22	#if defined(__clang__) \|\| defined(__GNUC__) \|\| defined(__llvm__)
				23	#define Z_BUILTIN_MEMCPY __builtin_memcpy
				24	#else
				25	#define Z_BUILTIN_MEMCPY zmemcpy
				26	#endif
				27
				28	#if defined(INFLATE_CHUNK_SIMD_NEON)
				29	#include <arm_neon.h>
				30	typedef uint8x16_t z_vec128i_t;
				31	#elif defined(INFLATE_CHUNK_SIMD_SSE2)
				32	#include <emmintrin.h>
				33	typedef __m128i z_vec128i_t;
				34	#else
				35	#error chunkcopy.h inflate chunk SIMD is not defined for your build target
				36	#endif
				37
				38	/*
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	39	* Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
				40	*/
				41	#define Z_DISABLE_MSAN
				42	#if defined(__has_feature)
				43	#if __has_feature(memory_sanitizer)
				44	#undef Z_DISABLE_MSAN
				45	#define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
				46	#endif
				47	#endif
				48
				49	/*
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	50	* chunk copy type: the z_vec128i_t type size should be exactly 128-bits
				51	* and equal to CHUNKCOPY_CHUNK_SIZE.
				52	*/
				53	#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
				54
				55	Z_STATIC_ASSERT(vector_128_bits_wide,
				56	CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
				57
				58	/*
				59	* Ask the compiler to perform a wide, unaligned load with a machine
				60	* instruction appropriate for the z_vec128i_t type.
				61	*/
				62	static inline z_vec128i_t loadchunk(
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	63	const unsigned char FAR* s) Z_DISABLE_MSAN {
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	64	z_vec128i_t v;
				65	Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
				66	return v;
				67	}
				68
				69	/*
				70	* Ask the compiler to perform a wide, unaligned store with a machine
				71	* instruction appropriate for the z_vec128i_t type.
				72	*/
				73	static inline void storechunk(
				74	unsigned char FAR* d,
				75	const z_vec128i_t v) {
				76	Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
				77	}
				78
				79	/*
				80	* Perform a memcpy-like operation, assuming that length is non-zero and that
				81	* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
				82	* the length is shorter than this.
				83	*
				84	* It also guarantees that it will properly unroll the data if the distance
				85	* between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
				86	* in chunkcopy_relaxed().
				87	*
				88	* Aside from better memory bus utilisation, this means that short copies
				89	* (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
				90	* without iteration, which will hopefully make the branch prediction more
				91	* reliable.
				92	*/
				93	static inline unsigned char FAR* chunkcopy_core(
				94	unsigned char FAR* out,
				95	const unsigned char FAR* from,
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	96	unsigned len) Z_DISABLE_MSAN {
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	97	const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
				98	storechunk(out, loadchunk(from));
				99	out += bump;
				100	from += bump;
				101	len /= CHUNKCOPY_CHUNK_SIZE;
				102	while (len-- > 0) {
				103	storechunk(out, loadchunk(from));
				104	out += CHUNKCOPY_CHUNK_SIZE;
				105	from += CHUNKCOPY_CHUNK_SIZE;
				106	}
				107	return out;
				108	}
				109
				110	/*
				111	* Like chunkcopy_core(), but avoid writing beyond of legal output.
				112	*
				113	* Accepts an additional pointer to the end of safe output. A generic safe
				114	* copy would use (out + len), but it's normally the case that the end of the
				115	* output buffer is beyond the end of the current copy, and this can still be
				116	* exploited.
				117	*/
				118	static inline unsigned char FAR* chunkcopy_core_safe(
				119	unsigned char FAR* out,
				120	const unsigned char FAR* from,
				121	unsigned len,
				122	unsigned char FAR* limit) {
				123	Assert(out + len <= limit, "chunk copy exceeds safety limit");
				124	if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
				125	const unsigned char FAR* Z_RESTRICT rfrom = from;
				126	Assert((uintptr_t)out - (uintptr_t)from >= len,
				127	"invalid restrict in chunkcopy_core_safe");
				128	Assert((uintptr_t)from - (uintptr_t)out >= len,
				129	"invalid restrict in chunkcopy_core_safe");
				130	if (len & 8) {
				131	Z_BUILTIN_MEMCPY(out, rfrom, 8);
				132	out += 8;
				133	rfrom += 8;
				134	}
				135	if (len & 4) {
				136	Z_BUILTIN_MEMCPY(out, rfrom, 4);
				137	out += 4;
				138	rfrom += 4;
				139	}
				140	if (len & 2) {
				141	Z_BUILTIN_MEMCPY(out, rfrom, 2);
				142	out += 2;
				143	rfrom += 2;
				144	}
				145	if (len & 1) {
				146	out++ = rfrom++;
				147	}
				148	return out;
				149	}
				150	return chunkcopy_core(out, from, len);
				151	}
				152
				153	/*
				154	* Perform short copies until distance can be rewritten as being at least
				155	* CHUNKCOPY_CHUNK_SIZE.
				156	*
				157	* Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
				158	* bytes of output even if the copy is shorter than this. This assumption
				159	* holds within zlib inflate_fast(), which starts every iteration with at
				160	* least 258 bytes of output space available (258 being the maximum length
				161	* output from a single token; see inffast.c).
				162	*/
				163	static inline unsigned char FAR* chunkunroll_relaxed(
				164	unsigned char FAR* out,
				165	unsigned FAR* dist,
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	166	unsigned FAR* len) Z_DISABLE_MSAN {
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	167	const unsigned char FAR* from = out - *dist;
				168	while (dist < len && *dist < CHUNKCOPY_CHUNK_SIZE) {
				169	storechunk(out, loadchunk(from));
				170	out += *dist;
				171	len -= dist;
				172	dist += dist;
				173	}
				174	return out;
				175	}
				176
				177	#if defined(INFLATE_CHUNK_SIMD_NEON)
				178	/*
				179	* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
				180	* every 64-bit component of the 128-bit result (64-bit int splat).
				181	*/
				182	static inline z_vec128i_t v_load64_dup(const void* src) {
				183	return vcombine_u8(vld1_u8(src), vld1_u8(src));
				184	}
				185
				186	/*
				187	* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
				188	* every 32-bit component of the 128-bit result (32-bit int splat).
				189	*/
				190	static inline z_vec128i_t v_load32_dup(const void* src) {
				191	int32_t i32;
				192	Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
				193	return vreinterpretq_u8_s32(vdupq_n_s32(i32));
				194	}
				195
				196	/*
				197	* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
				198	* every 16-bit component of the 128-bit result (16-bit int splat).
				199	*/
				200	static inline z_vec128i_t v_load16_dup(const void* src) {
				201	int16_t i16;
				202	Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
				203	return vreinterpretq_u8_s16(vdupq_n_s16(i16));
				204	}
				205
				206	/*
				207	* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
				208	* component of the 128-bit result (8-bit int splat).
				209	*/
				210	static inline z_vec128i_t v_load8_dup(const void* src) {
				211	return vld1q_dup_u8((const uint8_t*)src);
				212	}
				213
				214	/*
				215	* v_store_128(): store the 128-bit vec in a memory destination (that might
				216	* not be 16-byte aligned) void* out.
				217	*/
				218	static inline void v_store_128(void* out, const z_vec128i_t vec) {
				219	vst1q_u8(out, vec);
				220	}
				221
				222	#elif defined(INFLATE_CHUNK_SIMD_SSE2)
				223	/*
				224	* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
				225	* every 64-bit component of the 128-bit result (64-bit int splat).
				226	*/
				227	static inline z_vec128i_t v_load64_dup(const void* src) {
				228	int64_t i64;
				229	Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
				230	return _mm_set1_epi64x(i64);
				231	}
				232
				233	/*
				234	* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
				235	* every 32-bit component of the 128-bit result (32-bit int splat).
				236	*/
				237	static inline z_vec128i_t v_load32_dup(const void* src) {
				238	int32_t i32;
				239	Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
				240	return _mm_set1_epi32(i32);
				241	}
				242
				243	/*
				244	* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
				245	* every 16-bit component of the 128-bit result (16-bit int splat).
				246	*/
				247	static inline z_vec128i_t v_load16_dup(const void* src) {
				248	int16_t i16;
				249	Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
				250	return _mm_set1_epi16(i16);
				251	}
				252
				253	/*
				254	* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
				255	* component of the 128-bit result (8-bit int splat).
				256	*/
				257	static inline z_vec128i_t v_load8_dup(const void* src) {
				258	return _mm_set1_epi8((const char)src);
				259	}
				260
				261	/*
				262	* v_store_128(): store the 128-bit vec in a memory destination (that might
				263	* not be 16-byte aligned) void* out.
				264	*/
				265	static inline void v_store_128(void* out, const z_vec128i_t vec) {
				266	_mm_storeu_si128((__m128i*)out, vec);
				267	}
				268	#endif
				269
				270	/*
				271	* Perform an overlapping copy which behaves as a memset() operation, but
				272	* supporting periods other than one, and assume that length is non-zero and
				273	* that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
				274	* even if the length is shorter than this.
				275	*/
				276	static inline unsigned char FAR* chunkset_core(
				277	unsigned char FAR* out,
				278	unsigned period,
				279	unsigned len) {
				280	z_vec128i_t v;
				281	const int bump = ((len - 1) % sizeof(v)) + 1;
				282
				283	switch (period) {
				284	case 1:
				285	v = v_load8_dup(out - 1);
				286	v_store_128(out, v);
				287	out += bump;
				288	len -= bump;
				289	while (len > 0) {
				290	v_store_128(out, v);
				291	out += sizeof(v);
				292	len -= sizeof(v);
				293	}
				294	return out;
				295	case 2:
				296	v = v_load16_dup(out - 2);
				297	v_store_128(out, v);
				298	out += bump;
				299	len -= bump;
				300	if (len > 0) {
				301	v = v_load16_dup(out - 2);
				302	do {
				303	v_store_128(out, v);
				304	out += sizeof(v);
				305	len -= sizeof(v);
				306	} while (len > 0);
				307	}
				308	return out;
				309	case 4:
				310	v = v_load32_dup(out - 4);
				311	v_store_128(out, v);
				312	out += bump;
				313	len -= bump;
				314	if (len > 0) {
				315	v = v_load32_dup(out - 4);
				316	do {
				317	v_store_128(out, v);
				318	out += sizeof(v);
				319	len -= sizeof(v);
				320	} while (len > 0);
				321	}
				322	return out;
				323	case 8:
				324	v = v_load64_dup(out - 8);
				325	v_store_128(out, v);
				326	out += bump;
				327	len -= bump;
				328	if (len > 0) {
				329	v = v_load64_dup(out - 8);
				330	do {
				331	v_store_128(out, v);
				332	out += sizeof(v);
				333	len -= sizeof(v);
				334	} while (len > 0);
				335	}
				336	return out;
				337	}
				338	out = chunkunroll_relaxed(out, &period, &len);
				339	return chunkcopy_core(out, out - period, len);
				340	}
				341
				342	/*
				343	* Perform a memcpy-like operation, but assume that length is non-zero and that
				344	* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
				345	* the length is shorter than this.
				346	*
				347	* Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
				348	* of overlapping buffers, regardless of the distance between the pointers.
				349	* This is reflected in the `restrict`-qualified pointers, allowing the
				350	* compiler to re-order loads and stores.
				351	*/
				352	static inline unsigned char FAR* chunkcopy_relaxed(
				353	unsigned char FAR* Z_RESTRICT out,
				354	const unsigned char FAR* Z_RESTRICT from,
				355	unsigned len) {
				356	Assert((uintptr_t)out - (uintptr_t)from >= len,
				357	"invalid restrict in chunkcopy_relaxed");
				358	Assert((uintptr_t)from - (uintptr_t)out >= len,
				359	"invalid restrict in chunkcopy_relaxed");
				360	return chunkcopy_core(out, from, len);
				361	}
				362
				363	/*
				364	* Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
				365	*
				366	* Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
				367	* behaviour of overlapping buffers, regardless of the distance between the
				368	* pointers. This is reflected in the `restrict`-qualified pointers, allowing
				369	* the compiler to re-order loads and stores.
				370	*
				371	* Accepts an additional pointer to the end of safe output. A generic safe
				372	* copy would use (out + len), but it's normally the case that the end of the
				373	* output buffer is beyond the end of the current copy, and this can still be
				374	* exploited.
				375	*/
				376	static inline unsigned char FAR* chunkcopy_safe(
				377	unsigned char FAR* out,
				378	const unsigned char FAR* Z_RESTRICT from,
				379	unsigned len,
				380	unsigned char FAR* limit) {
				381	Assert(out + len <= limit, "chunk copy exceeds safety limit");
				382	Assert((uintptr_t)out - (uintptr_t)from >= len,
				383	"invalid restrict in chunkcopy_safe");
				384	Assert((uintptr_t)from - (uintptr_t)out >= len,
				385	"invalid restrict in chunkcopy_safe");
				386
				387	return chunkcopy_core_safe(out, from, len, limit);
				388	}
				389
				390	/*
				391	* Perform chunky copy within the same buffer, where the source and destination
				392	* may potentially overlap.
				393	*
				394	* Assumes that len > 0 on entry, and that it's safe to write at least
				395	* CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
				396	*/
				397	static inline unsigned char FAR* chunkcopy_lapped_relaxed(
				398	unsigned char FAR* out,
				399	unsigned dist,
				400	unsigned len) {
				401	if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
				402	return chunkset_core(out, dist, len);
				403	}
				404	return chunkcopy_core(out, out - dist, len);
				405	}
				406
				407	/*
				408	* Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
				409	* output.
				410	*
				411	* Accepts an additional pointer to the end of safe output. A generic safe
				412	* copy would use (out + len), but it's normally the case that the end of the
				413	* output buffer is beyond the end of the current copy, and this can still be
				414	* exploited.
				415	*/
				416	static inline unsigned char FAR* chunkcopy_lapped_safe(
				417	unsigned char FAR* out,
				418	unsigned dist,
				419	unsigned len,
				420	unsigned char FAR* limit) {
				421	Assert(out + len <= limit, "chunk copy exceeds safety limit");
				422	if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
				423	/* TODO(cavalcantii): try harder to optimise this */
				424	while (len-- > 0) {
				425	out = (out - dist);
				426	out++;
				427	}
				428	return out;
				429	}
				430	return chunkcopy_lapped_relaxed(out, dist, len);
				431	}
				432
				433	/* TODO(cavalcanti): see crbug.com/1110083. */
				434	static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
				435	unsigned dist,
				436	unsigned len,
				437	unsigned char FAR* limit) {
				438	#if defined(__GNUC__) && !defined(__clang__)
				439	/* Speed is the same as using chunkcopy_safe
				440	w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
				441	undefined behavior.
				442	*/
				443	return chunkcopy_core_safe(out, out - dist, len, limit);
				444	#elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
				445	/* Seems to perform better on 32bit (i.e. Android). */
				446	return chunkcopy_core_safe(out, out - dist, len, limit);
				447	#else
				448	/* Seems to perform better on 64bit. */
				449	return chunkcopy_lapped_safe(out, dist, len, limit);
				450	#endif
				451	}
				452
				453	/*
				454	* The chunk-copy code above deals with writing the decoded DEFLATE data to
				455	* the output with SIMD methods to increase decode speed. Reading the input
				456	* to the DEFLATE decoder with a wide, SIMD method can also increase decode
				457	* speed. This option is supported on little endian machines, and reads the
				458	* input data in 64-bit (8 byte) chunks.
				459	*/
				460
				461	#ifdef INFLATE_CHUNK_READ_64LE
				462	/*
				463	* Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
				464	*/
				465	typedef uint64_t inflate_holder_t;
				466
				467	/*
				468	* Ask the compiler to perform a wide, unaligned load of a uint64_t using a
				469	* machine instruction appropriate for the uint64_t type.
				470	*/
				471	static inline inflate_holder_t read64le(const unsigned char FAR *in) {
				472	inflate_holder_t input;
				473	Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
				474	return input;
				475	}
				476	#else
				477	/*
				478	* Otherwise, buffer the input bits using zlib's default input buffer type.
				479	*/
				480	typedef unsigned long inflate_holder_t;
				481
				482	#endif /* INFLATE_CHUNK_READ_64LE */
				483
				484	#undef Z_STATIC_ASSERT
				485	#undef Z_RESTRICT
				486	#undef Z_BUILTIN_MEMCPY
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	487	#undef Z_DISABLE_MSAN
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	488
				489	#endif /* CHUNKCOPY_H */