Blame - crc32_simd.c - platform/external/zlib

blob: 2fef610e21a2e6350c7965b54b90c22c903c2b6b [file] [log] [blame]

Noel Gordon	8e904b3	2018-01-04 12:10:08 +1100	[diff] [blame]	1	/* crc32_simd.c
				2	*
				3	* Copyright 2017 The Chromium Authors. All rights reserved.
				4	* Use of this source code is governed by a BSD-style license that can be
				5	* found in the Chromium source repository LICENSE file.
				6	*/
				7
				8	#include "crc32_simd.h"
				9
				10	#if defined(CRC32_SIMD_SSE42_PCLMUL)
				11
				12	/*
				13	* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
				14	* length must be at least 64, and a multiple of 16. Based on:
				15	*
				16	* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
				17	* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
				18	*/
				19
				20	#include <emmintrin.h>
				21	#include <smmintrin.h>
				22	#include <wmmintrin.h>
				23
				24	uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
				25	const unsigned char *buf,
				26	z_size_t len,
				27	uint32_t crc)
				28	{
				29	/*
				30	* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
				31	* the CRC32+Barrett polynomials given at the end of the paper.
				32	*/
				33	static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
				34	static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
				35	static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
				36	static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
				37
				38	__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
				39
				40	/*
				41	* There's at least one block of 64.
				42	*/
				43	x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
				44	x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
				45	x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
				46	x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
				47
				48	x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
				49
				50	x0 = _mm_load_si128((__m128i *)k1k2);
				51
				52	buf += 64;
				53	len -= 64;
				54
				55	/*
				56	* Parallel fold blocks of 64, if any.
				57	*/
				58	while (len >= 64)
				59	{
				60	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
				61	x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
				62	x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
				63	x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
				64
				65	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
				66	x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
				67	x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
				68	x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
				69
				70	y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
				71	y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
				72	y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
				73	y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
				74
				75	x1 = _mm_xor_si128(x1, x5);
				76	x2 = _mm_xor_si128(x2, x6);
				77	x3 = _mm_xor_si128(x3, x7);
				78	x4 = _mm_xor_si128(x4, x8);
				79
				80	x1 = _mm_xor_si128(x1, y5);
				81	x2 = _mm_xor_si128(x2, y6);
				82	x3 = _mm_xor_si128(x3, y7);
				83	x4 = _mm_xor_si128(x4, y8);
				84
				85	buf += 64;
				86	len -= 64;
				87	}
				88
				89	/*
				90	* Fold into 128-bits.
				91	*/
				92	x0 = _mm_load_si128((__m128i *)k3k4);
				93
				94	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
				95	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
				96	x1 = _mm_xor_si128(x1, x2);
				97	x1 = _mm_xor_si128(x1, x5);
				98
				99	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
				100	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
				101	x1 = _mm_xor_si128(x1, x3);
				102	x1 = _mm_xor_si128(x1, x5);
				103
				104	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
				105	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
				106	x1 = _mm_xor_si128(x1, x4);
				107	x1 = _mm_xor_si128(x1, x5);
				108
				109	/*
				110	* Single fold blocks of 16, if any.
				111	*/
				112	while (len >= 16)
				113	{
				114	x2 = _mm_loadu_si128((__m128i *)buf);
				115
				116	x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
				117	x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
				118	x1 = _mm_xor_si128(x1, x2);
				119	x1 = _mm_xor_si128(x1, x5);
				120
				121	buf += 16;
				122	len -= 16;
				123	}
				124
				125	/*
				126	* Fold 128-bits to 64-bits.
				127	*/
				128	x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
Noel Gordon	bb1fbea	2018-02-05 20:37:08 +0000	[diff] [blame]	129	x3 = _mm_setr_epi32(~0, 0, ~0, 0);
Noel Gordon	8e904b3	2018-01-04 12:10:08 +1100	[diff] [blame]	130	x1 = _mm_srli_si128(x1, 8);
				131	x1 = _mm_xor_si128(x1, x2);
				132
				133	x0 = _mm_loadl_epi64((__m128i*)k5k0);
				134
				135	x2 = _mm_srli_si128(x1, 4);
				136	x1 = _mm_and_si128(x1, x3);
				137	x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
				138	x1 = _mm_xor_si128(x1, x2);
				139
				140	/*
				141	* Barret reduce to 32-bits.
				142	*/
				143	x0 = _mm_load_si128((__m128i*)poly);
				144
				145	x2 = _mm_and_si128(x1, x3);
				146	x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
				147	x2 = _mm_and_si128(x2, x3);
				148	x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
				149	x1 = _mm_xor_si128(x1, x2);
				150
				151	/*
				152	* Return the crc32.
				153	*/
				154	return _mm_extract_epi32(x1, 1);
				155	}
				156
Adenilson Cavalcanti	7235672	2018-02-16 03:41:14 +0000	[diff] [blame]	157	#elif defined(CRC32_ARMV8_CRC32)
				158
				159	/* CRC32 checksums using ARMv8-a crypto instructions.
				160	*
				161	* TODO: implement a version using the PMULL instruction.
				162	*/
Adenilson Cavalcanti	7235672	2018-02-16 03:41:14 +0000	[diff] [blame]	163
Jose Dapena Paz	bbacb13	2019-06-10 09:16:22 +0000	[diff] [blame^]	164	#if defined(__clang__)
George Burgess IV	de0fe05	2018-08-23 00:40:55 +0000	[diff] [blame]	165	/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
				166	* armv8 target, which is incompatible with ThinLTO optimizations on Android.
				167	* (Namely, mixing and matching different module-level targets makes ThinLTO
				168	* warn, and Android defaults to armv7-a. This restriction does not apply to
				169	* function-level `target`s, however.)
				170	*
				171	* Since we only need three crc intrinsics, and since clang's implementation of
				172	* those are just wrappers around compiler builtins, it's simplest to #define
				173	* those builtins directly. If this #define list grows too much (or we depend on
				174	* an intrinsic that isn't a trivial wrapper), we may have to find a better way
				175	* to go about this.
				176	*
				177	* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
				178	* feature for this target (ignoring feature)." This appears to be a harmless
				179	* bug in clang.
				180	*/
				181	#define __crc32b __builtin_arm_crc32b
				182	#define __crc32d __builtin_arm_crc32d
				183	#define __crc32w __builtin_arm_crc32w
				184
				185	#if defined(__aarch64__)
				186	#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
Jose Dapena Paz	bbacb13	2019-06-10 09:16:22 +0000	[diff] [blame^]	187	#else // !defined(__aarch64__)
George Burgess IV	de0fe05	2018-08-23 00:40:55 +0000	[diff] [blame]	188	#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
Jose Dapena Paz	bbacb13	2019-06-10 09:16:22 +0000	[diff] [blame^]	189	#endif // defined(__aarch64__)
				190
				191	#elif defined(__GNUC__)
				192	/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
				193	* allowed. We can just include arm_acle.h.
				194	*/
				195	#include <arm_acle.h>
				196	#define TARGET_ARMV8_WITH_CRC
				197	#else // !defined(__GNUC__) && !defined(_aarch64__)
				198	#error ARM CRC32 SIMD extensions only supported for Clang and GCC
George Burgess IV	de0fe05	2018-08-23 00:40:55 +0000	[diff] [blame]	199	#endif
				200
				201	TARGET_ARMV8_WITH_CRC
Adenilson Cavalcanti	7235672	2018-02-16 03:41:14 +0000	[diff] [blame]	202	uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
				203	const unsigned char *buf,
				204	z_size_t len)
				205	{
				206	uint32_t c = (uint32_t) ~crc;
				207
				208	while (len && ((uintptr_t)buf & 7)) {
				209	c = __crc32b(c, *buf++);
				210	--len;
				211	}
				212
				213	const uint64_t buf8 = (const uint64_t )buf;
				214
				215	while (len >= 64) {
				216	c = __crc32d(c, *buf8++);
				217	c = __crc32d(c, *buf8++);
				218	c = __crc32d(c, *buf8++);
				219	c = __crc32d(c, *buf8++);
				220
				221	c = __crc32d(c, *buf8++);
				222	c = __crc32d(c, *buf8++);
				223	c = __crc32d(c, *buf8++);
				224	c = __crc32d(c, *buf8++);
				225	len -= 64;
				226	}
				227
				228	while (len >= 8) {
				229	c = __crc32d(c, *buf8++);
				230	len -= 8;
				231	}
				232
				233	buf = (const unsigned char *)buf8;
				234
				235	while (len--) {
				236	c = __crc32b(c, *buf++);
				237	}
				238
				239	return ~c;
				240	}
				241
George Burgess IV	de0fe05	2018-08-23 00:40:55 +0000	[diff] [blame]	242	TARGET_ARMV8_WITH_CRC
Adenilson Cavalcanti	21cc38f	2018-08-15 01:06:05 +0000	[diff] [blame]	243	Pos ZLIB_INTERNAL insert_string_arm(deflate_state *const s, const Pos str)
				244	{
				245	Pos ret;
				246	unsigned *ip, val, h = 0;
				247
				248	ip = (unsigned *)&s->window[str];
				249	val = *ip;
				250
				251	if (s->level >= 6)
				252	val &= 0xFFFFFF;
				253
				254	h = __crc32w(h, val);
				255
				256	ret = s->head[h & s->hash_mask];
				257	s->head[h & s->hash_mask] = str;
				258	s->prev[str & s->w_mask] = ret;
				259	return ret;
				260	}
				261
Adenilson Cavalcanti	7235672	2018-02-16 03:41:14 +0000	[diff] [blame]	262	#endif