Skip to content

Commit

Permalink
Enable building for non-x86/arm targets.
Browse files Browse the repository at this point in the history
Generic paths for chunk ops and hash function.  Add a cmake option to
avoid hardware features so that the generic paths can be tested on x86.
  • Loading branch information
Simon Hosie authored and kornelski committed Nov 20, 2023
1 parent 0144282 commit 5666c2d
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 34 deletions.
64 changes: 39 additions & 25 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ endif()
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
option(BUILD_EXAMPLES "Build examples" OFF)
option(SKIP_CPUID_CHECK "Assume CPU supports fast CRC" OFF)
option(BUILD_GENERIC_CODE "Avoid architecture-specific code paths" OFF)
option(FORCE_CHUNK_COPY "Force chunk-copy optimization" OFF)
option(FORCE_UNALIGNED_READ_64LE "Force unaligned 64-bit read optimizaton" OFF)

if(SKIP_CPUID_CHECK)
add_definitions(-DSKIP_CPUID_CHECK)
Expand Down Expand Up @@ -67,32 +70,36 @@ endif()
# Compiler dependent flags
include (CheckCCompilerFlag)
if(UNIX OR MINGW)
check_c_compiler_flag(-march=armv8-a+crc ARM_CRC)
if(ARM_CRC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc")
if(BUILD_GENERIC_CODE)
message(STATUS "Skipping target feature checks")
else()
check_c_compiler_flag(-msse2 HAS_SSE2)
if(HAS_SSE2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
add_definitions(-DHAS_SSE2)
endif()

check_c_compiler_flag(-mssse3 HAS_SSSE3)
if(HAS_SSSE3)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
add_definitions(-DHAS_SSSE3)
endif()

check_c_compiler_flag(-msse4.2 HAS_SSE42)
if(HAS_SSE42)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
add_definitions(-DHAS_SSE42)
endif()

check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
add_definitions(-DHAS_PCLMUL)
check_c_compiler_flag(-march=armv8-a+crc ARM_CRC)
if(ARM_CRC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc")
else()
check_c_compiler_flag(-msse2 HAS_SSE2)
if(HAS_SSE2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
add_definitions(-DHAS_SSE2)
endif()

check_c_compiler_flag(-mssse3 HAS_SSSE3)
if(HAS_SSSE3)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
add_definitions(-DHAS_SSSE3)
endif()

check_c_compiler_flag(-msse4.2 HAS_SSE42)
if(HAS_SSE42)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.2")
add_definitions(-DHAS_SSE42)
endif()

check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
add_definitions(-DHAS_PCLMUL)
endif()
endif()
endif()
elseif(MSVC)
Expand Down Expand Up @@ -144,6 +151,13 @@ set(ZLIB_SRCS
)

if(UNIX OR MINGW)
if(FORCE_CHUNK_COPY)
list(APPEND ZLIB_SRCS inffast_chunk.c)
add_definitions(-DINFLATE_CHUNK_GENERIC)
endif()
if(FORCE_UNALIGNED_READ_64LE)
add_definitions(-DINFLATE_CHUNK_READ_64LE)
endif()
# append "inffast_chunk.c" and "adler32_simd.c" for ARMv8 CPU
if(ARM_CRC)
list(APPEND ZLIB_SRCS inffast_chunk.c adler32_simd.c)
Expand Down
75 changes: 74 additions & 1 deletion chunkcopy.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@

#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
#define Z_BUILTIN_MEMCPY __builtin_memcpy
#define Z_BUILTIN_MEMSET __builtin_memset
#else
#define Z_BUILTIN_MEMCPY zmemcpy
#define Z_BUILTIN_MEMSET zmemset
#endif

#if defined(INFLATE_CHUNK_SIMD_NEON)
Expand All @@ -54,7 +56,7 @@ typedef uint8x16_t z_vec128i_t;
#include <emmintrin.h>
typedef __m128i z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
typedef struct { uint8_t x[16]; } z_vec128i_t;
#endif

/*
Expand Down Expand Up @@ -271,6 +273,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#else
/*
* Default implementations for chunk-copy functions rely on memcpy() being
* inlined by the compiler for best performance. This is most likely to work
* as expected when the length argument is constant (as is the case here) and
* the target supports unaligned loads and stores. Since that's not always a
* safe assumption, this may need extra compiler arguments such as
* `-mno-strict-align` or `-munaligned-access`, or the availability of
* extensions like SIMD.
*/

/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
int8_t in = *(uint8_t const*)src;
z_vec128i_t out;
Z_BUILTIN_MEMSET(&out, in, sizeof(out));
return out;
}

/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
}
#endif

/*
Expand Down
13 changes: 10 additions & 3 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ static uint32_t hash_func(deflate_state *s, void* str) {
return __crc32cw(0, *(uint32_t*)str) & s->hash_mask;
}

#elif defined __x86_64__ || defined _M_AMD64
#elif defined HAS_SSE42

#include <immintrin.h>
static uint32_t hash_func(deflate_state *s, void* str) {
Expand All @@ -146,7 +146,14 @@ static uint32_t hash_func(deflate_state *s, void* str) {

#else

#error "Only 64-bit Intel and ARM architectures are supported"
static uint32_t hash_func(deflate_state *s, void* str) {
uint32_t w;
zmemcpy(&w, str, sizeof(w));
// generic multiply-xor hash, using some magic numbers from xxhash.
w *= 0x85ebca77u;
w ^= w >> 19;
return w & s->hash_mask;
}

#endif

Expand Down Expand Up @@ -1329,7 +1336,7 @@ static void fill_window(deflate_state *s)
q+=8;
}

#elif defined __x86_64__ || defined _M_AMD64
#elif defined HAS_SSE2

__m128i W;
__m128i *q;
Expand Down
10 changes: 5 additions & 5 deletions inflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
#include "inftrees.h"
#include "inflate.h"

#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
#include "inffast_chunk.h"
#include "chunkcopy.h"
#else
Expand Down Expand Up @@ -390,7 +390,7 @@ static int updatewindow(z_streamp strm, const Bytef *end, unsigned copy) {

/* if it hasn't been done already, allocate space for the window */
if (state->window == Z_NULL) {
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
unsigned wsize = 1U << state->wbits;
state->window = (unsigned char FAR *)
ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
Expand Down Expand Up @@ -1061,7 +1061,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
if (have >= INFLATE_FAST_MIN_INPUT &&
left >= INFLATE_FAST_MIN_OUTPUT) {
RESTORE();
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
inflate_fast_chunk_(strm, out);
#else
inflate_fast(strm, out);
Expand Down Expand Up @@ -1200,7 +1200,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
else
from = state->window + (state->wnext - copy);
if (copy > state->length) copy = state->length;
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2)
#if defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC)
if (copy > left) copy = left;
put = chunkcopy_safe(put, from, copy, put + left);
}
Expand Down Expand Up @@ -1292,7 +1292,7 @@ int ZEXPORT inflate(z_streamp strm, int flush) {
Note: a memory error from inflate() is non-recoverable.
*/
inf_leave:
#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2))
#if defined(ZLIB_DEBUG) && (defined(INFLATE_CHUNK_SIMD_NEON) || defined(INFLATE_CHUNK_SIMD_SSE2) || defined(INFLATE_CHUNK_GENERIC))
/* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
* client code relying on undefined behavior when chunk_copy first landed.
*
Expand Down

0 comments on commit 5666c2d

Please sign in to comment.