Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

deps: update zlib to 1.3.0.1-motley-7d77fb7 #52516

Merged
merged 1 commit into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions deps/zlib/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,36 @@ executable("zlib_bench") {
configs += [ "//build/config/compiler:no_chromium_code" ]
}

executable("minigzip") {
include_dirs = [ "." ]

sources = [ "test/minigzip.c" ]
if (!is_debug) {
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
}

deps = [ ":zlib" ]

configs -= [ "//build/config/compiler:chromium_code" ]
configs += [ "//build/config/compiler:no_chromium_code" ]
}

executable("zpipe") {
include_dirs = [ "." ]

sources = [ "examples/zpipe.c" ]
if (!is_debug) {
configs -= [ "//build/config/compiler:default_optimization" ]
configs += [ "//build/config/compiler:optimize_speed" ]
}

deps = [ ":zlib" ]

configs -= [ "//build/config/compiler:chromium_code" ]
configs += [ "//build/config/compiler:no_chromium_code" ]
}

if (!is_win || target_os != "winuwp") {
executable("minizip_bin") {
include_dirs = [ "." ]
Expand Down
37 changes: 33 additions & 4 deletions deps/zlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ option(ENABLE_SIMD_AVX512 "Enable SIMD AXV512 optimizations" OFF)
option(USE_ZLIB_RABIN_KARP_HASH "Enable bitstream compatibility with canonical zlib" OFF)
option(BUILD_UNITTESTS "Enable standalone unit tests build" OFF)
option(BUILD_MINIZIP_BIN "Enable building minzip_bin tool" OFF)
option(BUILD_ZPIPE "Enable building zpipe tool" OFF)
option(BUILD_MINIGZIP "Enable building minigzip tool" OFF)

if (USE_ZLIB_RABIN_KARP_HASH)
add_definitions(-DUSE_ZLIB_RABIN_KARP_ROLLING_HASH)
Expand Down Expand Up @@ -79,9 +81,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
add_definitions(-DRISCV_RVV)
add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
add_definitions(-DADLER32_SIMD_RVV)
#TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# Required by CPU features detection code.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")

# TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
# chunk_copy is required for READ64 and unconditional decode of literals.
add_definitions(-DINFLATE_CHUNK_GENERIC)
add_definitions(-DINFLATE_CHUNK_READ_64LE)

# Tested with clang-17, unaligned loads are required by read64 & chunk_copy.
# TODO(cavalcantii): replace internal clang flags for -munaligned-access
# when we have a newer compiler available.
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem")
endif()

endif()
Expand Down Expand Up @@ -192,9 +201,14 @@ set(ZLIB_SRCS
if (ENABLE_SIMD_OPTIMIZATIONS)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
message("RISCVV: Add optimizations.")
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)

list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
else()
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
Expand Down Expand Up @@ -339,7 +353,7 @@ if (BUILD_UNITTESTS)
endif()

#============================================================================
# Minigzip tool
# Minizip tool
#============================================================================
# TODO(cavalcantii): get it working on Windows.
if (BUILD_MINIZIP_BIN)
Expand All @@ -349,3 +363,18 @@ if (BUILD_MINIZIP_BIN)
)
target_link_libraries(minizip_bin zlib)
endif()

#============================================================================
# zpipe tool
#============================================================================
if (BUILD_ZPIPE)
add_executable(zpipe examples/zpipe.c)
target_link_libraries(zpipe zlib)
endif()
#============================================================================
# MiniGzip tool
#============================================================================
if (BUILD_MINIGZIP)
add_executable(minigzip_bin test/minigzip.c)
target_link_libraries(minigzip_bin zlib)
endif()
166 changes: 76 additions & 90 deletions deps/zlib/adler32_simd.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#include "adler32_simd.h"

Expand Down Expand Up @@ -368,103 +365,92 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */

#elif defined(ADLER32_SIMD_RVV)
#include <riscv_vector.h>
/* adler32_rvv.c - RVV version of Adler-32
* RVV 1.0 code contributed by Alex Chiang <[email protected]>
* on https://github.com/zlib-ng/zlib-ng/pull/1532
* Port from Simon Hosie's fork:
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1

/*
* Patch by Simon Hosie, from:
* https://github.com/cloudflare/zlib/pull/55
*/

uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;

size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
vl = vl > 256 ? 256 : vl;
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
vuint16m2_t v_buf16_accu;

/*
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
* accumulators to boost performance.
*
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
* vl > 256 (255 * 256 <= UINT16_MAX).
*
* We accumulate 8-bit data into a 16-bit accumulator and then
* move the data into the 32-bit accumulator at the last iteration.
size_t vl = __riscv_vsetvlmax_e8m2();
const vuint16m4_t zero16 = __riscv_vmv_v_x_u16m4(0, vl);
vuint16m4_t a_sum = zero16;
vuint32m8_t b_sum = __riscv_vmv_v_x_u32m8(0, vl);

/* Deal with the part which is not a multiple of vl first; because it's
* easier to zero-stuff the beginning of the checksum than it is to tweak the
* multipliers and sums for odd lengths afterwards.
*/
size_t head = len & (vl - 1);
if (head > 0) {
vuint8m2_t zero8 = __riscv_vmv_v_x_u8m2(0, vl);
vuint8m2_t in = __riscv_vle8_v_u8m2(buf, vl);
in = __riscv_vslideup(zero8, in, vl - head, vl);
vuint16m4_t in16 = __riscv_vwcvtu_x(in, vl);
a_sum = in16;
buf += head;
}

/* We have a 32-bit accumulator, and in each iteration we add 22-times a
* 16-bit value, plus another 16-bit value. We periodically subtract up to
* 65535 times BASE to avoid overflow. b_overflow estimates how often we
* need to do this subtraction.
*/
const int b_overflow = BASE / 23;
int fixup = b_overflow;
ssize_t iters = (len - head) / vl;
while (iters > 0) {
const vuint16m4_t a_overflow = __riscv_vrsub(a_sum, BASE, vl);
int batch = iters < 22 ? iters : 22;
iters -= batch;
b_sum = __riscv_vwmaccu(b_sum, batch, a_sum, vl);
vuint16m4_t a_batch = zero16, b_batch = zero16;

/* Do a short batch, where neither a_sum nor b_sum can overflow a 16-bit
* register. Then add them back into the main accumulators.
*/
size_t block_size = (256 / vl) * vl;
size_t nmax_limit = (NMAX / block_size);
size_t cnt = 0;
while (left >= block_size) {
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t subprob = block_size;
while (subprob > 0) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
subprob -= vl;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
left -= block_size;
/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
cnt = 0;
}
while (batch-- > 0) {
vuint8m2_t in8 = __riscv_vle8_v_u8m2(buf, vl);
buf += vl;
b_batch = __riscv_vadd(b_batch, a_batch, vl);
a_batch = __riscv_vwaddu_wv(a_batch, in8, vl);
}
/* the left len <= 256 now, we can use 16-bit accum safely */
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
size_t res = left;
while (left >= vl) {
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
buf += vl;
left -= vl;
vbool4_t ov = __riscv_vmsgeu(a_batch, a_overflow, vl);
a_sum = __riscv_vadd(a_sum, a_batch, vl);
a_sum = __riscv_vadd_mu(ov, a_sum, a_sum, 65536 - BASE, vl);
b_sum = __riscv_vwaddu_wv(b_sum, b_batch, vl);
if (--fixup <= 0) {
b_sum = __riscv_vnmsac(b_sum, BASE, __riscv_vsrl(b_sum, 16, vl), vl);
fixup = b_overflow;
}
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);

sum2 += (sum2_sum + adler * (len - left));

vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);

adler += adler_sum;

while (left--) {
adler += *buf++;
sum2 += adler;
}

sum2 %= BASE;
adler %= BASE;

return adler | (sum2 << 16);
}
/* Adjust per-lane sums to have appropriate offsets from the end of the
* buffer.
*/
const vuint16m4_t off = __riscv_vrsub(__riscv_vid_v_u16m4(vl), vl, vl);
vuint16m4_t bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);
b_sum = __riscv_vadd(__riscv_vwmulu(a_sum, off, vl),
__riscv_vwmulu(bsum16, vl, vl), vl);
bsum16 = __riscv_vncvt_x(__riscv_vremu(b_sum, BASE, vl), vl);

/* And finally, do a horizontal sum across the registers for the final
* result.
*/
uint32_t a = adler & 0xffff;
uint32_t b = ((adler >> 16) + a * (len % BASE)) % BASE;
vuint32m1_t sca = __riscv_vmv_v_x_u32m1(a, 1);
vuint32m1_t scb = __riscv_vmv_v_x_u32m1(b, 1);
sca = __riscv_vwredsumu(a_sum, sca, vl);
scb = __riscv_vwredsumu(bsum16, scb, vl);
a = __riscv_vmv_x(sca);
b = __riscv_vmv_x(scb);
a %= BASE;
b %= BASE;
return (b << 16) | a;
}

#endif /* ADLER32_SIMD_SSSE3 */
75 changes: 75 additions & 0 deletions deps/zlib/contrib/optimizations/chunkcopy.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@

#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
#define Z_BUILTIN_MEMCPY __builtin_memcpy
#define Z_BUILTIN_MEMSET __builtin_memset
#else
#define Z_BUILTIN_MEMCPY zmemcpy
#define Z_BUILTIN_MEMSET zmemset
#endif

#if defined(INFLATE_CHUNK_SIMD_NEON)
Expand All @@ -31,6 +33,8 @@ typedef uint8x16_t z_vec128i_t;
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
#include <emmintrin.h>
typedef __m128i z_vec128i_t;
#elif defined(INFLATE_CHUNK_GENERIC)
typedef struct { uint8_t x[16]; } z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
#endif
Expand Down Expand Up @@ -265,6 +269,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) {
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#elif defined(INFLATE_CHUNK_GENERIC)
/*
* Default implementations for chunk-copy functions rely on memcpy() being
* inlined by the compiler for best performance. This is most likely to work
* as expected when the length argument is constant (as is the case here) and
* the target supports unaligned loads and stores. Since that's not always a
* safe assumption, this may need extra compiler arguments such as
* `-mno-strict-align` or `-munaligned-access`, or the availability of
* extensions like SIMD.
*/

/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t in;
Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
z_vec128i_t out;
for (int i = 0; i < sizeof(out); i += sizeof(in)) {
Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
}
return out;
}

/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
int8_t in = *(const uint8_t*)src;
z_vec128i_t out;
Z_BUILTIN_MEMSET(&out, in, sizeof(out));
return out;
}

/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
}
#endif

/*
Expand Down
Loading
Loading