From 802ea885efd73572fdd7afcdfd897a4460218cac Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 15 Nov 2021 17:25:24 -0800 Subject: [PATCH] Reduce function size in fast & dfast Take the same approach as in PR #2828 [0] to remove functions that force inline many function bodies and `switch`. Instead, create one function per "template" combination, and then switch between these functions. This allows the compiler to break the large function into many small functions, which generally helps codegen. Also, in the `extDict` modes when there is no ext-dict, call the top level function instead of the force inlined one, to save on code size. I'm specifically doing this because gcc on the parisc architecture doesn't handle the large function body well, and ends up using a lot of excess stack space. Outlining these functions fixes it. --- lib/compress/zstd_double_fast.c | 48 ++++++++++++++++++++++--------- lib/compress/zstd_fast.c | 50 ++++++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c index 9d19468cbaf..b9393b6a4e0 100644 --- a/lib/compress/zstd_double_fast.c +++ b/lib/compress/zstd_double_fast.c @@ -468,6 +468,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( return (size_t)(iend - anchor); } +#define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) + size_t ZSTD_compressBlock_doubleFast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -478,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize); } } @@ -498,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize); } } @@ -540,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_doubleFast_noDict_generic(ms, seqStore, rep, src, srcSize, mls); + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize); /* Search Loop */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */ @@ -653,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( return (size_t)(iend - anchor); } +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7) size_t ZSTD_compressBlock_doubleFast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -663,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); } } diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c index 2555e072044..87de17f81c1 100644 --- a/lib/compress/zstd_fast.c +++ b/lib/compress/zstd_fast.c @@ -90,7 +90,7 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, * This is also the work we do at the beginning to enter the loop initially. */ FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_generic( +ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, U32 const mls) @@ -310,6 +310,18 @@ ZSTD_compressBlock_fast_generic( goto _start; } +#define ZSTD_GEN_FAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_FAST_FN(noDict, 4) +ZSTD_GEN_FAST_FN(noDict, 5) +ZSTD_GEN_FAST_FN(noDict, 6) +ZSTD_GEN_FAST_FN(noDict, 7) size_t ZSTD_compressBlock_fast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -321,13 +333,13 @@ size_t ZSTD_compressBlock_fast( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_noDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_noDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_noDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_noDict_7(ms, seqStore, rep, src, srcSize); } } @@ -479,6 +491,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( return (size_t)(iend - anchor); } + +ZSTD_GEN_FAST_FN(dictMatchState, 4) +ZSTD_GEN_FAST_FN(dictMatchState, 5) +ZSTD_GEN_FAST_FN(dictMatchState, 6) +ZSTD_GEN_FAST_FN(dictMatchState, 7) + size_t ZSTD_compressBlock_fast_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) @@ -489,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_dictMatchState_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_dictMatchState_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_dictMatchState_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_dictMatchState_7(ms, seqStore, rep, src, srcSize); } } @@ -530,7 +548,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( /* switch to "regular" variant if extDict is invalidated due to maxDistance */ if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); /* Search Loop */ while (ip < ilimit) { /* < instead of <=, because (ip+1) */ @@ -603,6 +621,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( return (size_t)(iend - anchor); } +ZSTD_GEN_FAST_FN(extDict, 4) +ZSTD_GEN_FAST_FN(extDict, 5) +ZSTD_GEN_FAST_FN(extDict, 6) +ZSTD_GEN_FAST_FN(extDict, 7) size_t ZSTD_compressBlock_fast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -613,12 +635,12 @@ size_t ZSTD_compressBlock_fast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_extDict_4(ms, seqStore, rep, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_extDict_5(ms, seqStore, rep, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_extDict_6(ms, seqStore, rep, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_extDict_7(ms, seqStore, rep, src, srcSize); } }