From 672dbade910d9b2344af4edc1c3c2452259febe4 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 15 Nov 2021 20:41:53 -0800 Subject: [PATCH] lib: zstd: Remove large inline functions in zstd_lazy.c Backport of upstream PR #2828 [0]. Large functions with excessive force inlining can cause trouble for compilers, and can sometimes take excess stack space because the compiler isn't able to fully analyze the function. This commit splits functions that have multiple copies of the same body into multiple smaller functions, which can help the compiler. This commit isn't strictly necessary, as the reported problems [1] are in zstd_fast.c and zstd_double_fast.c. But, these functions are using the same pattern, so they could also be problematic. And, we already had the fix sitting in our dev branch for our next release, so I figured I'd add it in for consistency. Bloat-o-meter output summary on x86-64 shows we also save 1.5 KB of code size: ``` > ../scripts/bloat-o-meter vmlinux.old vmlinux add/remove: 50/5 grow/shrink: 10/6 up/down: 28810/-30369 (-1559) Total: Before=6418562, After=6417003, chg -0.02% ``` [0] https://github.com/facebook/zstd/pull/2828 [1] https://lkml.org/lkml/2021/11/15/710 Reported-by: Geert Uytterhoeven Signed-off-by: Nick Terrell --- lib/zstd/compress/zstd_lazy.c | 223 ++++++++++++++-------------------- 1 file changed, 89 insertions(+), 134 deletions(-) diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c index fb54d4e28a2bc..1db22db5b1e91 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -392,55 +392,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, } -static size_t -ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); - } -} - - -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); - } -} - - -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); - } -} - - - /* ********************************* * Hash Chain ***********************************/ @@ -595,7 +546,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( +size_t ZSTD_HcFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, @@ -783,76 +734,106 @@ size_t ZSTD_HcFindBestMatch_generic ( return ml; } +typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); - } -} +/* + * This struct contains the functions necessary for lazy to search. + * Currently, that is only searchMax. However, it is still valuable to have the + * VTable because this makes it easier to add more functions to the VTable later. + */ +typedef struct { + searchMax_f searchMax; +} ZSTD_LazyVTable; + +#define GEN_ZSTD_BT_VTABLE(dictMode, mls, ...) \ + static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \ + ZSTD_BtFindBestMatch_##dictMode##_##mls \ + }; +#define GEN_ZSTD_HC_VTABLE(dictMode, mls, ...) \ + static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \ + ZSTD_HcFindBestMatch_##dictMode##_##mls \ + }; -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate Binary Tree VTables for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE) +/* Generate Hash Chain VTables for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE) + +#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \ + { \ + &ZSTD_BtVTable_##dictMode##_4, \ + &ZSTD_BtVTable_##dictMode##_5, \ + &ZSTD_BtVTable_##dictMode##_6 \ } -} - -static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); +#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \ + { \ + &ZSTD_HcVTable_##dictMode##_4, \ + &ZSTD_HcVTable_##dictMode##_5, \ + &ZSTD_HcVTable_##dictMode##_6 \ } -} - -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); +#define GEN_ZSTD_VTABLE_ARRAY(X) \ + { \ + X(noDict), \ + X(extDict), \ + X(dictMatchState), \ + X(dedicatedDictSearch) \ } -} - /* ******************************* * Common parser - lazy strategy *********************************/ typedef enum { search_hashChain, search_binaryTree } searchMethod_e; +static ZSTD_LazyVTable const* ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode) +{ + /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */ + ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY); + ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY); + /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */ + + U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch)); + switch (searchMethod) { + case search_hashChain: + return hcVTables[dictMode][mls - 4]; + case search_binaryTree: + return btVTables[dictMode][mls - 4]; + default: + return NULL; + } +} + FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, @@ -870,36 +851,13 @@ ZSTD_compressBlock_lazy_generic( const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - /* * This table is indexed first by the four ZSTD_dictMode_e values, and then * by the two searchMethod_e values. NULLs are placed for configurations * that should never occur (extDict modes go to the other implementation * below and there is no DDSS for binary tree search yet). */ - const searchMax_f searchFuncs[4][2] = { - { - ZSTD_HcFindBestMatch_selectMLS, - ZSTD_BtFindBestMatch_selectMLS - }, - { - NULL, - NULL - }, - { - ZSTD_HcFindBestMatch_dictMatchState_selectMLS, - ZSTD_BtFindBestMatch_dictMatchState_selectMLS - }, - { - ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, - NULL - } - }; - - searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree]; + searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax; U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; const int isDMS = dictMode == ZSTD_dictMatchState; @@ -1221,10 +1179,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const dictStart = dictBase + ms->window.lowLimit; const U32 windowLog = ms->cParams.windowLog; - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax; U32 offset_1 = rep[0], offset_2 = rep[1];