Skip to content

Commit

Permalink
lib: zstd: Remove large inline functions in zstd_lazy.c
Browse files Browse the repository at this point in the history
Backport of upstream PR #2828 [0].

Large functions with excessive force inlining can cause trouble for
compilers, and can sometimes take excess stack space because the
compiler isn't able to fully analyze the function. This commit splits
functions that have multiple copies of the same body into multiple
smaller functions, which can help the compiler.

This commit isn't strictly necessary, as the reported problems [1] are
in zstd_fast.c and zstd_double_fast.c. But, these functions are using
the same pattern, so they could also be problematic. And, we already had
the fix sitting in our dev branch for our next release, so I figured I'd
add it in for consistency.

Bloat-o-meter output summary on x86-64 shows we also save 1.5 KB
of code size:

```
> ../scripts/bloat-o-meter vmlinux.old vmlinux
add/remove: 50/5 grow/shrink: 10/6 up/down: 28810/-30369 (-1559)
Total: Before=6418562, After=6417003, chg -0.02%
```

[0] facebook/zstd#2828
[1] https://lkml.org/lkml/2021/11/15/710

Reported-by: Geert Uytterhoeven <[email protected]>
Signed-off-by: Nick Terrell <[email protected]>
  • Loading branch information
terrelln authored and Dark-Matter7232 committed Oct 2, 2022
1 parent d81ed27 commit 672dbad
Showing 1 changed file with 89 additions and 134 deletions.
223 changes: 89 additions & 134 deletions lib/zstd/compress/zstd_lazy.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,55 +392,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
}


static size_t
ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
case 7 :
case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
}
}


static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
case 7 :
case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
}
}


static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
case 7 :
case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
}
}



/* *********************************
* Hash Chain
***********************************/
Expand Down Expand Up @@ -595,7 +546,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B

/* inlining is important to hardwire a hot branch (template emulation) */
FORCE_INLINE_TEMPLATE
size_t ZSTD_HcFindBestMatch_generic (
size_t ZSTD_HcFindBestMatch(
ZSTD_matchState_t* ms,
const BYTE* const ip, const BYTE* const iLimit,
size_t* offsetPtr,
Expand Down Expand Up @@ -783,76 +734,106 @@ size_t ZSTD_HcFindBestMatch_generic (
return ml;
}

typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);

FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
case 7 :
case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
}
}
/*
* This struct contains the functions necessary for lazy to search.
* Currently, that is only searchMax. However, it is still valuable to have the
* VTable because this makes it easier to add more functions to the VTable later.
*/
typedef struct {
searchMax_f searchMax;
} ZSTD_LazyVTable;

#define GEN_ZSTD_BT_VTABLE(dictMode, mls, ...) \
static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offsetPtr) \
{ \
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
} \
static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
ZSTD_BtFindBestMatch_##dictMode##_##mls \
};

#define GEN_ZSTD_HC_VTABLE(dictMode, mls, ...) \
static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
ZSTD_matchState_t* ms, \
const BYTE* ip, const BYTE* const iLimit, \
size_t* offsetPtr) \
{ \
assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
} \
static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
ZSTD_HcFindBestMatch_##dictMode##_##mls \
};

static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
case 7 :
case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
#define ZSTD_FOR_EACH_MLS(X, dictMode) \
X(dictMode, 4) \
X(dictMode, 5) \
X(dictMode, 6)

#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
X(__VA_ARGS__, noDict) \
X(__VA_ARGS__, extDict) \
X(__VA_ARGS__, dictMatchState) \
X(__VA_ARGS__, dedicatedDictSearch)

/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)

#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
{ \
&ZSTD_BtVTable_##dictMode##_4, \
&ZSTD_BtVTable_##dictMode##_5, \
&ZSTD_BtVTable_##dictMode##_6 \
}
}


static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
case 7 :
case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
{ \
&ZSTD_HcVTable_##dictMode##_4, \
&ZSTD_HcVTable_##dictMode##_5, \
&ZSTD_HcVTable_##dictMode##_6 \
}
}


FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* const iLimit,
size_t* offsetPtr)
{
switch(ms->cParams.minMatch)
{
default : /* includes case 3 */
case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
case 7 :
case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
#define GEN_ZSTD_VTABLE_ARRAY(X) \
{ \
X(noDict), \
X(extDict), \
X(dictMatchState), \
X(dedicatedDictSearch) \
}
}


/* *******************************
* Common parser - lazy strategy
*********************************/
typedef enum { search_hashChain, search_binaryTree } searchMethod_e;

static ZSTD_LazyVTable const* ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
{
/* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
/* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */

U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
switch (searchMethod) {
case search_hashChain:
return hcVTables[dictMode][mls - 4];
case search_binaryTree:
return btVTables[dictMode][mls - 4];
default:
return NULL;
}
}

FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore,
Expand All @@ -870,36 +851,13 @@ ZSTD_compressBlock_lazy_generic(
const U32 prefixLowestIndex = ms->window.dictLimit;
const BYTE* const prefixLowest = base + prefixLowestIndex;

typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);

/*
* This table is indexed first by the four ZSTD_dictMode_e values, and then
* by the two searchMethod_e values. NULLs are placed for configurations
* that should never occur (extDict modes go to the other implementation
* below and there is no DDSS for binary tree search yet).
*/
const searchMax_f searchFuncs[4][2] = {
{
ZSTD_HcFindBestMatch_selectMLS,
ZSTD_BtFindBestMatch_selectMLS
},
{
NULL,
NULL
},
{
ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
ZSTD_BtFindBestMatch_dictMatchState_selectMLS
},
{
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
NULL
}
};

searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];
searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;

const int isDMS = dictMode == ZSTD_dictMatchState;
Expand Down Expand Up @@ -1221,10 +1179,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
const U32 windowLog = ms->cParams.windowLog;

typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;

U32 offset_1 = rep[0], offset_2 = rep[1];

Expand Down

0 comments on commit 672dbad

Please sign in to comment.