Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LoongArch SIMD. #3587

Merged
merged 3 commits into from
Dec 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions codec/common/loongarch/intra_pred_com_lasx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*!
**********************************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Lu Wang <[email protected]>
*
* \copy
* Copyright (c) 2009-2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* \file intra_pred_com_lasx.c
*
* \brief Loongson optimization
*
* \date 13/10/2022 Created
*
**********************************************************************************
*/

#include <stdint.h>
#include "loongson_intrinsics.h"

void WelsIChromaPredV_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
__m256i vec_kuiSrc64 = __lasx_xvldrepl_d (pRef - kiStride, 0);

__lasx_xvst(vec_kuiSrc64, pPred, 0);
__lasx_xvst(vec_kuiSrc64, pPred, 32);
}

void WelsIChromaPredH_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
__m256i vec_kuiSrc0, vec_kuiSrc1;
int32_t iStride_x2 = (kiStride << 1);
int32_t iStride_x3 = (kiStride << 1) + kiStride;
int32_t iStride_x4 = (kiStride << 2);

pRef -= 1;
vec_kuiSrc0 = __lasx_xvldrepl_b(pRef + kiStride, 0);
vec_kuiSrc1 = __lasx_xvldrepl_b(pRef, 0);
vec_kuiSrc0 = __lasx_xvilvl_d(vec_kuiSrc0, vec_kuiSrc1);
__lasx_xvst(vec_kuiSrc0, pPred, 0);

vec_kuiSrc0 = __lasx_xvldrepl_b(pRef + iStride_x3, 0);
vec_kuiSrc1 = __lasx_xvldrepl_b(pRef + iStride_x2, 0);
vec_kuiSrc0 = __lasx_xvilvl_d(vec_kuiSrc0, vec_kuiSrc1);
__lasx_xvst(vec_kuiSrc0, pPred, 16);

pRef += iStride_x4;
vec_kuiSrc0 = __lasx_xvldrepl_b(pRef + kiStride, 0);
vec_kuiSrc1 = __lasx_xvldrepl_b(pRef, 0);
vec_kuiSrc0 = __lasx_xvilvl_d(vec_kuiSrc0, vec_kuiSrc1);
__lasx_xvst(vec_kuiSrc0, pPred, 32);

vec_kuiSrc0 = __lasx_xvldrepl_b(pRef + iStride_x3, 0);
vec_kuiSrc1 = __lasx_xvldrepl_b(pRef + iStride_x2, 0);
vec_kuiSrc0 = __lasx_xvilvl_d(vec_kuiSrc0, vec_kuiSrc1);
__lasx_xvst(vec_kuiSrc0, pPred, 48);
}

void WelsIChromaPredDc_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride) {
const int32_t kuiL1 = kiStride - 1;
const int32_t kuiL2 = kuiL1 + kiStride;
const int32_t kuiL3 = kuiL2 + kiStride;
const int32_t kuiL4 = kuiL3 + kiStride;
const int32_t kuiL5 = kuiL4 + kiStride;
const int32_t kuiL6 = kuiL5 + kiStride;
const int32_t kuiL7 = kuiL6 + kiStride;
/*caculate the iMean value*/
const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1 - kiStride] + pRef[2 - kiStride] +
pRef[3 - kiStride] + pRef[-1] + pRef[kuiL1] + pRef[kuiL2] +
pRef[kuiL3] + 4) >> 3;
const uint32_t kuiSum2 = pRef[4 - kiStride] + pRef[5 - kiStride] + pRef[6 - kiStride]
+ pRef[7 - kiStride];
const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;

const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2,
kuiMean2, kuiMean2, kuiMean2};
const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4,
kuiMean4, kuiMean4, kuiMean4};

__m256i vec_kuiTopMean64 = __lasx_xvldrepl_d(kuiTopMean, 0);
__m256i vec_kuiBottomMean64 = __lasx_xvldrepl_d(kuiBottomMean, 0);

__lasx_xvst(vec_kuiTopMean64, pPred, 0);
__lasx_xvst(vec_kuiBottomMean64, pPred, 32);
}
1 change: 1 addition & 0 deletions codec/common/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ elif cpu_family in ['loongarch32', 'loongarch64']
'loongarch/copy_mb_lsx.c',
'loongarch/deblock_lsx.c',
'loongarch/intra_pred_com_lsx.c',
'loongarch/intra_pred_com_lasx.c',
'loongarch/mc_chroma_lsx.c',
'loongarch/mc_horver_lsx.c',
'loongarch/satd_sad_lasx.c',
Expand Down
1 change: 1 addition & 0 deletions codec/common/targets.mk
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ COMMON_OBJSLOONGARCH_LSX += $(COMMON_ASM_LOONGARCH_LSX_SRCS:.c=.$(OBJ))

COMMON_ASM_LOONGARCH_LASX_SRCS=\
$(COMMON_SRCDIR)/loongarch/satd_sad_lasx.c\
$(COMMON_SRCDIR)/loongarch/intra_pred_com_lasx.c\

COMMON_OBJSLOONGARCH_LASX += $(COMMON_ASM_LOONGARCH_LASX_SRCS:.c=.$(OBJ))
ifeq ($(ASM_ARCH), loongarch)
Expand Down
7 changes: 7 additions & 0 deletions codec/encoder/core/inc/get_intra_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,13 @@ void WelsIChromaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiSt
#if defined(HAVE_LSX)
void WelsI16x16LumaPredPlane_lsx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_LSX

#if defined(HAVE_LASX)
void WelsIChromaPredV_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredH_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredDc_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_LASX

#if defined(__cplusplus)
}
#endif//__cplusplus
Expand Down
2 changes: 2 additions & 0 deletions codec/encoder/core/inc/sample.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ int32_t WelsSampleSatd8x8_lasx (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd16x8_lasx (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd8x16_lasx (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSatd16x16_lasx (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsIntra8x8Combined3Sad_lasx (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*, uint8_t*,
uint8_t*);
#endif

#if defined(__cplusplus)
Expand Down
10 changes: 10 additions & 0 deletions codec/encoder/core/inc/svc_motion_estimate.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,16 @@ void SumOf16x16BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t ki
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
}
#endif

#ifdef HAVE_LSX
extern "C"
{
int32_t SumOf8x8SingleBlock_lsx (uint8_t* pRef, const int32_t kiRefStride);
void SumOf8x8BlockOfFrame_lsx (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
const int32_t kiRefStride, uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
}
#endif

int32_t RequestScreenBlockFeatureStorage (CMemoryAlign* pMa, const int32_t kiFrameWidth, const int32_t kiFrameHeight,
const int32_t iNeedFeatureStorage,
SScreenBlockFeatureStorage* pScreenBlockFeatureStorage);
Expand Down
90 changes: 90 additions & 0 deletions codec/encoder/core/loongarch/sample_lasx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*!
**********************************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Lu Wang <[email protected]>
*
* \copy
* Copyright (c) 2009-2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* \file sample_lasx.c
*
* \brief Loongson optimization
*
* \date 13/10/2022 Created
*
**********************************************************************************
*/

#include <stdint.h>
#include "sad_common.h"
#include "loongson_intrinsics.h"

void WelsIChromaPredV_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredH_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredDc_lasx (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);

int32_t WelsIntra8x8Combined3Sad_lasx (uint8_t* pDecCb, int32_t iDecStride,
uint8_t* pEncCb, int32_t iEncStride,
int32_t* pBestMode, int32_t iLambda,
uint8_t* pDstChroma, uint8_t* pDecCr,
uint8_t* pEncCr) {
int32_t iBestMode = -1;
int32_t iCurCost, iBestCost = INT_MAX;

WelsIChromaPredV_lasx (pDstChroma, pDecCb, iDecStride);
WelsIChromaPredV_lasx (pDstChroma + 64, pDecCr, iDecStride);
iCurCost = WelsSampleSad8x8_lasx(pDstChroma, 8, pEncCb, iEncStride);
iCurCost += WelsSampleSad8x8_lasx(pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;

if (iCurCost < iBestCost) {
iBestMode = 2;
iBestCost = iCurCost;
}

WelsIChromaPredH_lasx(pDstChroma, pDecCb, iDecStride);
WelsIChromaPredH_lasx(pDstChroma + 64, pDecCr, iDecStride);
iCurCost = WelsSampleSad8x8_lasx(pDstChroma, 8, pEncCb, iEncStride);
iCurCost += WelsSampleSad8x8_lasx(pDstChroma + 64, 8, pEncCr, iEncStride) + iLambda * 2;
if (iCurCost < iBestCost) {
iBestMode = 1;
iBestCost = iCurCost;
}

WelsIChromaPredDc_lasx(pDstChroma, pDecCb, iDecStride);
WelsIChromaPredDc_lasx(pDstChroma + 64, pDecCr, iDecStride);
iCurCost = WelsSampleSad8x8_lasx(pDstChroma, 8, pEncCb, iEncStride);
iCurCost += WelsSampleSad8x8_lasx(pDstChroma + 64, 8, pEncCr, iEncStride);
if (iCurCost < iBestCost) {
iBestMode = 0;
iBestCost = iCurCost;
}
*pBestMode = iBestMode;

return iBestCost;
}
104 changes: 104 additions & 0 deletions codec/encoder/core/loongarch/svc_motion_estimate_lsx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*!
**********************************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Lu Wang <[email protected]>
*
* \copy
* Copyright (c) 2009-2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* \file svc_motion_estimate_lsx.c
*
* \brief Loongson optimization
*
* \date 13/10/2022 Created
*
**********************************************************************************
*/

#include <stdint.h>
#include "loongson_intrinsics.h"

int32_t SumOf8x8SingleBlock_lsx (uint8_t* pRef, const int32_t kiRefStride) {
__m128i vec_pRef0, vec_pRef1, vec_pRef2, vec_pRef3;
__m128i vec_pRef4, vec_pRef5, vec_pRef6, vec_pRef7;

int32_t iSum;
int32_t kiRefStride_x2 = kiRefStride << 1;
int32_t kiRefStride_x3 = kiRefStride_x2 + kiRefStride;
int32_t kiRefStride_x4 = kiRefStride << 2;

vec_pRef0 = __lsx_vld(pRef, 0);
vec_pRef1 = __lsx_vldx(pRef, kiRefStride);
vec_pRef2 = __lsx_vldx(pRef, kiRefStride_x2);
vec_pRef3 = __lsx_vldx(pRef, kiRefStride_x3);
pRef += kiRefStride_x4;
vec_pRef4 = __lsx_vld(pRef, 0);
vec_pRef5 = __lsx_vldx(pRef, kiRefStride);
vec_pRef6 = __lsx_vldx(pRef, kiRefStride_x2);
vec_pRef7 = __lsx_vldx(pRef, kiRefStride_x3);

vec_pRef0 = __lsx_vilvl_d(vec_pRef1, vec_pRef0);
vec_pRef2 = __lsx_vilvl_d(vec_pRef3, vec_pRef2);
vec_pRef4 = __lsx_vilvl_d(vec_pRef5, vec_pRef4);
vec_pRef6 = __lsx_vilvl_d(vec_pRef7, vec_pRef6);

vec_pRef0 = __lsx_vhaddw_hu_bu(vec_pRef0, vec_pRef0);
vec_pRef2 = __lsx_vhaddw_hu_bu(vec_pRef2, vec_pRef2);
vec_pRef4 = __lsx_vhaddw_hu_bu(vec_pRef4, vec_pRef4);
vec_pRef6 = __lsx_vhaddw_hu_bu(vec_pRef6, vec_pRef6);

vec_pRef0 = __lsx_vadd_h(vec_pRef0, vec_pRef2);
vec_pRef4 = __lsx_vadd_h(vec_pRef4, vec_pRef6);
vec_pRef0 = __lsx_vadd_h(vec_pRef0, vec_pRef4);
vec_pRef1 = __lsx_vhaddw_wu_hu(vec_pRef0, vec_pRef0);
vec_pRef2 = __lsx_vhaddw_du_wu(vec_pRef1, vec_pRef1);
vec_pRef0 = __lsx_vhaddw_qu_du(vec_pRef2, vec_pRef2);

iSum = __lsx_vpickve2gr_w(vec_pRef0, 0);
return iSum;
}

void SumOf8x8BlockOfFrame_lsx(uint8_t* pRefPicture, const int32_t kiWidth,
const int32_t kiHeight, const int32_t kiRefStride,
uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]) {
int32_t x, y;
uint8_t* pRef;
uint16_t* pBuffer;
int32_t iSum;
for (y = 0; y < kiHeight; y++) {
pRef = pRefPicture + kiRefStride * y;
pBuffer = pFeatureOfBlock + kiWidth * y;
for (x = 0; x < kiWidth; x++) {
iSum = SumOf8x8SingleBlock_lsx(pRef + x, kiRefStride);

pBuffer[x] = iSum;
pTimesOfFeatureValue[iSum]++;
}
}
}
8 changes: 8 additions & 0 deletions codec/encoder/core/src/get_intra_predictor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -742,5 +742,13 @@ void WelsInitIntraPredFuncs (SWelsFuncPtrList* pFuncList, const uint32_t kuiCpuF
pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_lsx;
}
#endif//HAVE_LSX

#if defined(HAVE_LASX)
if (kuiCpuFlag & WELS_CPU_LASX) {
pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_lasx;
pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_lasx;
pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_lasx;
}
#endif//HAVE_LASX
}
}
2 changes: 2 additions & 0 deletions codec/encoder/core/src/sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,8 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_lasx;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_lasx;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_lasx;

pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad = WelsIntra8x8Combined3Sad_lasx;
}
#endif
}
Expand Down
Loading