diff --git a/.github/workflows/cmake-darwin.yml b/.github/workflows/cmake-darwin.yml new file mode 100644 index 0000000..20a7c4d --- /dev/null +++ b/.github/workflows/cmake-darwin.yml @@ -0,0 +1,24 @@ +name: macOS + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + BUILD_TYPE: Release + +jobs: + build: + runs-on: macos-14 + steps: + - uses: actions/checkout@v3 + - name: Install openmp + run: brew install libomp + - name: Configure CMake + run: OpenMP_ROOT=$(brew --prefix)/opt/libomp cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} + - name: Build + run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} + - name: Test + run: ctest --verbose -C ${{env.BUILD_TYPE}} --test-dir ${{github.workspace}}/build/src/libllm diff --git a/.github/workflows/cmake-windows.yml b/.github/workflows/cmake-windows.yml index 263e1f1..0bad8c7 100644 --- a/.github/workflows/cmake-windows.yml +++ b/.github/workflows/cmake-windows.yml @@ -19,4 +19,4 @@ jobs: - name: Build run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} - name: Test - run: ctest --verbose -C ${{env.BUILD_TYPE}} --test-dir ${{github.workspace}}/build/src/libllm + run: ${{github.workspace}}\build\src\libllm\${{env.BUILD_TYPE}}\unittest.exe diff --git a/CMakeLists.txt b/CMakeLists.txt index 046d931..b72012a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,12 +40,22 @@ set(MKL_PREFIX "/opt/intel/mkl" CACHE STRING "Prefix for MKL headers and librari #add_link_options(-fsanitize=address) if(WIN32) - add_definitions( "/D_CRT_SECURE_NO_WARNINGS /DCATCH_AMALGAMATED_CUSTOM_MAIN /DCATCH_CONFIG_PREFIX_ALL" ) + add_definitions( "/D_CRT_SECURE_NO_WARNINGS /DCATCH_AMALGAMATED_CUSTOM_MAIN /DCATCH_CONFIG_PREFIX_ALL" ) endif(WIN32) if(UNIX) - add_definitions( "-DCATCH_AMALGAMATED_CUSTOM_MAIN -DCATCH_CONFIG_PREFIX_ALL" ) - set(CMAKE_CXX_FLAGS "-O3 -g") - set(CMAKE_C_FLAGS "-O3 -g") + add_definitions( "-DCATCH_AMALGAMATED_CUSTOM_MAIN -DCATCH_CONFIG_PREFIX_ALL -D_FILE_OFFSET_BITS=64" ) + set(CMAKE_CXX_FLAGS "-O3 -g") + set(CMAKE_C_FLAGS "-O3 -g") endif(UNIX) +message("CMAKE_HOST_SYSTEM_PROCESSOR=" ${CMAKE_HOST_SYSTEM_PROCESSOR}) + +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + add_compile_definitions(LIBLLM_ARCH_X86_64) + set(LIBLLM_KERNEL_X86_64 ON) +endif() +if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)") + add_compile_definitions(LIBLLM_ARCH_AARCH64) +endif() + add_subdirectory("src/libllm") diff --git a/src/libllm/CMakeLists.txt b/src/libllm/CMakeLists.txt index f594ee4..c5f0190 100644 --- a/src/libllm/CMakeLists.txt +++ b/src/libllm/CMakeLists.txt @@ -20,8 +20,6 @@ set(lut_SOURCES "lut/zip_file.cc") set(libllm_SOURCES - "cpu/kernel/kernel_avx2.cc" - "cpu/kernel/kernel_avx512.cc" "cpu/kernel/kernel_fallback.cc" "cpu/kernel/kernel.cc" "cpu/kernel/util.cc" @@ -80,7 +78,7 @@ set(llm_SOURCES "dialog_manager.cc" "llm_main.cc") -set(libllm_INCDIR ".." "../../third_party") +set(libllm_INCDIR ".." "../../third_party" ${OpenMP_CXX_INCLUDE_DIRS}) if (WITH_CUDA) set(libllm_INCDIR ${libllm_INCDIR} ${CUDAToolkit_INCLUDE_DIRS}) @@ -118,38 +116,58 @@ if (WITH_CUDA) "lut/internal/log.cc") endif() +# OS specific code if(WIN32) - set_source_files_properties( - "cpu/kernel/kernel_avx512.cc" - PROPERTIES COMPILE_FLAGS /arch:AVX512) - set_source_files_properties( - "cpu/kernel/kernel_avx2.cc" - PROPERTIES COMPILE_FLAGS /arch:AVX2) set(libllm_SOURCES ${libllm_SOURCES} "lut/path_windows.cc" "lut/platform_windows.cc" "lut/shared_library_windows.cc") -endif(WIN32) - +endif() if(UNIX) - set_source_files_properties( - "cpu/kernel/kernel_avx512.cc" - PROPERTIES COMPILE_FLAGS "-mavx512f") - set_source_files_properties( - "cpu/kernel/kernel_avx2.cc" - PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c") set(libllm_SOURCES ${libllm_SOURCES} - "lut/path_linux.cc" "lut/platform_linux.cc" "lut/shared_library_linux.cc") -endif(UNIX) +endif() +if(UNIX AND APPLE) + set(libllm_SOURCES + ${libllm_SOURCES} + "lut/path_darwin.cc") +endif() +if(UNIX AND NOT APPLE) + set(libllm_SOURCES + ${libllm_SOURCES} + "lut/path_linux.cc") +endif() + +# CPU specific code +if(LIBLLM_KERNEL_X86_64) + set(libllm_SOURCES + ${libllm_SOURCES} + "cpu/kernel/kernel_avx2.cc" + "cpu/kernel/kernel_avx512.cc") + if(WIN32) + set_source_files_properties( + "cpu/kernel/kernel_avx512.cc" + PROPERTIES COMPILE_FLAGS /arch:AVX512) + set_source_files_properties( + "cpu/kernel/kernel_avx2.cc" + PROPERTIES COMPILE_FLAGS /arch:AVX2) + endif(WIN32) + if(UNIX) + set_source_files_properties( + "cpu/kernel/kernel_avx512.cc" + PROPERTIES COMPILE_FLAGS "-mavx512f") + set_source_files_properties( + "cpu/kernel/kernel_avx2.cc" + PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c") + endif(UNIX) +endif() add_library(lut STATIC ${lut_SOURCES}) set_target_properties(lut PROPERTIES CXX_VISIBILITY_PRESET hidden) -target_include_directories(lut PRIVATE ".." "../../third_party/") - +target_include_directories(lut PRIVATE ${libllm_INCDIR}) set(libllm_LIBADD lut @@ -165,14 +183,14 @@ target_include_directories(libllm_static PRIVATE ${libllm_INCDIR}) add_library(libllm SHARED $) target_link_libraries(libllm ${libllm_LIBADD} ) set_property(TARGET libllm PROPERTY OUTPUT_NAME llm) -if(UNIX) +if(UNIX AND NOT APPLE) target_link_options(libllm PUBLIC "-Wl,--no-undefined") -endif(UNIX) +endif() add_library(catch2 STATIC "../../third_party/catch2/catch_amalgamated.cpp") add_executable(unittest ${unittest_SOURCES}) -target_include_directories(unittest PRIVATE .. "../../third_party/") -target_link_libraries(unittest libllm_static lut catch2) +target_include_directories(unittest PRIVATE ${libllm_INCDIR}) +target_link_libraries(unittest libllm_static lut catch2 OpenMP::OpenMP_CXX) add_executable(llm ${llm_SOURCES}) target_include_directories(llm PRIVATE ..) @@ -180,7 +198,7 @@ target_link_libraries(llm libllm lut) if (WITH_CUDA) add_library(llmextcublas SHARED ${llmextcublas_SOURCES}) - target_include_directories(llmextcublas PRIVATE .. "../../third_party/") + target_include_directories(llmextcublas PRIVATE ${libllm_INCDIR}) target_link_libraries(llmextcublas lut CUDA::cublas) if(UNIX) target_link_options(llmextcublas PUBLIC "-Wl,--no-undefined") diff --git a/src/libllm/cpu/kernel/kernel.cc b/src/libllm/cpu/kernel/kernel.cc index c60eeb3..6617948 100644 --- a/src/libllm/cpu/kernel/kernel.cc +++ b/src/libllm/cpu/kernel/kernel.cc @@ -40,7 +40,8 @@ namespace kernel { enum class CPUMathBackend { DEFAULT, AVX2, - AVX512 + AVX512, + ASIMDHP }; CPUMathBackend findBestCpuMathBackend() { @@ -49,19 +50,30 @@ CPUMathBackend findBestCpuMathBackend() { bool isaAvx512f = ruapu_supports("avx512f") > 0; bool isaF16c = ruapu_supports("f16c") > 0; +#ifdef LIBLLM_ARCH_X86_64 LOG(INFO) << lut::sprintf( "ISA support: AVX2=%d F16C=%d AVX512F=%d", isaAvx2, isaF16c, isaAvx512f); +#endif // LIBLLM_ARCH_X86_64 +#ifdef LIBLLM_ARCH_X86_64 if (isaAvx512f && isaF16c) { LOG(INFO) << "Use Avx512 backend."; return CPUMathBackend::AVX512; - } else if (isaAvx2 && isaF16c) { + } + + if (isaAvx2 && isaF16c) { LOG(INFO) << "Use Avx2 backend."; return CPUMathBackend::AVX2; - } else { - LOG(FATAL) << "CPU not supported (AVX2 and F16C is required)."; - NOT_IMPL(); } +#endif // LIBLLM_ARCH_X86_64 + +#ifdef LIBLLM_ARCH_AARCH64 + LOG(INFO) << "Use default backend."; + return CPUMathBackend::DEFAULT; +#endif // LIBLLM_ARCH_AARCH64 + + LOG(FATAL) << "CPU not supported."; + NOT_IMPL(); } // instance of Api. @@ -106,6 +118,7 @@ void Api::init() { _instance = new Api(); switch (findBestCpuMathBackend()) { +#ifdef LIBLLM_ARCH_X86_64 case CPUMathBackend::AVX512: _instance->_sgemm = std::make_unique(); _instance->_sgemmOmp = std::make_unique(); @@ -120,6 +133,7 @@ void Api::init() { _instance->_q4dequant = std::make_unique(); _instance->_cvtHalfToFloat = std::make_unique(); break; +#endif // LIBLLM_ARCH_X86_64 case CPUMathBackend::DEFAULT: _instance->_sgemm = std::make_unique(); _instance->_sgemmOmp = std::make_unique(); diff --git a/src/libllm/cpu/kernel/kernel_fallback.cc b/src/libllm/cpu/kernel/kernel_fallback.cc index 5ea1158..891c9aa 100644 --- a/src/libllm/cpu/kernel/kernel_fallback.cc +++ b/src/libllm/cpu/kernel/kernel_fallback.cc @@ -105,12 +105,29 @@ void SAxpyFallbackKernel::apply(int64_t n, float a, PCFp32 x, PFp32 y) { const float *px = x; float *py = y; for (int i = 0; i < n; ++i) { - *py = a * *px; + *py += a * *px; ++px; ++py; } } +void SAxpyFallbackKernel::applyColumn(const SGEMVArgs &args, int column, float *y) { + apply(args.N, args.x[column], args.A + column * args.lda, y); +} + +float SDotFallbackKernel::apply(int64_t n, const float *x, const float *y) { + float sum = 0; + for (int64_t i = 0; i < n; ++i) { + sum += x[i] * y[i]; + } + + return sum; +} + +float SDotFallbackKernel::applyRow(const SGEMVArgs &args, int row) { + return apply(args.N, args.A + row * args.lda, args.x); +} + void CvtHalfToFloatFallbackKernel::apply(int64_t n, PCFp16 x, PFp32 y) { for (int i = 0; i < n; ++i) { y[i] = lut::cvtsh_ss(x[i]); diff --git a/src/libllm/cpu/kernel/sgemv.h b/src/libllm/cpu/kernel/sgemv.h index b527923..86e5dd7 100644 --- a/src/libllm/cpu/kernel/sgemv.h +++ b/src/libllm/cpu/kernel/sgemv.h @@ -48,10 +48,10 @@ class SGEMVImpl : public SGEMV { typedef SGEMVImpl SGEMVImplAvx512; typedef SGEMVImpl SGEMVImplAvx2; -typedef SGEMVImpl SGEMVImplDefault; +typedef SGEMVImpl SGEMVImplDefault; typedef SGEMVImpl SGEMVImplAvx512OMP; typedef SGEMVImpl SGEMVImplAvx2OMP; -typedef SGEMVImpl SGEMVImplDefaultOMP; +typedef SGEMVImpl SGEMVImplDefaultOMP; } // namespace kernel } // namespace cpu diff --git a/src/libllm/cpu/kernel/skernel.h b/src/libllm/cpu/kernel/skernel.h index 3e9d7e8..b4964d0 100644 --- a/src/libllm/cpu/kernel/skernel.h +++ b/src/libllm/cpu/kernel/skernel.h @@ -83,6 +83,13 @@ struct SDotAvx2Kernel { static float applyRow(const SGEMVArgs &args, int row); }; +struct SDotFallbackKernel { + typedef float ValueType; + + static float apply(int64_t n, const float *x, const float *y); + static float applyRow(const SGEMVArgs &args, int row); +}; + } // namespace kernel } // namespace cpu } // namespace op diff --git a/src/libllm/cpu/kernel/test.cc b/src/libllm/cpu/kernel/test.cc index 1ce7b3d..68291c0 100644 --- a/src/libllm/cpu/kernel/test.cc +++ b/src/libllm/cpu/kernel/test.cc @@ -164,6 +164,9 @@ void refSgemm( } } + +#ifdef LIBLLM_ARCH_X86_64 + CATCH_TEST_CASE("test q4 dequantization", "[lymath][dequant][q4]") { constexpr int DIM = DequantMinElemPerThread * 2 + GroupSizeQ4; @@ -269,6 +272,7 @@ CATCH_TEST_CASE("test q4 dot kernels apply row", "[lymath][dot][q4]") { float a = DotQ4Avx2Kernel::apply(NUM_COL * 2, x2.data(), {A.data(), scaleA.data(), zeroA.data()}, 0); CATCH_REQUIRE(isClose(a, a0 + a1)); } +#endif // LIBLLM_ARCH_X86_64 CATCH_TEST_CASE("test lymath_q4gemm", "[lymath][api][q4]") { testGemmQ4(true, 1, 32, 128); @@ -359,7 +363,7 @@ void testHalfToFloat(int n) { random.fill(lut::makeSpan(yr)); std::transform(yr.begin(), yr.end(), x.begin(), lut::cvtss_sh); - CvtHalfToFloatAvx2OMP().apply(n, x.data(), y.data()); + convertHalfToFloat(n, x.data(), y.data()); CATCH_REQUIRE(isClose(yr, y, 1e-4, 1e-3)); } diff --git a/src/libllm/dtype.cc b/src/libllm/dtype.cc index 38cf9de..ce1ad59 100644 --- a/src/libllm/dtype.cc +++ b/src/libllm/dtype.cc @@ -70,14 +70,6 @@ DType DType::getTypeImpl() { #endif -template DType DType::getTypeImpl(); -template DType DType::getTypeImpl(); -template DType DType::getTypeImpl(); -template DType DType::getTypeImpl(); -template DType DType::getTypeImpl(); -template DType DType::getTypeImpl(); - - int64_t DType::getTotalSize(int64_t numel) const { switch (_dtype) { case DType::kFloat: diff --git a/src/libllm/lut/path_darwin.cc b/src/libllm/lut/path_darwin.cc new file mode 100644 index 0000000..9d1b416 --- /dev/null +++ b/src/libllm/lut/path_darwin.cc @@ -0,0 +1,62 @@ +// The MIT License (MIT) +// +// Copyright (c) 2023 Xiaoyang Chen +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software +// and associated documentation files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "libllm/lut/path.h" + +#include +#include +#include +#include "libllm/lut/error.h" +#include "libllm/lut/log.h" +#include "libllm/lut/strings.h" + +namespace lut { + +Path Path::currentExecutablePath() { + char path[PATH_MAX + 1]; + uint32_t size = sizeof(path); + int ret = _NSGetExecutablePath(path, &size); + if (ret) { + throw lut::AbortedError("failed to call _NSGetExecutablePath()"); + } + + return Path(path); +} + +Path Path::currentModulePath() { + Dl_info info; + int success = dladdr(reinterpret_cast(¤tModulePath), &info); + CHECK(success); + + return Path(info.dli_fname); +} + +bool Path::isabs() const { + if (_path.size() == 0) return false; + if (_path[0] == '/') return true; + + return false; +} + +std::string Path::normPath(const std::string &path) { + return path; +} + + +} // namespace lut diff --git a/src/libllm/lut/platform.h b/src/libllm/lut/platform.h index 3d3dfb8..c0a83f9 100644 --- a/src/libllm/lut/platform.h +++ b/src/libllm/lut/platform.h @@ -35,8 +35,6 @@ namespace lut { -bool isAvx512Available(); -bool isAvx2Available(); void *alloc32ByteAlignedMem(int64_t nbytes); void free32ByteAlignedMem(void *); const char *getPathDelim(); diff --git a/src/libllm/lut/platform_linux.cc b/src/libllm/lut/platform_linux.cc index a5fb667..a25072d 100644 --- a/src/libllm/lut/platform_linux.cc +++ b/src/libllm/lut/platform_linux.cc @@ -23,23 +23,6 @@ namespace lut { -void initCpuInfo() { -#if !defined(__clang__) || __clang_major__ >= 6 - __builtin_cpu_init(); -#endif -} - -bool isAvx512Available() { - initCpuInfo(); - return __builtin_cpu_supports("avx512f") != 0; -} - -bool isAvx2Available() { - initCpuInfo(); - return __builtin_cpu_supports("avx2") != 0; -} - - void *alloc32ByteAlignedMem(int64_t size) { if (size % 32 != 0) { size += (32 - size % 32); diff --git a/src/libllm/lut/zip_file.cc b/src/libllm/lut/zip_file.cc index 0617aaa..e51a011 100644 --- a/src/libllm/lut/zip_file.cc +++ b/src/libllm/lut/zip_file.cc @@ -27,6 +27,8 @@ #ifdef _MSC_VER #define FSEEK64 _fseeki64 +#elif _FILE_OFFSET_BITS == 64 +#define FSEEK64 fseeko #else #define FSEEK64 fseeko64 #endif // _MSC_VER diff --git a/src/libllm/state_map.cc b/src/libllm/state_map.cc index 7ae3bb7..600e772 100644 --- a/src/libllm/state_map.cc +++ b/src/libllm/state_map.cc @@ -120,8 +120,5 @@ bool StateMap::hasValue(const std::string &name) const { return _intDict.find(name) != _intDict.end(); } -template int StateMap::getValue(const std::string &name) const; -template void StateMap::putValue(const std::string &name, int value); -template bool StateMap::hasValue(const std::string &name) const; }