Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate transform implementation with lightGBM, add separate header file support. #4734

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .ci/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ else # Linux
mv $AMDAPPSDK_PATH/lib/x86_64/sdk/* $AMDAPPSDK_PATH/lib/x86_64/
echo libamdocl64.so > $OPENCL_VENDOR_PATH/amdocl64.icd
fi
if [[ $TASK == "transform" ]]; then
sudo apt-get update && apt-get -y upgrade
sudo DEBIAN_FRONTEND="noninteractive" apt-get install -y libboost-all-dev gcc g++ wget cmake git curl
cmake --version
gcc --version
cd $BUILD_DIRECTORY
wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-3.5.1.tar.gz && tar zxvf llvmorg-3.5.1.tar.gz
cd ./llvm-project-llvmorg-3.5.1/llvm && mkdir build && cd build && cmake -DLLVM_REQUIRES_RTTI=1 .. && make -j4 && sudo make install || exit -1
cd $BUILD_DIRECTORY
git clone --recursive https://github.com/microsoft/bond.git
sudo DEBIAN_FRONTEND="noninteractive" apt-get install -y clang zlib1g-dev
sudo curl -sSL https://get.haskellstack.org/ | sh
cd ./bond && mkdir build && cd build && cmake -DBOND_ENABLE_GRPC=FALSE .. && make -j4 && sudo make install || exit -1
fi
ARCH=$(uname -m)
if [[ $TASK == "cuda" ]]; then
echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
Expand Down
14 changes: 14 additions & 0 deletions .ci/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,20 @@ elif [[ $TASK == "mpi" ]]; then
elif [[ $METHOD == "source" ]]; then
cmake -DUSE_MPI=ON -DUSE_DEBUG=ON ..
fi
elif [[ $TASK == "transform" ]]; then
if [[ $METHOD == "pip" ]]; then
cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--transform || exit -1
pytest $BUILD_DIRECTORY/tests || exit -1
exit 0
elif [[ $METHOD == "wheel" ]]; then
cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --transform || exit -1
pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
pytest $BUILD_DIRECTORY/tests || exit -1
exit 0
elif [[ $METHOD == "source" ]]; then
cmake -DUSE_TRANSFORM=ON -DUSE_DEBUG=ON ..
fi
else
cmake ..
fi
Expand Down
10 changes: 9 additions & 1 deletion .github/workflows/python_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ jobs:
task: mpi
method: wheel
python_version: 3.7
- os: ubuntu-latest
task: transform
method: wheel
python_version: 3.7
steps:
- name: Checkout repository
uses: actions/[email protected]
Expand All @@ -61,7 +65,11 @@ jobs:
export COMPILER="gcc"
export OS_NAME="macos"
elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
export COMPILER="clang"
if [[ "${{ matrix.task }}" == "transform" ]]; then
export COMPILER="gcc"
else
export COMPILER="clang"
fi
export OS_NAME="linux"
fi
export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ tests/distributed/mlist.txt
tests/distributed/train*
tests/distributed/model*
tests/distributed/predict*

TestData

# Files from interactive R sessions
.Rproj.user
Expand Down
39 changes: 39 additions & 0 deletions .vsts-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,45 @@ jobs:
- bash: $(Build.SourcesDirectory)/.ci/test.sh
displayName: Test
###########################################
- job: Transform
###########################################
variables:
COMPILER: gcc
DEBIAN_FRONTEND: 'noninteractive'
IN_UBUNTU_LATEST_CONTAINER: 'true'
OS_NAME: 'linux'
SETUP_CONDA: 'true'
pool:
vmImage: 'ubuntu-latest'
container: ubuntu-latest
strategy:
matrix:
transform:
TASK: transform
METHOD: sdist
PYTHON_VERSION: 3.7
steps:
- script: |
echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
CONDA=$HOME/miniconda
echo "##vso[task.setvariable variable=CONDA]$CONDA"
echo "##vso[task.prependpath]$CONDA/bin"
AMDAPPSDK_PATH=$BUILD_SOURCESDIRECTORY/AMDAPPSDK
echo "##vso[task.setvariable variable=AMDAPPSDK_PATH]$AMDAPPSDK_PATH"
LD_LIBRARY_PATH=$AMDAPPSDK_PATH/lib/x86_64:$LD_LIBRARY_PATH
echo "##vso[task.setvariable variable=LD_LIBRARY_PATH]$LD_LIBRARY_PATH"
echo "##vso[task.setvariable variable=OPENCL_VENDOR_PATH]$AMDAPPSDK_PATH/etc/OpenCL/vendors"
displayName: 'Set variables'
- script: |
/tmp/docker exec -t -u 0 ci-container \
sh -c "apt-get update && apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
displayName: 'Install sudo'
- bash: $(Build.SourcesDirectory)/.ci/setup.sh
displayName: Setup
- bash: $(Build.SourcesDirectory)/.ci/test.sh
displayName: Test
###########################################
- job: QEMU_multiarch
###########################################
variables:
Expand Down
35 changes: 35 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ OPTION(USE_OPENMP "Enable OpenMP" ON)
OPTION(USE_GPU "Enable GPU-accelerated training" OFF)
OPTION(USE_SWIG "Enable SWIG to generate Java API" OFF)
OPTION(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
OPTION(USE_TRANSFORM "Enable transform processor support" OFF)
OPTION(USE_TIMETAG "Set to ON to output time costs" OFF)
OPTION(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
OPTION(USE_DEBUG "Set to ON for Debug mode" OFF)
Expand Down Expand Up @@ -33,6 +34,28 @@ PROJECT(lightgbm LANGUAGES C CXX)

list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")

#-- Transform Processor
if (USE_TRANSFORM)
cmake_minimum_required(VERSION 3.15)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.0.0")
message(FATAL_ERROR "Transform processor needs GCC/G++ 8 or higher.")
endif()
if(WIN32)
message(FATAL_ERROR "Transform processor temporarily do not support Windows with GCC/G++.")
endif()
if(APPLE)
message(FATAL_ERROR "Transform processor temporarily do not support MacOS with GCC/G++.")
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(FATAL_ERROR "Transform processor temporarily do not support Clang.")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
message(FATAL_ERROR "Transform processor temporarily do not support AppleClang.")
elseif(MSVC)
message(FATAL_ERROR "Transform processor temporarily do not support MSVC.")
endif()
endif()

#-- Sanitizer
if(USE_SANITIZER)
if(MSVC)
Expand Down Expand Up @@ -154,6 +177,10 @@ if(USE_GPU)
ADD_DEFINITIONS(-DUSE_GPU)
endif(USE_GPU)

if(USE_TRANSFORM)
ADD_DEFINITIONS(-DUSE_TRANSFORM)
endif(USE_TRANSFORM)

if(__INTEGRATE_OPENCL)
if(WIN32)
include(cmake/IntegratedOpenCL.cmake)
Expand Down Expand Up @@ -368,6 +395,14 @@ else()
add_library(_lightgbm SHARED ${SOURCES})
endif(BUILD_STATIC_LIB)

if(USE_TRANSFORM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/transform)
add_dependencies(lightgbm _transform)
add_dependencies(_lightgbm _transform)
target_link_libraries(lightgbm _transform)
target_link_libraries(_lightgbm _transform)
endif()

if(MSVC)
set_target_properties(_lightgbm PROPERTIES OUTPUT_NAME "lib_lightgbm")
endif(MSVC)
Expand Down
4 changes: 4 additions & 0 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,10 @@ class LIGHTGBM_EXPORT Boosting {
static Boosting* CreateBoosting(const std::string& type, const char* filename);

virtual bool IsLinear() const { return false; }

virtual std::string TransformStr() const = 0;

virtual std::string HeaderStr() const = 0;
};

class GBDTBase : public Boosting {
Expand Down
6 changes: 6 additions & 0 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,12 @@ struct Config {
// desc = **Note**: can be used only in CLI version
std::vector<std::string> valid;

// desc = path of transform file
std::string transform_file = "";

// desc = path of separate header file
std::string header_file = "";

// alias = num_iteration, n_iter, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators, max_iter
// check = >=0
// desc = number of boosting iterations
Expand Down
23 changes: 23 additions & 0 deletions include/LightGBM/dataset.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,21 @@ class Parser {
* \return Object of parser
*/
static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx, bool precise_float_parser);

#ifdef USE_TRANSFORM
/*!
* \brief Create an object of parser, will auto choose the format depend on file
* \param filename One Filename of data
* \param num_features Pass num_features of this data file if you know, <=0 means don't know
* \param label_idx index of label column
* \param precise_float_parser using precise floating point number parsing if true
* \param transform_str transform file content
* \param header_str separate header file content
* \return Object of parser
*/
static Parser* CreateParser(const char* filename, bool header, int num_features, int label_idx, bool precise_float_parser,
const std::string& transform_str, const std::string& header_str);
#endif
};

/*! \brief The main class of data set,
Expand Down Expand Up @@ -605,6 +620,12 @@ class Dataset {
/*! \brief Get names of current data set */
inline const std::vector<std::string>& feature_names() const { return feature_names_; }

/*! \brief Get name of transform file */
inline const std::string transform_filename() const {return transform_filename_;}

/*! \brief Get name of header file */
inline const std::string header_filename() const { return header_filename_;}

inline void set_feature_names(const std::vector<std::string>& feature_names) {
if (feature_names.size() != static_cast<size_t>(num_total_features_)) {
Log::Fatal("Size of feature_names error, should equal with total number of features");
Expand Down Expand Up @@ -683,6 +704,8 @@ class Dataset {

private:
std::string data_filename_;
std::string transform_filename_ = "";
std::string header_filename_ = "";
/*! \brief Store used features */
std::vector<std::unique_ptr<FeatureGroup>> feature_groups_;
/*! \brief Mapper from real feature index to used index*/
Expand Down
Empty file modified include/LightGBM/dataset_loader.h
100644 → 100755
Empty file.
16 changes: 16 additions & 0 deletions include/LightGBM/utils/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <unordered_map>
#include <utility>
#include <vector>
#include <fstream>

#if (!((defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))))
#define FMT_HEADER_ONLY
Expand Down Expand Up @@ -200,6 +201,21 @@ inline static std::vector<std::string> Split(const char* c_str, const char* deli
return ret;
}

inline static std::string LoadStringFromFile(const char* filename, int row_num = INT_MAX) {
if(filename == NULL || *filename == '\0') {
return "";
}
std::stringstream ss;
Common::C_stringstream(ss);
std::ifstream fin(filename);
std::string line = "";
int i = 0;
while (std::getline(fin, line) && i++ < row_num) {
ss << line << "\n";
}
return ss.str();
}

template<typename T>
inline static const char* Atoi(const char* p, T* out) {
int sign;
Expand Down
4 changes: 3 additions & 1 deletion python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ def _load_lib():
lib_path = find_lib_path()
if len(lib_path) == 0:
return None
lib = ctypes.cdll.LoadLibrary(lib_path[0])
# Load all c libraries, the last one is lib_lightgbm.so
for p in lib_path:
lib = ctypes.cdll.LoadLibrary(p)
lib.LGBM_GetLastError.restype = ctypes.c_char_p
callback = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
lib.callback = callback(_log_callback)
Expand Down
2 changes: 1 addition & 1 deletion python-package/lightgbm/libpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def find_lib_path() -> List[str]:
dll_path.append(curr_path.parents[1] / 'windows' / 'x64' / 'DLL')
dll_path = [p / 'lib_lightgbm.dll' for p in dll_path]
else:
dll_path = [p / 'lib_lightgbm.so' for p in dll_path]
dll_path = [p / lib_name for lib_name in ['lib_transform.so', 'lib_lightgbm.so'] for p in dll_path]
lib_path = [str(p) for p in dll_path if p.is_file()]
if not lib_path:
dll_path_joined = '\n'.join(map(str, dll_path))
Expand Down
18 changes: 13 additions & 5 deletions python-package/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
('hdfs', 'h', 'Compile HDFS version'),
('bit32', None, 'Compile 32-bit version'),
('precompile', 'p', 'Use precompiled library'),
('transform', None, 'Support transform'),
('boost-root=', None, 'Boost preferred installation prefix'),
('boost-dir=', None, 'Directory with Boost package configuration file'),
('boost-include-dir=', None, 'Directory containing Boost headers'),
Expand Down Expand Up @@ -114,7 +115,8 @@ def compile_cpp(
opencl_library: Optional[str] = None,
nomp: bool = False,
bit32: bool = False,
integrated_opencl: bool = False
integrated_opencl: bool = False,
transform: bool = False
) -> None:
build_dir = CURRENT_DIR / "build_cpp"
rmtree(build_dir, ignore_errors=True)
Expand Down Expand Up @@ -150,6 +152,8 @@ def compile_cpp(
cmake_cmd.append("-DUSE_OPENMP=OFF")
if use_hdfs:
cmake_cmd.append("-DUSE_HDFS=ON")
if transform:
cmake_cmd.append("-DUSE_TRANSFORM=ON")

if system() in {'Windows', 'Microsoft'}:
if use_mingw:
Expand Down Expand Up @@ -205,10 +209,11 @@ class CustomInstallLib(install_lib):

def install(self) -> List[str]:
outfiles = install_lib.install(self)
src = find_lib()[0]
src = find_lib()
dst = Path(self.install_dir) / 'lightgbm'
dst, _ = self.copy_file(src, str(dst))
outfiles.append(dst)
for src_lib in src:
dst_lib, _ = self.copy_file(src_lib, str(dst))
outfiles.append(dst_lib)
return outfiles


Expand All @@ -233,6 +238,7 @@ def initialize_options(self) -> None:
self.precompile = False
self.nomp = False
self.bit32 = False
self.transform = False

def run(self) -> None:
if (8 * struct.calcsize("P")) != 64:
Expand All @@ -249,7 +255,7 @@ def run(self) -> None:
use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
nomp=self.nomp, bit32=self.bit32, integrated_opencl=self.integrated_opencl)
nomp=self.nomp, bit32=self.bit32, integrated_opencl=self.integrated_opencl, transform=self.transform)
install.run(self)
if LOG_PATH.is_file():
LOG_PATH.unlink()
Expand All @@ -276,6 +282,7 @@ def initialize_options(self) -> None:
self.precompile = False
self.nomp = False
self.bit32 = False
self.transform = False

def finalize_options(self) -> None:
bdist_wheel.finalize_options(self)
Expand All @@ -297,6 +304,7 @@ def finalize_options(self) -> None:
install.precompile = self.precompile
install.nomp = self.nomp
install.bit32 = self.bit32
install.transform = self.transform


class CustomSdist(sdist):
Expand Down
2 changes: 1 addition & 1 deletion src/application/application.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ void Application::Predict() {

void Application::InitPredict() {
boosting_.reset(
Boosting::CreateBoosting("gbdt", config_.input_model.c_str()));
Boosting::CreateBoosting("gbdt", config_.input_model.c_str()));
Log::Info("Finished initializing prediction, total used %d iterations", boosting_->GetCurrentIteration());
}

Expand Down
Loading