Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support exporting cut values #9356

Merged
merged 11 commits into from
Jul 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
*/
XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);

/*!
/**
* \brief Get the predictors from DMatrix as CSR matrix for testing. If this is a
* quantized DMatrix, quantized values are returned instead.
*
Expand All @@ -819,8 +819,10 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
* XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until
* exiting the thread.
*
* @since 1.7.0
*
* \param handle the handle to the DMatrix
* \param config Json configuration string. At the moment it should be an empty document,
* \param config JSON configuration string. At the moment it should be an empty document,
* preserved for future use.
* \param out_indptr indptr of output CSR matrix.
* \param out_indices Column index of output CSR matrix.
Expand All @@ -831,6 +833,24 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
bst_ulong *out_indptr, unsigned *out_indices, float *out_data);

/**
* @brief Export the quantile cuts used for training histogram-based models like `hist` and
* `approx`. Useful for model compression.
*
* @since 2.0.0
*
* @param handle the handle to the DMatrix
* @param config JSON configuration string. At the moment it should be an empty document,
* preserved for future use.
*
* @param out_indptr indptr of output CSC matrix represented by a JSON encoded
* __(cuda_)array_interface__.
* @param out_data Data value of CSC matrix represented by a JSON encoded
* __(cuda_)array_interface__.
*/
XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
char const **out_indptr, char const **out_data);

/** @} */ // End of DMatrix

/**
Expand Down
86 changes: 22 additions & 64 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ struct BatchParam {
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}

bool ParamNotEqual(BatchParam const& other) const {
[[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
// Check non-floating parameters.
bool cond = max_bin != other.max_bin;
// Check sparse thresh.
Expand All @@ -293,11 +293,11 @@ struct BatchParam {

return cond;
}
bool Initialized() const { return max_bin != 0; }
[[nodiscard]] bool Initialized() const { return max_bin != 0; }
/**
* \brief Make a copy of self for DMatrix to describe how its existing index was generated.
*/
BatchParam MakeCache() const {
[[nodiscard]] BatchParam MakeCache() const {
auto p = *this;
// These parameters have nothing to do with how the gradient index was generated in the
// first place.
Expand All @@ -319,7 +319,7 @@ struct HostSparsePageView {
static_cast<Inst::index_type>(size)};
}

size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
[[nodiscard]] size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
};

/*!
Expand All @@ -337,7 +337,7 @@ class SparsePage {
/*! \brief an instance of sparse vector in the batch */
using Inst = common::Span<Entry const>;

HostSparsePageView GetView() const {
[[nodiscard]] HostSparsePageView GetView() const {
return {offset.ConstHostSpan(), data.ConstHostSpan()};
}

Expand All @@ -353,12 +353,12 @@ class SparsePage {
virtual ~SparsePage() = default;

/*! \return Number of instances in the page. */
inline size_t Size() const {
[[nodiscard]] size_t Size() const {
return offset.Size() == 0 ? 0 : offset.Size() - 1;
}

/*! \return estimation of memory cost of this page */
inline size_t MemCostBytes() const {
[[nodiscard]] size_t MemCostBytes() const {
return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
}

Expand All @@ -376,7 +376,7 @@ class SparsePage {
base_rowid = row_id;
}

SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
[[nodiscard]] SparsePage GetTranspose(int num_columns, int32_t n_threads) const;

/**
* \brief Sort the column index.
Expand All @@ -385,7 +385,7 @@ class SparsePage {
/**
* \brief Check wether the column index is sorted.
*/
bool IsIndicesSorted(int32_t n_threads) const;
[[nodiscard]] bool IsIndicesSorted(int32_t n_threads) const;
/**
* \brief Reindex the column index with an offset.
*/
Expand Down Expand Up @@ -440,49 +440,7 @@ class SortedCSCPage : public SparsePage {
explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
};

class EllpackPageImpl;
/*!
* \brief A page stored in ELLPACK format.
*
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
* including CUDA-specific implementation details in the header.
*/
class EllpackPage {
public:
/*!
* \brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();

/*!
* \brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);

/*! \brief Destructor. */
~EllpackPage();

EllpackPage(EllpackPage&& that);

/*! \return Number of instances in the page. */
size_t Size() const;

/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id);

const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }

private:
std::unique_ptr<EllpackPageImpl> impl_;
};

class EllpackPage;
class GHistIndexMatrix;

template<typename T>
Expand All @@ -492,7 +450,7 @@ class BatchIteratorImpl {
virtual ~BatchIteratorImpl() = default;
virtual const T& operator*() const = 0;
virtual BatchIteratorImpl& operator++() = 0;
virtual bool AtEnd() const = 0;
[[nodiscard]] virtual bool AtEnd() const = 0;
virtual std::shared_ptr<T const> Page() const = 0;
};

Expand All @@ -519,12 +477,12 @@ class BatchIterator {
return !impl_->AtEnd();
}

bool AtEnd() const {
[[nodiscard]] bool AtEnd() const {
CHECK(impl_ != nullptr);
return impl_->AtEnd();
}

std::shared_ptr<T const> Page() const {
[[nodiscard]] std::shared_ptr<T const> Page() const {
return impl_->Page();
}

Expand Down Expand Up @@ -563,15 +521,15 @@ class DMatrix {
this->Info().SetInfo(ctx, key, StringView{interface_str});
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;
[[nodiscard]] virtual const MetaInfo& Info() const = 0;

/*! \brief Get thread local memory for returning data from DMatrix. */
XGBAPIThreadLocalEntry& GetThreadLocal() const;
[[nodiscard]] XGBAPIThreadLocalEntry& GetThreadLocal() const;
/**
* \brief Get the context object of this DMatrix. The context is created during construction of
* DMatrix with user specified `nthread` parameter.
*/
virtual Context const* Ctx() const = 0;
[[nodiscard]] virtual Context const* Ctx() const = 0;

/**
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
Expand All @@ -583,16 +541,16 @@ class DMatrix {
template <typename T>
BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
template <typename T>
bool PageExists() const;
[[nodiscard]] bool PageExists() const;

// the following are column meta data, should be able to answer them fast.
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
[[nodiscard]] virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix();

/*! \brief Whether the matrix is dense. */
bool IsDense() const {
[[nodiscard]] bool IsDense() const {
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
}

Expand Down Expand Up @@ -695,9 +653,9 @@ class DMatrix {
BatchParam const& param) = 0;
virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;

virtual bool EllpackExists() const = 0;
virtual bool GHistIndexExists() const = 0;
virtual bool SparsePageExists() const = 0;
[[nodiscard]] virtual bool EllpackExists() const = 0;
[[nodiscard]] virtual bool GHistIndexExists() const = 0;
[[nodiscard]] virtual bool SparsePageExists() const = 0;
};

template <>
Expand Down
75 changes: 75 additions & 0 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""Core XGBoost Library."""
import copy
import ctypes
import importlib.util
import json
import os
import re
Expand Down Expand Up @@ -381,6 +382,54 @@ def c_array(
return (ctype * len(values))(*values)


def from_array_interface(interface: dict) -> NumpyOrCupy:
"""Convert array interface to numpy or cupy array"""

class Array: # pylint: disable=too-few-public-methods
"""Wrapper type for communicating with numpy and cupy."""

_interface: Optional[dict] = None

@property
def __array_interface__(self) -> Optional[dict]:
return self._interface

@__array_interface__.setter
def __array_interface__(self, interface: dict) -> None:
self._interface = copy.copy(interface)
# converts some fields to tuple as required by numpy
self._interface["shape"] = tuple(self._interface["shape"])
self._interface["data"] = tuple(self._interface["data"])
if self._interface.get("strides", None) is not None:
self._interface["strides"] = tuple(self._interface["strides"])

@property
def __cuda_array_interface__(self) -> Optional[dict]:
return self.__array_interface__

@__cuda_array_interface__.setter
def __cuda_array_interface__(self, interface: dict) -> None:
self.__array_interface__ = interface

arr = Array()

if "stream" in interface:
# CUDA stream is presented, this is a __cuda_array_interface__.
spec = importlib.util.find_spec("cupy")
if spec is None:
raise ImportError("`cupy` is required for handling CUDA buffer.")

import cupy as cp # pylint: disable=import-error

arr.__cuda_array_interface__ = interface
out = cp.array(arr, copy=True)
else:
arr.__array_interface__ = interface
out = np.array(arr, copy=True)

return out


def _prediction_output(
shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
) -> NumpyOrCupy:
Expand Down Expand Up @@ -1060,6 +1109,32 @@ def get_data(self) -> scipy.sparse.csr_matrix:
)
return ret

def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
"""Get quantile cuts for quantization."""
n_features = self.num_col()

c_sindptr = ctypes.c_char_p()
c_sdata = ctypes.c_char_p()
config = make_jcargs()
_check_call(
_LIB.XGDMatrixGetQuantileCut(
self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
)
)
assert c_sindptr.value is not None
assert c_sdata.value is not None

i_indptr = json.loads(c_sindptr.value)
indptr = from_array_interface(i_indptr)
assert indptr.size == n_features + 1
assert indptr.dtype == np.uint64

i_data = json.loads(c_sdata.value)
data = from_array_interface(i_data)
assert data.size == indptr[-1]
assert data.dtype == np.float32
return indptr, data

def num_row(self) -> int:
"""Get the number of rows in the DMatrix."""
ret = c_bst_ulong()
Expand Down
8 changes: 8 additions & 0 deletions python-package/xgboost/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,14 @@ def make_batches(
return X, y, w


def make_regression(
n_samples: int, n_features: int, use_cupy: bool
) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
"""Make a simple regression dataset."""
X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
return X[0], y[0], w[0]


def make_batches_sparse(
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
Expand Down
Loading