Skip to content

Commit

Permalink
Move feature names and types of DMatrix from Python to C++. (#5858)
Browse files Browse the repository at this point in the history
* Add thread local return entry for DMatrix.
* Save feature name and feature type in binary file.

Co-authored-by: Philip Hyunsu Cho <[email protected]>
  • Loading branch information
trivialfis and hcho3 committed Jul 7, 2020
1 parent 4b0852e commit 93c44a9
Show file tree
Hide file tree
Showing 12 changed files with 451 additions and 84 deletions.
77 changes: 73 additions & 4 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
const unsigned *array,
bst_ulong len);

/*!
* \brief Set string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* \param handle An instance of data matrix
* \param field Feild name
* \param features Pointer to array of strings.
* \param size Size of `features` pointer (number of strings passed in).
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const* feat_names [] {"feat_0", "feat_1"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
*
* // i for integer, q for quantitive. Similarly "int" and "float" are also recognized.
* char const* feat_types [] {"i", "q"};
* XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
*
* \endcode
*/
XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field,
const char **features,
const bst_ulong size);

/*!
* \brief Get string encoded information of all features.
*
* Accepted fields are:
* - feature_name
* - feature_type
*
* Caller is responsible for copying out the data, before next call to any API function of
* XGBoost.
*
* \param handle An instance of data matrix
* \param field Feild name
* \param size Size of output pointer `features` (number of strings returned).
* \param out_features Address of a pointer to array of strings. Result is stored in
* thread local memory.
*
* \return 0 when success, -1 when failure happens
*
* \code
*
* char const **c_out_features = NULL;
* bst_ulong out_size = 0;
*
* // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`.
* XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size,
* &c_out_features)
*
* for (bst_ulong i = 0; i < out_size; ++i) {
* // Here we are simply printing the string. Copy it out if the feature name is
* // useful after printing.
* printf("feature %lu: %s\n", i, c_out_features[i]);
* }
*
* \endcode
*/
XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
bst_ulong *size,
const char ***out_features);

/*!
* \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
* \param handle a instance of data matrix
Expand Down Expand Up @@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
*
* - Functions with the term "Model" handles saving/loading XGBoost model like trees or
* linear weights. Striping out parameters configuration like training algorithms or
* CUDA device ID helps user to reuse the trained model for different tasks, examples
* are prediction, training continuation or interpretation.
* CUDA device ID. These functions are designed to let users reuse the trained model
* for different tasks, examples are prediction, training continuation or model
* interpretation.
*
* - Functions with the term "Config" handles save/loading configuration. It helps user
* to study the internal of XGBoost. Also user can use the load method for specifying
Expand All @@ -592,15 +661,15 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
/*!
* \brief Load model from existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
const char *fname);
/*!
* \brief Save model into existing file
* \param handle handle
* \param fname file name
* \param fname File URI or file name.
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
Expand Down
36 changes: 33 additions & 3 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ enum class DataType : uint8_t {
kFloat32 = 1,
kDouble = 2,
kUInt32 = 3,
kUInt64 = 4
kUInt64 = 4,
kStr = 5
};

enum class FeatureType : uint8_t {
kNumerical
};

/*!
Expand All @@ -40,7 +45,7 @@ enum class DataType : uint8_t {
class MetaInfo {
public:
/*! \brief number of data fields in MetaInfo */
static constexpr uint64_t kNumField = 9;
static constexpr uint64_t kNumField = 11;

/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
Expand Down Expand Up @@ -72,6 +77,19 @@ class MetaInfo {
*/
HostDeviceVector<bst_float> labels_upper_bound_; // NOLINT

/*!
* \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q"
*/
std::vector<std::string> feature_type_names;
/*!
* \brief Name for each feature.
*/
std::vector<std::string> feature_names;
/*
* \brief Type of each feature. Automatically set when feature_type_names is specifed.
*/
HostDeviceVector<FeatureType> feature_types;

/*! \brief default constructor */
MetaInfo() = default;
MetaInfo(MetaInfo&& that) = default;
Expand Down Expand Up @@ -158,6 +176,12 @@ class MetaInfo {
*/
void SetInfo(const char* key, std::string const& interface_str);

void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
const void** out_dptr) const;

void SetFeatureInfo(const char *key, const char **info, const bst_ulong size);
void GetFeatureInfo(const char *field, std::vector<std::string>* out_str_vecs) const;

/*
* \brief Extend with other MetaInfo.
*
Expand Down Expand Up @@ -432,6 +456,8 @@ class BatchSet {
BatchIterator<T> begin_iter_;
};

struct XGBAPIThreadLocalEntry;

/*!
* \brief Internal data structured used by XGBoost during training.
*/
Expand All @@ -450,6 +476,10 @@ class DMatrix {
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;

/*! \brief Get thread local memory for returning data from DMatrix. */
XGBAPIThreadLocalEntry& GetThreadLocal() const;

/**
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
*/
Expand All @@ -462,7 +492,7 @@ class DMatrix {
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix() = default;
virtual ~DMatrix();

/*! \brief Whether the matrix is dense. */
bool IsDense() const {
Expand Down
113 changes: 71 additions & 42 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes
DMatrix is a internal data structure that used by XGBoost
which is optimized for both memory efficiency and training speed.
You can construct DMatrix from numpy.arrays
You can construct DMatrix from multiple different sources of data.
"""

_feature_names = None # for previous version's pickle
_feature_types = None

def __init__(self, data, label=None, weight=None, base_margin=None,
missing=None,
silent=False,
Expand Down Expand Up @@ -362,11 +359,6 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None

if feature_names is not None:
self._feature_names = feature_names
if feature_types is not None:
self._feature_types = feature_types
return

handler = self.get_data_handler(data)
Expand Down Expand Up @@ -666,14 +658,16 @@ def slice(self, rindex, allow_groups=False):
res : DMatrix
A new DMatrix containing only selected indices.
"""
res = DMatrix(None, feature_names=self.feature_names,
feature_types=self.feature_types)
res = DMatrix(None)
res.handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.c_int(1 if allow_groups else 0)))
_check_call(_LIB.XGDMatrixSliceDMatrixEx(
self.handle,
c_array(ctypes.c_int, rindex),
c_bst_ulong(len(rindex)),
ctypes.byref(res.handle),
ctypes.c_int(1 if allow_groups else 0)))
res.feature_names = self.feature_names
res.feature_types = self.feature_types
return res

@property
Expand All @@ -684,20 +678,17 @@ def feature_names(self):
-------
feature_names : list or None
"""
if self._feature_names is None:
self._feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return self._feature_names

@property
def feature_types(self):
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
return self._feature_types
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_name'),
ctypes.byref(length),
ctypes.byref(sarr)))
feature_names = from_cstr_to_pystr(sarr, length)
if not feature_names:
feature_names = ['f{0}'.format(i)
for i in range(self.num_col())]
return feature_names

@feature_names.setter
def feature_names(self, feature_names):
Expand Down Expand Up @@ -728,10 +719,41 @@ def feature_names(self, feature_names):
not any(x in f for x in set(('[', ']', '<')))
for f in feature_names):
raise ValueError('feature_names must be string, and may not contain [, ] or <')
c_feature_names = [bytes(f, encoding='utf-8')
for f in feature_names]
c_feature_names = (ctypes.c_char_p *
len(c_feature_names))(*c_feature_names)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_name'),
c_feature_names,
c_bst_ulong(len(feature_names))))
else:
# reset feature_types also
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str('feature_name'),
None,
c_bst_ulong(0)))
self.feature_types = None
self._feature_names = feature_names

@property
def feature_types(self):
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
c_str('feature_type'),
ctypes.byref(length),
ctypes.byref(sarr)))
res = from_cstr_to_pystr(sarr, length)
if not res:
return None
return res

@feature_types.setter
def feature_types(self, feature_types):
Expand All @@ -746,31 +768,38 @@ def feature_types(self, feature_types):
Labels for features. None will reset existing feature names
"""
if feature_types is not None:
if self._feature_names is None:
msg = 'Unable to set feature types before setting names'
raise ValueError(msg)

if not isinstance(feature_types, (list, str)):
raise TypeError(
'feature_types must be string or list of strings')
if isinstance(feature_types, STRING_TYPES):
# single string will be applied to all columns
feature_types = [feature_types] * self.num_col()

try:
if not isinstance(feature_types, str):
feature_types = list(feature_types)
else:
feature_types = [feature_types]
except TypeError:
feature_types = [feature_types]
c_feature_types = [bytes(f, encoding='utf-8')
for f in feature_types]
c_feature_types = (ctypes.c_char_p *
len(c_feature_types))(*c_feature_types)
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle, c_str('feature_type'),
c_feature_types,
c_bst_ulong(len(feature_types))))

if len(feature_types) != self.num_col():
msg = 'feature_types must have the same length as data'
raise ValueError(msg)

valid = ('int', 'float', 'i', 'q')
if not all(isinstance(f, STRING_TYPES) and f in valid
for f in feature_types):
raise ValueError('All feature_names must be {int, float, i, q}')
self._feature_types = feature_types
else:
# Reset.
_check_call(_LIB.XGDMatrixSetStrFeatureInfo(
self.handle,
c_str('feature_type'),
None,
c_bst_ulong(0)))


class DeviceQuantileDMatrix(DMatrix):
Expand Down
2 changes: 1 addition & 1 deletion python-package/xgboost/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def _maybe_dt_data(self, data, feature_names, feature_types,
raise ValueError(
'DataTable has own feature types, cannot pass them in.')
feature_types = np.vectorize(self.dt_type_mapper2.get)(
data_types_names)
data_types_names).tolist()

return data, feature_names, feature_types

Expand Down
Loading

0 comments on commit 93c44a9

Please sign in to comment.