diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 557e000c6127..c117f62f3a77 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -415,6 +415,74 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const unsigned *array, bst_ulong len); +/*! + * \brief Set string encoded information of all features. + * + * Accepted fields are: + * - feature_name + * - feature_type + * + * \param handle An instance of data matrix + * \param field Feild name + * \param features Pointer to array of strings. + * \param size Size of `features` pointer (number of strings passed in). + * + * \return 0 when success, -1 when failure happens + * + * \code + * + * char const* feat_names [] {"feat_0", "feat_1"}; + * XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2); + * + * // i for integer, q for quantitive. Similarly "int" and "float" are also recognized. + * char const* feat_types [] {"i", "q"}; + * XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2); + * + * \endcode + */ +XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field, + const char **features, + const bst_ulong size); + +/*! + * \brief Get string encoded information of all features. + * + * Accepted fields are: + * - feature_name + * - feature_type + * + * Caller is responsible for copying out the data, before next call to any API function of + * XGBoost. + * + * \param handle An instance of data matrix + * \param field Feild name + * \param size Size of output pointer `features` (number of strings returned). + * \param out_features Address of a pointer to array of strings. Result is stored in + * thread local memory. + * + * \return 0 when success, -1 when failure happens + * + * \code + * + * char const **c_out_features = NULL; + * bst_ulong out_size = 0; + * + * // Asumming the feature names are already set by `XGDMatrixSetStrFeatureInfo`. + * XGDMatrixGetStrFeatureInfo(handle, "feature_name", &out_size, + * &c_out_features) + * + * for (bst_ulong i = 0; i < out_size; ++i) { + * // Here we are simply printing the string. Copy it out if the feature name is + * // useful after printing. + * printf("feature %lu: %s\n", i, c_out_features[i]); + * } + * + * \endcode + */ +XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, + bst_ulong *size, + const char ***out_features); + /*! * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix * \param handle a instance of data matrix @@ -575,8 +643,9 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, * * - Functions with the term "Model" handles saving/loading XGBoost model like trees or * linear weights. Striping out parameters configuration like training algorithms or - * CUDA device ID helps user to reuse the trained model for different tasks, examples - * are prediction, training continuation or interpretation. + * CUDA device ID. These functions are designed to let users reuse the trained model + * for different tasks, examples are prediction, training continuation or model + * interpretation. * * - Functions with the term "Config" handles save/loading configuration. It helps user * to study the internal of XGBoost. Also user can use the load method for specifying @@ -592,7 +661,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, /*! * \brief Load model from existing file * \param handle handle - * \param fname file name + * \param fname File URI or file name. * \return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, @@ -600,7 +669,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, /*! * \brief Save model into existing file * \param handle handle - * \param fname file name + * \param fname File URI or file name. * \return 0 when success, -1 when failure happens */ XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 1e9e429d5e17..e7350fffeeba 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -31,7 +31,12 @@ enum class DataType : uint8_t { kFloat32 = 1, kDouble = 2, kUInt32 = 3, - kUInt64 = 4 + kUInt64 = 4, + kStr = 5 +}; + +enum class FeatureType : uint8_t { + kNumerical }; /*! @@ -40,7 +45,7 @@ enum class DataType : uint8_t { class MetaInfo { public: /*! \brief number of data fields in MetaInfo */ - static constexpr uint64_t kNumField = 9; + static constexpr uint64_t kNumField = 11; /*! \brief number of rows in the data */ uint64_t num_row_{0}; // NOLINT @@ -72,6 +77,19 @@ class MetaInfo { */ HostDeviceVector labels_upper_bound_; // NOLINT + /*! + * \brief Name of type for each feature provided by users. Eg. "int"/"float"/"i"/"q" + */ + std::vector feature_type_names; + /*! + * \brief Name for each feature. + */ + std::vector feature_names; + /* + * \brief Type of each feature. Automatically set when feature_type_names is specifed. + */ + HostDeviceVector feature_types; + /*! \brief default constructor */ MetaInfo() = default; MetaInfo(MetaInfo&& that) = default; @@ -158,6 +176,12 @@ class MetaInfo { */ void SetInfo(const char* key, std::string const& interface_str); + void GetInfo(char const* key, bst_ulong* out_len, DataType dtype, + const void** out_dptr) const; + + void SetFeatureInfo(const char *key, const char **info, const bst_ulong size); + void GetFeatureInfo(const char *field, std::vector* out_str_vecs) const; + /* * \brief Extend with other MetaInfo. * @@ -432,6 +456,8 @@ class BatchSet { BatchIterator begin_iter_; }; +struct XGBAPIThreadLocalEntry; + /*! * \brief Internal data structured used by XGBoost during training. */ @@ -450,6 +476,10 @@ class DMatrix { } /*! \brief meta information of the dataset */ virtual const MetaInfo& Info() const = 0; + + /*! \brief Get thread local memory for returning data from DMatrix. */ + XGBAPIThreadLocalEntry& GetThreadLocal() const; + /** * \brief Gets batches. Use range based for loop over BatchSet to access individual batches. */ @@ -462,7 +492,7 @@ class DMatrix { /*! \return Whether the data columns single column block. */ virtual bool SingleColBlock() const = 0; /*! \brief virtual destructor */ - virtual ~DMatrix() = default; + virtual ~DMatrix(); /*! \brief Whether the matrix is dense. */ bool IsDense() const { diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 079e916c3260..687e47b2c298 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -305,12 +305,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed. - You can construct DMatrix from numpy.arrays + You can construct DMatrix from multiple different sources of data. """ - _feature_names = None # for previous version's pickle - _feature_types = None - def __init__(self, data, label=None, weight=None, base_margin=None, missing=None, silent=False, @@ -362,11 +359,6 @@ def __init__(self, data, label=None, weight=None, base_margin=None, # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None - - if feature_names is not None: - self._feature_names = feature_names - if feature_types is not None: - self._feature_types = feature_types return handler = self.get_data_handler(data) @@ -666,14 +658,16 @@ def slice(self, rindex, allow_groups=False): res : DMatrix A new DMatrix containing only selected indices. """ - res = DMatrix(None, feature_names=self.feature_names, - feature_types=self.feature_types) + res = DMatrix(None) res.handle = ctypes.c_void_p() - _check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle, - c_array(ctypes.c_int, rindex), - c_bst_ulong(len(rindex)), - ctypes.byref(res.handle), - ctypes.c_int(1 if allow_groups else 0))) + _check_call(_LIB.XGDMatrixSliceDMatrixEx( + self.handle, + c_array(ctypes.c_int, rindex), + c_bst_ulong(len(rindex)), + ctypes.byref(res.handle), + ctypes.c_int(1 if allow_groups else 0))) + res.feature_names = self.feature_names + res.feature_types = self.feature_types return res @property @@ -684,20 +678,17 @@ def feature_names(self): ------- feature_names : list or None """ - if self._feature_names is None: - self._feature_names = ['f{0}'.format(i) - for i in range(self.num_col())] - return self._feature_names - - @property - def feature_types(self): - """Get feature types (column types). - - Returns - ------- - feature_types : list or None - """ - return self._feature_types + length = c_bst_ulong() + sarr = ctypes.POINTER(ctypes.c_char_p)() + _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, + c_str('feature_name'), + ctypes.byref(length), + ctypes.byref(sarr))) + feature_names = from_cstr_to_pystr(sarr, length) + if not feature_names: + feature_names = ['f{0}'.format(i) + for i in range(self.num_col())] + return feature_names @feature_names.setter def feature_names(self, feature_names): @@ -728,10 +719,41 @@ def feature_names(self, feature_names): not any(x in f for x in set(('[', ']', '<'))) for f in feature_names): raise ValueError('feature_names must be string, and may not contain [, ] or <') + c_feature_names = [bytes(f, encoding='utf-8') + for f in feature_names] + c_feature_names = (ctypes.c_char_p * + len(c_feature_names))(*c_feature_names) + _check_call(_LIB.XGDMatrixSetStrFeatureInfo( + self.handle, c_str('feature_name'), + c_feature_names, + c_bst_ulong(len(feature_names)))) else: # reset feature_types also + _check_call(_LIB.XGDMatrixSetStrFeatureInfo( + self.handle, + c_str('feature_name'), + None, + c_bst_ulong(0))) self.feature_types = None - self._feature_names = feature_names + + @property + def feature_types(self): + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ + length = c_bst_ulong() + sarr = ctypes.POINTER(ctypes.c_char_p)() + _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, + c_str('feature_type'), + ctypes.byref(length), + ctypes.byref(sarr))) + res = from_cstr_to_pystr(sarr, length) + if not res: + return None + return res @feature_types.setter def feature_types(self, feature_types): @@ -746,14 +768,12 @@ def feature_types(self, feature_types): Labels for features. None will reset existing feature names """ if feature_types is not None: - if self._feature_names is None: - msg = 'Unable to set feature types before setting names' - raise ValueError(msg) - + if not isinstance(feature_types, (list, str)): + raise TypeError( + 'feature_types must be string or list of strings') if isinstance(feature_types, STRING_TYPES): # single string will be applied to all columns feature_types = [feature_types] * self.num_col() - try: if not isinstance(feature_types, str): feature_types = list(feature_types) @@ -761,16 +781,25 @@ def feature_types(self, feature_types): feature_types = [feature_types] except TypeError: feature_types = [feature_types] + c_feature_types = [bytes(f, encoding='utf-8') + for f in feature_types] + c_feature_types = (ctypes.c_char_p * + len(c_feature_types))(*c_feature_types) + _check_call(_LIB.XGDMatrixSetStrFeatureInfo( + self.handle, c_str('feature_type'), + c_feature_types, + c_bst_ulong(len(feature_types)))) if len(feature_types) != self.num_col(): msg = 'feature_types must have the same length as data' raise ValueError(msg) - - valid = ('int', 'float', 'i', 'q') - if not all(isinstance(f, STRING_TYPES) and f in valid - for f in feature_types): - raise ValueError('All feature_names must be {int, float, i, q}') - self._feature_types = feature_types + else: + # Reset. + _check_call(_LIB.XGDMatrixSetStrFeatureInfo( + self.handle, + c_str('feature_type'), + None, + c_bst_ulong(0))) class DeviceQuantileDMatrix(DMatrix): diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index b90410e216aa..170ce81c0578 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -372,7 +372,7 @@ def _maybe_dt_data(self, data, feature_names, feature_types, raise ValueError( 'DataTable has own feature types, cannot pass them in.') feature_types = np.vectorize(self.dt_type_mapper2.get)( - data_types_names) + data_types_names).tolist() return data, feature_names, feature_types diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index adf51f780d42..824e46f7122a 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -283,6 +283,38 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, API_END(); } +XGB_DLL int XGDMatrixSetStrFeatureInfo(DMatrixHandle handle, const char *field, + const char **c_info, + const xgboost::bst_ulong size) { + API_BEGIN(); + CHECK_HANDLE(); + auto &info = static_cast *>(handle)->get()->Info(); + info.SetFeatureInfo(field, c_info, size); + API_END(); +} + +XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, + xgboost::bst_ulong *len, + const char ***out_features) { + API_BEGIN(); + CHECK_HANDLE(); + auto m = *static_cast*>(handle); + auto &info = static_cast *>(handle)->get()->Info(); + + std::vector &charp_vecs = m->GetThreadLocal().ret_vec_charp; + std::vector &str_vecs = m->GetThreadLocal().ret_vec_str; + + info.GetFeatureInfo(field, &str_vecs); + + charp_vecs.resize(str_vecs.size()); + for (size_t i = 0; i < str_vecs.size(); ++i) { + charp_vecs[i] = str_vecs[i].c_str(); + } + *out_features = dmlc::BeginPtr(charp_vecs); + *len = static_cast(charp_vecs.size()); + API_END(); +} + XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned* group, xgboost::bst_ulong len) { @@ -301,22 +333,7 @@ XGB_DLL int XGDMatrixGetFloatInfo(const DMatrixHandle handle, API_BEGIN(); CHECK_HANDLE(); const MetaInfo& info = static_cast*>(handle)->get()->Info(); - const std::vector* vec = nullptr; - if (!std::strcmp(field, "label")) { - vec = &info.labels_.HostVector(); - } else if (!std::strcmp(field, "weight")) { - vec = &info.weights_.HostVector(); - } else if (!std::strcmp(field, "base_margin")) { - vec = &info.base_margin_.HostVector(); - } else if (!std::strcmp(field, "label_lower_bound")) { - vec = &info.labels_lower_bound_.HostVector(); - } else if (!std::strcmp(field, "label_upper_bound")) { - vec = &info.labels_upper_bound_.HostVector(); - } else { - LOG(FATAL) << "Unknown float field name " << field; - } - *out_len = static_cast(vec->size()); // NOLINT - *out_dptr = dmlc::BeginPtr(*vec); + info.GetInfo(field, out_len, DataType::kFloat32, reinterpret_cast(out_dptr)); API_END(); } @@ -327,14 +344,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle, API_BEGIN(); CHECK_HANDLE(); const MetaInfo& info = static_cast*>(handle)->get()->Info(); - const std::vector* vec = nullptr; - if (!std::strcmp(field, "group_ptr")) { - vec = &info.group_ptr_; - } else { - LOG(FATAL) << "Unknown uint field name " << field; - } - *out_len = static_cast(vec->size()); - *out_dptr = dmlc::BeginPtr(*vec); + info.GetInfo(field, out_len, DataType::kUInt32, reinterpret_cast(out_dptr)); API_END(); } diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index a6ee30e1f346..f9974f8ecfaf 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -171,6 +171,8 @@ void HostDeviceVector::SetDevice(int device) const {} template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_node_t +template class HostDeviceVector; +template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_row_t template class HostDeviceVector; // bst_feature_t diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 7950096ca756..dcc772baa2dd 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -398,6 +398,7 @@ template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_node_t template class HostDeviceVector; +template class HostDeviceVector; template class HostDeviceVector; template class HostDeviceVector; // bst_row_t template class HostDeviceVector; // bst_feature_t diff --git a/src/data/data.cc b/src/data/data.cc index 8d36d82783fc..b3652c45b89f 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -11,6 +11,7 @@ #include "xgboost/host_device_vector.h" #include "xgboost/logging.h" #include "xgboost/version_config.h" +#include "xgboost/learner.h" #include "sparse_page_writer.h" #include "simple_dmatrix.h" @@ -148,8 +149,10 @@ void MetaInfo::Clear() { * | group_ptr | kUInt32 | False | ${size} | 1 | ${group_ptr_} | * | weights | kFloat32 | False | ${size} | 1 | ${weights_} | * | base_margin | kFloat32 | False | ${size} | 1 | ${base_margin_} | - * | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound__} | - * | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound__} | + * | labels_lower_bound | kFloat32 | False | ${size} | 1 | ${labels_lower_bound_} | + * | labels_upper_bound | kFloat32 | False | ${size} | 1 | ${labels_upper_bound_} | + * | feature_names | kStr | False | ${size} | 1 | ${feature_names} | + * | feature_types | kStr | False | ${size} | 1 | ${feature_types} | * * Note that the scalar fields (is_scalar=True) will have num_row and num_col missing. * Also notice the difference between the saved name and the name used in `SetInfo': @@ -177,9 +180,31 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const { SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32, {labels_upper_bound_.Size(), 1}, labels_upper_bound_); ++field_cnt; + SaveVectorField(fo, u8"feature_names", DataType::kStr, + {feature_names.size(), 1}, feature_names); ++field_cnt; + SaveVectorField(fo, u8"feature_types", DataType::kStr, + {feature_type_names.size(), 1}, feature_type_names); ++field_cnt; + CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields"; } +void LoadFeatureType(std::vectorconst& type_names, std::vector* types) { + types->clear(); + for (auto const &elem : type_names) { + if (elem == "int") { + types->emplace_back(FeatureType::kNumerical); + } else if (elem == "float") { + types->emplace_back(FeatureType::kNumerical); + } else if (elem == "i") { + types->emplace_back(FeatureType::kNumerical); + } else if (elem == "q") { + types->emplace_back(FeatureType::kNumerical); + } else { + LOG(FATAL) << "All feature_types must be {int, float, i, q}"; + } + } +} + void MetaInfo::LoadBinary(dmlc::Stream *fi) { auto version = Version::Load(fi); auto major = std::get<0>(version); @@ -193,11 +218,20 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) { const uint64_t expected_num_field = kNumField; uint64_t num_field { 0 }; CHECK(fi->Read(&num_field)) << "MetaInfo: invalid format"; - CHECK_GE(num_field, expected_num_field) - << "MetaInfo: insufficient number of fields (expected at least " << expected_num_field - << " fields, but the binary file only contains " << num_field << "fields.)"; + size_t expected = 0; + if (major == 1 && std::get<1>(version) < 2) { + // feature names and types are added in 1.2 + expected = expected_num_field - 2; + } else { + expected = expected_num_field; + } + CHECK_GE(num_field, expected) + << "MetaInfo: insufficient number of fields (expected at least " + << expected << " fields, but the binary file only contains " << num_field + << "fields.)"; if (num_field > expected_num_field) { - LOG(WARNING) << "MetaInfo: the given binary file contains extra fields which will be ignored."; + LOG(WARNING) << "MetaInfo: the given binary file contains extra fields " + "which will be ignored."; } LoadScalarField(fi, u8"num_row", DataType::kUInt64, &num_row_); @@ -209,6 +243,10 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) { LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_); LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_); LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_); + + LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names); + LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names); + LoadFeatureType(feature_type_names, &feature_types.HostVector()); } template @@ -344,6 +382,76 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t } } +void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype, + const void **out_dptr) const { + if (dtype == DataType::kFloat32) { + const std::vector* vec = nullptr; + if (!std::strcmp(key, "label")) { + vec = &this->labels_.HostVector(); + } else if (!std::strcmp(key, "weight")) { + vec = &this->weights_.HostVector(); + } else if (!std::strcmp(key, "base_margin")) { + vec = &this->base_margin_.HostVector(); + } else if (!std::strcmp(key, "label_lower_bound")) { + vec = &this->labels_lower_bound_.HostVector(); + } else if (!std::strcmp(key, "label_upper_bound")) { + vec = &this->labels_upper_bound_.HostVector(); + } else { + LOG(FATAL) << "Unknown float field name: " << key; + } + *out_len = static_cast(vec->size()); // NOLINT + *reinterpret_cast(out_dptr) = dmlc::BeginPtr(*vec); + } else if (dtype == DataType::kUInt32) { + const std::vector *vec = nullptr; + if (!std::strcmp(key, "group_ptr")) { + vec = &this->group_ptr_; + } else { + LOG(FATAL) << "Unknown uint32 field name: " << key; + } + *out_len = static_cast(vec->size()); + *reinterpret_cast(out_dptr) = dmlc::BeginPtr(*vec); + } else { + LOG(FATAL) << "Unknown data type for getting meta info."; + } +} + +void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) { + if (size != 0) { + CHECK_EQ(size, this->num_col_) + << "Length of " << key << " must be equal to number of columns."; + } + if (!std::strcmp(key, "feature_type")) { + feature_type_names.clear(); + auto& h_feature_types = feature_types.HostVector(); + for (size_t i = 0; i < size; ++i) { + auto elem = info[i]; + feature_type_names.emplace_back(elem); + } + LoadFeatureType(feature_type_names, &h_feature_types); + } else if (!std::strcmp(key, "feature_name")) { + feature_names.clear(); + for (size_t i = 0; i < size; ++i) { + feature_names.emplace_back(info[i]); + } + } else { + LOG(FATAL) << "Unknown feature info name: " << key; + } +} + +void MetaInfo::GetFeatureInfo(const char *field, + std::vector *out_str_vecs) const { + auto &str_vecs = *out_str_vecs; + if (!std::strcmp(field, "feature_type")) { + str_vecs.resize(feature_type_names.size()); + std::copy(feature_type_names.cbegin(), feature_type_names.cend(), str_vecs.begin()); + } else if (!strcmp(field, "feature_name")) { + str_vecs.resize(feature_names.size()); + std::copy(feature_names.begin(), feature_names.end(), str_vecs.begin()); + } else { + LOG(FATAL) << "Unknown feature info: " << field; + } +} + void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { if (accumulate_rows) { this->num_row_ += that.num_row_; @@ -441,6 +549,20 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { } #endif // !defined(XGBOOST_USE_CUDA) +using DMatrixThreadLocal = + dmlc::ThreadLocalStore>; + +XGBAPIThreadLocalEntry& DMatrix::GetThreadLocal() const { + return (*DMatrixThreadLocal::Get())[this]; +} + +DMatrix::~DMatrix() { + auto local_map = DMatrixThreadLocal::Get(); + if (local_map->find(this) != local_map->cend()) { + local_map->erase(this); + } +} + DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split, diff --git a/src/learner.cc b/src/learner.cc index 34649480c5ce..ebfdeccc36fb 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -202,7 +202,7 @@ void GenericParameter::ConfigureGpuId(bool require_gpu) { #endif // defined(XGBOOST_USE_CUDA) } -using XGBAPIThreadLocalStore = +using LearnerAPIThreadLocalStore = dmlc::ThreadLocalStore>; class LearnerConfiguration : public Learner { @@ -895,7 +895,7 @@ class LearnerImpl : public LearnerIO { explicit LearnerImpl(std::vector > cache) : LearnerIO{cache} {} ~LearnerImpl() override { - auto local_map = XGBAPIThreadLocalStore::Get(); + auto local_map = LearnerAPIThreadLocalStore::Get(); if (local_map->find(this) != local_map->cend()) { local_map->erase(this); } @@ -1023,7 +1023,7 @@ class LearnerImpl : public LearnerIO { } XGBAPIThreadLocalEntry& GetThreadLocal() const override { - return (*XGBAPIThreadLocalStore::Get())[this]; + return (*LearnerAPIThreadLocalStore::Get())[this]; } void InplacePredict(dmlc::any const &x, std::string const &type, diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc index 98046105dfe3..2ba36a16dd8c 100644 --- a/tests/cpp/c_api/test_c_api.cc +++ b/tests/cpp/c_api/test_c_api.cc @@ -10,7 +10,6 @@ #include "../helpers.h" #include "../../../src/common/io.h" - TEST(CAPI, XGDMatrixCreateFromMatDT) { std::vector col0 = {0, -1, 3}; std::vector col1 = {-4.0f, 2.0f, 0.0f}; @@ -148,4 +147,48 @@ TEST(CAPI, CatchDMLCError) { EXPECT_THROW({ dmlc::Stream::Create("foo", "r"); }, dmlc::Error); } +TEST(CAPI, DMatrixSetFeatureName) { + size_t constexpr kRows = 10; + bst_feature_t constexpr kCols = 2; + + DMatrixHandle handle; + std::vector data(kCols * kRows, 1.5); + + XGDMatrixCreateFromMat_omp(data.data(), kRows, kCols, + std::numeric_limits::quiet_NaN(), &handle, + 0); + std::vector feature_names; + for (bst_feature_t i = 0; i < kCols; ++i) { + feature_names.emplace_back(std::to_string(i)); + } + std::vector c_feature_names; + c_feature_names.resize(feature_names.size()); + std::transform(feature_names.cbegin(), feature_names.cend(), + c_feature_names.begin(), + [](auto const &str) { return str.c_str(); }); + XGDMatrixSetStrFeatureInfo(handle, u8"feature_name", c_feature_names.data(), + c_feature_names.size()); + bst_ulong out_len = 0; + char const **c_out_features; + XGDMatrixGetStrFeatureInfo(handle, u8"feature_name", &out_len, + &c_out_features); + + CHECK_EQ(out_len, kCols); + std::vector out_features; + for (bst_ulong i = 0; i < out_len; ++i) { + ASSERT_EQ(std::to_string(i), c_out_features[i]); + } + + char const* feat_types [] {"i", "q"}; + static_assert(sizeof(feat_types)/ sizeof(feat_types[0]) == kCols, ""); + XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols); + char const **c_out_types; + XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len, + &c_out_types); + for (bst_ulong i = 0; i < out_len; ++i) { + ASSERT_STREQ(feat_types[i], c_out_types[i]); + } + + XGDMatrixFree(handle); +} } // namespace xgboost diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index 74002b75aacc..eec37f3e50ce 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -39,6 +39,36 @@ TEST(MetaInfo, GetSet) { ASSERT_EQ(info.group_ptr_.size(), 0); } +TEST(MetaInfo, GetSetFeature) { + xgboost::MetaInfo info; + EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error); + EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error); + EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0)); + EXPECT_NO_THROW(info.SetFeatureInfo("feature_type", nullptr, 0)); + ASSERT_EQ(info.feature_type_names.size(), 0); + ASSERT_EQ(info.feature_types.Size(), 0); + ASSERT_EQ(info.feature_names.size(), 0); + + size_t constexpr kCols = 19; + std::vector types(kCols, u8"float"); + std::vector c_types(kCols); + std::transform(types.cbegin(), types.cend(), c_types.begin(), + [](auto const &str) { return str.c_str(); }); + // Info has 0 column + EXPECT_THROW( + info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), + dmlc::Error); + info.num_col_ = kCols; + EXPECT_NO_THROW( + info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size())); + + // Test clear. + info.SetFeatureInfo("feature_type", nullptr, 0); + ASSERT_EQ(info.feature_type_names.size(), 0); + ASSERT_EQ(info.feature_types.Size(), 0); + // Other conditions are tested in `SaveLoadBinary`. +} + TEST(MetaInfo, SaveLoadBinary) { xgboost::MetaInfo info; uint64_t constexpr kRows { 64 }, kCols { 32 }; @@ -51,9 +81,22 @@ TEST(MetaInfo, SaveLoadBinary) { info.SetInfo("label", values.data(), xgboost::DataType::kFloat32, kRows); info.SetInfo("weight", values.data(), xgboost::DataType::kFloat32, kRows); info.SetInfo("base_margin", values.data(), xgboost::DataType::kFloat32, kRows); + info.num_row_ = kRows; info.num_col_ = kCols; + auto featname = u8"特征名"; + std::vector types(kCols, u8"float"); + std::vector c_types(kCols); + std::transform(types.cbegin(), types.cend(), c_types.begin(), + [](auto const &str) { return str.c_str(); }); + info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()); + std::vector names(kCols, featname); + std::vector c_names(kCols); + std::transform(names.cbegin(), names.cend(), c_names.begin(), + [](auto const &str) { return str.c_str(); }); + info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size());; + dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/metainfo.binary"; { @@ -80,6 +123,23 @@ TEST(MetaInfo, SaveLoadBinary) { EXPECT_EQ(inforead.group_ptr_, info.group_ptr_); EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector()); EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector()); + + EXPECT_EQ(inforead.feature_type_names.size(), kCols); + EXPECT_EQ(inforead.feature_types.Size(), kCols); + EXPECT_TRUE(std::all_of(inforead.feature_type_names.cbegin(), + inforead.feature_type_names.cend(), + [](auto const &str) { return str == u8"float"; })); + auto h_ft = inforead.feature_types.HostSpan(); + EXPECT_TRUE(std::all_of(h_ft.cbegin(), h_ft.cend(), [](auto f) { + return f == xgboost::FeatureType::kNumerical; + })); + + EXPECT_EQ(inforead.feature_names.size(), kCols); + EXPECT_TRUE(std::all_of(inforead.feature_names.cbegin(), + inforead.feature_names.cend(), + [=](auto const& str) { + return str == featname; + })); } } diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index c1640d4e3b45..8daf7f3573cb 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -115,6 +115,7 @@ def test_feature_names_slice(self): dm.feature_names = list('abcde') assert dm.feature_names == list('abcde') + assert dm.slice([0, 1]).num_col() == dm.num_col() assert dm.slice([0, 1]).feature_names == dm.feature_names dm.feature_types = 'q'