ASUS-AICS · donglihe-hub · Nov 3, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 30, 2023
@@ -27,11 +27,13 @@ def __init__(
         weights: np.matrix,
         bias: float,
         thresholds: float | np.ndarray,
+        is_multilabel: bool,
     ):
         self.name = name
         self.weights = weights
         self.bias = bias
         self.thresholds = thresholds
+        self.is_multilabel = is_multilabel
 
     def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         """Calculates the decision values associated with x.
@@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         return (x * self.weights).A + self.thresholds
 
 
-def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel:
-    """Trains a linear model for multiabel data using a one-vs-rest strategy.
+def train_1vsrest(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
+) -> FlatModel:
+    """Trains a linear classification model with one-vs-rest strategy.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "",
         yi = y[:, i].toarray().reshape(-1)
         weights[:, i] = _do_train(2 * yi - 1, x, options).ravel()
 
-    return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="1vsrest",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
@@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
 
 
 def train_thresholding(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multi-label data using a one-vs-rest strategy
     and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
@@ -160,6 +179,7 @@ def train_thresholding(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -189,7 +209,13 @@ def train_thresholding(
         weights[:, i] = w.ravel()
         thresholds[i] = t
 
-    return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="thresholding",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _micromacro_one_label(
@@ -361,7 +387,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:
 
 
 def train_cost_sensitive(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -373,6 +403,7 @@ def train_cost_sensitive(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -394,7 +425,13 @@ def train_cost_sensitive(
         w = _cost_sensitive_one_label(2 * yi - 1, x, options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray:
@@ -454,7 +491,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np.
 
 
 def train_cost_sensitive_micro(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -466,6 +507,7 @@ def train_cost_sensitive_micro(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -510,23 +552,36 @@ def train_cost_sensitive_micro(
         w = _do_train(2 * yi - 1, x, final_options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive_micro",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        is_multilabel=is_multilabel,
+    )
 
 
 def train_binary_and_multiclass(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    is_multilabel: bool,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for binary and multi-class data.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        is_multilabel (bool): A flag indicating if the dataset is multilabel.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
     Returns:
         A model which can be used in predict_values.
     """
+    if is_multilabel:
+        raise ValueError("binary_and_multiclass doesn't support multilabel data.")
     x, options, bias = _prepare_options(x, options)
     num_instances, num_labels = y.shape
     nonzero_instance_ids, nonzero_label_ids = y.nonzero()
@@ -556,7 +611,13 @@ def train_binary_and_multiclass(
     # For labels not appeared in training, assign thresholds to -inf so they won't be predicted.
     thresholds = np.full(num_labels, -np.inf)
     thresholds[train_labels] = 0
-    return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="binary_and_multiclass",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        is_multilabel=is_multilabel,
+    )
 
 
 def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:

@@ -48,6 +48,7 @@ def __init__(
         self.root = root
         self.flat_model = flat_model
         self.weight_map = weight_map
+        self.is_multilabel = True
 
     def predict_values(
         self,
@@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node:
         node (Node): Node to be trained.
     """
     if node.isLeaf():
-        node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False)
+        node.model = linear.train_1vsrest(y[:, node.label_map], x, True, options, False)
     else:
         # meta_y[i, j] is 1 if the ith instance is relevant to the jth child.
         # getnnz returns an ndarray of shape number of instances.
         # This must be reshaped into number of instances * 1 to be interpreted as a column.
         meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children]
         meta_y = sparse.csr_matrix(np.hstack(meta_y))
-        node.model = linear.train_1vsrest(meta_y, x, options, False)
+        node.model = linear.train_1vsrest(meta_y, x, True, options, False)
 
     node.model.weights = sparse.csc_matrix(node.model.weights)
 
@@ -250,6 +251,7 @@ def visit(node):
         weights=sparse.hstack(weights, "csr"),
         bias=bias,
         thresholds=0,
+        is_multilabel=True,
     )
 
     # w.shape[1] is the number of labels/metalabels of each node

@@ -5,13 +5,15 @@
 from tqdm import tqdm
 
 import libmultilabel.linear as linear
-from libmultilabel.common_utils import dump_log
+from libmultilabel.common_utils import dump_log, is_multiclass_dataset
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
 def linear_test(config, model, datasets, label_mapping):
     metrics = linear.get_metrics(
-        config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass"
+        config.monitor_metrics,
+        datasets["test"]["y"].shape[1],
+        multiclass=not model.is_multilabel,
     )
     num_instance = datasets["test"]["x"].shape[0]
     k = config.save_k_predictions
@@ -37,7 +39,20 @@ def linear_test(config, model, datasets, label_mapping):
 
 
 def linear_train(datasets, config):
+    # detect task type
+    is_multilabel = config.get("is_multilabel", "auto")
+    if is_multilabel == "auto":
+        is_multilabel = not is_multiclass_dataset(datasets["train"], "y")
+    elif not isinstance(is_multilabel, bool):
+        raise ValueError(
+            f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
+        )
+
+    # train
     if config.linear_technique == "tree":
+        if not is_multilabel:
+            raise ValueError("Tree model should only be used with multilabel datasets.")
+
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
@@ -49,7 +64,8 @@ def linear_train(datasets, config):
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
-            config.liblinear_options,
+            is_multilabel=is_multilabel,
+            options=config.liblinear_options,
         )
     return model
 

@@ -65,7 +65,15 @@ def __init__(
         else:
             self.datasets = datasets
 
-        self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
+        # detect task type
+        is_multilabel = self.config.get("is_multilabel", "auto")
+        if is_multilabel == "auto":
+            self.config.multiclass = is_multiclass_dataset(self.datasets["train"] + self.datasets.get("val", list()))
+        elif not isinstance(is_multilabel, bool):
+            raise ValueError(
+                f'"is_multilabel" is expected to be either "auto", "True", or "False". But got "{is_multilabel}".'
+            )
+
         self._setup_model(
             classes=classes,
             word_dict=word_dict,