ASUS-AICS · cjlin1 · Dec 7, 2023 · Nov 3, 2023 · Nov 29, 2023 · Nov 29, 2023
@@ -136,6 +136,18 @@ def is_multiclass_dataset(dataset, label="label"):
     else:
         label_sizes = dataset[label].sum(axis=1)
 
+    # TODO: separate logging message from the function
+    # detect unlabeled ratio
+    ratio = (label_sizes == 0).sum() / label_sizes.shape[0]
+    threshold = 0.1
+    if ratio >= threshold:
+        logging.warning(
+            f"""About {ratio * 100:.1f}% (>= {threshold * 100:.1f}%) instances in the dataset are unlabeled.
+            LibMultiLabel doesn't treat unlabeled data in a special way.
+            Thus, the metrics you see will not be accurate.
+            We suggest you either apply preprocessing to the data or modify the metric classes."""
+        )
+
     ratio = float((label_sizes == 1).sum()) / len(label_sizes)
     if ratio > 0.999 and ratio != 1.0:
         logging.info(

@@ -27,11 +27,13 @@ def __init__(
         weights: np.matrix,
         bias: float,
         thresholds: float | np.ndarray,
+        multiclass: bool,
     ):
         self.name = name
         self.weights = weights
         self.bias = bias
         self.thresholds = thresholds
+        self.multiclass = multiclass
 
     def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         """Calculates the decision values associated with x.
@@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
         return (x * self.weights).A + self.thresholds
 
 
-def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel:
+def train_1vsrest(
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    multiclass: bool = False,
+    options: str = "",
+    verbose: bool = True,
+) -> FlatModel:
     """Trains a linear model for multiabel data using a one-vs-rest strategy.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        multiclass (bool, optional): A flag indicating if the dataset is multiclass.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "",
         yi = y[:, i].toarray().reshape(-1)
         weights[:, i] = _do_train(2 * yi - 1, x, options).ravel()
 
-    return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="1vsrest",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        multiclass=multiclass,
+    )
 
 
 def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
@@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
 
 
 def train_thresholding(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    multiclass: bool = False,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multi-label data using a one-vs-rest strategy
     and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
@@ -160,6 +179,7 @@ def train_thresholding(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        multiclass (bool, optional): A flag indicating if the dataset is multiclass.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -189,7 +209,13 @@ def train_thresholding(
         weights[:, i] = w.ravel()
         thresholds[i] = t
 
-    return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="thresholding",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        multiclass=multiclass,
+    )
 
 
 def _micromacro_one_label(
@@ -361,7 +387,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:
 
 
 def train_cost_sensitive(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    multiclass: bool = False,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -373,6 +403,7 @@ def train_cost_sensitive(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        multiclass (bool, optional): A flag indicating if the dataset is multiclass.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -394,7 +425,13 @@ def train_cost_sensitive(
         w = _cost_sensitive_one_label(2 * yi - 1, x, options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        multiclass=multiclass,
+    )
 
 
 def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray:
@@ -454,7 +491,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np.
 
 
 def train_cost_sensitive_micro(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    multiclass: bool = False,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for multilabel data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
@@ -466,6 +507,7 @@ def train_cost_sensitive_micro(
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        multiclass (bool, optional): A flag indicating if the dataset is multiclass.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -510,17 +552,28 @@ def train_cost_sensitive_micro(
         w = _do_train(2 * yi - 1, x, final_options)
         weights[:, i] = w.ravel()
 
-    return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0)
+    return FlatModel(
+        name="cost_sensitive_micro",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=0,
+        multiclass=multiclass,
+    )
 
 
 def train_binary_and_multiclass(
-    y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
+    y: sparse.csr_matrix,
+    x: sparse.csr_matrix,
+    multiclass: bool = True,
+    options: str = "",
+    verbose: bool = True,
 ) -> FlatModel:
     """Trains a linear model for binary and multi-class data.
 
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
         x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
+        multiclass (bool, optional): A flag indicating if the dataset is multiclass.
         options (str, optional): The option string passed to liblinear. Defaults to ''.
         verbose (bool, optional): Output extra progress information. Defaults to True.
 
@@ -556,7 +609,13 @@ def train_binary_and_multiclass(
     # For labels not appeared in training, assign thresholds to -inf so they won't be predicted.
     thresholds = np.full(num_labels, -np.inf)
     thresholds[train_labels] = 0
-    return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
+    return FlatModel(
+        name="binary_and_multiclass",
+        weights=np.asmatrix(weights),
+        bias=bias,
+        thresholds=thresholds,
+        multiclass=multiclass,
+    )
 
 
 def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:

@@ -48,6 +48,7 @@ def __init__(
         self.root = root
         self.flat_model = flat_model
         self.weight_map = weight_map
+        self.multiclass = False
 
     def predict_values(
         self,
@@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node:
         node (Node): Node to be trained.
     """
     if node.isLeaf():
-        node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False)
+        node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False)
     else:
         # meta_y[i, j] is 1 if the ith instance is relevant to the jth child.
         # getnnz returns an ndarray of shape number of instances.
         # This must be reshaped into number of instances * 1 to be interpreted as a column.
         meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children]
         meta_y = sparse.csr_matrix(np.hstack(meta_y))
-        node.model = linear.train_1vsrest(meta_y, x, options, False)
+        node.model = linear.train_1vsrest(meta_y, x, False, options, False)
 
     node.model.weights = sparse.csc_matrix(node.model.weights)
 
@@ -250,6 +251,7 @@ def visit(node):
         weights=sparse.hstack(weights, "csr"),
         bias=bias,
         thresholds=0,
+        multiclass=False,
     )
 
     # w.shape[1] is the number of labels/metalabels of each node

@@ -5,14 +5,12 @@
 from tqdm import tqdm
 
 import libmultilabel.linear as linear
-from libmultilabel.common_utils import dump_log
+from libmultilabel.common_utils import dump_log, is_multiclass_dataset
 from libmultilabel.linear.utils import LINEAR_TECHNIQUES
 
 
 def linear_test(config, model, datasets, label_mapping):
-    metrics = linear.get_metrics(
-        config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass"
-    )
+    metrics = linear.get_metrics(config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.multiclass)
     num_instance = datasets["test"]["x"].shape[0]
     k = config.save_k_predictions
     if k > 0:
@@ -37,7 +35,14 @@ def linear_test(config, model, datasets, label_mapping):
 
 
 def linear_train(datasets, config):
+    # detect task type
+    multiclass = is_multiclass_dataset(datasets["train"], "y")
+
+    # train
     if config.linear_technique == "tree":
+        if multiclass:
+            raise ValueError("Tree model should only be used with multilabel datasets.")
+
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
@@ -49,7 +54,8 @@ def linear_train(datasets, config):
         model = LINEAR_TECHNIQUES[config.linear_technique](
             datasets["train"]["y"],
             datasets["train"]["x"],
-            config.liblinear_options,
+            multiclass=multiclass,
+            options=config.liblinear_options,
         )
     return model