Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

detect task type #337

Merged
merged 8 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions libmultilabel/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ def is_multiclass_dataset(dataset, label="label"):
else:
label_sizes = dataset[label].sum(axis=1)

# TODO: separate logging message from the function
# detect unlabeled ratio
ratio = (label_sizes == 0).sum() / label_sizes.shape[0]
threshold = 0.1
if ratio >= threshold:
logging.warning(
f"""About {ratio * 100:.1f}% (>= {threshold * 100:.1f}%) instances in the dataset are unlabeled.
LibMultiLabel doesn't treat unlabeled data in a special way.
Thus, the metrics you see will not be accurate.
We suggest you either apply preprocessing to the data or modify the metric classes."""
)

ratio = float((label_sizes == 1).sum()) / len(label_sizes)
if ratio > 0.999 and ratio != 1.0:
logging.info(
Expand Down
79 changes: 69 additions & 10 deletions libmultilabel/linear/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ def __init__(
weights: np.matrix,
bias: float,
thresholds: float | np.ndarray,
multiclass: bool,
):
self.name = name
self.weights = weights
self.bias = bias
self.thresholds = thresholds
self.multiclass = multiclass

def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
"""Calculates the decision values associated with x.
Expand Down Expand Up @@ -67,12 +69,19 @@ def predict_values(self, x: sparse.csr_matrix) -> np.ndarray:
return (x * self.weights).A + self.thresholds


def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True) -> FlatModel:
def train_1vsrest(
y: sparse.csr_matrix,
x: sparse.csr_matrix,
multiclass: bool = False,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multiabel data using a one-vs-rest strategy.

Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
multiclass (bool, optional): A flag indicating if the dataset is multiclass.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand All @@ -93,7 +102,13 @@ def train_1vsrest(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "",
yi = y[:, i].toarray().reshape(-1)
weights[:, i] = _do_train(2 * yi - 1, x, options).ravel()

return FlatModel(name="1vsrest", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="1vsrest",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
multiclass=multiclass,
)


def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_matrix, str, float]:
Expand Down Expand Up @@ -145,7 +160,11 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat


def train_thresholding(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
multiclass: bool = False,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multi-label data using a one-vs-rest strategy
and cross-validation to pick decision thresholds optimizing the sum of Macro-F1 and Micro-F1.
Expand All @@ -160,6 +179,7 @@ def train_thresholding(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
multiclass (bool, optional): A flag indicating if the dataset is multiclass.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand Down Expand Up @@ -189,7 +209,13 @@ def train_thresholding(
weights[:, i] = w.ravel()
thresholds[i] = t

return FlatModel(name="thresholding", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
return FlatModel(
name="thresholding",
weights=np.asmatrix(weights),
bias=bias,
thresholds=thresholds,
multiclass=multiclass,
)


def _micromacro_one_label(
Expand Down Expand Up @@ -361,7 +387,11 @@ def _fmeasure(y_true: np.ndarray, y_pred: np.ndarray) -> float:


def train_cost_sensitive(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
multiclass: bool = False,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multilabel data using a one-vs-rest strategy
and cross-validation to pick an optimal asymmetric misclassification cost
Expand All @@ -373,6 +403,7 @@ def train_cost_sensitive(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
multiclass (bool, optional): A flag indicating if the dataset is multiclass.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand All @@ -394,7 +425,13 @@ def train_cost_sensitive(
w = _cost_sensitive_one_label(2 * yi - 1, x, options)
weights[:, i] = w.ravel()

return FlatModel(name="cost_sensitive", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="cost_sensitive",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
multiclass=multiclass,
)


def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str) -> np.ndarray:
Expand Down Expand Up @@ -454,7 +491,11 @@ def _cross_validate(y: np.ndarray, x: sparse.csr_matrix, options: str, perm: np.


def train_cost_sensitive_micro(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
multiclass: bool = False,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for multilabel data using a one-vs-rest strategy
and cross-validation to pick an optimal asymmetric misclassification cost
Expand All @@ -466,6 +507,7 @@ def train_cost_sensitive_micro(
Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
multiclass (bool, optional): A flag indicating if the dataset is multiclass.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand Down Expand Up @@ -510,17 +552,28 @@ def train_cost_sensitive_micro(
w = _do_train(2 * yi - 1, x, final_options)
weights[:, i] = w.ravel()

return FlatModel(name="cost_sensitive_micro", weights=np.asmatrix(weights), bias=bias, thresholds=0)
return FlatModel(
name="cost_sensitive_micro",
weights=np.asmatrix(weights),
bias=bias,
thresholds=0,
multiclass=multiclass,
)


def train_binary_and_multiclass(
y: sparse.csr_matrix, x: sparse.csr_matrix, options: str = "", verbose: bool = True
y: sparse.csr_matrix,
x: sparse.csr_matrix,
multiclass: bool = True,
options: str = "",
verbose: bool = True,
) -> FlatModel:
"""Trains a linear model for binary and multi-class data.

Args:
y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
x (sparse.csr_matrix): A matrix with dimensions number of instances * number of features.
multiclass (bool, optional): A flag indicating if the dataset is multiclass.
options (str, optional): The option string passed to liblinear. Defaults to ''.
verbose (bool, optional): Output extra progress information. Defaults to True.

Expand Down Expand Up @@ -556,7 +609,13 @@ def train_binary_and_multiclass(
# For labels not appeared in training, assign thresholds to -inf so they won't be predicted.
thresholds = np.full(num_labels, -np.inf)
thresholds[train_labels] = 0
return FlatModel(name="binary_and_multiclass", weights=np.asmatrix(weights), bias=bias, thresholds=thresholds)
return FlatModel(
name="binary_and_multiclass",
weights=np.asmatrix(weights),
bias=bias,
thresholds=thresholds,
multiclass=multiclass,
)


def predict_values(model, x: sparse.csr_matrix) -> np.ndarray:
Expand Down
6 changes: 4 additions & 2 deletions libmultilabel/linear/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
self.root = root
self.flat_model = flat_model
self.weight_map = weight_map
self.multiclass = False

def predict_values(
self,
Expand Down Expand Up @@ -203,14 +204,14 @@ def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node:
node (Node): Node to be trained.
"""
if node.isLeaf():
node.model = linear.train_1vsrest(y[:, node.label_map], x, options, False)
node.model = linear.train_1vsrest(y[:, node.label_map], x, False, options, False)
else:
# meta_y[i, j] is 1 if the ith instance is relevant to the jth child.
# getnnz returns an ndarray of shape number of instances.
# This must be reshaped into number of instances * 1 to be interpreted as a column.
meta_y = [y[:, child.label_map].getnnz(axis=1)[:, np.newaxis] > 0 for child in node.children]
meta_y = sparse.csr_matrix(np.hstack(meta_y))
node.model = linear.train_1vsrest(meta_y, x, options, False)
node.model = linear.train_1vsrest(meta_y, x, False, options, False)

node.model.weights = sparse.csc_matrix(node.model.weights)

Expand Down Expand Up @@ -250,6 +251,7 @@ def visit(node):
weights=sparse.hstack(weights, "csr"),
bias=bias,
thresholds=0,
multiclass=False,
)

# w.shape[1] is the number of labels/metalabels of each node
Expand Down
16 changes: 11 additions & 5 deletions linear_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
from tqdm import tqdm

import libmultilabel.linear as linear
from libmultilabel.common_utils import dump_log
from libmultilabel.common_utils import dump_log, is_multiclass_dataset
from libmultilabel.linear.utils import LINEAR_TECHNIQUES


def linear_test(config, model, datasets, label_mapping):
metrics = linear.get_metrics(
config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.name == "binary_and_multiclass"
)
metrics = linear.get_metrics(config.monitor_metrics, datasets["test"]["y"].shape[1], multiclass=model.multiclass)
num_instance = datasets["test"]["x"].shape[0]
k = config.save_k_predictions
if k > 0:
Expand All @@ -37,7 +35,14 @@ def linear_test(config, model, datasets, label_mapping):


def linear_train(datasets, config):
# detect task type
multiclass = is_multiclass_dataset(datasets["train"], "y")

# train
if config.linear_technique == "tree":
if multiclass:
raise ValueError("Tree model should only be used with multilabel datasets.")

model = LINEAR_TECHNIQUES[config.linear_technique](
datasets["train"]["y"],
datasets["train"]["x"],
Expand All @@ -49,7 +54,8 @@ def linear_train(datasets, config):
model = LINEAR_TECHNIQUES[config.linear_technique](
datasets["train"]["y"],
datasets["train"]["x"],
config.liblinear_options,
multiclass=multiclass,
options=config.liblinear_options,
)
return model

Expand Down