diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala index b31719c11ea31..ea10bde5fa252 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MultilabelMetrics.scala @@ -33,126 +33,125 @@ class MultilabelMetrics(predictionAndLabels: RDD[(Array[Double], Array[Double])] labels}.distinct().count() /** - * Returns strict Accuracy + * Returns subset accuracy * (for equal sets of labels) */ - lazy val strictAccuracy: Double = predictionAndLabels.filter { case (predictions, labels) => - predictions.deep == labels.deep }.count().toDouble / numDocs + lazy val subsetAccuracy: Double = predictionAndLabels.filter { case (predictions, labels) => + predictions.deep == labels.deep + }.count().toDouble / numDocs /** - * Returns Accuracy + * Returns accuracy */ lazy val accuracy: Double = predictionAndLabels.map { case (predictions, labels) => labels.intersect(predictions).size.toDouble / (labels.size + predictions.size - labels.intersect(predictions).size)}.sum / numDocs + /** * Returns Hamming-loss */ lazy val hammingLoss: Double = predictionAndLabels.map { case (predictions, labels) => - labels.diff(predictions).size + predictions.diff(labels).size}. - sum / (numDocs * numLabels) + labels.size + predictions.size - 2 * labels.intersect(predictions).size + }.sum / (numDocs * numLabels) /** - * Returns Document-based Precision averaged by the number of documents + * Returns document-based precision averaged by the number of documents */ - lazy val macroPrecisionDoc: Double = predictionAndLabels.map { case (predictions, labels) => + lazy val precision: Double = predictionAndLabels.map { case (predictions, labels) => if (predictions.size > 0) { predictions.intersect(labels).size.toDouble / predictions.size - } else 0 + } else { + 0 + } }.sum / numDocs /** - * Returns Document-based Recall averaged by the number of documents - */ - lazy val macroRecallDoc: Double = predictionAndLabels.map { case (predictions, labels) => - labels.intersect(predictions).size.toDouble / labels.size}.sum / numDocs - - /** - * Returns Document-based F1-measure averaged by the number of documents + * Returns document-based recall averaged by the number of documents */ - lazy val macroF1MeasureDoc: Double = predictionAndLabels.map { case (predictions, labels) => - 2.0 * predictions.intersect(labels).size / (predictions.size + labels.size)}.sum / numDocs - - /** - * Returns micro-averaged document-based Precision - * (equals to label-based microPrecision) - */ - lazy val microPrecisionDoc: Double = microPrecisionClass - - /** - * Returns micro-averaged document-based Recall - * (equals to label-based microRecall) - */ - lazy val microRecallDoc: Double = microRecallClass + lazy val recall: Double = predictionAndLabels.map { case (predictions, labels) => + labels.intersect(predictions).size.toDouble / labels.size + }.sum / numDocs /** - * Returns micro-averaged document-based F1-measure - * (equals to label-based microF1measure) + * Returns document-based f1-measure averaged by the number of documents */ - lazy val microF1MeasureDoc: Double = microF1MeasureClass + lazy val f1Measure: Double = predictionAndLabels.map { case (predictions, labels) => + 2.0 * predictions.intersect(labels).size / (predictions.size + labels.size) + }.sum / numDocs private lazy val tpPerClass = predictionAndLabels.flatMap { case (predictions, labels) => - predictions.intersect(labels).map(category => (category, 1))}.reduceByKey(_ + _).collectAsMap() + predictions.intersect(labels) + }.countByValue() - private lazy val fpPerClass = predictionAndLabels.flatMap { case(predictions, labels) => - predictions.diff(labels).map(category => (category, 1))}.reduceByKey(_ + _).collectAsMap() + private lazy val fpPerClass = predictionAndLabels.flatMap { case (predictions, labels) => + predictions.diff(labels) + }.countByValue() - private lazy val fnPerClass = predictionAndLabels.flatMap{ case(predictions, labels) => - labels.diff(predictions).map(category => (category, 1))}.reduceByKey(_ + _).collectAsMap() + private lazy val fnPerClass = predictionAndLabels.flatMap { case(predictions, labels) => + labels.diff(predictions) + }.countByValue() /** - * Returns Precision for a given label (category) + * Returns precision for a given label (category) * @param label the label. */ - def precisionClass(label: Double) = { + def precision(label: Double) = { val tp = tpPerClass(label) - val fp = fpPerClass.getOrElse(label, 0) + val fp = fpPerClass.getOrElse(label, 0L) if (tp + fp == 0) 0 else tp.toDouble / (tp + fp) } /** - * Returns Recall for a given label (category) + * Returns recall for a given label (category) * @param label the label. */ - def recallClass(label: Double) = { + def recall(label: Double) = { val tp = tpPerClass(label) - val fn = fnPerClass.getOrElse(label, 0) + val fn = fnPerClass.getOrElse(label, 0L) if (tp + fn == 0) 0 else tp.toDouble / (tp + fn) } /** - * Returns F1-measure for a given label (category) + * Returns f1-measure for a given label (category) * @param label the label. */ - def f1MeasureClass(label: Double) = { - val precision = precisionClass(label) - val recall = recallClass(label) - if((precision + recall) == 0) 0 else 2 * precision * recall / (precision + recall) + def f1Measure(label: Double) = { + val p = precision(label) + val r = recall(label) + if((p + r) == 0) 0 else 2 * p * r / (p + r) } - private lazy val sumTp = tpPerClass.foldLeft(0L){ case (sum, (_, tp)) => sum + tp} - private lazy val sumFpClass = fpPerClass.foldLeft(0L){ case (sum, (_, fp)) => sum + fp} - private lazy val sumFnClass = fnPerClass.foldLeft(0L){ case (sum, (_, fn)) => sum + fn} + private lazy val sumTp = tpPerClass.foldLeft(0L) { case (sum, (_, tp)) => sum + tp } + private lazy val sumFpClass = fpPerClass.foldLeft(0L) { case (sum, (_, fp)) => sum + fp } + private lazy val sumFnClass = fnPerClass.foldLeft(0L) { case (sum, (_, fn)) => sum + fn } /** - * Returns micro-averaged label-based Precision + * Returns micro-averaged label-based precision + * (equals to micro-averaged document-based precision) */ - lazy val microPrecisionClass = { + lazy val microPrecision = { val sumFp = fpPerClass.foldLeft(0L){ case(cum, (_, fp)) => cum + fp} sumTp.toDouble / (sumTp + sumFp) } /** - * Returns micro-averaged label-based Recall + * Returns micro-averaged label-based recall + * (equals to micro-averaged document-based recall) */ - lazy val microRecallClass = { + lazy val microRecall = { val sumFn = fnPerClass.foldLeft(0.0){ case(cum, (_, fn)) => cum + fn} sumTp.toDouble / (sumTp + sumFn) } /** - * Returns micro-averaged label-based F1-measure + * Returns micro-averaged label-based f1-measure + * (equals to micro-averaged document-based f1-measure) + */ + lazy val microF1Measure = 2.0 * sumTp / (2 * sumTp + sumFnClass + sumFpClass) + + /** + * Returns the sequence of labels in ascending order */ - lazy val microF1MeasureClass = 2.0 * sumTp / (2 * sumTp + sumFnClass + sumFpClass) + lazy val labels: Array[Double] = tpPerClass.keys.toArray.sorted } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala index 5ace9d9a59d6e..342baa0274e9c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/evaluation/MultilabelMetricsSuite.scala @@ -80,23 +80,24 @@ class MultilabelMetricsSuite extends FunSuite with LocalSparkContext { val hammingLoss = (1.0 / (7 * 3)) * (2 + 2 + 1 + 0 + 0 + 1 + 1) val strictAccuracy = 2.0 / 7 val accuracy = 1.0 / 7 * (1.0 / 3 + 1.0 /3 + 0 + 1.0 / 1 + 2.0 / 2 + 2.0 / 3 + 1.0 / 2) - assert(math.abs(metrics.precisionClass(0.0) - precision0) < delta) - assert(math.abs(metrics.precisionClass(1.0) - precision1) < delta) - assert(math.abs(metrics.precisionClass(2.0) - precision2) < delta) - assert(math.abs(metrics.recallClass(0.0) - recall0) < delta) - assert(math.abs(metrics.recallClass(1.0) - recall1) < delta) - assert(math.abs(metrics.recallClass(2.0) - recall2) < delta) - assert(math.abs(metrics.f1MeasureClass(0.0) - f1measure0) < delta) - assert(math.abs(metrics.f1MeasureClass(1.0) - f1measure1) < delta) - assert(math.abs(metrics.f1MeasureClass(2.0) - f1measure2) < delta) - assert(math.abs(metrics.microPrecisionClass - microPrecisionClass) < delta) - assert(math.abs(metrics.microRecallClass - microRecallClass) < delta) - assert(math.abs(metrics.microF1MeasureClass - microF1MeasureClass) < delta) - assert(math.abs(metrics.macroPrecisionDoc - macroPrecisionDoc) < delta) - assert(math.abs(metrics.macroRecallDoc - macroRecallDoc) < delta) - assert(math.abs(metrics.macroF1MeasureDoc - macroF1MeasureDoc) < delta) + assert(math.abs(metrics.precision(0.0) - precision0) < delta) + assert(math.abs(metrics.precision(1.0) - precision1) < delta) + assert(math.abs(metrics.precision(2.0) - precision2) < delta) + assert(math.abs(metrics.recall(0.0) - recall0) < delta) + assert(math.abs(metrics.recall(1.0) - recall1) < delta) + assert(math.abs(metrics.recall(2.0) - recall2) < delta) + assert(math.abs(metrics.f1Measure(0.0) - f1measure0) < delta) + assert(math.abs(metrics.f1Measure(1.0) - f1measure1) < delta) + assert(math.abs(metrics.f1Measure(2.0) - f1measure2) < delta) + assert(math.abs(metrics.microPrecision - microPrecisionClass) < delta) + assert(math.abs(metrics.microRecall - microRecallClass) < delta) + assert(math.abs(metrics.microF1Measure - microF1MeasureClass) < delta) + assert(math.abs(metrics.precision - macroPrecisionDoc) < delta) + assert(math.abs(metrics.recall - macroRecallDoc) < delta) + assert(math.abs(metrics.f1Measure - macroF1MeasureDoc) < delta) assert(math.abs(metrics.hammingLoss - hammingLoss) < delta) - assert(math.abs(metrics.strictAccuracy - strictAccuracy) < delta) + assert(math.abs(metrics.subsetAccuracy - strictAccuracy) < delta) assert(math.abs(metrics.accuracy - accuracy) < delta) + assert(metrics.labels.sameElements(Array(0.0, 1.0, 2.0))) } }