baseline_evaluation.py

from argparse import ArgumentParser
import sys
import os
import numpy as np

### read/write files ###

def read_synonyms(fn_synonyms, skip_first=False):
    file_synonyms = open(fn_synonyms, "r")
    # skip first line if it contains number of items or something like that
    synonyms = file_synonyms.readlines()[1 if skip_first else 0:]
    synonyms = list(map(lambda x: set(map(lambda y: int(y), x.split())), synonyms))
    file_synonyms.close()
    return synonyms

def write_file(fn_output, lines):
    file_output = open(fn_output, "x")
    file_output.writelines(lines)
    file_output.flush()
    file_output.close()

### precision, recall, f-scores ###

def filtered(synonyms, ground_truth):
    return list(filter(lambda x: x in ground_truth, synonyms))

def precision(synonyms, ground_truth):
    return (len(filtered(synonyms, ground_truth)) / len(synonyms)) if len(synonyms) > 0 else 0.0

def recall(synonyms, ground_truth):
    return (len(filtered(synonyms, ground_truth)) / len(ground_truth)) if len(ground_truth) > 0 else 0.0

def pr(synonyms, ground_truth):
    return [precision(synonyms, ground_truth), recall(synonyms, ground_truth)]

def f_score(p_vals, r_vals, beta=1.0, func_mean=np.mean):
    beta2 = beta ** 2.0
    p = func_mean(p_vals)
    r = func_mean(r_vals)
    return (1.0 + beta2) * (p * r / (beta2 * p + r))

def f1(p_vals, r_vals):
    return f_score(p_vals, r_vals, beta=1.0)

def f05(p_vals, r_vals):
    return f_score(p_vals, r_vals, beta=0.5)

def f2(p_vals, r_vals):
    return f_score(p_vals, r_vals, beta=2.0)

### calculate p-r-values from list of synonyms ###

def calc_pr_vals(synonyms, ground_truth):
    # number of p-r-values equals number of synonyms (top-k, variable minConf) for a fixed minSup
    pr_vals = []
    len_synonyms = len(synonyms)
    last_precision = -1
    last_recall = -1
    for i in range(0, len_synonyms):
        pr_val = pr(synonyms[:i + 1], ground_truth)
        # always add only the best precision for each recall value:
        # for one recall value, the first pr_val has the best precision,
        # (while the last pr_val has the best recall for one precision)
        if pr_val[1] != last_recall:
            # new recall value
            last_recall = pr_val[1]
            # ------------------------------------------------------------------------------
            # DON'T USE THIS PART:
            # we only need the best precision for each recall value to build a p-r-curve
            #
            # check if last precision equals precision of current pr_val
            #if pr_val[0] == last_precision:
                # same precision value, pop last inserted pr_val because its recall is worse
                #pr_vals.pop()
            # ------------------------------------------------------------------------------
            # trace current precision and add pr_val
            last_precision = pr_val[0]
            pr_vals.append(pr_val)

    # calculate different f-scores
    p_vals = list(map(lambda x: x[0], pr_vals))
    r_vals = list(map(lambda x: x[1], pr_vals))
    f05_score = f05(p_vals, r_vals)
    f1_score = f1(p_vals, r_vals)
    f2_score = f2(p_vals, r_vals)

    # return a tuple of two lists: f-scores list and p-r-values list
    return [f05_score, f1_score, f2_score], pr_vals

### main function ###

def main():
    # parse arguments
    parser = ArgumentParser()
    parser.add_argument("INPUT_SYNONYMS", type=str, \
            help="The synonyms file which was calculated by baseline.py.")
    parser.add_argument("INPUT_GROUND_TRUTH", type=str, \
            help="The ground-truth synonyms file e.g. which was generated by synonym_inject.py.")
    args = parser.parse_args()

    # check files
    if not os.path.exists(args.INPUT_SYNONYMS):
        print("INPUT_SYNONYMS invalid.")
        sys.exit(1)
    if not os.path.exists(args.INPUT_GROUND_TRUTH):
        print("INPUT_GROUND_TRUTH invalid.")
        sys.exit(1)
    basename = os.path.splitext(os.path.basename(args.INPUT_SYNONYMS))[0]
    dirname = os.path.dirname(args.INPUT_SYNONYMS)
    fn_fscores = os.path.join(dirname, basename + "_fscores.txt")
    fn_evaluation = os.path.join(dirname, basename + "_evaluation.txt")

    # evaluate
    synonyms = read_synonyms(args.INPUT_SYNONYMS, skip_first=False)
    ground_truth = read_synonyms(args.INPUT_GROUND_TRUTH, skip_first=True)
    fscores, pr_vals = calc_pr_vals(synonyms, ground_truth)

    # write
    write_file(fn_fscores, list(map(lambda x: str(x) + "\n", fscores)))
    write_file(fn_evaluation, list(map(lambda x: str(x[0]) + "\t" + str(x[1]) + "\n", pr_vals)))

if __name__ == "__main__":
    main()