diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py index c70988707a15..22de56b93c81 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/dataset_evaluator.py @@ -60,6 +60,11 @@ def _calculate_label_metrics(self, metric: str, category: str): sample["output"], ref, all_classes=self.data[category]["inference_kwargs"]["all_classes"] ), ) + + score = max( + score, + metric_helper.accuracy_by_options(sample["input"], sample["output"], ref), + ) softmaxs.append(references[i] if score == 1 else -1) else: softmaxs.append(np.argmax(np.array(list(sample["softmax_over_choices"].values())))) diff --git a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py index 914465478dec..45a12756de69 100644 --- a/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py +++ b/applications/ColossalEval/colossal_eval/evaluate/dataset_evaluator/metrics.py @@ -443,6 +443,20 @@ def multi_choice_accuracy(prediction, reference, **kwargs): return score +def accuracy_by_options(question, prediction, reference): + pattern = r"[A-Z]\. [^\n]+" + options = re.findall(pattern, question) + answer = prediction.split("\n\n")[0] + + for option in options: + choice, content = option.split(". ", 1) + + if choice == reference and content == answer: + return 1 + + return 0 + + def combined_single_choice_accuracy(prediction, reference, **kwargs): return single_choice_accuracy(prediction, reference, **kwargs) diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py index 9f785a6aa9d1..47259c1db758 100644 --- a/applications/ColossalEval/colossal_eval/models/huggingface.py +++ b/applications/ColossalEval/colossal_eval/models/huggingface.py @@ -96,7 +96,7 @@ def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], tokenizer_kw self.logger.warning("pad_token_id is not set for the tokenizer. " "Using eos_token_id as pad_token_id.") if self.tokenizer.eos_token: self.tokenizer.pad_token = self.tokenizer.eos_token - elif self.tokenizer.eod_id: + elif hasattr(self.tokenizer, "eod_id"): # Qwen has an eod token "<|endoftext|>". self.tokenizer.pad_token_id = self.tokenizer.eod_id