From 77bb66cdd97ba29a79238f08dc41b00019aeac35 Mon Sep 17 00:00:00 2001
From: Yao Qing <Qing.Yao@intel.com>
Date: Fri, 6 Sep 2024 12:57:12 +0800
Subject: [PATCH] Add README for codegen acc test. (#110)

Signed-off-by: Yao, Qing <qing.yao@intel.com>
---
 .../api_evaluator.py                          |  3 +
 examples/CodeGen/README.md                    | 92 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 examples/CodeGen/README.md

diff --git a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py
index e6078764..b6faa5b1 100644
--- a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py
+++ b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py
@@ -16,6 +16,9 @@ def generate_text(self, task_name, intermediate_generations=None):
         dataset = task.get_dataset()
         # if args.limit is None, use all samples
         # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset)
+
+        # TODO: Only support running the entire task in its entirety now,
+        #       parameters limit or limit_start will result in inaccurate results.
         n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset)
         print(n_tasks)
         # when args.limit is None
diff --git a/examples/CodeGen/README.md b/examples/CodeGen/README.md
new file mode 100644
index 00000000..5d118967
--- /dev/null
+++ b/examples/CodeGen/README.md
@@ -0,0 +1,92 @@
+# CodeGen accuracy Evaluation
+
+## Evaluation Framework
+We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It  is a framework for the evaluation of code generation models. 
+
+
+## Evaluation FAQs
+
+### Launch CodeGen microservice
+Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice.
+
+Use cURL command to test codegen service and ensure that it has started properly
+```bash
+export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen"
+curl $CODEGEN_ENDPOINT \
+    -H "Content-Type: application/json" \
+    -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}'
+
+```
+
+
+### Generation and Evaluation 
+
+For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available.
+#### command line usage
+
+```shell
+cd evals/evaluation/bigcode_evaluation_harness/examples
+python main.py --model Qwen/CodeQwen1.5-7B-Chat \ 
+  --tasks humaneval \
+  --codegen_url $CODEGEN_ENDPOINT \
+  --max_length_generation 2048 \
+  --batch_size 1  \
+  --save_generations \
+  --save_references \
+  --allow_code_execution
+```
+
+***Note:*** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples.
+
+
+### accuracy Result
+Here is the tested result for your reference
+```json
+{
+  "humaneval": {
+    "pass@1": 0.7195121951219512
+  },
+  "config": {
+    "prefix": "",
+    "do_sample": true,
+    "temperature": 0.2,
+    "top_k": 0,
+    "top_p": 0.95,
+    "n_samples": 1,
+    "eos": "<|endoftext|>",
+    "seed": 0,
+    "model": "Qwen/CodeQwen1.5-7B-Chat",
+    "modeltype": "causal",
+    "peft_model": null,
+    "revision": null,
+    "use_auth_token": false,
+    "trust_remote_code": false,
+    "tasks": "humaneval",
+    "instruction_tokens": null,
+    "batch_size": 1,
+    "max_length_generation": 2048,
+    "precision": "fp32",
+    "load_in_8bit": false,
+    "load_in_4bit": false,
+    "left_padding": false,
+    "limit": null,
+    "limit_start": 0,
+    "save_every_k_tasks": -1,
+    "postprocess": true,
+    "allow_code_execution": true,
+    "generation_only": false,
+    "load_generations_path": null,
+    "load_data_path": null,
+    "metric_output_path": "evaluation_results.json",
+    "save_generations": true,
+    "load_generations_intermediate_paths": null,
+    "save_generations_path": "generations.json",
+    "save_references": true,
+    "save_references_path": "references.json",
+    "prompt": "prompt",
+    "max_memory_per_gpu": null,
+    "check_references": false,
+    "codegen_url": "http://192.168.123.104:31234/v1/codegen"
+  }
+}
+```