repoqa.html

<!doctype html>
<html>
  <link rel="preconnect" href="https://fonts.googleapis.com" />
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
  <link
    href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@100;400&display=swap"
    rel="stylesheet"
  />

  <head>
    <meta charset="UTF-8" />
    <title>RepoQA for Evaluating Long-Context Code Understanding</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/echarts@5.3.3/dist/echarts.min.js"></script>
    <link
      rel="icon"
      href="https://images.emojiterra.com/google/noto-emoji/unicode-15/color/1024px/1f9d1-1f4bb.png"
    />
    <link
      rel="stylesheet"
      href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0/dist/css/bootstrap.min.css"
    />

    <link href="https://cdn.jsdelivr.net/npm/prismjs@v1.x/themes/prism.css" rel="stylesheet" />
    <script src="https://cdn.jsdelivr.net/npm/prismjs@v1.x/components/prism-core.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/prismjs@v1.x/plugins/autoloader/prism-autoloader.min.js"></script>
    <script src="https://cdn.jsdelivr.net/npm/prismjs-bibtex@2.1.0/prism-bibtex.min.js"></script>

    <style>
      .slidecontainer {
        width: 100%;
      }

      .slider {
        -webkit-appearance: none;
        appearance: none;
        width: 70%;
        height: 15px;
        border-radius: 5px;
        background: #d3d3d3;
        outline: none;
        opacity: 0.7;
        -webkit-transition: 0.2s;
        transition: opacity 0.2s;
      }

      .slider::-webkit-slider-thumb {
        -webkit-appearance: none;
        appearance: none;
        width: 25px;
        height: 25px;
        border-radius: 50%;
        background: #04aa6d;
        cursor: pointer;
      }

      .slider::-moz-range-thumb {
        width: 25px;
        height: 25px;
        border-radius: 50%;
        background: #04aa6d;
        cursor: pointer;
      }

      .slider-title-container {
        margin-bottom: 10px;
        color: #04aa6d;
      }

      #sliderValue {
        background-color: #dddddd; /* Just for visibility */
        padding: 0 10px; /* Optional styling */
        border-radius: 5px; /* Optional styling */
      }

      body {
        font-family: "JetBrains Mono", monospace;
        background-color: #ffffff;
        color: #000000;
      }

      #content {
        width: 60%;
      }

      th,
      td {
        text-align: left;
        width: fit-content;
        font-size: larger;
      }

      #notes {
        font-size: 1em;
      }

      #notes h3 {
        margin-top: 1em;
        font-size: 2em;
        text-align: center;
      }

      #notes li {
        font-size: 1.2em;
        font-weight: 300;
        margin: 1em;
      }

      .form-select {
        font-size: 1em;
      }

      .slider-title {
        font-size: 1em;
        margin: 0.5em;
        color: gray;
      }

      @media screen and (max-width: 1400px) {
        body {
          font-size: 1.6vw;
        }

        #content {
          width: 100%;
        }

        h1 {
          font-size: 2em;
        }

        h2 {
          font-size: 1.6em;
        }

        h3 {
          font-size: 1.2em;
        }

        table {
          font-size: small;
        }
      }
    </style>
  </head>

  <body>
    <div
      id="content"
      class="container-fluid d-flex flex-column align-items-center gap-3"
    >
      <h1 class="text-nowrap mt-5">
        <b>💬 RepoQA</b>
      </h1>
      <div id="warning">
        🚩The <i>First</i> Benchmark for Long-Context Code Understanding.🚩<br />
      </div>
      <div class="d-flex flex-row justify-content-center gap-3">
        <a href="https://arxiv.org/abs/2406.06025"
          ><img
            src="https://img.shields.io/badge/arXiv-2406.06025-b31b1b.svg?style=for-the-badge"
            alt="arxiv"
            class="img-fluid" /></a
        ><a href="https://github.com/evalplus/repoqa"
          ><img
            src="https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white"
            alt="github"
            class="img-fluid" /></a
        ><a href="https://pypi.org/project/repoqa"
          ><img
            alt="PyPI - Version"
            src="https://img.shields.io/pypi/v/repoqa?style=for-the-badge&labelColor=black"
            class="img-fluid"
          />
        </a>
      </div>
      <div class="container-fluid d-flex flex-row flex-nowrap">
        <div class="container-fluid d-flex flex-column align-items-center">
          <p>
            RepoQA aims to create a series of long-context code understanding tasks
            to challenge chat/instruction models for code:
          </p>
          <ul>
            <li>
              <b>Multi-Lingual:</b> RepoQA covers 50 high-quality respositories
              from 5 programming langauges.
            </li>
            <li>
              <b>Application-Driven:</b> While <i>"Needle in the Code"</i> by
              <a href="https://qwenlm.github.io/blog/codeqwen1.5/">CodeQwen</a>
              uses a synthetic task to examine the vulnerable parts over the
              LLM's long context, RepoQA focuses on tasks that can reflect
              real-world uses.
            </li>
            <li>
              <b>🔍 Searching Needle Function (<a href="#task-snf">🔗</a>):</b>
              Search a function given its description.
            </li>
            <li>
              🚧 RepoQA is still under development... More types of QA tasks are
              coming soon... Stay tuned!
            </li>
          </ul>
          <pre style="padding-top: 0; padding-bottom: 0;">
          <code class="language-bash">
# Using RepoQA is super easy
pip install "repoqa[vllm]"
# RepoQA supports 5 backends
repoqa.search_needle_function --backend openai    --model "gpt4-turbo"
repoqa.search_needle_function --backend anthropic --model "claude-3-haiku-20240307"
repoqa.search_needle_function --backend vllm      --model "Qwen/CodeQwen1.5-7B-Chat"
repoqa.search_needle_function --backend hf        --model "Qwen/CodeQwen1.5-7B-Chat"
repoqa.search_needle_function --backend google    --model "gemini-1.5-pro-latest"</code>
          </pre>
          <h2 id="task-snf" class="text-nowrap mt-5">
            <b>🔎 Searching Needle Function (SNF)</b>
          </h2>
          <p>
            <b>Overview:</b>
            This task ask the model to retrieve 10 needle functions from each of
            5 langauges x 10 repositories (500 sub-tasks/tests). Each time the
            model is given a long chunk of source code (following import
            dependency) and a precise function description, and we ask the model
            to find the function in the context that corresponds to the
            description. More details can be found at
            <a href="#task-snf-how">🔗How It Works</a>.
          </p>
          <h3 class="text-nowrap mt-5">🏆 Benchmark @ 16K Code Context</h3>
          <p>
            🛠️ <b>Config:</b> The code in the prompt is fixed to 16K tokens (by
            CodeLlama tokenizer). Yet, the required context is a bit larger than
            16K so we extend 8K and 16K models using either
            <a
              href="https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/"
              >Dynamic RoPE Scaling</a
            >
            or just no scaling -- whichever is better. For example, RoPE scaling
            makes Llama 3 models substaintially better and CodeLlama-13B slight
            better (Credit to @abacaj for the finding!).
          </p>
          <p>
            <b>📝 Note:</b> SNF is an elementary test focusing on testing LLMs'
            capabilities on long-context code understanding and retrieval. It
            does not lead to simple conclusions like
            <i>"model X is better than Model Y (on everything)"</i>. It's a
            start-point task, and we will include more challenging tasks in the
            future. :D
          </p>
          <div class="slider-title">
            Drag 🟢 to select the threshold of match similarity (larger ->
            closer to exact match)
          </div>
          <div
            class="slider-container d-flex flex-row justify-content-center align-items-center gap-3"
            style="width: 70%"
          >
            <input
              type="range"
              id="fixedPointSlider"
              min="0"
              max="10"
              step="1"
              value="8"
              ,
              class="slider"
            />
            <div>thresh =</div>
            <div id="sliderValue">0.8</div>
          </div>
          <br />
          <table
            id="16k"
            class="table table-responsive table-striped table-bordered flex-shrink-1 border border-3"
          ></table>
          <h3 id="task-snf-how" class="text-nowrap mt-5">How It Works</h3>
          <p>
            SNF includes 500 sub-tasks from 5 languages x 10 repositories x 10
            needles. The prompt and expected output are demonstrated in the
            following figure:
          </p>
          <img
            src="assets/RepoQA-CTX.svg"
            style="width: 100%; max-width: 1000px"
          />
          <p>
            The evaluator passes a test if the model generated function:
            <i>(i)</i> has the highest similarity to the ground-truth compared
            to all other functions; <i>(ii)</i> the similarity is above certain
            threshold (default is 0.8 but can be user defined). By default, we
            define the similarity using BLEU score (method 4).
          </p>
          <p>
            The curation of the dataset includes four steps: <i>(i)</i> select
            permissive repositories based on some quality-based metrics;
            <i>(ii)</i> collect source code content and analyze their file
            dependency; <i>(iii)</i> use tree-sitter to parse all functions and
            select a subset of them as needle functions; <i>(iv)</i> prompt
            GPT-4 Turbo to generate function description for the needle.
            Detailed information and scripts for dataset curation can be found
            at our
            <a
              href="https://github.com/evalplus/repoqa/blob/main/docs/curate_dataset.md"
              >GitHub repo</a
            >.
          </p>
          <h2 id="faq" class="text-nowrap mt-5">🙋🏻‍♀️ FAQ</h2>
          <h3 id="yet-another" class="text-nowrap mt-5">
            Just yet another needle test?
          </h3>
          <ul>
            <li>
              SNF is a variant of needle test and is part of RepoQA as the elementary test:
              <b
                >if a model can't pass SNF, don't expect it to pass more
                challenging tasks.</b
              >
            </li>
            <li>
              Unlike vanilla needle tests which use single test to perform fully
              synthetic retrieval, SNF is a multi-lingual, application-driven,
              and comprehensive test that require LLMs to understand NL
              description before retrieval, which aligns with the use of
              advanced code search.
            </li>
          </ul>
          <h3 id="non-determinism" class="text-nowrap mt-5">Non-determinism</h3>
          <p>
            In theory as we use greedy decoding, the results should be
            deterministic. In practice, the results may slightly vary: (i) for
            OpenAI/Anthropic models, they do not seem to be deterministic all
            the time (Thanks to @scottinallcaps); and (ii) for local inference,
            the library configuration such as tensor parallelism sizes may also
            slightly impact reproducibility.
          </p>
          <h3 id="limit" class="text-nowrap mt-5">Known limitations</h3>
          <ul>
            <li>
              The current description is made verbose to avoid one description
              being mapped to multiple functions. However, in real world,
              developers may naturally use short description for code search.
              (Thanks @chrisgorgo for the suggestion!)
            </li>
          </ul>
          <h2 id="sponsor" class="text-nowrap mt-5">🖊️ Citation</h2>
          <div class="col-md-12 overflow-auto">
            <pre style="padding-top: 0; padding-bottom: 0;">
            <code class="language-bibtex">
@inproceedings{repoqa,
  title={Repo{QA}: Evaluating Long Context Code Understanding},
  author={Jiawei Liu and Jia Le Tian and Vijay Daita and Yuxiang Wei and Yifeng Ding and Yuhan Katherine Wang and Jun Yang and LINGMING ZHANG},
  booktitle={First Workshop on Long-Context Foundation Models @ ICML 2024},
  year={2024},
  url={https://openreview.net/forum?id=hK9YSrFuGf}
}</code>
            </pre>
          </div>
          <h2 id="sponsor" class="text-nowrap mt-5">🤗 Acknowledgment</h2>
          <p>
            Part of the compute is generously provided by <a href="https://deepmind.google/">Google DeepMind</a>
            and <a href="https://wandb.ai/site">Weights & Biases</a>.
          </p>
        </div>
      </div>
    </div>

    <script>
      const contextTable = document.getElementById("16k");
      const linkMapping = new Map([]);
      const hfLinkPrefix = "https://huggingface.co/";
      const dataUrlPrefix = "results/repoqa";
      const correctColor = "rgba(72, 200, 120",
        incorrectColor = "rgba(200, 53, 50";

      // Load data
      var data = [];
      var dataUrl = dataUrlPrefix + "/COMBINED-RESULTS.json";
      var xhr = new XMLHttpRequest();
      xhr.open("GET", dataUrl, false); // false makes the request synchronous
      xhr.send();

      if (xhr.status === 200) {
        var scores = JSON.parse(xhr.responseText);
        var scoresMap = new Map(Object.entries(scores));
        scoresMap.forEach((value, modelId) => {
          var result = {
            Model: modelId.split("/").pop(),
            ...value,
          };
          data = data.concat(result);

          if (modelId.includes("/")) {
            modelId = modelId.split("/");
            modelOrg = modelId[0];
            modelId = modelId[1];
            url = hfLinkPrefix + modelOrg + "/" + modelId;
            linkMapping.set(modelId, url);
          } else if (modelId.startsWith("gpt-4-")) {
            linkMapping.set(
              modelId,
              "https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4",
            );
          } else if (modelId.startsWith("gpt-3.5-")) {
            linkMapping.set(
              modelId,
              "https://platform.openai.com/docs/models/gpt-3-5-turbo",
            );
          } else if (modelId.startsWith("claude-3-")) {
            linkMapping.set(
              modelId,
              "https://www.anthropic.com/news/claude-3-family",
            );
          } else if (modelId.startsWith("gemini-1.5-pro")) {
            linkMapping.set(
              modelId,
              "https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#sundar-note",
            );
          } else if (modelId.startsWith("gemini-1.5-flash")) {
            linkMapping.set(
              modelId,
              "https://deepmind.google/technologies/gemini/flash/",
            );
          } else if (modelId.startsWith("gpt-4o-")) {
            linkMapping.set(modelId, "https://openai.com/index/hello-gpt-4o/");
          }
        });
      } else {
        alert(
          "Failed to load data from " + dataUrl + ". Please try again later.",
        );
      }

      const globalData = data;
      var scoreThresh = 0.8;

      // each row represents a model
      const theaders = [
        "#", // rank
        "Model", // model name
        "Claimed Ctx", // Claimed ctx
        "Score (%)", // score of each language
      ];

      const lang_map = {
        python: ".py",
        cpp: ".cpp",
        rust: ".rs",
        java: ".java",
        typescript: ".ts",
        all: "🏆Avg.",
      };

      const displayTable = (table) => {
        var thead = document.createElement("thead");
        var headerRow = document.createElement("tr");
        var langs = Object.keys(lang_map);

        // headers
        theaders.forEach(function (header) {
          var th = document.createElement("th");
          th.classList.add("text-nowrap");
          if (header == "Score (%)") {
            for (var i = 0; i < langs.length; i++) {
              var th = document.createElement("th");
              th.style.textAlign = "right";
              th.textContent = lang_map[langs[i]] + " (%)";
              if (langs[i] == "all") {
                th.style.backgroundColor = "#EEFFEE";
              }
              headerRow.appendChild(th);
            }
          } else if (header == "Claimed Ctx") {
            th.style.textAlign = "right";
            th.textContent = header;
            headerRow.appendChild(th);
          } else {
            th.textContent = header;
            headerRow.appendChild(th);
          }
        });
        thead.appendChild(headerRow);
        table.appendChild(thead);

        var tbody = document.createElement("tbody");
        // add rank
        var rank = 0;
        var last_best = null;
        var n_last_best = 1;
        data.forEach((row) => {
          var dataRow = document.createElement("tr");
          // rank
          var rankCell = document.createElement("td");
          var cur_score = row["scores"]["all"][scoreThresh]["pass@1"];
          if (last_best != cur_score) {
            rank += n_last_best;
            last_best = cur_score;
            rankCell.textContent = rank;
            n_last_best = 1;
          } else {
            n_last_best += 1;
          }
          dataRow.appendChild(rankCell);

          // model name
          var modelCell = document.createElement("td");
          var modelLink = document.createElement("a");
          if (rank == 1) {
            modelLink.textContent = "🥇 " + row["Model"];
          } else if (rank == 2) {
            modelLink.textContent = "🥈 " + row["Model"];
          } else if (rank == 3) {
            modelLink.textContent = "🥉 " + row["Model"];
          } else {
            modelLink.textContent = row["Model"];
          }
          if (linkMapping.has(row["Model"])) {
            modelLink.href = linkMapping.get(row["Model"]);
          }
          modelLink.classList.add("link-underline-primary");
          modelLink.classList.add("text-nowrap");
          modelCell.appendChild(modelLink);
          dataRow.appendChild(modelCell);

          // Train Size
          var trainCell = document.createElement("td");
          trainCell.textContent = row["train_size"];
          trainCell.style.textAlign = "right";
          dataRow.appendChild(trainCell);

          // score
          var rowLangs = Object.keys(row["scores"]);

          var sortedScores = langs
            .map((lang) => ({
              lang: lang,
              score: rowLangs.includes(lang)
                ? 100 * row["scores"][lang][scoreThresh]["pass@1"]
                : null,
            }))
            .filter((item) => item.lang !== "all");

          sortedScores.sort((a, b) => b.score - a.score);

          for (var i = 0; i < langs.length; i++) {
            var lang = langs[i];
            var scoreCell = document.createElement("td");
            if (rowLangs.includes(lang)) {
              scoreCell.textContent = (
                100 * row["scores"][lang][scoreThresh]["pass@1"]
              ).toFixed(1);
              // remove suffix of .0
              if (lang != "all" && scoreCell.textContent.endsWith(".0")) {
                scoreCell.textContent = scoreCell.textContent.slice(0, -2);
              }
            } else {
              scoreCell.textContent = "N/A";
            }
            scoreCell.style.textAlign = "right";
            if (lang != "all") {
              scoreCell.style.color = "gray";
            } else {
              scoreCell.style.fontWeight = "bold";
              scoreCell.style.backgroundColor = "#EEFFEE";
            }
            const good_color = "#32CD32";
            if (lang == sortedScores[0]["lang"] && sortedScores[0]["score"]) {
              scoreCell.textContent = "👍" + scoreCell.textContent;
              scoreCell.style.color = good_color;
              scoreCell.style.fontWeight = "bold";
            } else if (
              lang == sortedScores[1]["lang"] &&
              sortedScores[0]["score"]
            ) {
              scoreCell.style.color = good_color;
              scoreCell.style.fontWeight = "bold";
            }
            dataRow.appendChild(scoreCell);
          }
          tbody.appendChild(dataRow);
        });
        table.appendChild(tbody);
      };

      const clearTable = () => {
        contextTable.innerHTML = "";
      };

      var chartOption = {
        grid: {
          top: 0,
          left: 8,
          right: 8,
          bottom: 20,
        },
        xAxis: {
          type: "value",
          boundaryGap: false,
          min: 0,
          max: 16384,
          interval: 2048,
          axisLabel: {
            show: true,
            margin: 5,
            formatter: function (value, index) {
              if (value == 0) {
                return "0";
              }
              return value / 1024 + "k";
            },
            fontSize: 10,
          },
          splitLine: {
            lineStyle: {
              color: "gray",
              type: "dashed",
            },
          },
        },
        yAxis: {
          type: "value",
          max: 1,
          show: false,
        },
        visualMap: {
          type: "piecewise",
          show: false,
          dimension: 0,
          pieces: [],
        },
        series: [
          {
            type: "line",
            symbol: "none",
            lineStyle: {
              color: correctColor + ", 0.3)",
              width: 0,
            },
            markLine: {
              symbol: ["none", "none"],
              label: { show: false },
            },
            areaStyle: {},

            data: [],
          },
          {
            type: "bar",
            symbol: "none",
            lineStyle: {
              color: incorrectColor + ", 0.3)",
              width: 0,
            },
            markLine: {
              symbol: ["none", "none"],
              label: { show: false },
            },
            areaStyle: {},

            data: [],
          },
        ],
      };

      const getChartMetaData = (results, color) => {
        if (results.length == 0) {
          return {
            chartPoints: [],
            visualMapPieces: [],
          };
        }

        // calculate the changes
        var changes = [];
        for (var i = 0; i < results.length; i++) {
          start = results[i]["position"]["token_start"];
          end = results[i]["position"]["token_end"];
          if (end >= changes.length) {
            for (var j = changes.length; j <= end; j++) {
              changes.push(0);
            }
          }
          changes[start] += 1;
          changes[end] -= 1;
        }

        // calculate the points
        var points = [];
        var y = 0,
          max_y = 0;
        for (var x = 0; x < changes.length; x++) {
          if (changes[x] != 0) {
            y += changes[x];
            points.push([x, y]);
            if (y > max_y) {
              max_y = y;
            }
          }
        }

        // calculate the chart points
        var y = 0;
        var chartPoints = [];
        var visualMapPieces = [];
        for (var i = 0; i < points.length; i++) {
          if (points[i][1] > 0 && y == 0) {
            chartPoints.push([points[i][0], y]);
            y = 1;
            chartPoints.push([points[i][0], y]);
          }
          if (points[i][1] == 0 && y > 0) {
            chartPoints.push([points[i][0], y]);
            y = 0;
            chartPoints.push([points[i][0], y]);
          }
          if (i > 0) {
            visualMapPieces.push({
              gt: points[i - 1][0],
              lt: points[i][0],
              // change the opacity based on the relative score
              color: color + ", " + (0.7 + (points[i][1] / max_y) * 0.3) + ")",
            });
          }
        }

        return {
          chartPoints: chartPoints,
          visualMapPieces: visualMapPieces,
        };
      };

      const displayChart = (chartId, results) => {
        chart = echarts.init(document.getElementById(chartId));
        function partition(array, isValid) {
          return array.reduce(
            ([pass, fail], elem) => {
              return isValid(elem)
                ? [[...pass, elem], fail]
                : [pass, [...fail, elem]];
            },
            [[], []],
          );
        }
        const [correctResults, incorrectResults] = partition(
          results,
          (result) =>
            result["is_best_similar"] &&
            result["best_similar_score"] >= scoreThresh,
        );

        chartMetaData = getChartMetaData(correctResults, correctColor);
        incorrectChartMetaData = getChartMetaData(
          incorrectResults,
          incorrectColor,
        );

        chartOption.series[0].data = chartMetaData.chartPoints;
        chartOption.series[1].data = incorrectChartMetaData.chartPoints;
        chartOption.visualMap.pieces = chartMetaData.visualMapPieces.concat(
          incorrectChartMetaData.visualMapPieces,
        );

        chart.setOption(chartOption);
      };

      const main = () => {
        data.sort((a, b) => {
          return (
            b["scores"]["all"][scoreThresh]["pass@1"] -
            a["scores"]["all"][scoreThresh]["pass@1"]
          );
        });
        clearTable();
        displayTable(contextTable);
      };

      main();
      document.addEventListener("DOMContentLoaded", function () {
        var slider = document.getElementById("fixedPointSlider");
        var output = document.getElementById("sliderValue");
        slider.addEventListener("change", function () {
          var sliderValue = this.value;
          output.textContent = (sliderValue / 10).toFixed(1);
          scoreThresh = (sliderValue / 10).toFixed(1);
          main();
        });
        slider.addEventListener("input", function () {
          var sliderValue = this.value;
          output.textContent = (sliderValue / 10).toFixed(1);
        });
      });
    </script>
  </body>
</html>