From cea50964251bca07caccc8e7c746446eb71709c8 Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Mon, 3 Jun 2024 17:45:30 +0200
Subject: [PATCH] feat: Update benchmark results in README.md

---
 README.md                                     |  12 +-
 .../tests/output_tests/MegaFake_report.md     |  12 +-
 megaparse/unstructured.py                     | 118 ++++++++++--------
 notebooks/evaluate.ipynb                      |  61 +++++----
 notebooks/unstructured.ipynb                  |  71 +++++++++++
 5 files changed, 186 insertions(+), 88 deletions(-)
 create mode 100644 notebooks/unstructured.ipynb

diff --git a/README.md b/README.md
index 89555a7..22ac370 100644
--- a/README.md
+++ b/README.md
@@ -62,10 +62,14 @@ print(content)
 
 ## BenchMark
 
-**Diff megaparse unstructured:** 120
-**Diff llama parse:** 31
-**Diff megaparse llama:** 26
-
+<!---BENCHMARK-->
+| Parser | Diff |
+|---|---|
+| Megaparse with LLamaParse and GPTCleaner | 84 |
+| **Megaparse** | 100 |
+| Megaparse with LLamaParse | 104 |
+| LLama Parse | 108 |
+<!---END_BENCHMARK-->
 
 *Lower is better*
 
diff --git a/megaparse/tests/output_tests/MegaFake_report.md b/megaparse/tests/output_tests/MegaFake_report.md
index 3a696bb..64870ca 100644
--- a/megaparse/tests/output_tests/MegaFake_report.md
+++ b/megaparse/tests/output_tests/MegaFake_report.md
@@ -11,11 +11,11 @@ Mega Parse is a state-of-the-art document parser designed to convert various doc
 
 Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.
 
-**Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.
+Multiple Format Support: Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.
 
-**High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.
+High-Speed Processing: One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.
 
-**Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.
+Markdown Output: Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.
 
 Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.
 
@@ -29,11 +29,11 @@ Error Handling: Advanced error handling capabilities ensure that any issues enco
 
 The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.
 
-**Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.
+Efficiency: By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.
 
-**Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.
+Versatility: Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.
 
-**Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
+Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
 also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.
 
 Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.
diff --git a/megaparse/unstructured.py b/megaparse/unstructured.py
index e0a328a..a5cf8fb 100644
--- a/megaparse/unstructured.py
+++ b/megaparse/unstructured.py
@@ -12,69 +12,70 @@ def convert_to_markdown(self, elements):
         element_hierarchy = {}
 
         for el in elements:
-            element_type = el["type"]
-            text = el["text"]
-            metadata = el["metadata"]
-            parent_id = metadata.get("parent_id", None)
-            category_depth = metadata.get("category_depth", 0)
-
-            markdown_line = ""
-
-            if element_type == "Title":
-                if parent_id:
-                    markdown_line = f"## {text}\n\n"  # Adjusted to add sub headers if parent_id exists
-                else:
-                    markdown_line = f"# {text}\n\n"
-            elif element_type == "Subtitle":
-                markdown_line = f"## {text}\n\n"
-            elif element_type == "Header":
-                markdown_line = f"{'#' * (category_depth + 1)} {text}\n\n"
-            elif element_type == "Footer":
-                markdown_line = f"#### {text}\n\n"
-            elif element_type == "NarrativeText":
-                markdown_line = f"{text}\n\n"
-            elif element_type == "ListItem":
-                markdown_line = f"- {text}\n"
-            elif element_type == "Table":
-                markdown_line = el["metadata"]["text_as_html"]
-            elif element_type == "PageBreak":
-                markdown_line = f"---\n\n"
-            elif element_type == "Image":
-                markdown_line = f"![Image]({el['metadata'].get('image_path', '')})\n\n"
-            elif element_type == "Formula":
-                markdown_line = f"$$ {text} $$\n\n"
-            elif element_type == "FigureCaption":
-                markdown_line = f"**Figure:** {text}\n\n"
-            elif element_type == "Address":
-                markdown_line = f"**Address:** {text}\n\n"
-            elif element_type == "EmailAddress":
-                markdown_line = f"**Email:** {text}\n\n"
-            elif element_type == "CodeSnippet":
-                markdown_line = (
-                    f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n"
-                )
-            elif element_type == "PageNumber":
-                markdown_line = f"**Page {text}**\n\n"
-            else:
-                markdown_line = f"{text}\n\n"
-
-            markdown_content += markdown_line
+            markdown_content += self.get_markdown_line(el)
 
         return markdown_content
 
-    def convert(self, path):
-        # Partition the PDF
-        elements = partition_pdf(
+    def get_markdown_line(self, el):
+        element_type = el["type"]
+        text = el["text"]
+        metadata = el["metadata"]
+        parent_id = metadata.get("parent_id", None)
+        category_depth = metadata.get("category_depth", 0)
+        if "emphasized_text_contents" in metadata:
+            print(metadata["emphasized_text_contents"])
+
+        markdown_line = ""
+
+        if element_type == "Title":
+            if parent_id:
+                markdown_line = (
+                    f"## {text}\n\n"  # Adjusted to add sub headers if parent_id exists
+                )
+            else:
+                markdown_line = f"# {text}\n\n"
+        elif element_type == "Subtitle":
+            markdown_line = f"## {text}\n\n"
+        elif element_type == "Header":
+            markdown_line = f"{'#' * (category_depth + 1)} {text}\n\n"
+        elif element_type == "Footer":
+            markdown_line = f"#### {text}\n\n"
+        elif element_type == "NarrativeText":
+            markdown_line = f"{text}\n\n"
+        elif element_type == "ListItem":
+            markdown_line = f"- {text}\n"
+        elif element_type == "Table":
+            markdown_line = el["metadata"]["text_as_html"]
+        elif element_type == "PageBreak":
+            markdown_line = f"---\n\n"
+        elif element_type == "Image":
+            markdown_line = f"![Image]({el['metadata'].get('image_path', '')})\n\n"
+        elif element_type == "Formula":
+            markdown_line = f"$$ {text} $$\n\n"
+        elif element_type == "FigureCaption":
+            markdown_line = f"**Figure:** {text}\n\n"
+        elif element_type == "Address":
+            markdown_line = f"**Address:** {text}\n\n"
+        elif element_type == "EmailAddress":
+            markdown_line = f"**Email:** {text}\n\n"
+        elif element_type == "CodeSnippet":
+            markdown_line = f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n"
+        elif element_type == "PageNumber":
+            markdown_line = f"**Page {text}**\n\n"
+        else:
+            markdown_line = f"{text}\n\n"
+
+        return markdown_line
+
+    def partition_pdf_file(self, path):
+        return partition_pdf(
             filename=path, infer_table_structure=True, strategy="hi_res"
         )
 
-        # Convert elements to markdown
-        # Check if dict is a table, if so send it to openai using langchain for cleaning and improvements
-
+    def improve_table_elements(self, elements):
         llm = ChatOpenAI(model="gpt-4o")
 
         # Define the prompt
-
         messages = [
             (
                 "system",
@@ -93,6 +94,15 @@ def convert(self, path):
 
             improved_elements.append(el)
 
+        return improved_elements
+
+    def convert(self, path):
+        # Partition the PDF
+        elements = self.partition_pdf_file(path)
+
+        # Improve table elements
+        improved_elements = self.improve_table_elements(elements)
+
         elements_dict = [el.to_dict() for el in improved_elements]
         markdown_content = self.convert_to_markdown(elements_dict)
         return markdown_content
diff --git a/notebooks/evaluate.ipynb b/notebooks/evaluate.ipynb
index 1449a84..568918b 100644
--- a/notebooks/evaluate.ipynb
+++ b/notebooks/evaluate.ipynb
@@ -9,15 +9,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Started parsing the file under job_id 2216e572-99ce-4b37-bdb6-b2d8ff2e18d1\n",
-      "Started parsing the file under job_id 1c36b5f9-bc30-475f-901d-ea79f9425205\n"
+      "Started parsing the file under job_id e5e0367d-2f83-4e4d-84e5-4d5df7119516\n",
+      "Started parsing the file under job_id 0b5d66aa-bbab-454b-b256-82495d20f91f\n"
      ]
     },
     {
@@ -62,14 +62,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Started parsing the file under job_id bed037cd-cdb1-45d1-971b-dc99094650b2\n"
+      "Started parsing the file under job_id f78ee794-ffde-4e0a-938d-987f1b22cfcb\n"
      ]
     }
    ],
@@ -112,7 +112,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -184,23 +184,36 @@
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Diff megaparse unstructured: 114\n",
-      "Diff megaparse llama: 26\n",
-      "Diff megaparse llama gptcleaner: 11\n",
-      "Diff llama parse: 31\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "print(f\"Diff megaparse unstructured: {diff_megaparse_unstructured}\")\n",
-    "print(f\"Diff megaparse llama: {diff_megaparse_llama}\")\n",
-    "print(f\"Diff megaparse llama gptcleaner: {diff_megaparse_llama_gptcleaner}\")\n",
-    "print(f\"Diff llama parse: {diff_llamaparse}\")\n"
+    "diff_results = {\n",
+    "    \"**Megaparse**\": diff_megaparse_unstructured,\n",
+    "    \"Megaparse with LLamaParse\": diff_megaparse_llama,\n",
+    "    \"Megaparse with LLamaParse and GPTCleaner\": diff_megaparse_llama_gptcleaner,\n",
+    "    \"LLama Parse\": diff_llamaparse\n",
+    "}\n",
+    "\n",
+    "# Sort the results\n",
+    "sorted_diff_results = sorted(diff_results.items(), key=lambda x: x[1])\n",
+    "\n",
+    "# Generate a table with the results\n",
+    "benchmark_results = \"| Parser | Diff |\\n|---|---|\\n\"\n",
+    "for parser, diff in sorted_diff_results:\n",
+    "    benchmark_results += f\"| {parser} | {diff} |\\n\"\n",
+    "\n",
+    "# Update README.md file\n",
+    "with open(\"../README.md\", \"r\") as readme_file:\n",
+    "    readme_content = readme_file.read()\n",
+    "\n",
+    "start_marker = \"<!---BENCHMARK-->\"\n",
+    "end_marker = \"<!---END_BENCHMARK-->\"\n",
+    "start_index = readme_content.find(start_marker) + len(start_marker)\n",
+    "end_index = readme_content.find(end_marker)\n",
+    "\n",
+    "updated_readme_content = readme_content[:start_index] + \"\\n\" + benchmark_results + readme_content[end_index:]\n",
+    "\n",
+    "with open(\"../README.md\", \"w\") as readme_file:\n",
+    "    readme_file.write(updated_readme_content)"
    ]
   }
  ],
diff --git a/notebooks/unstructured.ipynb b/notebooks/unstructured.ipynb
new file mode 100644
index 0000000..6174d33
--- /dev/null
+++ b/notebooks/unstructured.ipynb
@@ -0,0 +1,71 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Mega Parse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n",
+      "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "import sys\n",
+    "sys.path.append('..')\n",
+    "from megaparse.unstructured import UnstructuredParser\n",
+    "import os \n",
+    "\n",
+    "unstructured = UnstructuredParser()\n",
+    "file_partitioned = unstructured.partition_pdf_file('../megaparse/tests/input_tests/MegaFake_report.pdf')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unstructured2 = UnstructuredParser()\n",
+    "\n",
+    "\n",
+    "elements_dict = [el.to_dict() for el in file_partitioned]\n",
+    "markdown_content = unstructured2.convert_to_markdown(elements_dict)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "QuivrParse-DS8JDGq8",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}