lbrealdev · lbrealdev · Jun 20, 2024 · Jun 21, 2024 · Jun 21, 2024 · Jun 21, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,13 @@ venv/
 
 # Output PDF path directory
 _output/
+
+# Ignore pandoc download binary
+*.deb
+
+# images
+*.jpg
+*.jpeg
+*.png
+*.xml
+*.shtml
diff --git a/README.md b/README.md
@@ -12,11 +12,25 @@ Setup python virtualenv using yen:
 yen create -p 3.12 venv
 ```
 
-Pandoc source links:
+### Pandoc source links
 
 - [Pandoc](https://pandoc.org/)
 - [Try Pandoc](https://pandoc.org/try/)
 - [Pandoc Repository](https://github.com/jgm/pandoc)
 - [Pandoc Images](https://github.com/pandoc/dockerfiles)
 - [Pandoc GitHub Action](https://github.com/pandoc/pandoc-action-example)
+- [Pandoc Exit Codes](https://pandoc.org/MANUAL.html#exit-codes)
 
+### Pandoc related tools
+
+- [typst](https://github.com/typst/typst)
+- [mdtopdf](https://github.com/mandolyte/mdtopdf)
+- [quarto](https://github.com/quarto-dev/quarto)
+- [mdp](https://github.com/visit1985/mdp)
+- [revealjs](https://github.com/hakimel/reveal.js)
+- [showman](https://github.com/ntjess/showman)
+- [mermaid-cli](https://github.com/mermaid-js/mermaid-cli)
+
+### Pandoc blogs
+
+- https://learnbyexample.github.io/customizing-pandoc/
diff --git a/converter.py b/converter.py
@@ -1,22 +1,126 @@
 #!/usr/bin/env python
 
+import os
 import sys
+import shutil
 from pathlib import Path
 import pypandoc
+import re
 
 
 if len(sys.argv) < 2:
     print("Usage: python converter.py <path-to-markdown>")
     sys.exit(1)
 
+
+# install_pandoc function
+def install_pandoc():
+    pandoc_bin = shutil.which("pandoc")
+    if not pandoc_bin:
+        from pypandoc.pandoc_download import download_pandoc
+
+        print("Pandoc binary was not found!")
+
+        # Set the environment variable PYPANDOC_PANDOC
+        # to the only location where pandoc will be searched
+        os.environ.setdefault("PYPANDOC_PANDOC", "/usr/bin/pandoc")
+
+        # Pandoc binary installation and download directory.
+        bin_dir = Path(os.path.join(os.sep, "usr", "bin"))
+        tmp_dir = Path(os.path.join(os.sep, "tmp"))
+
+        print(f"Downloading pandoc to {tmp_dir} and installing it in {bin_dir} ...\n")
+        download_pandoc(targetfolder=str(bin_dir), download_folder=str(tmp_dir))
+
+
+# convert_pandoc function
+def convert_pandoc(input, output, auth: bool = False):
+    markdown = str(input)
+    pdf = str(output)
+
+    args = [
+        "--pdf-engine=pdflatex",
+        "--from=markdown-implicit_figures+rebase_relative_paths",
+        "--extract-media=.",
+    ]
+
+    if auth:
+        args.append(f"--request-header=Authorization: token {GITHUB_AUTH_TOKEN}")
+
+    pypandoc.convert_file(markdown, "pdf", outputfile=pdf, extra_args=args)
+
+
+def search_markdown(path):
+    #md_element_image_url_pattern = r"^[!]?\[.*?\]\((https:\/\/[^\)]+)\)"
+    md_element_image_url_pattern = r"^[!]?\[.*?\]\((https:\/\/[^\)]+\.(?:png|jpg|jpeg|gif|bmp|svg))\)"
+    md_url_asset_pattern = r'/asset/'
+    #md_comment_pattern = r"\[.*?\]: <>"
+    md_comment_pattern = r"\[.*?\]: <> \(!\[.*?\]\(.*?/asset/.*?\)\)"
+
+    md_pattern_found = False
+    md_with_comment = False
+
+    for file in path.rglob("*.md"):
+        with file.open("r", encoding="utf-8") as f:
+            lines = f.readlines()
+
+        new_lines = []
+        count = 1
+
+        for line in lines:
+            if re.search(md_comment_pattern, line):
+                md_with_comment = True
+
+        if md_with_comment:
+            break
+
+        for line in lines:
+            match_pattern = re.match(md_element_image_url_pattern, line)
+            if match_pattern:
+                asset = match_pattern.group(0)
+                if re.search(md_url_asset_pattern, asset):
+                    new_comment_pattern = f"[comment{count:03}]: <> ({asset})\n"
+                    update_string = re.sub(md_element_image_url_pattern, new_comment_pattern, asset)
+                    new_lines.append(update_string)
+                    md_pattern_found = True
+                    count += 1
+                else:
+                    new_lines.append(line)
+            else:
+                new_lines.append(line)
+
+            with file.open("w", encoding="utf-8") as f:
+                f.writelines(new_lines)
+
+    if md_pattern_found and not md_with_comment:
+        print("Markdown image URL element found, editing file to convert!")
+        print()
+    if not md_pattern_found and md_with_comment:
+        print(f"The markdown file already has comments in asset references, nothing to do!")
+        print()
+    if not md_pattern_found and not md_with_comment:
+        pass
+
+# Set the environment variable GITHUB_TOKEN to github authenticate.
+# This variable must be set if the markdown files you want
+# convert to PDF contain images with URL image referencing an
+# image in a private repository.
+GITHUB_AUTH_TOKEN = os.getenv("GITHUB_TOKEN")
+GITHUB_AUTH = bool(GITHUB_AUTH_TOKEN)
+
+# Path to the directory where the markdown and
+# destination path that will be created '_output' for converted pdf files.
 SOURCE_PATH_TO_MD = Path(sys.argv[1])
 DESTINATION_PATH_TO_PDF = Path(SOURCE_PATH_TO_MD) / "_output"
 
+
 if not SOURCE_PATH_TO_MD.exists() or not SOURCE_PATH_TO_MD.is_dir():
     print("Invalid path to markdown directory!")
     sys.exit(1)
 
 markdown_files = list(SOURCE_PATH_TO_MD.rglob("*.md"))
+source_md_files = []
+output_pdf_files = []
 
 if not markdown_files:
     print(f"No markdown files found in {SOURCE_PATH_TO_MD}")
@@ -25,12 +129,14 @@
 if not DESTINATION_PATH_TO_PDF.exists():
     DESTINATION_PATH_TO_PDF.mkdir()
 
-source_md_files = []
-output_pdf_files = []
+install_pandoc()
+search_markdown(SOURCE_PATH_TO_MD)
 
-print("Converter from markdown to PDF\n")
-print(f"Input directory: {SOURCE_PATH_TO_MD.absolute()}")
-print(f"Output directory: {DESTINATION_PATH_TO_PDF.absolute()}\n")
+print(f"Markdown input directory: {SOURCE_PATH_TO_MD.absolute()}")
+print(f"PDF output directory: {DESTINATION_PATH_TO_PDF.absolute()}")
+print(f"Markdown files found: {len(markdown_files)}")
+print()
+print("Converting markdown files to PDF ...\n")
 for markdown_file in markdown_files:
     relative_path = markdown_file.relative_to(SOURCE_PATH_TO_MD)
     pdf_output_path = DESTINATION_PATH_TO_PDF / relative_path.with_suffix(".pdf")
@@ -41,17 +147,9 @@
     output_pdf_files.append(pdf_output_path)
 
     try:
-        pypandoc.convert_file(
-            str(markdown_file),
-            "pdf",
-            outputfile=str(pdf_output_path),
-            extra_args=[
-                "--pdf-engine=pdflatex",
-                "--from=markdown+rebase_relative_paths",
-            ],
-        )
+        convert_pandoc(input=markdown_file, output=pdf_output_path, auth=GITHUB_AUTH)
     except Exception as e:
-        print(f"Error converting {markdown_file}: {e}")
+        print(f"Error converting {markdown_file.absolute()}: {e}")
         sys.exit(1)
 
 print("Source markdown files:")

diff --git a/test/img/XenPanda.jpg b/test/img/XenPanda.jpg
diff --git a/test/second/README.md b/test/second/README.md
@@ -1 +1,32 @@
 # Test Python Pandoc
+
+
+### LOCAL IMAGE
+
+![image-from-local](../img/XenPanda.jpg)
+
+### IMAGE FROM PRIVATE REPOSITORY PNG
+
+![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux.png)
+
+### IMAGE FROM PRIVATE REPOSITORY PNG
+
+![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux.png)
+
+### IMAGE FROM PRIVATE REPOSITORY JPG
+
+![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux-minimalism.jpg)
+
+### IMAGE FROM PRIVATE REPOSITORY JPEG
+
+![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux-cosmos.jpeg)
+
+
+### TEST IMAGE FROM ASSET
+
+[comment001]: <> (![linux-png-image](https://github.com/repo/images/main/asset/linux-tux-cosmos.jpeg))
+
+test comment
+
+[comment002]: <> (![linux-png-image](https://github.com/repo/images/main/asset/linux-tux-cosmos.jpeg))
+