diff --git a/.gitignore b/.gitignore index 39b7120..eefb26c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,13 @@ venv/ # Output PDF path directory _output/ + +# Ignore pandoc download binary +*.deb + +# images +*.jpg +*.jpeg +*.png +*.xml +*.shtml diff --git a/README.md b/README.md index 25a4b9d..b934e9b 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,25 @@ Setup python virtualenv using yen: yen create -p 3.12 venv ``` -Pandoc source links: +### Pandoc source links - [Pandoc](https://pandoc.org/) - [Try Pandoc](https://pandoc.org/try/) - [Pandoc Repository](https://github.com/jgm/pandoc) - [Pandoc Images](https://github.com/pandoc/dockerfiles) - [Pandoc GitHub Action](https://github.com/pandoc/pandoc-action-example) +- [Pandoc Exit Codes](https://pandoc.org/MANUAL.html#exit-codes) +### Pandoc related tools + +- [typst](https://github.com/typst/typst) +- [mdtopdf](https://github.com/mandolyte/mdtopdf) +- [quarto](https://github.com/quarto-dev/quarto) +- [mdp](https://github.com/visit1985/mdp) +- [revealjs](https://github.com/hakimel/reveal.js) +- [showman](https://github.com/ntjess/showman) +- [mermaid-cli](https://github.com/mermaid-js/mermaid-cli) + +### Pandoc blogs + +- https://learnbyexample.github.io/customizing-pandoc/ diff --git a/converter.py b/converter.py index c638c50..0085f2a 100644 --- a/converter.py +++ b/converter.py @@ -1,22 +1,126 @@ #!/usr/bin/env python +import os import sys +import shutil from pathlib import Path import pypandoc +import re if len(sys.argv) < 2: print("Usage: python converter.py ") sys.exit(1) + +# install_pandoc function +def install_pandoc(): + pandoc_bin = shutil.which("pandoc") + if not pandoc_bin: + from pypandoc.pandoc_download import download_pandoc + + print("Pandoc binary was not found!") + + # Set the environment variable PYPANDOC_PANDOC + # to the only location where pandoc will be searched + os.environ.setdefault("PYPANDOC_PANDOC", "/usr/bin/pandoc") + + # Pandoc binary installation and download directory. + bin_dir = Path(os.path.join(os.sep, "usr", "bin")) + tmp_dir = Path(os.path.join(os.sep, "tmp")) + + print(f"Downloading pandoc to {tmp_dir} and installing it in {bin_dir} ...\n") + download_pandoc(targetfolder=str(bin_dir), download_folder=str(tmp_dir)) + + +# convert_pandoc function +def convert_pandoc(input, output, auth: bool = False): + markdown = str(input) + pdf = str(output) + + args = [ + "--pdf-engine=pdflatex", + "--from=markdown-implicit_figures+rebase_relative_paths", + "--extract-media=.", + ] + + if auth: + args.append(f"--request-header=Authorization: token {GITHUB_AUTH_TOKEN}") + + pypandoc.convert_file(markdown, "pdf", outputfile=pdf, extra_args=args) + + +def search_markdown(path): + #md_element_image_url_pattern = r"^[!]?\[.*?\]\((https:\/\/[^\)]+)\)" + md_element_image_url_pattern = r"^[!]?\[.*?\]\((https:\/\/[^\)]+\.(?:png|jpg|jpeg|gif|bmp|svg))\)" + md_url_asset_pattern = r'/asset/' + #md_comment_pattern = r"\[.*?\]: <>" + md_comment_pattern = r"\[.*?\]: <> \(!\[.*?\]\(.*?/asset/.*?\)\)" + + md_pattern_found = False + md_with_comment = False + + for file in path.rglob("*.md"): + with file.open("r", encoding="utf-8") as f: + lines = f.readlines() + + new_lines = [] + count = 1 + + for line in lines: + if re.search(md_comment_pattern, line): + md_with_comment = True + + if md_with_comment: + break + + for line in lines: + match_pattern = re.match(md_element_image_url_pattern, line) + if match_pattern: + asset = match_pattern.group(0) + if re.search(md_url_asset_pattern, asset): + new_comment_pattern = f"[comment{count:03}]: <> ({asset})\n" + update_string = re.sub(md_element_image_url_pattern, new_comment_pattern, asset) + new_lines.append(update_string) + md_pattern_found = True + count += 1 + else: + new_lines.append(line) + else: + new_lines.append(line) + + with file.open("w", encoding="utf-8") as f: + f.writelines(new_lines) + + if md_pattern_found and not md_with_comment: + print("Markdown image URL element found, editing file to convert!") + print() + if not md_pattern_found and md_with_comment: + print(f"The markdown file already has comments in asset references, nothing to do!") + print() + if not md_pattern_found and not md_with_comment: + pass + +# Set the environment variable GITHUB_TOKEN to github authenticate. +# This variable must be set if the markdown files you want +# convert to PDF contain images with URL image referencing an +# image in a private repository. +GITHUB_AUTH_TOKEN = os.getenv("GITHUB_TOKEN") +GITHUB_AUTH = bool(GITHUB_AUTH_TOKEN) + +# Path to the directory where the markdown and +# destination path that will be created '_output' for converted pdf files. SOURCE_PATH_TO_MD = Path(sys.argv[1]) DESTINATION_PATH_TO_PDF = Path(SOURCE_PATH_TO_MD) / "_output" + if not SOURCE_PATH_TO_MD.exists() or not SOURCE_PATH_TO_MD.is_dir(): print("Invalid path to markdown directory!") sys.exit(1) markdown_files = list(SOURCE_PATH_TO_MD.rglob("*.md")) +source_md_files = [] +output_pdf_files = [] if not markdown_files: print(f"No markdown files found in {SOURCE_PATH_TO_MD}") @@ -25,12 +129,14 @@ if not DESTINATION_PATH_TO_PDF.exists(): DESTINATION_PATH_TO_PDF.mkdir() -source_md_files = [] -output_pdf_files = [] +install_pandoc() +search_markdown(SOURCE_PATH_TO_MD) -print("Converter from markdown to PDF\n") -print(f"Input directory: {SOURCE_PATH_TO_MD.absolute()}") -print(f"Output directory: {DESTINATION_PATH_TO_PDF.absolute()}\n") +print(f"Markdown input directory: {SOURCE_PATH_TO_MD.absolute()}") +print(f"PDF output directory: {DESTINATION_PATH_TO_PDF.absolute()}") +print(f"Markdown files found: {len(markdown_files)}") +print() +print("Converting markdown files to PDF ...\n") for markdown_file in markdown_files: relative_path = markdown_file.relative_to(SOURCE_PATH_TO_MD) pdf_output_path = DESTINATION_PATH_TO_PDF / relative_path.with_suffix(".pdf") @@ -41,17 +147,9 @@ output_pdf_files.append(pdf_output_path) try: - pypandoc.convert_file( - str(markdown_file), - "pdf", - outputfile=str(pdf_output_path), - extra_args=[ - "--pdf-engine=pdflatex", - "--from=markdown+rebase_relative_paths", - ], - ) + convert_pandoc(input=markdown_file, output=pdf_output_path, auth=GITHUB_AUTH) except Exception as e: - print(f"Error converting {markdown_file}: {e}") + print(f"Error converting {markdown_file.absolute()}: {e}") sys.exit(1) print("Source markdown files:") diff --git a/test/img/XenPanda.jpg b/test/img/XenPanda.jpg new file mode 100644 index 0000000..a2e5201 Binary files /dev/null and b/test/img/XenPanda.jpg differ diff --git a/test/second/README.md b/test/second/README.md index cc103d9..de72072 100644 --- a/test/second/README.md +++ b/test/second/README.md @@ -1 +1,32 @@ # Test Python Pandoc + + +### LOCAL IMAGE + +![image-from-local](../img/XenPanda.jpg) + +### IMAGE FROM PRIVATE REPOSITORY PNG + +![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux.png) + +### IMAGE FROM PRIVATE REPOSITORY PNG + +![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux.png) + +### IMAGE FROM PRIVATE REPOSITORY JPG + +![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux-minimalism.jpg) + +### IMAGE FROM PRIVATE REPOSITORY JPEG + +![linux-png-image](https://raw.githubusercontent.com/lbrealdev/private-images/main/linux-tux-cosmos.jpeg) + + +### TEST IMAGE FROM ASSET + +[comment001]: <> (![linux-png-image](https://github.com/repo/images/main/asset/linux-tux-cosmos.jpeg)) + +test comment + +[comment002]: <> (![linux-png-image](https://github.com/repo/images/main/asset/linux-tux-cosmos.jpeg)) +