From a00df7d070ead3b3fa377deb2151e0cf042deef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mit=C3=A1=C5=A1?= Date: Sun, 25 Feb 2024 15:12:25 +0100 Subject: [PATCH 1/3] Introduce MD_PARSER_v2. This version of MD_PARSER changes prototypes of callbacks enter_block(), leave_block(), enter_span(), leave_span() and text() so that their 1st argument is int instead of enumerations MD_BLOCK_TYPE, MD_SPAN_TYPE and MD_TEXT_TYPE respectivelly. The purpose if this is to enable introduction of new block, span and text type in runtime in upcoming patches. --- src/md4c.c | 26 ++++++++++++++++++-------- src/md4c.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/src/md4c.c b/src/md4c.c index 054c559e..c8b0000e 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -168,7 +168,7 @@ struct MD_CTX_tag { /* Immutable stuff (parameters of md_parse()). */ const CHAR* text; SZ size; - MD_PARSER parser; + MD_PARSER_v2 parser; void* userdata; /* When this is true, it allows some optimizations. */ @@ -6446,20 +6446,30 @@ int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata) { MD_CTX ctx; + size_t parser_size; int i; int ret; - if(parser->abi_version != 0) { - if(parser->debug_log != NULL) - parser->debug_log("Unsupported abi_version.", userdata); - return -1; - } - /* Setup context structure. */ memset(&ctx, 0, sizeof(MD_CTX)); + switch(parser->abi_version) { + case 0: + case 1: + parser_size = sizeof(MD_PARSER_v1); + break; + case 2: + parser_size = sizeof(MD_PARSER_v2); + break; + default: + if(parser->debug_log != NULL) + parser->debug_log("Unsupported abi_version.", userdata); + return -1; + } + memcpy(&ctx.parser, parser, parser_size); + memset((uint8_t*)&ctx.parser + parser_size, 0, sizeof(ctx.parser) - parser_size); + ctx.text = text; ctx.size = size; - memcpy(&ctx.parser, parser, sizeof(MD_PARSER)); ctx.userdata = userdata; ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4; md_build_mark_char_map(&ctx); diff --git a/src/md4c.h b/src/md4c.h index 8d6be1cb..5abdb2d3 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -336,8 +336,8 @@ typedef struct MD_SPAN_WIKILINK { /* Parser structure. */ -typedef struct MD_PARSER { - /* Reserved. Set to zero. +typedef struct MD_PARSER_v1 { + /* Set to one (for compatibility, zero is also accepted). */ unsigned abi_version; @@ -380,11 +380,59 @@ typedef struct MD_PARSER { /* Reserved. Set to NULL. */ void (*syntax)(void); -} MD_PARSER; +} MD_PARSER_v1; + + +typedef struct MD_PARSER_v2 { + /* Set to 2. + */ + unsigned abi_version; + + /* Dialect options. Bitmask of MD_FLAG_xxxx values. + */ + unsigned flags; + + /* Caller-provided rendering callbacks. + * + * For some block/span types, more detailed information is provided in a + * type-specific structure pointed by the argument 'detail'. + * + * The last argument of all callbacks, 'userdata', is just propagated from + * md_parse() and is available for any use by the application. + * + * Note any strings provided to the callbacks as their arguments or as + * members of any detail structure are generally not zero-terminated. + * Application has to take the respective size information into account. + * + * Any rendering callback may abort further parsing of the document by + * returning non-zero. + */ + int (*enter_block)(int /*type*/, void* /*detail*/, void* /*userdata*/); + int (*leave_block)(int /*type*/, void* /*detail*/, void* /*userdata*/); + + int (*enter_span)(int /*type*/, void* /*detail*/, void* /*userdata*/); + int (*leave_span)(int /*type*/, void* /*detail*/, void* /*userdata*/); + + int (*text)(int /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/); + + /* Debug callback. Optional (may be NULL). + * + * If provided and something goes wrong, this function gets called. + * This is intended for debugging and problem diagnosis for developers; + * it is not intended to provide any errors suitable for displaying to an + * end user. + */ + void (*debug_log)(const char* /*msg*/, void* /*userdata*/); + + /* Reserved. Set to NULL. + */ + void (*syntax)(void); +} MD_PARSER_v2; /* For backward compatibility. Do not use in new code. */ +typedef MD_PARSER_v1 MD_PARSER; typedef MD_PARSER MD_RENDERER; From ca4dd29e4f870e172f82bae9d9003af8d7926f62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mit=C3=A1=C5=A1?= Date: Sun, 25 Feb 2024 15:27:53 +0100 Subject: [PATCH 2/3] Implement flag MD_FLAG_SKIPBOM on parser side. Also update md2html to enable this on the parser side rather than renderer side. Make MD_HTML_FLAG_SKIP_UTF8_BOM deprecated. md_html() converts the flag into parser_flags. --- md2html/md2html.c | 8 ++------ src/md4c-html.c | 11 +++-------- src/md4c-html.h | 2 +- src/md4c.c | 15 +++++++++++++++ src/md4c.h | 32 +++++++++++++++++--------------- 5 files changed, 38 insertions(+), 30 deletions(-) diff --git a/md2html/md2html.c b/md2html/md2html.c index de4ab18e..25d51a72 100644 --- a/md2html/md2html.c +++ b/md2html/md2html.c @@ -34,12 +34,8 @@ /* Global options. */ -static unsigned parser_flags = 0; -#ifndef MD4C_USE_ASCII - static unsigned renderer_flags = MD_HTML_FLAG_DEBUG | MD_HTML_FLAG_SKIP_UTF8_BOM; -#else - static unsigned renderer_flags = MD_HTML_FLAG_DEBUG; -#endif +static unsigned parser_flags = MD_FLAG_SKIPBOM; +static unsigned renderer_flags = MD_HTML_FLAG_DEBUG; static int want_fullhtml = 0; static int want_xhtml = 0; static int want_stat = 0; diff --git a/src/md4c-html.c b/src/md4c-html.c index 5229de54..4dbba9aa 100644 --- a/src/md4c-html.c +++ b/src/md4c-html.c @@ -553,14 +553,9 @@ md_html(const MD_CHAR* input, MD_SIZE input_size, render.escape_map[i] |= NEED_URL_ESC_FLAG; } - /* Consider skipping UTF-8 byte order mark (BOM). */ - if(renderer_flags & MD_HTML_FLAG_SKIP_UTF8_BOM && sizeof(MD_CHAR) == 1) { - static const MD_CHAR bom[3] = { (char)0xef, (char)0xbb, (char)0xbf }; - if(input_size >= sizeof(bom) && memcmp(input, bom, sizeof(bom)) == 0) { - input += sizeof(bom); - input_size -= sizeof(bom); - } - } + /* For compatibility with old apps. */ + if(renderer_flags & MD_HTML_FLAG_SKIP_UTF8_BOM) + parser.flags |= MD_FLAG_SKIPBOM; return md_parse(input, input_size, &parser, (void*) &render); } diff --git a/src/md4c-html.h b/src/md4c-html.h index 324211da..15adcb1b 100644 --- a/src/md4c-html.h +++ b/src/md4c-html.h @@ -36,7 +36,7 @@ /* If set, debug output from md_parse() is sent to stderr. */ #define MD_HTML_FLAG_DEBUG 0x0001 #define MD_HTML_FLAG_VERBATIM_ENTITIES 0x0002 -#define MD_HTML_FLAG_SKIP_UTF8_BOM 0x0004 +#define MD_HTML_FLAG_SKIP_UTF8_BOM 0x0004 /* Deprecated; use MD_FLAG_SKIPBOM on the parser side in new code. */ #define MD_HTML_FLAG_XHTML 0x0008 diff --git a/src/md4c.c b/src/md4c.c index c8b0000e..ce5117a6 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -6468,6 +6468,21 @@ md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userd memcpy(&ctx.parser, parser, parser_size); memset((uint8_t*)&ctx.parser + parser_size, 0, sizeof(ctx.parser) - parser_size); +#if defined MD4C_USE_UTF8 || defined MD4C_USE_UTF16 + if(parser->flags & MD_FLAG_SKIPBOM) { +#ifdef MD4C_USE_UTF8 + static const MD_CHAR bom[3] = { (char)0xef, (char)0xbb, (char)0xbf }; +#endif +#ifdef MD4C_USE_UTF16 + static const MD_CHAR bom[1] = { (WCHAR)0xfeff }; +#endif + if(size >= SIZEOF_ARRAY(bom) && memcmp(text, bom, sizeof(bom)) == 0) { + text += SIZEOF_ARRAY(bom); + size -= SIZEOF_ARRAY(bom); + } + } +#endif + ctx.text = text; ctx.size = size; ctx.userdata = userdata; diff --git a/src/md4c.h b/src/md4c.h index 5abdb2d3..263471cc 100644 --- a/src/md4c.h +++ b/src/md4c.h @@ -303,21 +303,23 @@ typedef struct MD_SPAN_WIKILINK { * By default (when MD_PARSER::flags == 0), we follow CommonMark specification. * The following flags may allow some extensions or deviations from it. */ -#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ -#define MD_FLAG_PERMISSIVEATXHEADERS 0x0002 /* Do not require space in ATX headers ( ###header ) */ -#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x0004 /* Recognize URLs as autolinks even without '<', '>' */ -#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x0008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ -#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x0010 /* Disable indented code blocks. (Only fenced code works.) */ -#define MD_FLAG_NOHTMLBLOCKS 0x0020 /* Disable raw HTML blocks. */ -#define MD_FLAG_NOHTMLSPANS 0x0040 /* Disable raw HTML (inline). */ -#define MD_FLAG_TABLES 0x0100 /* Enable tables extension. */ -#define MD_FLAG_STRIKETHROUGH 0x0200 /* Enable strikethrough extension. */ -#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x0400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ -#define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */ -#define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */ -#define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */ -#define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */ -#define MD_FLAG_HARD_SOFT_BREAKS 0x8000 /* Force all soft breaks to act as hard breaks. */ +#define MD_FLAG_COLLAPSEWHITESPACE 0x00000001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */ +#define MD_FLAG_PERMISSIVEATXHEADERS 0x00000002 /* Do not require space in ATX headers ( ###header ) */ +#define MD_FLAG_PERMISSIVEURLAUTOLINKS 0x00000004 /* Recognize URLs as autolinks even without '<', '>' */ +#define MD_FLAG_PERMISSIVEEMAILAUTOLINKS 0x00000008 /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */ +#define MD_FLAG_NOINDENTEDCODEBLOCKS 0x00000010 /* Disable indented code blocks. (Only fenced code works.) */ +#define MD_FLAG_NOHTMLBLOCKS 0x00000020 /* Disable raw HTML blocks. */ +#define MD_FLAG_NOHTMLSPANS 0x00000040 /* Disable raw HTML (inline). */ +#define MD_FLAG_TABLES 0x00000100 /* Enable tables extension. */ +#define MD_FLAG_STRIKETHROUGH 0x00000200 /* Enable strikethrough extension. */ +#define MD_FLAG_PERMISSIVEWWWAUTOLINKS 0x00000400 /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */ +#define MD_FLAG_TASKLISTS 0x00000800 /* Enable task list extension. */ +#define MD_FLAG_LATEXMATHSPANS 0x00001000 /* Enable $ and $$ containing LaTeX equations. */ +#define MD_FLAG_WIKILINKS 0x00002000 /* Enable wiki links extension. */ +#define MD_FLAG_UNDERLINE 0x00004000 /* Enable underline extension (and disables '_' for normal emphasis). */ +#define MD_FLAG_HARD_SOFT_BREAKS 0x00008000 /* Force all soft breaks to act as hard breaks. */ +#define MD_FLAG_SKIPBOM 0x00010000 /* Skip Unicode BOM, if present. */ + #define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS) #define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS) From 3e93dd5e06fe47ce452df387c5948c0384fcbe43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Mit=C3=A1=C5=A1?= Date: Sun, 25 Feb 2024 15:39:28 +0100 Subject: [PATCH 3/3] Implement customizable HTML renderer. --- src/md4c-html.c | 297 ++++++++++++++++++++++++++---------------------- src/md4c-html.h | 79 ++++++++++++- 2 files changed, 241 insertions(+), 135 deletions(-) diff --git a/src/md4c-html.c b/src/md4c-html.c index 4dbba9aa..8c944a65 100644 --- a/src/md4c-html.c +++ b/src/md4c-html.c @@ -47,8 +47,8 @@ -typedef struct MD_HTML_tag MD_HTML; -struct MD_HTML_tag { +typedef struct MD_HTML MD_HTML; +struct MD_HTML { void (*process_output)(const MD_CHAR*, MD_SIZE, void*); void* userdata; unsigned flags; @@ -70,89 +70,11 @@ struct MD_HTML_tag { #define ISALNUM(ch) (ISLOWER(ch) || ISUPPER(ch) || ISDIGIT(ch)) -static inline void -render_verbatim(MD_HTML* r, const MD_CHAR* text, MD_SIZE size) -{ - r->process_output(text, size, r->userdata); -} - /* Keep this as a macro. Most compiler should then be smart enough to replace * the strlen() call with a compile-time constant if the string is a C literal. */ #define RENDER_VERBATIM(r, verbatim) \ - render_verbatim((r), (verbatim), (MD_SIZE) (strlen(verbatim))) - - -static void -render_html_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size) -{ - MD_OFFSET beg = 0; - MD_OFFSET off = 0; - - /* Some characters need to be escaped in normal HTML text. */ - #define NEED_HTML_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_HTML_ESC_FLAG) - - while(1) { - /* Optimization: Use some loop unrolling. */ - while(off + 3 < size && !NEED_HTML_ESC(data[off+0]) && !NEED_HTML_ESC(data[off+1]) - && !NEED_HTML_ESC(data[off+2]) && !NEED_HTML_ESC(data[off+3])) - off += 4; - while(off < size && !NEED_HTML_ESC(data[off])) - off++; - - if(off > beg) - render_verbatim(r, data + beg, off - beg); - - if(off < size) { - switch(data[off]) { - case '&': RENDER_VERBATIM(r, "&"); break; - case '<': RENDER_VERBATIM(r, "<"); break; - case '>': RENDER_VERBATIM(r, ">"); break; - case '"': RENDER_VERBATIM(r, """); break; - } - off++; - } else { - break; - } - beg = off; - } -} - -static void -render_url_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size) -{ - static const MD_CHAR hex_chars[] = "0123456789ABCDEF"; - MD_OFFSET beg = 0; - MD_OFFSET off = 0; - - /* Some characters need to be escaped in URL attributes. */ - #define NEED_URL_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_URL_ESC_FLAG) - - while(1) { - while(off < size && !NEED_URL_ESC(data[off])) - off++; - if(off > beg) - render_verbatim(r, data + beg, off - beg); - - if(off < size) { - char hex[3]; - - switch(data[off]) { - case '&': RENDER_VERBATIM(r, "&"); break; - default: - hex[0] = '%'; - hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf]; - hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf]; - render_verbatim(r, hex, 3); - break; - } - off++; - } else { - break; - } + md_html_output_verbatim((r), (verbatim), (MD_SIZE) (strlen(verbatim))) - beg = off; - } -} static unsigned hex_val(char ch) @@ -207,7 +129,7 @@ render_entity(MD_HTML* r, const MD_CHAR* text, MD_SIZE size, void (*fn_append)(MD_HTML*, const MD_CHAR*, MD_SIZE)) { if(r->flags & MD_HTML_FLAG_VERBATIM_ENTITIES) { - render_verbatim(r, text, size); + md_html_output_verbatim(r, text, size); return; } @@ -258,7 +180,7 @@ render_attribute(MD_HTML* r, const MD_ATTRIBUTE* attr, const MD_CHAR* text = attr->text + off; switch(type) { - case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, render_verbatim); break; + case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, md_html_output_verbatim); break; case MD_TEXT_ENTITY: render_entity(r, text, size, fn_append); break; default: fn_append(r, text, size); break; } @@ -302,7 +224,7 @@ render_open_code_block(MD_HTML* r, const MD_BLOCK_CODE_DETAIL* det) /* If known, output the HTML 5 attribute class="language-LANGNAME". */ if(det->lang.text != NULL) { RENDER_VERBATIM(r, " class=\"language-"); - render_attribute(r, &det->lang, render_html_escaped); + render_attribute(r, &det->lang, md_html_output_escaped); RENDER_VERBATIM(r, "\""); } @@ -327,11 +249,11 @@ static void render_open_a_span(MD_HTML* r, const MD_SPAN_A_DETAIL* det) { RENDER_VERBATIM(r, "href, render_url_escaped); + render_attribute(r, &det->href, md_html_output_url_escaped); if(det->title.text != NULL) { RENDER_VERBATIM(r, "\" title=\""); - render_attribute(r, &det->title, render_html_escaped); + render_attribute(r, &det->title, md_html_output_escaped); } RENDER_VERBATIM(r, "\">"); @@ -341,7 +263,7 @@ static void render_open_img_span(MD_HTML* r, const MD_SPAN_IMG_DETAIL* det) { RENDER_VERBATIM(r, "src, render_url_escaped); + render_attribute(r, &det->src, md_html_output_url_escaped); RENDER_VERBATIM(r, "\" alt=\""); } @@ -351,7 +273,7 @@ render_close_img_span(MD_HTML* r, const MD_SPAN_IMG_DETAIL* det) { if(det->title.text != NULL) { RENDER_VERBATIM(r, "\" title=\""); - render_attribute(r, &det->title, render_html_escaped); + render_attribute(r, &det->title, md_html_output_escaped); } RENDER_VERBATIM(r, (r->flags & MD_HTML_FLAG_XHTML) ? "\" />" : "\">"); @@ -361,18 +283,86 @@ static void render_open_wikilink_span(MD_HTML* r, const MD_SPAN_WIKILINK_DETAIL* det) { RENDER_VERBATIM(r, "target, render_html_escaped); + render_attribute(r, &det->target, md_html_output_escaped); RENDER_VERBATIM(r, "\">"); } +static void +md_html_init(MD_HTML* mh, void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned renderer_flags) +{ + int i; + + memset(mh, 0, sizeof(MD_HTML)); + mh->process_output = process_output; + mh->userdata = userdata; + mh->flags = renderer_flags; + mh->image_nesting_level = 0; + + /* Build map of characters which need escaping. */ + for(i = 0; i < 256; i++) { + unsigned char ch = (unsigned char) i; + + if(strchr("\"&<>", ch) != NULL) + mh->escape_map[i] |= NEED_HTML_ESC_FLAG; + + if(!ISALNUM(ch) && strchr("~-_.+!*(),%#@?=;:/,+$", ch) == NULL) + mh->escape_map[i] |= NEED_URL_ESC_FLAG; + } +} + /************************************** *** HTML renderer implementation *** **************************************/ -static int -enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) +int +md_html(const MD_CHAR* input, MD_SIZE input_size, + void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned parser_flags, unsigned renderer_flags) +{ + MD_HTML mh; + MD_PARSER_v2 parser = { + 0, + parser_flags, + md_html_enter_block, + md_html_leave_block, + md_html_enter_span, + md_html_leave_span, + md_html_text, + md_html_debug_log, + NULL + }; + + md_html_init(&mh, process_output, userdata, renderer_flags); + + /* For compatibility with old apps. */ + if(renderer_flags & MD_HTML_FLAG_SKIP_UTF8_BOM) + parser.flags |= MD_FLAG_SKIPBOM; + + return md_parse(input, input_size, (MD_PARSER*) &parser, (void*) &mh); +} + +MD_HTML* +md_html_create(void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned renderer_flags) +{ + MD_HTML* mh; + mh = (MD_HTML*) malloc(sizeof(MD_HTML)); + if(mh != NULL) + md_html_init(mh, process_output, userdata, renderer_flags); + return mh; +} + +void +md_html_destroy(MD_HTML* mh) +{ + free(mh); +} + +int +md_html_enter_block(int type, void* detail, void* userdata) { static const MD_CHAR* head[6] = { "

", "

", "

", "

", "

", "
" }; MD_HTML* r = (MD_HTML*) userdata; @@ -399,8 +389,8 @@ enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) return 0; } -static int -leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) +int +md_html_leave_block(int type, void* detail, void* userdata) { static const MD_CHAR* head[6] = { "
\n", "\n", "\n", "\n", "\n", "\n" }; MD_HTML* r = (MD_HTML*) userdata; @@ -427,8 +417,8 @@ leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata) return 0; } -static int -enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) +int +md_html_enter_span(int type, void* detail, void* userdata) { MD_HTML* r = (MD_HTML*) userdata; int inside_img = (r->image_nesting_level > 0); @@ -468,8 +458,8 @@ enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) return 0; } -static int -leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) +int +md_html_leave_span(int type, void* detail, void* userdata) { MD_HTML* r = (MD_HTML*) userdata; @@ -494,69 +484,108 @@ leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) return 0; } -static int -text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdata) +int +md_html_text(int type, const MD_CHAR* text, MD_SIZE size, void* userdata) { MD_HTML* r = (MD_HTML*) userdata; switch(type) { - case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, render_verbatim); break; + case MD_TEXT_NULLCHAR: render_utf8_codepoint(r, 0x0000, md_html_output_verbatim); break; case MD_TEXT_BR: RENDER_VERBATIM(r, (r->image_nesting_level == 0 ? ((r->flags & MD_HTML_FLAG_XHTML) ? "
\n" : "
\n") : " ")); break; case MD_TEXT_SOFTBR: RENDER_VERBATIM(r, (r->image_nesting_level == 0 ? "\n" : " ")); break; - case MD_TEXT_HTML: render_verbatim(r, text, size); break; - case MD_TEXT_ENTITY: render_entity(r, text, size, render_html_escaped); break; - default: render_html_escaped(r, text, size); break; + case MD_TEXT_HTML: md_html_output_verbatim(r, text, size); break; + case MD_TEXT_ENTITY: render_entity(r, text, size, md_html_output_escaped); break; + default: md_html_output_escaped(r, text, size); break; } return 0; } -static void -debug_log_callback(const char* msg, void* userdata) +void +md_html_debug_log(const char* msg, void* userdata) { MD_HTML* r = (MD_HTML*) userdata; if(r->flags & MD_HTML_FLAG_DEBUG) fprintf(stderr, "MD4C: %s\n", msg); } -int -md_html(const MD_CHAR* input, MD_SIZE input_size, - void (*process_output)(const MD_CHAR*, MD_SIZE, void*), - void* userdata, unsigned parser_flags, unsigned renderer_flags) +void +md_html_output_verbatim(MD_HTML* r, const MD_CHAR* text, MD_SIZE size) { - MD_HTML render = { process_output, userdata, renderer_flags, 0, { 0 } }; - int i; + r->process_output(text, size, r->userdata); +} - MD_PARSER parser = { - 0, - parser_flags, - enter_block_callback, - leave_block_callback, - enter_span_callback, - leave_span_callback, - text_callback, - debug_log_callback, - NULL - }; +void +md_html_output_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size) +{ + MD_OFFSET beg = 0; + MD_OFFSET off = 0; - /* Build map of characters which need escaping. */ - for(i = 0; i < 256; i++) { - unsigned char ch = (unsigned char) i; + /* Some characters need to be escaped in normal HTML text. */ + #define NEED_HTML_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_HTML_ESC_FLAG) - if(strchr("\"&<>", ch) != NULL) - render.escape_map[i] |= NEED_HTML_ESC_FLAG; + while(1) { + /* Optimization: Use some loop unrolling. */ + while(off + 3 < size && !NEED_HTML_ESC(data[off+0]) && !NEED_HTML_ESC(data[off+1]) + && !NEED_HTML_ESC(data[off+2]) && !NEED_HTML_ESC(data[off+3])) + off += 4; + while(off < size && !NEED_HTML_ESC(data[off])) + off++; - if(!ISALNUM(ch) && strchr("~-_.+!*(),%#@?=;:/,+$", ch) == NULL) - render.escape_map[i] |= NEED_URL_ESC_FLAG; + if(off > beg) + md_html_output_verbatim(r, data + beg, off - beg); + + if(off < size) { + switch(data[off]) { + case '&': RENDER_VERBATIM(r, "&"); break; + case '<': RENDER_VERBATIM(r, "<"); break; + case '>': RENDER_VERBATIM(r, ">"); break; + case '"': RENDER_VERBATIM(r, """); break; + } + off++; + } else { + break; + } + beg = off; } +} - /* For compatibility with old apps. */ - if(renderer_flags & MD_HTML_FLAG_SKIP_UTF8_BOM) - parser.flags |= MD_FLAG_SKIPBOM; +void +md_html_output_url_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size) +{ + static const MD_CHAR hex_chars[] = "0123456789ABCDEF"; + MD_OFFSET beg = 0; + MD_OFFSET off = 0; - return md_parse(input, input_size, &parser, (void*) &render); -} + /* Some characters need to be escaped in URL attributes. */ + #define NEED_URL_ESC(ch) (r->escape_map[(unsigned char)(ch)] & NEED_URL_ESC_FLAG) + while(1) { + while(off < size && !NEED_URL_ESC(data[off])) + off++; + if(off > beg) + md_html_output_verbatim(r, data + beg, off - beg); + + if(off < size) { + char hex[3]; + + switch(data[off]) { + case '&': RENDER_VERBATIM(r, "&"); break; + default: + hex[0] = '%'; + hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf]; + hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf]; + md_html_output_verbatim(r, hex, 3); + break; + } + off++; + } else { + break; + } + + beg = off; + } +} diff --git a/src/md4c-html.h b/src/md4c-html.h index 15adcb1b..84001d3d 100644 --- a/src/md4c-html.h +++ b/src/md4c-html.h @@ -40,11 +40,14 @@ #define MD_HTML_FLAG_XHTML 0x0008 -/* Render Markdown into HTML. +/* Simple do-it-all function for converting Markdown to HTML. * * Note only contents of tag is generated. Caller must generate * HTML header/footer manually before/after calling md_html(). * + * For more control over the conversion (e.g. to customize the output), you may + * use more fine-grained API below. + * * Params input and input_size specify the Markdown input. * Callback process_output() gets called with chunks of HTML output. * (Typical implementation may just output the bytes to a file or append to @@ -61,6 +64,80 @@ int md_html(const MD_CHAR* input, MD_SIZE input_size, void* userdata, unsigned parser_flags, unsigned renderer_flags); +/* The functions below provide more finer-grained building blocks, which allow + * application to e.g. customize how (some) Markdown syntax constructions are + * converted into HTML. + * + * The call to md_html() above is morally equivalent to this code: + * + * ``` C + * #include "md4c.h" + * #include "md4c-html.h" + * + * int + * md_html(const MD_CHAR* input, MD_SIZE input_size, + * void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + * void* userdata, unsigned parser_flags, unsigned renderer_flags) + * { + * MD_HTML* mh; + * MD_PARSER_v2 p; + * int ret; + * + * mh = md_html_create(process_output, userdata, parser_flags, renderer_flags); + * if(mh == NULL) + * return -1; + * + * memset(&p, 0, sizeof(p)); + * p.abi_version = 2; + * p.flags = parser_flags; + * p.enter_block = md_html_enter_block; + * p.leave_block = md_html_leave_block; + * p.enter_span = md_html_enter_span; + * p.leave_span = md_html_leave_span; + * p.text = md_html_text; + * p.debug_log = md_html_debug_log; + * + * ret = md_parse(input, input_size, (MD_PARSER*) &p, (void*) mh); + * + * md_html_destroy(mh); + * return ret; + * } + * ``` + * + * This allows application to implement its own callbacks for md_parse() + * which may provide custom output e.g. for some block and/or span types, and + * calls the original callback for block/span types it does not want to + * customize. + */ + +/* An opaque structure representing the Markdown-to-HTML converter. */ +typedef struct MD_HTML MD_HTML; + +/* Create/destroy the Markdown-to-HTML converter structure. */ +MD_HTML* md_html_create(void (*process_output)(const MD_CHAR*, MD_SIZE, void*), + void* userdata, unsigned renderer_flags); +void md_html_destroy(MD_HTML* mh); + +/* Standard HTML callbacks for MD_PARSER. + * + * (Application can use its own callback and use these functions as "fallback" + * for stuff it does not want to customize. In such case the application is + * responsible for propagating MD_HTML* returned from md_html_create() as + * userdata to these standard callbacks.) + */ +int md_html_enter_block(int block_type, void* detail, void* userdata); +int md_html_leave_block(int block_type, void* detail, void* userdata); +int md_html_enter_span(int span_type, void* detail, void* userdata); +int md_html_leave_span(int span_type, void* detail, void* userdata); +int md_html_text(int text_type, const MD_CHAR* text, MD_SIZE size, void* userdata); +void md_html_debug_log(const char* msg, void* userdata); + +/* Functions to call from custom md_parser() callbacks, to make an output. */ +void md_html_output_verbatim(MD_HTML* mh, const MD_CHAR* test, MD_SIZE size); +void md_html_output_escaped(MD_HTML* mh, const MD_CHAR* test, MD_SIZE size); +void md_html_output_url_escaped(MD_HTML* mh, const MD_CHAR* test, MD_SIZE size); + + #ifdef __cplusplus } /* extern "C" { */ #endif