From 071f71d76dd850d0d2e455b4ebb30026e41f2490 Mon Sep 17 00:00:00 2001 From: Martin Mitas Date: Tue, 16 Jan 2024 18:56:32 +0100 Subject: [PATCH] Rework permissive autolinks. * We have now dedicated run over the inline marks for them. * We check more throughly whether it really looks as an URL or e-mail address. The old implementation recognized even heavily broken ones. * This allows us to be much more careful in order not to cross already resolved marks. * Share substantial parts of the code between all three types of the permissive autolinks (URL, WWW, e-mail). * Merge their tests into one file, spec-permissive-autolinks.txt. * Add one pathological case which triggered quadratic behavior in the old implementation. --- scripts/run-tests.sh | 12 +- src/md4c.c | 288 +++++++++++++---------- test/pathological-tests.py | 6 +- test/spec-permissive-autolinks.txt | 248 +++++++++++++++++++ test/spec-permissive-email-autolinks.txt | 58 ----- test/spec-permissive-url-autolinks.txt | 76 ------ test/spec-permissive-www-autolinks.txt | 106 --------- 7 files changed, 423 insertions(+), 371 deletions(-) create mode 100644 test/spec-permissive-autolinks.txt delete mode 100644 test/spec-permissive-email-autolinks.txt delete mode 100644 test/spec-permissive-url-autolinks.txt delete mode 100644 test/spec-permissive-www-autolinks.txt diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index dab91b7b..8af7684e 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -31,16 +31,8 @@ echo "CommonMark specification:" $PYTHON "$TEST_DIR/run-testsuite.py" -s "$TEST_DIR/spec.txt" -p "$PROGRAM" echo -echo "Permissive e-mail autolinks extension:" -$PYTHON "$TEST_DIR/run-testsuite.py" -s "$TEST_DIR/spec-permissive-email-autolinks.txt" -p "$PROGRAM" - -echo -echo "Permissive URL autolinks extension:" -$PYTHON "$TEST_DIR/run-testsuite.py" -s "$TEST_DIR/spec-permissive-url-autolinks.txt" -p "$PROGRAM" - -echo -echo "WWW autolinks extension:" -$PYTHON "$TEST_DIR/run-testsuite.py" -s "$TEST_DIR/spec-permissive-www-autolinks.txt" -p "$PROGRAM" +echo "Permissive autolink extensions:" +$PYTHON "$TEST_DIR/run-testsuite.py" -s "$TEST_DIR/spec-permissive-autolinks.txt" -p "$PROGRAM" echo echo "Hard soft breaks extension:" diff --git a/src/md4c.c b/src/md4c.c index 79b91f5c..35f5ea09 100644 --- a/src/md4c.c +++ b/src/md4c.c @@ -3021,8 +3021,9 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) int codespan_scanned_till_paragraph_end = FALSE; for(line = lines; line < line_term; line++) { - OFF off = line->beg; + OFF line_beg = line->beg; OFF line_end = line->end; + OFF off = line_beg; while(TRUE) { CHAR ch; @@ -3245,7 +3246,7 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) { PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER); /* Push a dummy as a reserve for a closer. */ - PUSH_MARK('D', off, off, 0); + PUSH_MARK('D', line_beg, line_end, 0); } off++; @@ -3274,12 +3275,11 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) const SZ suffix_size = scheme_map[scheme_index].suffix_size; if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) && - (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) && off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size)) { PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER); /* Push a dummy as a reserve for a closer. */ - PUSH_MARK('D', off, off, 0); + PUSH_MARK('D', line_beg, line_end, 0); off += 1 + suffix_size; break; } @@ -3291,13 +3291,10 @@ md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode) /* A potential permissive WWW autolink. */ if(ch == _T('.')) { - if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) && - (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) && - off + 1 < line_end) - { + if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3)) { PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER); /* Push a dummy as a reserve for a closer. */ - PUSH_MARK('D', off, off, 0); + PUSH_MARK('D', line_beg, line_end, 0); off++; continue; } @@ -3878,142 +3875,184 @@ md_analyze_dollar(MD_CTX* ctx, int mark_index) md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index); } -static void -md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index) +static MD_MARK* +md_scan_left_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor) { - MD_MARK* opener = &ctx->marks[mark_index]; - int closer_index = mark_index + 1; - MD_MARK* closer = &ctx->marks[closer_index]; - MD_MARK* next_resolved_mark; - OFF off = opener->end; - int n_dots = FALSE; - int has_underscore_in_last_seg = FALSE; - int has_underscore_in_next_to_last_seg = FALSE; - int n_opened_parenthesis = 0; - int n_excess_parenthesis = 0; - - /* Check for domain. */ - while(off < ctx->size) { - if(ISALNUM(off) || CH(off) == _T('-')) { - off++; - } else if(CH(off) == _T('.')) { - /* We must see at least one period. */ - n_dots++; - has_underscore_in_next_to_last_seg = has_underscore_in_last_seg; - has_underscore_in_last_seg = FALSE; - off++; - } else if(CH(off) == _T('_')) { - /* No underscore may be present in the last two domain segments. */ - has_underscore_in_last_seg = TRUE; - off++; - } else { - break; + MD_MARK* mark; + + for(mark = mark_from; mark >= ctx->marks; mark--) { + if(mark->ch == 'D' || mark->beg > off) + continue; + if(mark->beg <= off && off < mark->end && (mark->flags & MD_MARK_RESOLVED)) { + if(p_cursor != NULL) + *p_cursor = mark; + return mark; } + if(mark->end <= off) + break; } - if(off > opener->end && CH(off-1) == _T('.')) { - off--; - n_dots--; - } - if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg) - return; - /* Check for path. */ - next_resolved_mark = closer + 1; - while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED)) - next_resolved_mark++; - while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) { - /* Parenthesis must be balanced. */ - if(CH(off) == _T('(')) { - n_opened_parenthesis++; - } else if(CH(off) == _T(')')) { - if(n_opened_parenthesis > 0) - n_opened_parenthesis--; - else - n_excess_parenthesis++; - } + if(p_cursor != NULL) + *p_cursor = mark; + return NULL; +} - off++; - } +static MD_MARK* +md_scan_right_for_resolved_mark(MD_CTX* ctx, MD_MARK* mark_from, OFF off, MD_MARK** p_cursor) +{ + MD_MARK* mark; - /* Trim a trailing punctuation from the end. */ - while(TRUE) { - if(ISANYOF(off-1, _T("?!.,:*_~"))) { - off--; - } else if(CH(off-1) == ')' && n_excess_parenthesis > 0) { - /* Unmatched ')' can be in an interior of the path but not at the - * of it, so the auto-link may be safely nested in a parenthesis - * pair. */ - off--; - n_excess_parenthesis--; - } else { - break; + for(mark = mark_from; mark < ctx->marks + ctx->n_marks; mark++) { + if(mark->ch == 'D' || mark->end <= off) + continue; + if(mark->beg <= off && off < mark->end && (mark->flags & MD_MARK_RESOLVED)) { + if(p_cursor != NULL) + *p_cursor = mark; + return mark; } + if(mark->beg > off) + break; } - /* Ok. Lets call it an auto-link. Adapt opener and create closer to zero - * length so all the contents becomes the link text. */ - MD_ASSERT(closer->ch == 'D' || - ((ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS) && - (closer->ch == '.' || closer->ch == ':' || closer->ch == '@'))); - opener->end = opener->beg; - closer->ch = opener->ch; - closer->beg = off; - closer->end = off; - md_resolve_range(ctx, NULL, mark_index, closer_index); + if(p_cursor != NULL) + *p_cursor = mark; + return NULL; } -/* The permissive autolinks do not have to be enclosed in '<' '>' but we - * instead impose stricter rules what is understood as an e-mail address - * here. Actually any non-alphanumeric characters with exception of '.' - * are prohibited both in username and after '@'. */ static void -md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index) +md_analyze_permissive_autolink(MD_CTX* ctx, int mark_index) { + static const struct { + const MD_CHAR start_char; + const MD_CHAR delim_char; + const MD_CHAR* allowed_nonalnum_chars; + int min_components; + } URL_MAP[] = { + { _T('\0'), _T('.'), _T(".-_"), 2 }, /* host, mandatory */ + { _T('/'), _T('/'), _T("/.-_"), 0 }, /* path */ + { _T('?'), _T('&'), _T("&.-+_=()"), 1 }, /* query */ + { _T('#'), _T('\0'), _T(".-+_") , 1 } /* fragment */ + }; + MD_MARK* opener = &ctx->marks[mark_index]; - int closer_index; - MD_MARK* closer; + MD_MARK* closer = &ctx->marks[mark_index + 1]; /* The dummy. */ + OFF line_beg = closer->beg; /* md_collect_mark() set this for us */ + OFF line_end = closer->end; /* ditto */ OFF beg = opener->beg; OFF end = opener->end; - int dot_count = 0; + MD_MARK* left_cursor = opener; + int left_boundary_ok = FALSE; + MD_MARK* right_cursor = opener; + int right_boundary_ok = FALSE; + unsigned i; - MD_ASSERT(CH(beg) == _T('@')); + MD_ASSERT(closer->ch == 'D'); + + if(opener->ch == '@') { + MD_ASSERT(CH(opener->beg) == _T('@')); + + /* Scan backwards for the user name (before '@'). */ + while(beg > line_beg) { + if(ISALNUM(beg-1)) + beg--; + else if(beg >= line_beg+2 && ISALNUM(beg-2) && + ISANYOF(beg-1, _T(".-_+")) && + md_scan_left_for_resolved_mark(ctx, left_cursor, beg-1, &left_cursor) == NULL && + ISALNUM(beg)) + beg--; + else + break; + } + if(beg == opener->beg) /* empty user name */ + return; + } + + for(i = 0; i < SIZEOF_ARRAY(URL_MAP); i++) { + int n_components = 0; + int n_open_brackets = 0; + + if(URL_MAP[i].start_char != _T('\0')) { + if(end + 1 >= line_end || CH(end) != URL_MAP[i].start_char || !ISALNUM(end+1)) + continue; + end++; + } - /* Scan for name before '@'. */ - while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+")))) - beg--; + while(end < line_end) { + if(ISALNUM(end)) { + if(n_components == 0) + n_components++; + end++; + } else if(end < line_end && + ISANYOF(end, URL_MAP[i].allowed_nonalnum_chars) && + md_scan_right_for_resolved_mark(ctx, right_cursor, end, &right_cursor) == NULL && + ((end > line_beg && (ISALNUM(end-1) || CH(end-1) == _T(')'))) || CH(end) == _T('(')) && + ((end+1 < line_end && (ISALNUM(end+1) || CH(end+1) == _T('('))) || CH(end) == _T(')'))) + { + if(CH(end) == URL_MAP[i].delim_char) + n_components++; + + /* brackets have to be balanced. */ + if(CH(end) == _T('(')) { + n_open_brackets++; + } else if(CH(end) == _T(')')) { + if(n_open_brackets <= 0) + break; + n_open_brackets--; + } - /* Scan for domain after '@'. */ - while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) { - if(CH(end) == _T('.')) - dot_count++; - end++; + end++; + } else { + break; + } + } + + if(n_components < URL_MAP[i].min_components || n_open_brackets != 0) + return; + + if(opener->ch == '@') /* E-mail autolinks wants only the host. */ + break; } - if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */ - dot_count--; - end--; + + /* Verify there's line boundary, whitespace or resolved emphasis mark just + * before and after the suspected autolink. */ + if(beg == line_beg || ISUNICODEWHITESPACEBEFORE(beg) || ISANYOF(beg-1, _T("({["))) { + left_boundary_ok = TRUE; + } else if(ISANYOF(beg-1, _T("*_~"))) { + MD_MARK* left_mark; + + left_mark = md_scan_left_for_resolved_mark(ctx, left_cursor, beg-1, &left_cursor); + if(left_mark != NULL && (left_mark->flags & MD_MARK_OPENER)) + left_boundary_ok = TRUE; } - else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */ - return; - if(CH(end-1) == _T('@') || dot_count == 0) + if(!left_boundary_ok) return; - /* Ok. Lets call it auto-link. Adapt opener and create closer to zero - * length so all the contents becomes the link text. */ - closer_index = mark_index + 1; - closer = &ctx->marks[closer_index]; - MD_ASSERT(closer->ch == 'D'); + if(end == line_end || ISUNICODEWHITESPACE(end) || ISANYOF(end, _T(")}].!?,;"))) { + right_boundary_ok = TRUE; + } else { + MD_MARK* right_mark; + right_mark = md_scan_right_for_resolved_mark(ctx, right_cursor, end, &right_cursor); + if(right_mark != NULL && (right_mark->flags & MD_MARK_CLOSER)) + right_boundary_ok = TRUE; + } + if(!right_boundary_ok) + return; + + /* Success, we are an autolink. */ opener->beg = beg; opener->end = beg; - closer->ch = opener->ch; closer->beg = end; closer->end = end; - md_resolve_range(ctx, NULL, mark_index, closer_index); + closer->ch = opener->ch; + md_resolve_range(ctx, NULL, mark_index, mark_index + 1); } +#define MD_ANALYZE_NOSKIP_EMPH 0x01 + static inline void md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, - int mark_beg, int mark_end, const CHAR* mark_chars) + int mark_beg, int mark_end, const CHAR* mark_chars, unsigned flags) { int i = mark_beg; OFF last_end = lines[0].beg; @@ -4026,7 +4065,9 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, /* Skip resolved spans. */ if(mark->flags & MD_MARK_RESOLVED) { - if(mark->flags & MD_MARK_OPENER) { + if((mark->flags & MD_MARK_OPENER) && + !((flags & MD_ANALYZE_NOSKIP_EMPH) && ISANYOF_(mark->ch, "*_~"))) + { MD_ASSERT(i < mark->next); i = mark->next + 1; } else { @@ -4059,8 +4100,8 @@ md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, case '~': md_analyze_tilde(ctx, i); break; case '$': md_analyze_dollar(ctx, i); break; case '.': /* Pass through. */ - case ':': md_analyze_permissive_url_autolink(ctx, i); break; - case '@': md_analyze_permissive_email_autolink(ctx, i); break; + case ':': /* Pass through. */ + case '@': md_analyze_permissive_autolink(ctx, i); break; } if(mark->flags & MD_MARK_RESOLVED) { @@ -4087,7 +4128,7 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mod MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode)); /* (1) Links. */ - md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!")); + md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"), 0); MD_CHECK(md_resolve_links(ctx, lines, n_lines)); BRACKET_OPENERS.head = -1; BRACKET_OPENERS.tail = -1; @@ -4098,7 +4139,7 @@ md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mod /* (2) Analyze table cell boundaries. */ MD_ASSERT(n_lines == 1); ctx->n_table_cell_boundaries = 0; - md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|")); + md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"), 0); return ret; } @@ -4115,8 +4156,15 @@ md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines, { int i; - md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&")); - md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:.")); + md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("&"), 0); + md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$"), 0); + + if((ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) != 0) { + /* These have to be processed last, as they may be greedy and expand + * from their original mark. Also their implementation must be careful + * not to cross any (previously) resolved marks when doing so. */ + md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("@:."), MD_ANALYZE_NOSKIP_EMPH); + } for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) { ctx->mark_chains[i].head = -1; diff --git a/test/pathological-tests.py b/test/pathological-tests.py index 78d77b90..fa6318be 100644 --- a/test/pathological-tests.py +++ b/test/pathological-tests.py @@ -91,7 +91,11 @@ re.compile("\r?\n){49999}")), "nested invalid link references": (("[" * 50000 + "]" * 50000 + "\n\n[a]: /b"), - re.compile("\[{50000}\]{50000}")) + re.compile("\[{50000}\]{50000}")), + "many broken permissive autolinks": + (("www._" * 50000 + "x"), + re.compile("

(www._){50000}x

"), + "--fpermissive-www-autolinks") } whitespace_re = re.compile('/s+/') diff --git a/test/spec-permissive-autolinks.txt b/test/spec-permissive-autolinks.txt new file mode 100644 index 00000000..f7d26719 --- /dev/null +++ b/test/spec-permissive-autolinks.txt @@ -0,0 +1,248 @@ + +# Permissive Autolinks + +Standard autolinks (as per CommonMark specification) have to be decorated with +`<` and `>` so for example: + +```````````````````````````````` example + + +. +

mailto:john.doe@gmail.com +https://example.com

+```````````````````````````````` + +With flags `MD_FLAG_PERMISSIVEURLAUTOLINKS`, `MD_FLAG_PERMISSIVEWWWAUTOLINKS` +and `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C is able also to recognize autolinks +without those marks. + +Example of permissive autolinks follows: + +```````````````````````````````` example +john.doe@gmail.com +https://www.example.com +www.example.com +. +

john.doe@gmail.com +https://www.example.com +www.example.com

+. +--fpermissive-email-autolinks +--fpermissive-url-autolinks +--fpermissive-www-autolinks +```````````````````````````````` + +However as this syntax also brings some more danger of false positives, more +strict rules apply to what characters may or may not form such autolinks. +When a need arises to use a link which does not satisfy these restrictions, +standard Markdown autolinks have to be used. + +First and formost, these autolinks have to be delimited from surrounded text, +i.e. whitespace, beginning/end of line, or very limited punctuation must +precede and follow respectively. + +Therefore these are not autolinks because `:` precedes or follows: + +```````````````````````````````` example +:john.doe@gmail.com +:https://www.example.com +:www.example.com +. +

:john.doe@gmail.com +:https://www.example.com +:www.example.com

+. +--fpermissive-email-autolinks +--fpermissive-url-autolinks +--fpermissive-www-autolinks +```````````````````````````````` + +Allowed punctuation right before autolink includes only opening brackets `(`, +`{` or `[`: + +```````````````````````````````` example +[john.doe@gmail.com +(https://www.example.com +{www.example.com +. +

[john.doe@gmail.com +(https://www.example.com +{www.example.com

+. +--fpermissive-email-autolinks +--fpermissive-url-autolinks +--fpermissive-www-autolinks +```````````````````````````````` + +Correspondingly, the respective closing brackets may follow the autolinks. + +```````````````````````````````` example +john.doe@gmail.com] +https://www.example.com) +www.example.com} +. +

john.doe@gmail.com] +https://www.example.com) +www.example.com}

+. +--fpermissive-email-autolinks +--fpermissive-url-autolinks +--fpermissive-www-autolinks +```````````````````````````````` + +Some other punctuation characters are also allowed after the autolink so that +the autolinks may appear at the end of a sentence or clause (`.`, `!`, `?`, +`,`, `;`): + +```````````````````````````````` example +Have you ever visited http://zombo.com? +. +

Have you ever visited http://zombo.com?

+. +--fpermissive-url-autolinks +```````````````````````````````` + +Markdown emphasis mark can also precede (but only opening mark) or follow +(only closer mark): + +```````````````````````````````` example +You may contact me at **john.doe@example.com**. +. +

You may contact me at john.doe@example.com.

+. +--fpermissive-email-autolinks +```````````````````````````````` + +However the following is not, because in this example `*` is literal `*` and +such punctuation is not allowed before the autolink: + +```````````````````````````````` example +*john.doe@example.com + +john.doe@example.com* +. +

*john.doe@example.com

+

john.doe@example.com*

+. +--fpermissive-email-autolinks +```````````````````````````````` + +## Permissive URL Autolinks + +Permissive URL autolinks (`MD_FLAG_PERMISSIVEURLAUTOLINKS`) are formed +by mandatory URL scheme, mandatory host, optional path, optional query and +optional fragment. + +The permissive URL autolinks recognize only `http://`, `https://` and `ftp://` +as the scheme: + +```````````````````````````````` example +https://example.com +http://example.com +ftp://example.com + +ssh://example.com +. +

https://example.com +http://example.com +ftp://example.com

+

ssh://example.com

+. +--fpermissive-url-autolinks +```````````````````````````````` + +The host is a sequence made of alphanumerical characters, `.`, `-` and `_`. +It has to include at least two components delimited with `.`, last component +has to have at least two characters, and occurrence of `.`, `-` and `_` has to +be immediately preceded and followed with a letter or digit. + +The host specification may optionally be followed with path. Path begins with +character `/` and uses it also for delimiting path components. Every path +component is made of alhanumerical characters and `.`, `-`, `_`. Once again, +any occurrence of `.`, `-`, `_` has to be surrounded with alphanumerical +character. + +```````````````````````````````` example +https://example.com/images/branding/logo_272x92.png +. +

https://example.com/images/branding/logo_272x92.png

+. +--fpermissive-url-autolinks +```````````````````````````````` + +Then optionally query may follow. The query is made of `?` and then with +alhanumerical characters, `&`, `.`, `-`, `+`, `_`, `=`, `(` and `)`. Once again any +of those non-alhanumerical characters has to be surrounded with alpha-numerical +characters, and also brackets `(` have to be balanced `)`. + +```````````````````````````````` example +https://www.google.com/search?q=md4c+markdown +. +

https://www.google.com/search?q=md4c+markdown

+. +--fpermissive-url-autolinks +```````````````````````````````` + +And finally there may be an optional fragment. + +```````````````````````````````` example +https://example.com#fragment +. +

https://example.com#fragment

+. +--fpermissive-url-autolinks +```````````````````````````````` + +And finally one complex example: + +```````````````````````````````` example +http://commonmark.org + +(Visit https://encrypted.google.com/search?q=Markup+(business)) + +Anonymous FTP is available at ftp://foo.bar.baz. +. +

http://commonmark.org

+

(Visit https://encrypted.google.com/search?q=Markup+(business))

+

Anonymous FTP is available at ftp://foo.bar.baz.

+. +--fpermissive-url-autolinks +```````````````````````````````` + + +## Permissive WWW Autolinks + +Permissive WWW autolinks (`MD_FLAG_PERMISSIVEWWWAUTOLINKS`) are very similar +to the permissive URL autolinks. Actually the only difference is that instead +of providing an explicit scheme, they have to begin with `www.`. + +```````````````````````````````` example +www.google.com/search?q=Markdown +. +

www.google.com/search?q=Markdown

+. +--fpermissive-www-autolinks +```````````````````````````````` + + +## Permissive E-mail Autolinks + +Permissive E-mail autolinks (`MD_FLAG_PERMISSIVEEMAILAUTOLINKS`) impose the +following limitations to the e-mail addresses: + +1. The username (before the `@`) can only use alphanumerical characters and + characters `.`, `-`, `_` and `+`. However every such non-alphanumerical + character must be immediately preceded and followed by an alphanumerical + character. + + For example this is not an auto-link because of that double underscore `__`. + + ```````````````````````````````` example + john__doe@example.com + . +

john__doe@example.com

+ . + --fpermissive-email-autolinks + ```````````````````````````````` + +2. Same rules for domain as for URL and WWW autolinks apply. diff --git a/test/spec-permissive-email-autolinks.txt b/test/spec-permissive-email-autolinks.txt deleted file mode 100644 index f2e89979..00000000 --- a/test/spec-permissive-email-autolinks.txt +++ /dev/null @@ -1,58 +0,0 @@ - -# Permissive E-mail Autolinks - -With the flag `MD_FLAG_PERMISSIVEEMAILAUTOLINKS`, MD4C enables more permissive -recognition of e-mail addresses and transforms them to autolinks, even if they -do not exactly follow the syntax of autolink as specified in CommonMark -specification. - -This is standard CommonMark e-mail autolink: - -```````````````````````````````` example -E-mail: -. -

E-mail: mailto:john.doe@gmail.com

-. ---fpermissive-email-autolinks -```````````````````````````````` - -With the permissive autolinks enabled, this is sufficient: - -```````````````````````````````` example -E-mail: john.doe@gmail.com -. -

E-mail: john.doe@gmail.com

-. ---fpermissive-email-autolinks -```````````````````````````````` - -`+` can occur before the `@`, but not after. - -```````````````````````````````` example -hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is. -. -

hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.

-. ---fpermissive-email-autolinks -```````````````````````````````` - -`.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at -the end of the email address, in which case it will not be considered part of -the address: - -```````````````````````````````` example -a.b-c_d@a.b - -a.b-c_d@a.b. - -a.b-c_d@a.b- - -a.b-c_d@a.b_ -. -

a.b-c_d@a.b

-

a.b-c_d@a.b.

-

a.b-c_d@a.b-

-

a.b-c_d@a.b_

-. ---fpermissive-email-autolinks -```````````````````````````````` diff --git a/test/spec-permissive-url-autolinks.txt b/test/spec-permissive-url-autolinks.txt deleted file mode 100644 index d068499f..00000000 --- a/test/spec-permissive-url-autolinks.txt +++ /dev/null @@ -1,76 +0,0 @@ - -# Permissive URL Autolinks - -With the flag `MD_FLAG_PERMISSIVEURLAUTOLINKS`, MD4C enables more permissive recognition -of URLs and transform them to autolinks, even if they do not exactly follow the syntax -of autolink as specified in CommonMark specification. - -This is a standard CommonMark autolink: - -```````````````````````````````` example -Homepage: -. -

Homepage: https://github.com/mity/md4c

-. ---fpermissive-url-autolinks -```````````````````````````````` - -With the permissive autolinks enabled, this is sufficient: - -```````````````````````````````` example -Homepage: https://github.com/mity/md4c -. -

Homepage: https://github.com/mity/md4c

-. ---fpermissive-url-autolinks -```````````````````````````````` - -But this permissive autolink feature can work only for very widely used URL -schemes, in alphabetical order `ftp:`, `http:`, `https:`. - -That's why this is not a permissive autolink: - -```````````````````````````````` example -ssh://root@example.com -. -

ssh://root@example.com

-. ---fpermissive-url-autolinks -```````````````````````````````` - -The same rules for path validation as for permissivve WWW autolinks apply. -Therefore the final question mark here is not part of the autolink: - -```````````````````````````````` example -Have you ever visited http://www.zombo.com? -. -

Have you ever visited http://www.zombo.com?

-. ---fpermissive-url-autolinks -```````````````````````````````` - -But in contrast, in this example it is: - -```````````````````````````````` example -http://www.bing.com/search?q=md4c -. -

http://www.bing.com/search?q=md4c

-. ---fpermissive-url-autolinks -```````````````````````````````` - -And finally one complex example: - -```````````````````````````````` example -http://commonmark.org - -(Visit https://encrypted.google.com/search?q=Markup+(business)) - -Anonymous FTP is available at ftp://foo.bar.baz. -. -

http://commonmark.org

-

(Visit https://encrypted.google.com/search?q=Markup+(business))

-

Anonymous FTP is available at ftp://foo.bar.baz.

-. ---fpermissive-url-autolinks -```````````````````````````````` diff --git a/test/spec-permissive-www-autolinks.txt b/test/spec-permissive-www-autolinks.txt deleted file mode 100644 index 7a3a7725..00000000 --- a/test/spec-permissive-www-autolinks.txt +++ /dev/null @@ -1,106 +0,0 @@ - -# Permissive WWW Autolinks - -With the flag `MD_FLAG_PERMISSIVEWWWAUTOLINKS`, MD4C enables recognition of -autolinks starting with `www.`, even if they do not exactly follow the syntax -of autolink as specified in CommonMark specification. - -These do not have to be enclosed in `<` and `>`, and they even do not need -any preceding scheme specification. - -The WWW autolink will be recognized when the text `www.` is found followed by a -valid domain. A valid domain consists of segments of alphanumeric characters, -underscores (`_`) and hyphens (`-`) separated by periods (`.`). There must be -at least one period, and no underscores may be present in the last two segments -of the domain. - -The scheme `http` will be inserted automatically: - -```````````````````````````````` example -www.commonmark.org -. -

www.commonmark.org

-. ---fpermissive-www-autolinks -```````````````````````````````` - -After a valid domain, zero or more non-space non-`<` characters may follow: - -```````````````````````````````` example -Visit www.commonmark.org/help for more information. -. -

Visit www.commonmark.org/help for more information.

-. ---fpermissive-www-autolinks -```````````````````````````````` - -We then apply extended autolink path validation as follows: - -Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`) -will not be considered part of the autolink, though they may be included in the -interior of the link: - -```````````````````````````````` example -Visit www.commonmark.org. - -Visit www.commonmark.org/a.b. -. -

Visit www.commonmark.org.

-

Visit www.commonmark.org/a.b.

-. ---fpermissive-www-autolinks -```````````````````````````````` - -When an autolink ends in `)`, we scan the entire autolink for the total number -of parentheses. If there is a greater number of closing parentheses than -opening ones, we don't consider the last character part of the autolink, in -order to facilitate including an autolink inside a parenthesis: - -```````````````````````````````` example -www.google.com/search?q=Markup+(business) - -(www.google.com/search?q=Markup+(business)) -. -

www.google.com/search?q=Markup+(business)

-

(www.google.com/search?q=Markup+(business))

-. ---fpermissive-www-autolinks -```````````````````````````````` - -This check is only done when the link ends in a closing parentheses `)`, so if -the only parentheses are in the interior of the autolink, no special rules are -applied: - -```````````````````````````````` example -www.google.com/search?q=(business))+ok -. -

www.google.com/search?q=(business))+ok

-. ---fpermissive-www-autolinks -```````````````````````````````` - -If an autolink ends in a semicolon (`;`), we check to see if it appears to -resemble an [entity reference][entity references]; if the preceding text is `&` -followed by one or more alphanumeric characters. If so, it is excluded from -the autolink: - -```````````````````````````````` example -www.google.com/search?q=commonmark&hl=en - -www.google.com/search?q=commonmark&hl; -. -

www.google.com/search?q=commonmark&hl=en

-

www.google.com/search?q=commonmark&hl;

-. ---fpermissive-www-autolinks -```````````````````````````````` - -`<` immediately ends an autolink. - -```````````````````````````````` example -www.commonmark.org/hewww.commonmark.org/he<lp

-. ---fpermissive-www-autolinks -````````````````````````````````