From aa4381f9fad4050e59a1ddd9ce5e745fcab7734a Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Wed, 27 Apr 2022 15:39:05 +0100 Subject: [PATCH 1/6] Update spec to 0.30 --- README.md | 2 +- tests/dune.inc | 36 +- tests/extract_tests.ml | 2 +- tests/spec.txt | 3234 ++++++++++++++++++++-------------------- 4 files changed, 1671 insertions(+), 1603 deletions(-) diff --git a/README.md b/README.md index e30297a5..d9cee68e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ formats. In addition to the library, a command-line tool `omd` is included to easily convert markdown into HTML. Omd aims at implementing the [Commonmark](https://commonmark.org/) standard. The -version currently targeted is [0.29](https://spec.commonmark.org/0.29/). +version currently targeted is [0.30](https://spec.commonmark.org/0.30/). Omd is developed on GitHub. If you need to report an issue, please do so at https://github.com/ocaml/omd/issues. diff --git a/tests/dune.inc b/tests/dune.inc index 6cca5892..5a9d1832 100644 --- a/tests/dune.inc +++ b/tests/dune.inc @@ -650,6 +650,9 @@ spec-647.md spec-647.html spec-648.md spec-648.html spec-649.md spec-649.html + spec-650.md spec-650.html + spec-651.md spec-651.html + spec-652.md spec-652.html attributes-001.md attributes-001.html attributes-002.md attributes-002.html attributes-003.md attributes-003.html @@ -4561,6 +4564,24 @@ (rule (alias spec-649) (action (diff spec-649.html spec-649.html.new))) +(rule + (action + (with-stdout-to spec-650.html.new (run ./omd.exe %{dep:spec-650.md})))) +(rule + (alias spec-650) + (action (diff spec-650.html spec-650.html.new))) +(rule + (action + (with-stdout-to spec-651.html.new (run ./omd.exe %{dep:spec-651.md})))) +(rule + (alias spec-651) + (action (diff spec-651.html spec-651.html.new))) +(rule + (action + (with-stdout-to spec-652.html.new (run ./omd.exe %{dep:spec-652.md})))) +(rule + (alias spec-652) + (action (diff spec-652.html spec-652.html.new))) (rule (action (with-stdout-to attributes-001.html.new @@ -4703,7 +4724,6 @@ (alias spec-025) (alias spec-026) (alias spec-027) - (alias spec-028) (alias spec-029) (alias spec-030) (alias spec-031) @@ -4846,10 +4866,10 @@ (alias spec-168) (alias spec-169) (alias spec-170) - (alias spec-171) (alias spec-172) (alias spec-173) (alias spec-174) + (alias spec-175) (alias spec-176) (alias spec-177) (alias spec-178) @@ -4858,6 +4878,8 @@ (alias spec-181) (alias spec-182) (alias spec-183) + (alias spec-184) + (alias spec-185) (alias spec-186) (alias spec-187) (alias spec-188) @@ -4878,7 +4900,6 @@ (alias spec-203) (alias spec-204) (alias spec-205) - (alias spec-206) (alias spec-207) (alias spec-208) (alias spec-209) @@ -4887,8 +4908,6 @@ (alias spec-212) (alias spec-213) (alias spec-214) - (alias spec-215) - (alias spec-216) (alias spec-217) (alias spec-218) (alias spec-219) @@ -5180,9 +5199,9 @@ (alias spec-513) (alias spec-514) (alias spec-515) + (alias spec-516) (alias spec-517) (alias spec-518) - (alias spec-519) (alias spec-520) (alias spec-521) (alias spec-522) @@ -5199,9 +5218,9 @@ (alias spec-533) (alias spec-534) (alias spec-535) + (alias spec-536) (alias spec-537) (alias spec-538) - (alias spec-539) (alias spec-540) (alias spec-541) (alias spec-542) @@ -5312,6 +5331,9 @@ (alias spec-647) (alias spec-648) (alias spec-649) + (alias spec-650) + (alias spec-651) + (alias spec-652) (alias attributes-001) (alias attributes-002) (alias attributes-003) diff --git a/tests/extract_tests.ml b/tests/extract_tests.ml index ae705717..f2f6136a 100644 --- a/tests/extract_tests.ml +++ b/tests/extract_tests.ml @@ -9,7 +9,7 @@ let protect ~finally f = r let disabled = - [ 175; 184; 185; 410; 411; 414; 415; 416; 428; 468; 469; 516; 536 ] + [ 028; 171; 206; 215; 216; 410; 411; 414; 415; 416; 428; 468; 469; 519; 539 ] let with_open_in fn f = let ic = open_in fn in diff --git a/tests/spec.txt b/tests/spec.txt index 3913de44..e6f31375 100644 --- a/tests/spec.txt +++ b/tests/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.29 -date: '2019-04-06' +version: 0.30 +date: '2021-06-19' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -270,6 +270,16 @@ of representing the structural distinctions we need to make, and the choice of HTML for the tests makes it possible to run the tests against an implementation without writing an abstract syntax tree renderer. +Note that not every feature of the HTML samples is mandated by +the spec. For example, the spec says what counts as a link +destination, but it doesn't mandate that non-ASCII characters in +the URL be percent-encoded. To use the automatic tests, +implementers will need to provide a renderer that conforms to +the expectations of the spec examples (percent-encoding +non-ASCII characters in URLs). But a conforming implementation +can use a different renderer and may choose not to +percent-encode non-ASCII characters in URLs. + This document is generated from a text file, `spec.txt`, written in Markdown with a small extension for the side-by-side tests. The script `tools/makespec.py` can be used to convert `spec.txt` into @@ -294,37 +304,31 @@ of [characters] rather than bytes. A conforming parser may be limited to a certain encoding. A [line](@) is a sequence of zero or more [characters] -other than newline (`U+000A`) or carriage return (`U+000D`), +other than line feed (`U+000A`) or carriage return (`U+000D`), followed by a [line ending] or by the end of file. -A [line ending](@) is a newline (`U+000A`), a carriage return -(`U+000D`) not followed by a newline, or a carriage return and a -following newline. +A [line ending](@) is a line feed (`U+000A`), a carriage return +(`U+000D`) not followed by a line feed, or a carriage return and a +following line feed. A line containing no characters, or a line containing only spaces (`U+0020`) or tabs (`U+0009`), is called a [blank line](@). The following definitions of character classes will be used in this spec: -A [whitespace character](@) is a space -(`U+0020`), tab (`U+0009`), newline (`U+000A`), line tabulation (`U+000B`), -form feed (`U+000C`), or carriage return (`U+000D`). - -[Whitespace](@) is a sequence of one or more [whitespace -characters]. - A [Unicode whitespace character](@) is any code point in the Unicode `Zs` general category, or a tab (`U+0009`), -carriage return (`U+000D`), newline (`U+000A`), or form feed -(`U+000C`). +line feed (`U+000A`), form feed (`U+000C`), or carriage return (`U+000D`). + +[Unicode whitespace](@) is a sequence of one or more +[Unicode whitespace characters]. -[Unicode whitespace](@) is a sequence of one -or more [Unicode whitespace characters]. +A [tab](@) is `U+0009`. A [space](@) is `U+0020`. -A [non-whitespace character](@) is any character -that is not a [whitespace character]. +An [ASCII control character](@) is a character between `U+0000–1F` (both +including) or `U+007F`. An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, @@ -333,14 +337,14 @@ is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, `[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), `{`, `|`, `}`, or `~` (U+007B–007E). -A [punctuation character](@) is an [ASCII +A [Unicode punctuation character](@) is an [ASCII punctuation character] or anything in the general Unicode categories `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. ## Tabs Tabs in lines are not expanded to [spaces]. However, -in contexts where whitespace helps to define block structure, +in contexts where spaces help to define block structure, tabs behave as if they were replaced by spaces with a tab stop of 4 characters. @@ -478,1575 +482,1916 @@ bar For security reasons, the Unicode character `U+0000` must be replaced with the REPLACEMENT CHARACTER (`U+FFFD`). -# Blocks and inlines - -We can think of a document as a sequence of -[blocks](@)---structural elements like paragraphs, block -quotations, lists, headings, rules, and code blocks. Some blocks (like -block quotes and list items) contain other blocks; others (like -headings and paragraphs) contain [inline](@) content---text, -links, emphasized text, images, code spans, and so on. -## Precedence +## Backslash escapes -Indicators of block structure always take precedence over indicators -of inline structure. So, for example, the following is a list with -two items, not a list with one item containing a code span: +Any ASCII punctuation character may be backslash-escaped: ```````````````````````````````` example -- `one -- two` +\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ . - +

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

```````````````````````````````` -This means that parsing can proceed in two steps: first, the block -structure of the document can be discerned; second, text lines inside -paragraphs, headings, and other block constructs can be parsed for inline -structure. The second step requires information about link reference -definitions that will be available only at the end of the first -step. Note that the first step requires processing lines in sequence, -but the second can be parallelized, since the inline parsing of -one block element does not affect the inline parsing of any other. - -## Container blocks and leaf blocks - -We can divide blocks into two types: -[container blocks](@), -which can contain other blocks, and [leaf blocks](@), -which cannot. - -# Leaf blocks +Backslashes before other characters are treated as literal +backslashes: -This section describes the different kinds of leaf block that make up a -Markdown document. +```````````````````````````````` example +\→\A\a\ \3\φ\« +. +

\→\A\a\ \3\φ\«

+```````````````````````````````` -## Thematic breaks -A line consisting of 0-3 spaces of indentation, followed by a sequence -of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces or tabs, forms a -[thematic break](@). +Escaped characters are treated as regular characters and do +not have their usual Markdown meanings: ```````````````````````````````` example -*** ---- -___ +\*not emphasized* +\
not a tag +\[not a link](/foo) +\`not code` +1\. not a list +\* not a list +\# not a heading +\[foo]: /url "not a reference" +\ö not a character entity . -
-
-
+

*not emphasized* +<br/> not a tag +[not a link](/foo) +`not code` +1. not a list +* not a list +# not a heading +[foo]: /url "not a reference" +&ouml; not a character entity

```````````````````````````````` -Wrong characters: +If a backslash is itself escaped, the following character is not: ```````````````````````````````` example -+++ +\\*emphasis* . -

+++

+

\emphasis

```````````````````````````````` +A backslash at the end of the line is a [hard line break]: + ```````````````````````````````` example -=== +foo\ +bar . -

===

+

foo
+bar

```````````````````````````````` -Not enough characters: +Backslash escapes do not work in code blocks, code spans, autolinks, or +raw HTML: ```````````````````````````````` example --- -** -__ +`` \[\` `` . -

-- -** -__

+

\[\`

```````````````````````````````` -One to three spaces indent are allowed: - ```````````````````````````````` example - *** - *** - *** + \[\] . -
-
-
+
\[\]
+
```````````````````````````````` -Four spaces is too many: - ```````````````````````````````` example - *** +~~~ +\[\] +~~~ . -
***
+
\[\]
 
```````````````````````````````` ```````````````````````````````` example -Foo - *** + . -

Foo -***

+

http://example.com?find=\*

```````````````````````````````` -More than three characters may be used: - ```````````````````````````````` example -_____________________________________ + . -
+
```````````````````````````````` -Spaces are allowed between the characters: +But they work in all other contexts, including URLs and link titles, +link references, and [info strings] in [fenced code blocks]: ```````````````````````````````` example - - - - +[foo](/bar\* "ti\*tle") . -
+

foo

```````````````````````````````` ```````````````````````````````` example - ** * ** * ** * ** +[foo] + +[foo]: /bar\* "ti\*tle" . -
+

foo

```````````````````````````````` ```````````````````````````````` example -- - - - +``` foo\+bar +foo +``` . -
+
foo
+
```````````````````````````````` -Spaces are allowed at the end: +## Entity and numeric character references -```````````````````````````````` example -- - - - -. -
-```````````````````````````````` +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: +- Entity and character references are not recognized in code + blocks and code spans. -However, no other characters may occur in the line: +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. -```````````````````````````````` example -_ _ _ _ a +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. -a------ +[Entity references](@) consist of `&` + any of the valid +HTML5 entity names + `;`. The +document +is used as an authoritative source for the valid entity +references and their corresponding code points. ----a--- +```````````````````````````````` example +  & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸ . -

_ _ _ _ a

-

a------

-

---a---

+

  & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸

```````````````````````````````` -It is required that all of the [non-whitespace characters] be the same. -So, this is not a thematic break: +[Decimal numeric character +references](@) +consist of `&#` + a string of 1--7 arabic digits + `;`. A +numeric character reference is parsed as the corresponding +Unicode character. Invalid Unicode code points will be replaced by +the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, +the code point `U+0000` will also be replaced by `U+FFFD`. ```````````````````````````````` example - *-* +# Ӓ Ϡ � . -

-

+

# Ӓ Ϡ �

```````````````````````````````` -Thematic breaks do not need blank lines before or after: +[Hexadecimal numeric character +references](@) consist of `&#` + +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. +They too are parsed as the corresponding Unicode character (this +time specified with a hexadecimal numeral instead of decimal). ```````````````````````````````` example -- foo -*** -- bar +" ആ ಫ . -
    -
  • foo
  • -
-
-
    -
  • bar
  • -
+

" ആ ಫ

```````````````````````````````` -Thematic breaks can interrupt a paragraph: +Here are some nonentities: ```````````````````````````````` example -Foo -*** -bar +  &x; &#; &#x; +� +&#abcdef0; +&ThisIsNotDefined; &hi?; . -

Foo

-
-

bar

+

&nbsp &x; &#; &#x; +&#87654321; +&#abcdef0; +&ThisIsNotDefined; &hi?;

```````````````````````````````` -If a line of dashes that meets the above conditions for being a -thematic break could also be interpreted as the underline of a [setext -heading], the interpretation as a -[setext heading] takes precedence. Thus, for example, -this is a setext heading, not a paragraph followed by a thematic break: +Although HTML5 does accept some entity references +without a trailing semicolon (such as `©`), these are not +recognized here, because it makes the grammar too ambiguous: ```````````````````````````````` example -Foo ---- -bar +© . -

Foo

-

bar

+

&copy

```````````````````````````````` -When both a thematic break and a list item are possible -interpretations of a line, the thematic break takes precedence: +Strings that are not on the list of HTML5 named entities are not +recognized as entity references either: ```````````````````````````````` example -* Foo -* * * -* Bar +&MadeUpEntity; . -
    -
  • Foo
  • -
-
-
    -
  • Bar
  • -
+

&MadeUpEntity;

```````````````````````````````` -If you want a thematic break in a list item, use a different bullet: +Entity and numeric character references are recognized in any +context besides code spans or code blocks, including +URLs, [link titles], and [fenced code block][] [info strings]: ```````````````````````````````` example -- Foo -- * * * + . -
    -
  • Foo
  • -
  • -
    -
  • -
+
```````````````````````````````` -## ATX headings - -An [ATX heading](@) -consists of a string of characters, parsed as inline content, between an -opening sequence of 1--6 unescaped `#` characters and an optional -closing sequence of any number of unescaped `#` characters. -The opening sequence of `#` characters must be followed by a -[space] or by the end of line. The optional closing sequence of `#`s must be -preceded by a [space] and may be followed by spaces only. The opening -`#` character may be indented 0-3 spaces. The raw contents of the -heading are stripped of leading and trailing spaces before being parsed -as inline content. The heading level is equal to the number of `#` -characters in the opening sequence. +```````````````````````````````` example +[foo](/föö "föö") +. +

foo

+```````````````````````````````` -Simple headings: ```````````````````````````````` example -# foo -## foo -### foo -#### foo -##### foo -###### foo +[foo] + +[foo]: /föö "föö" . -

foo

-

foo

-

foo

-

foo

-
foo
-
foo
+

foo

```````````````````````````````` -More than six `#` characters is not a heading: - ```````````````````````````````` example -####### foo +``` föö +foo +``` . -

####### foo

+
foo
+
```````````````````````````````` -At least one space is required between the `#` characters and the -heading's contents, unless the heading is empty. Note that many -implementations currently do not require the space. However, the -space was required by the -[original ATX implementation](http://www.aaronsw.com/2002/atx/atx.py), -and it helps prevent things like the following from being parsed as -headings: +Entity and numeric character references are treated as literal +text in code spans and code blocks: ```````````````````````````````` example -#5 bolt +`föö` +. +

f&ouml;&ouml;

+```````````````````````````````` -#hashtag + +```````````````````````````````` example + föfö . -

#5 bolt

-

#hashtag

+
f&ouml;f&ouml;
+
```````````````````````````````` -This is not a heading, because the first `#` is escaped: +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. ```````````````````````````````` example -\## foo +*foo* +*foo* . -

## foo

+

*foo* +foo

```````````````````````````````` +```````````````````````````````` example +* foo -Contents are parsed as inlines: +* foo +. +

* foo

+
    +
  • foo
  • +
+```````````````````````````````` ```````````````````````````````` example -# foo *bar* \*baz\* +foo bar . -

foo bar *baz*

+

foo + +bar

```````````````````````````````` +```````````````````````````````` example + foo +. +

→foo

+```````````````````````````````` -Leading and trailing [whitespace] is ignored in parsing inline content: ```````````````````````````````` example -# foo +[a](url "tit") . -

foo

+

[a](url "tit")

```````````````````````````````` -One to three spaces indentation are allowed: + +# Blocks and inlines + +We can think of a document as a sequence of +[blocks](@)---structural elements like paragraphs, block +quotations, lists, headings, rules, and code blocks. Some blocks (like +block quotes and list items) contain other blocks; others (like +headings and paragraphs) contain [inline](@) content---text, +links, emphasized text, images, code spans, and so on. + +## Precedence + +Indicators of block structure always take precedence over indicators +of inline structure. So, for example, the following is a list with +two items, not a list with one item containing a code span: ```````````````````````````````` example - ### foo - ## foo - # foo +- `one +- two` . -

foo

-

foo

-

foo

+
    +
  • `one
  • +
  • two`
  • +
```````````````````````````````` -Four spaces are too much: +This means that parsing can proceed in two steps: first, the block +structure of the document can be discerned; second, text lines inside +paragraphs, headings, and other block constructs can be parsed for inline +structure. The second step requires information about link reference +definitions that will be available only at the end of the first +step. Note that the first step requires processing lines in sequence, +but the second can be parallelized, since the inline parsing of +one block element does not affect the inline parsing of any other. + +## Container blocks and leaf blocks + +We can divide blocks into two types: +[container blocks](#container-blocks), +which can contain other blocks, and [leaf blocks](#leaf-blocks), +which cannot. + +# Leaf blocks + +This section describes the different kinds of leaf block that make up a +Markdown document. + +## Thematic breaks + +A line consisting of optionally up to three spaces of indentation, followed by a +sequence of three or more matching `-`, `_`, or `*` characters, each followed +optionally by any number of spaces or tabs, forms a +[thematic break](@). ```````````````````````````````` example - # foo +*** +--- +___ . -
# foo
-
+
+
+
```````````````````````````````` +Wrong characters: + ```````````````````````````````` example -foo - # bar ++++ . -

foo -# bar

+

+++

```````````````````````````````` -A closing sequence of `#` characters is optional: - ```````````````````````````````` example -## foo ## - ### bar ### +=== . -

foo

-

bar

+

===

```````````````````````````````` -It need not be the same length as the opening sequence: +Not enough characters: ```````````````````````````````` example -# foo ################################## -##### foo ## +-- +** +__ . -

foo

-
foo
+

-- +** +__

```````````````````````````````` -Spaces are allowed after the closing sequence: +Up to three spaces of indentation are allowed: ```````````````````````````````` example -### foo ### + *** + *** + *** . -

foo

+
+
+
```````````````````````````````` -A sequence of `#` characters with anything but [spaces] following it -is not a closing sequence, but counts as part of the contents of the -heading: +Four spaces of indentation is too many: ```````````````````````````````` example -### foo ### b + *** . -

foo ### b

+
***
+
```````````````````````````````` -The closing sequence must be preceded by a space: - ```````````````````````````````` example -# foo# +Foo + *** . -

foo#

+

Foo +***

```````````````````````````````` -Backslash-escaped `#` characters do not count as part -of the closing sequence: +More than three characters may be used: ```````````````````````````````` example -### foo \### -## foo #\## -# foo \# +_____________________________________ . -

foo ###

-

foo ###

-

foo #

+
```````````````````````````````` -ATX headings need not be separated from surrounding content by blank -lines, and they can interrupt paragraphs: +Spaces and tabs are allowed between the characters: ```````````````````````````````` example -**** -## foo -**** + - - - .
-

foo

-
```````````````````````````````` ```````````````````````````````` example -Foo bar -# baz -Bar foo + ** * ** * ** * ** . -

Foo bar

-

baz

-

Bar foo

+
```````````````````````````````` -ATX headings can be empty: - ```````````````````````````````` example -## -# -### ### +- - - - . -

-

-

+
```````````````````````````````` -## Setext headings - -A [setext heading](@) consists of one or more -lines of text, each containing at least one [non-whitespace -character], with no more than 3 spaces indentation, followed by -a [setext heading underline]. The lines of text must be such -that, were they not followed by the setext heading underline, -they would be interpreted as a paragraph: they cannot be -interpretable as a [code fence], [ATX heading][ATX headings], -[block quote][block quotes], [thematic break][thematic breaks], -[list item][list items], or [HTML block][HTML blocks]. - -A [setext heading underline](@) is a sequence of -`=` characters or a sequence of `-` characters, with no more than 3 -spaces indentation and any number of trailing spaces. If a line -containing a single `-` can be interpreted as an -empty [list items], it should be interpreted this way -and not as a [setext heading underline]. +Spaces and tabs are allowed at the end: -The heading is a level 1 heading if `=` characters are used in -the [setext heading underline], and a level 2 heading if `-` -characters are used. The contents of the heading are the result -of parsing the preceding lines of text as CommonMark inline -content. +```````````````````````````````` example +- - - - +. +
+```````````````````````````````` -In general, a setext heading need not be preceded or followed by a -blank line. However, it cannot interrupt a paragraph, so when a -setext heading comes after a paragraph, a blank line is needed between -them. -Simple examples: +However, no other characters may occur in the line: ```````````````````````````````` example -Foo *bar* -========= +_ _ _ _ a -Foo *bar* ---------- +a------ + +---a--- . -

Foo bar

-

Foo bar

+

_ _ _ _ a

+

a------

+

---a---

```````````````````````````````` -The content of the header may span more than one line: +It is required that all of the characters other than spaces or tabs be the same. +So, this is not a thematic break: ```````````````````````````````` example -Foo *bar -baz* -==== + *-* . -

Foo bar -baz

+

-

```````````````````````````````` -The contents are the result of parsing the headings's raw -content as inlines. The heading's raw content is formed by -concatenating the lines and removing initial and final -[whitespace]. + +Thematic breaks do not need blank lines before or after: ```````````````````````````````` example - Foo *bar -baz*→ -==== +- foo +*** +- bar . -

Foo bar -baz

+
    +
  • foo
  • +
+
+
    +
  • bar
  • +
```````````````````````````````` -The underlining can be any length: +Thematic breaks can interrupt a paragraph: ```````````````````````````````` example Foo -------------------------- - -Foo -= +*** +bar . -

Foo

-

Foo

+

Foo

+
+

bar

```````````````````````````````` -The heading content can be indented up to three spaces, and need -not line up with the underlining: +If a line of dashes that meets the above conditions for being a +thematic break could also be interpreted as the underline of a [setext +heading], the interpretation as a +[setext heading] takes precedence. Thus, for example, +this is a setext heading, not a paragraph followed by a thematic break: ```````````````````````````````` example - Foo +Foo --- - - Foo ------ - - Foo - === +bar .

Foo

-

Foo

-

Foo

+

bar

```````````````````````````````` -Four spaces indent is too much: +When both a thematic break and a list item are possible +interpretations of a line, the thematic break takes precedence: ```````````````````````````````` example - Foo - --- - - Foo ---- +* Foo +* * * +* Bar . -
Foo
----
+
    +
  • Foo
  • +
+
+
    +
  • Bar
  • +
+```````````````````````````````` -Foo -
+ +If you want a thematic break in a list item, use a different bullet: + +```````````````````````````````` example +- Foo +- * * * +. +
    +
  • Foo
  • +

  • +
  • +
```````````````````````````````` -The setext heading underline can be indented up to three spaces, and -may have trailing spaces: +## ATX headings + +An [ATX heading](@) +consists of a string of characters, parsed as inline content, between an +opening sequence of 1--6 unescaped `#` characters and an optional +closing sequence of any number of unescaped `#` characters. +The opening sequence of `#` characters must be followed by spaces or tabs, or +by the end of line. The optional closing sequence of `#`s must be preceded by +spaces or tabs and may be followed by spaces or tabs only. The opening +`#` character may be preceded by up to three spaces of indentation. The raw +contents of the heading are stripped of leading and trailing space or tabs +before being parsed as inline content. The heading level is equal to the number +of `#` characters in the opening sequence. + +Simple headings: ```````````````````````````````` example -Foo - ---- +# foo +## foo +### foo +#### foo +##### foo +###### foo . -

Foo

+

foo

+

foo

+

foo

+

foo

+
foo
+
foo
```````````````````````````````` -Four spaces is too much: +More than six `#` characters is not a heading: ```````````````````````````````` example -Foo - --- +####### foo . -

Foo ----

+

####### foo

```````````````````````````````` -The setext heading underline cannot contain internal spaces: +At least one space or tab is required between the `#` characters and the +heading's contents, unless the heading is empty. Note that many +implementations currently do not require the space. However, the +space was required by the +[original ATX implementation](http://www.aaronsw.com/2002/atx/atx.py), +and it helps prevent things like the following from being parsed as +headings: ```````````````````````````````` example -Foo -= = +#5 bolt -Foo ---- - +#hashtag . -

Foo -= =

-

Foo

-
+

#5 bolt

+

#hashtag

```````````````````````````````` -Trailing spaces in the content line do not cause a line break: +This is not a heading, because the first `#` is escaped: ```````````````````````````````` example -Foo ------ +\## foo . -

Foo

+

## foo

```````````````````````````````` -Nor does a backslash at the end: +Contents are parsed as inlines: ```````````````````````````````` example -Foo\ ----- +# foo *bar* \*baz\* . -

Foo\

+

foo bar *baz*

```````````````````````````````` -Since indicators of block structure take precedence over -indicators of inline structure, the following are setext headings: +Leading and trailing spaces or tabs are ignored in parsing inline content: ```````````````````````````````` example -`Foo ----- -` - - +# foo . -

`Foo

-

`

-

<a title="a lot

-

of dashes"/>

+

foo

```````````````````````````````` -The setext heading underline cannot be a [lazy continuation -line] in a list item or block quote: +Up to three spaces of indentation are allowed: ```````````````````````````````` example -> Foo ---- + ### foo + ## foo + # foo . -
-

Foo

-
-
+

foo

+

foo

+

foo

```````````````````````````````` +Four spaces of indentation is too many: + ```````````````````````````````` example -> foo -bar -=== + # foo . -
-

foo -bar -===

-
+
# foo
+
```````````````````````````````` ```````````````````````````````` example -- Foo ---- +foo + # bar . -
    -
  • Foo
  • -
-
+

foo +# bar

```````````````````````````````` -A blank line is needed between a paragraph and a following -setext heading, since otherwise the paragraph becomes part -of the heading's content: +A closing sequence of `#` characters is optional: ```````````````````````````````` example -Foo -Bar ---- +## foo ## + ### bar ### . -

Foo -Bar

+

foo

+

bar

```````````````````````````````` -But in general a blank line is not required before or after -setext headings: +It need not be the same length as the opening sequence: ```````````````````````````````` example ---- -Foo ---- -Bar ---- -Baz +# foo ################################## +##### foo ## . -
-

Foo

-

Bar

-

Baz

+

foo

+
foo
```````````````````````````````` -Setext headings cannot be empty: +Spaces or tabs are allowed after the closing sequence: ```````````````````````````````` example +### foo ### +. +

foo

+```````````````````````````````` -==== + +A sequence of `#` characters with anything but spaces or tabs following it +is not a closing sequence, but counts as part of the contents of the +heading: + +```````````````````````````````` example +### foo ### b . -

====

+

foo ### b

```````````````````````````````` -Setext heading text lines must not be interpretable as block -constructs other than paragraphs. So, the line of dashes -in these examples gets interpreted as a thematic break: +The closing sequence must be preceded by a space or tab: ```````````````````````````````` example ---- ---- +# foo# . -
-
+

foo#

```````````````````````````````` +Backslash-escaped `#` characters do not count as part +of the closing sequence: + ```````````````````````````````` example -- foo ------ +### foo \### +## foo #\## +# foo \# . -
    -
  • foo
  • -
-
+

foo ###

+

foo ###

+

foo #

```````````````````````````````` +ATX headings need not be separated from surrounding content by blank +lines, and they can interrupt paragraphs: + ```````````````````````````````` example - foo ---- +**** +## foo +**** . -
foo
-
+
+

foo


```````````````````````````````` ```````````````````````````````` example -> foo ------ +Foo bar +# baz +Bar foo . -
-

foo

-
-
+

Foo bar

+

baz

+

Bar foo

```````````````````````````````` -If you want a heading with `> foo` as its literal text, you can -use backslash escapes: +ATX headings can be empty: ```````````````````````````````` example -\> foo ------- +## +# +### ### . -

> foo

+

+

+

```````````````````````````````` -**Compatibility note:** Most existing Markdown implementations -do not allow the text of setext headings to span multiple lines. -But there is no consensus about how to interpret +## Setext headings -``` markdown -Foo -bar ---- -baz -``` +A [setext heading](@) consists of one or more +lines of text, not interrupted by a blank line, of which the first line does not +have more than 3 spaces of indentation, followed by +a [setext heading underline]. The lines of text must be such +that, were they not followed by the setext heading underline, +they would be interpreted as a paragraph: they cannot be +interpretable as a [code fence], [ATX heading][ATX headings], +[block quote][block quotes], [thematic break][thematic breaks], +[list item][list items], or [HTML block][HTML blocks]. -One can find four different interpretations: +A [setext heading underline](@) is a sequence of +`=` characters or a sequence of `-` characters, with no more than 3 +spaces of indentation and any number of trailing spaces or tabs. If a line +containing a single `-` can be interpreted as an +empty [list items], it should be interpreted this way +and not as a [setext heading underline]. -1. paragraph "Foo", heading "bar", paragraph "baz" -2. paragraph "Foo bar", thematic break, paragraph "baz" -3. paragraph "Foo bar --- baz" -4. heading "Foo bar", paragraph "baz" +The heading is a level 1 heading if `=` characters are used in +the [setext heading underline], and a level 2 heading if `-` +characters are used. The contents of the heading are the result +of parsing the preceding lines of text as CommonMark inline +content. -We find interpretation 4 most natural, and interpretation 4 -increases the expressive power of CommonMark, by allowing -multiline headings. Authors who want interpretation 1 can -put a blank line after the first paragraph: +In general, a setext heading need not be preceded or followed by a +blank line. However, it cannot interrupt a paragraph, so when a +setext heading comes after a paragraph, a blank line is needed between +them. + +Simple examples: ```````````````````````````````` example -Foo +Foo *bar* +========= -bar ---- -baz +Foo *bar* +--------- . -

Foo

-

bar

-

baz

+

Foo bar

+

Foo bar

```````````````````````````````` -Authors who want interpretation 2 can put blank lines around -the thematic break, +The content of the header may span more than one line: ```````````````````````````````` example -Foo -bar +Foo *bar +baz* +==== +. +

Foo bar +baz

+```````````````````````````````` ---- +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +spaces or tabs. -baz +```````````````````````````````` example + Foo *bar +baz*→ +==== . -

Foo -bar

-
-

baz

+

Foo bar +baz

```````````````````````````````` -or use a thematic break that cannot count as a [setext heading -underline], such as +The underlining can be any length: ```````````````````````````````` example Foo -bar -* * * -baz +------------------------- + +Foo += . -

Foo -bar

-
-

baz

+

Foo

+

Foo

```````````````````````````````` -Authors who want interpretation 3 can use backslash escapes: +The heading content can be preceded by up to three spaces of indentation, and +need not line up with the underlining: ```````````````````````````````` example -Foo -bar -\--- -baz -. -

Foo -bar + Foo --- -baz

-```````````````````````````````` + Foo +----- -## Indented code blocks + Foo + === +. +

Foo

+

Foo

+

Foo

+```````````````````````````````` -An [indented code block](@) is composed of one or more -[indented chunks] separated by blank lines. -An [indented chunk](@) is a sequence of non-blank lines, -each indented four or more spaces. The contents of the code block are -the literal contents of the lines, including trailing -[line endings], minus four spaces of indentation. -An indented code block has no [info string]. -An indented code block cannot interrupt a paragraph, so there must be -a blank line between a paragraph and a following indented code block. -(A blank line is not needed, however, between a code block and a following -paragraph.) +Four spaces of indentation is too many: ```````````````````````````````` example - a simple - indented code block + Foo + --- + + Foo +--- . -
a simple
-  indented code block
+
Foo
+---
+
+Foo
 
+
```````````````````````````````` -If there is any ambiguity between an interpretation of indentation -as a code block and as indicating that material belongs to a [list -item][list items], the list item interpretation takes precedence: +The setext heading underline can be preceded by up to three spaces of +indentation, and may have trailing spaces or tabs: ```````````````````````````````` example - - foo - - bar +Foo + ---- . -
    -
  • -

    foo

    -

    bar

    -
  • -
+

Foo

```````````````````````````````` -```````````````````````````````` example -1. foo +Four spaces of indentation is too many: - - bar +```````````````````````````````` example +Foo + --- . -
    -
  1. -

    foo

    -
      -
    • bar
    • -
    -
  2. -
+

Foo +---

```````````````````````````````` - -The contents of a code block are literal text, and do not get parsed -as Markdown: +The setext heading underline cannot contain internal spaces or tabs: ```````````````````````````````` example -
- *hi* +Foo += = - - one +Foo +--- - . -
<a/>
-*hi*
-
-- one
-
+

Foo += =

+

Foo

+
```````````````````````````````` -Here we have three chunks separated by blank lines: +Trailing spaces or tabs in the content line do not cause a hard line break: ```````````````````````````````` example - chunk1 - - chunk2 - - - - chunk3 +Foo +----- . -
chunk1
-
-chunk2
-
-
-
-chunk3
-
+

Foo

```````````````````````````````` -Any initial spaces beyond four will be included in the content, even -in interior blank lines: +Nor does a backslash at the end: ```````````````````````````````` example - chunk1 - - chunk2 +Foo\ +---- . -
chunk1
-  
-  chunk2
-
+

Foo\

```````````````````````````````` -An indented code block cannot interrupt a paragraph. (This -allows hanging indents and the like.) +Since indicators of block structure take precedence over +indicators of inline structure, the following are setext headings: ```````````````````````````````` example -Foo - bar +`Foo +---- +` +
. -

Foo -bar

+

`Foo

+

`

+

<a title="a lot

+

of dashes"/>

```````````````````````````````` -However, any non-blank line with fewer than four leading spaces ends -the code block immediately. So a paragraph may occur immediately -after indented code: +The setext heading underline cannot be a [lazy continuation +line] in a list item or block quote: ```````````````````````````````` example - foo -bar +> Foo +--- . -
foo
-
-

bar

+
+

Foo

+
+
```````````````````````````````` -And indented code can occur immediately before and after other kinds of -blocks: - ```````````````````````````````` example -# Heading - foo -Heading ------- - foo ----- +> foo +bar +=== . -

Heading

-
foo
-
-

Heading

-
foo
-
-
+
+

foo +bar +===

+
```````````````````````````````` -The first line can be indented more than four spaces: - ```````````````````````````````` example - foo - bar +- Foo +--- . -
    foo
-bar
-
+
    +
  • Foo
  • +
+
```````````````````````````````` -Blank lines preceding or following an indented code block -are not included in it: +A blank line is needed between a paragraph and a following +setext heading, since otherwise the paragraph becomes part +of the heading's content: ```````````````````````````````` example - - - foo - - +Foo +Bar +--- . -
foo
-
+

Foo +Bar

```````````````````````````````` -Trailing spaces are included in the code block's content: +But in general a blank line is not required before or after +setext headings: ```````````````````````````````` example - foo +--- +Foo +--- +Bar +--- +Baz . -
foo  
-
+
+

Foo

+

Bar

+

Baz

```````````````````````````````` +Setext headings cannot be empty: -## Fenced code blocks - -A [code fence](@) is a sequence -of at least three consecutive backtick characters (`` ` ``) or -tildes (`~`). (Tildes and backticks cannot be mixed.) -A [fenced code block](@) -begins with a code fence, indented no more than three spaces. - -The line with the opening code fence may optionally contain some text -following the code fence; this is trimmed of leading and trailing -whitespace and called the [info string](@). If the [info string] comes -after a backtick fence, it may not contain any backtick -characters. (The reason for this restriction is that otherwise -some inline code would be incorrectly interpreted as the -beginning of a fenced code block.) +```````````````````````````````` example -The content of the code block consists of all subsequent lines, until -a closing [code fence] of the same type as the code block -began with (backticks or tildes), and with at least as many backticks -or tildes as the opening code fence. If the leading code fence is -indented N spaces, then up to N spaces of indentation are removed from -each line of the content (if present). (If a content line is not -indented, it is preserved unchanged. If it is indented less than N -spaces, all of the indentation is removed.) +==== +. +

====

+```````````````````````````````` -The closing code fence may be indented up to three spaces, and may be -followed only by spaces, which are ignored. If the end of the -containing block (or document) is reached and no closing code fence -has been found, the code block contains all of the lines after the -opening code fence until the end of the containing block (or -document). (An alternative spec would require backtracking in the -event that a closing code fence is not found. But this makes parsing -much less efficient, and there seems to be no real down side to the -behavior described here.) -A fenced code block may interrupt a paragraph, and does not require -a blank line either before or after. +Setext heading text lines must not be interpretable as block +constructs other than paragraphs. So, the line of dashes +in these examples gets interpreted as a thematic break: -The content of a code fence is treated as literal text, not parsed -as inlines. The first word of the [info string] is typically used to -specify the language of the code sample, and rendered in the `class` -attribute of the `code` tag. However, this spec does not mandate any -particular treatment of the [info string]. +```````````````````````````````` example +--- +--- +. +
+
+```````````````````````````````` -Here is a simple example with backticks: ```````````````````````````````` example -``` -< - > -``` +- foo +----- . -
<
- >
-
+
    +
  • foo
  • +
+
```````````````````````````````` -With tildes: - ```````````````````````````````` example -~~~ -< - > -~~~ + foo +--- . -
<
- >
+
foo
 
+
```````````````````````````````` -Fewer than three backticks is not enough: ```````````````````````````````` example -`` -foo -`` +> foo +----- . -

foo

+
+

foo

+
+
```````````````````````````````` -The closing code fence must use the same character as the opening -fence: + +If you want a heading with `> foo` as its literal text, you can +use backslash escapes: ```````````````````````````````` example -``` -aaa -~~~ -``` +\> foo +------ . -
aaa
-~~~
-
+

> foo

```````````````````````````````` -```````````````````````````````` example -~~~ -aaa -``` -~~~ -. -
aaa
+**Compatibility note:**  Most existing Markdown implementations
+do not allow the text of setext headings to span multiple lines.
+But there is no consensus about how to interpret
+
+``` markdown
+Foo
+bar
+---
+baz
 ```
-
-```````````````````````````````` +One can find four different interpretations: -The closing code fence must be at least as long as the opening fence: +1. paragraph "Foo", heading "bar", paragraph "baz" +2. paragraph "Foo bar", thematic break, paragraph "baz" +3. paragraph "Foo bar --- baz" +4. heading "Foo bar", paragraph "baz" + +We find interpretation 4 most natural, and interpretation 4 +increases the expressive power of CommonMark, by allowing +multiline headings. Authors who want interpretation 1 can +put a blank line after the first paragraph: ```````````````````````````````` example -```` -aaa -``` -`````` +Foo + +bar +--- +baz . -
aaa
-```
-
+

Foo

+

bar

+

baz

```````````````````````````````` +Authors who want interpretation 2 can put blank lines around +the thematic break, + ```````````````````````````````` example -~~~~ -aaa -~~~ -~~~~ +Foo +bar + +--- + +baz . -
aaa
-~~~
-
+

Foo +bar

+
+

baz

```````````````````````````````` -Unclosed code blocks are closed by the end of the document -(or the enclosing [block quote][block quotes] or [list item][list items]): +or use a thematic break that cannot count as a [setext heading +underline], such as ```````````````````````````````` example -``` +Foo +bar +* * * +baz . -
+

Foo +bar

+
+

baz

```````````````````````````````` -```````````````````````````````` example -````` +Authors who want interpretation 3 can use backslash escapes: -``` -aaa +```````````````````````````````` example +Foo +bar +\--- +baz . -

-```
-aaa
-
+

Foo +bar +--- +baz

```````````````````````````````` -```````````````````````````````` example -> ``` -> aaa +## Indented code blocks -bbb +An [indented code block](@) is composed of one or more +[indented chunks] separated by blank lines. +An [indented chunk](@) is a sequence of non-blank lines, +each preceded by four or more spaces of indentation. The contents of the code +block are the literal contents of the lines, including trailing +[line endings], minus four spaces of indentation. +An indented code block has no [info string]. + +An indented code block cannot interrupt a paragraph, so there must be +a blank line between a paragraph and a following indented code block. +(A blank line is not needed, however, between a code block and a following +paragraph.) + +```````````````````````````````` example + a simple + indented code block . -
-
aaa
+
a simple
+  indented code block
 
-
-

bbb

```````````````````````````````` -A code block can have all empty lines as its content: +If there is any ambiguity between an interpretation of indentation +as a code block and as indicating that material belongs to a [list +item][list items], the list item interpretation takes precedence: ```````````````````````````````` example -``` + - foo - -``` + bar . -

-  
-
+
    +
  • +

    foo

    +

    bar

    +
  • +
```````````````````````````````` -A code block can be empty: - ```````````````````````````````` example -``` -``` +1. foo + + - bar . -
+
    +
  1. +

    foo

    +
      +
    • bar
    • +
    +
  2. +
```````````````````````````````` -Fences can be indented. If the opening fence is indented, -content lines will have equivalent opening indentation removed, -if present: + +The contents of a code block are literal text, and do not get parsed +as Markdown: ```````````````````````````````` example - ``` - aaa -aaa -``` +
+ *hi* + + - one . -
aaa
-aaa
+
<a/>
+*hi*
+
+- one
 
```````````````````````````````` +Here we have three chunks separated by blank lines: + ```````````````````````````````` example - ``` -aaa - aaa -aaa - ``` + chunk1 + + chunk2 + + + + chunk3 . -
aaa
-aaa
-aaa
+
chunk1
+
+chunk2
+
+
+
+chunk3
 
```````````````````````````````` +Any initial spaces or tabs beyond four spaces of indentation will be included in +the content, even in interior blank lines: + ```````````````````````````````` example - ``` - aaa - aaa - aaa - ``` + chunk1 + + chunk2 . -
aaa
- aaa
-aaa
+
chunk1
+  
+  chunk2
 
```````````````````````````````` -Four spaces indentation produces an indented code block: +An indented code block cannot interrupt a paragraph. (This +allows hanging indents and the like.) ```````````````````````````````` example - ``` - aaa - ``` +Foo + bar + . -
```
-aaa
-```
-
+

Foo +bar

```````````````````````````````` -Closing fences may be indented by 0-3 spaces, and their indentation -need not match that of the opening fence: +However, any non-blank line with fewer than four spaces of indentation ends +the code block immediately. So a paragraph may occur immediately +after indented code: ```````````````````````````````` example -``` -aaa - ``` + foo +bar . -
aaa
+
foo
 
+

bar

```````````````````````````````` +And indented code can occur immediately before and after other kinds of +blocks: + ```````````````````````````````` example - ``` -aaa - ``` +# Heading + foo +Heading +------ + foo +---- . -
aaa
+

Heading

+
foo
+
+

Heading

+
foo
 
+
```````````````````````````````` -This is not a closing fence, because it is indented 4 spaces: +The first line can be preceded by more than four spaces of indentation: ```````````````````````````````` example -``` -aaa - ``` + foo + bar . -
aaa
-    ```
+
    foo
+bar
 
```````````````````````````````` - -Code fences (opening and closing) cannot contain internal spaces: +Blank lines preceding or following an indented code block +are not included in it: ```````````````````````````````` example -``` ``` -aaa + + + foo + + . -

-aaa

+
foo
+
```````````````````````````````` +Trailing spaces or tabs are included in the code block's content: + ```````````````````````````````` example -~~~~~~ -aaa -~~~ ~~ + foo . -
aaa
-~~~ ~~
+
foo  
 
```````````````````````````````` -Fenced code blocks can interrupt paragraphs, and can be followed -directly by paragraphs, without a blank line between: + +## Fenced code blocks + +A [code fence](@) is a sequence +of at least three consecutive backtick characters (`` ` ``) or +tildes (`~`). (Tildes and backticks cannot be mixed.) +A [fenced code block](@) +begins with a code fence, preceded by up to three spaces of indentation. + +The line with the opening code fence may optionally contain some text +following the code fence; this is trimmed of leading and trailing +spaces or tabs and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick +characters. (The reason for this restriction is that otherwise +some inline code would be incorrectly interpreted as the +beginning of a fenced code block.) + +The content of the code block consists of all subsequent lines, until +a closing [code fence] of the same type as the code block +began with (backticks or tildes), and with at least as many backticks +or tildes as the opening code fence. If the leading code fence is +preceded by N spaces of indentation, then up to N spaces of indentation are +removed from each line of the content (if present). (If a content line is not +indented, it is preserved unchanged. If it is indented N spaces or less, all +of the indentation is removed.) + +The closing code fence may be preceded by up to three spaces of indentation, and +may be followed only by spaces or tabs, which are ignored. If the end of the +containing block (or document) is reached and no closing code fence +has been found, the code block contains all of the lines after the +opening code fence until the end of the containing block (or +document). (An alternative spec would require backtracking in the +event that a closing code fence is not found. But this makes parsing +much less efficient, and there seems to be no real down side to the +behavior described here.) + +A fenced code block may interrupt a paragraph, and does not require +a blank line either before or after. + +The content of a code fence is treated as literal text, not parsed +as inlines. The first word of the [info string] is typically used to +specify the language of the code sample, and rendered in the `class` +attribute of the `code` tag. However, this spec does not mandate any +particular treatment of the [info string]. + +Here is a simple example with backticks: ```````````````````````````````` example -foo ``` -bar +< + > ``` -baz . -

foo

-
bar
+
<
+ >
 
-

baz

```````````````````````````````` -Other blocks can also occur before and after fenced code blocks -without an intervening blank line: +With tildes: ```````````````````````````````` example -foo ---- ~~~ -bar +< + > ~~~ -# baz . -

foo

-
bar
+
<
+ >
 
-

baz

```````````````````````````````` +Fewer than three backticks is not enough: + +```````````````````````````````` example +`` +foo +`` +. +

foo

+```````````````````````````````` -An [info string] can be provided after the opening code fence. -Although this spec doesn't mandate any particular treatment of -the info string, the first word is typically used to specify -the language of the code block. In HTML output, the language is -normally indicated by adding a class to the `code` element consisting -of `language-` followed by the language name. +The closing code fence must use the same character as the opening +fence: ```````````````````````````````` example -```ruby -def foo(x) - return 3 -end +``` +aaa +~~~ ``` . -
def foo(x)
-  return 3
-end
+
aaa
+~~~
 
```````````````````````````````` ```````````````````````````````` example -~~~~ ruby startline=3 $%@#$ -def foo(x) - return 3 -end -~~~~~~~ +~~~ +aaa +``` +~~~ . -
def foo(x)
-  return 3
-end
+
aaa
+```
 
```````````````````````````````` +The closing code fence must be at least as long as the opening fence: + ```````````````````````````````` example -````; ```` +aaa +``` +`````` . -
+
aaa
+```
+
```````````````````````````````` -[Info strings] for backtick code blocks cannot contain backticks: +```````````````````````````````` example +~~~~ +aaa +~~~ +~~~~ +. +
aaa
+~~~
+
+```````````````````````````````` + + +Unclosed code blocks are closed by the end of the document +(or the enclosing [block quote][block quotes] or [list item][list items]): ```````````````````````````````` example -``` aa ``` -foo +``` . -

aa -foo

+
```````````````````````````````` -[Info strings] for tilde code blocks can contain backticks and tildes: +```````````````````````````````` example +````` + +``` +aaa +. +

+```
+aaa
+
+```````````````````````````````` + ```````````````````````````````` example -~~~ aa ``` ~~~ -foo -~~~ +> ``` +> aaa + +bbb . -
foo
+
+
aaa
 
+
+

bbb

```````````````````````````````` -Closing code fences cannot have [info strings]: +A code block can have all empty lines as its content: ```````````````````````````````` example ``` -``` aaa + + ``` . -
``` aaa
+

+  
 
```````````````````````````````` +A code block can be empty: -## HTML blocks - -An [HTML block](@) is a group of lines that is treated -as raw HTML (and will not be escaped in HTML output). +```````````````````````````````` example +``` +``` +. +
+```````````````````````````````` -There are seven kinds of [HTML block], which can be defined by their -start and end conditions. The block begins with a line that meets a -[start condition](@) (after up to three spaces optional indentation). -It ends with the first subsequent line that meets a matching [end -condition](@), or the last line of the document, or the last line of -the [container block](#container-blocks) containing the current HTML -block, if no line is encountered that meets the [end condition]. If -the first line meets both the [start condition] and the [end -condition], the block will contain just that line. -1. **Start condition:** line begins with the string ``, or the end of the line.\ -**End condition:** line contains an end tag -``, `
`, or `` (case-insensitive; it -need not match the start tag). +Fences can be indented. If the opening fence is indented, +content lines will have equivalent opening indentation removed, +if present: -2. **Start condition:** line begins with the string ``. +```````````````````````````````` example + ``` + aaa +aaa +``` +. +
aaa
+aaa
+
+```````````````````````````````` -3. **Start condition:** line begins with the string ``. -4. **Start condition:** line begins with the string ``. +```````````````````````````````` example + ``` +aaa + aaa +aaa + ``` +. +
aaa
+aaa
+aaa
+
+```````````````````````````````` + + +```````````````````````````````` example + ``` + aaa + aaa + aaa + ``` +. +
aaa
+ aaa
+aaa
+
+```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + ``` + aaa + ``` +. +
```
+aaa
+```
+
+```````````````````````````````` + + +Closing fences may be preceded by up to three spaces of indentation, and their +indentation need not match that of the opening fence: + +```````````````````````````````` example +``` +aaa + ``` +. +
aaa
+
+```````````````````````````````` + + +```````````````````````````````` example + ``` +aaa + ``` +. +
aaa
+
+```````````````````````````````` + + +This is not a closing fence, because it is indented 4 spaces: + +```````````````````````````````` example +``` +aaa + ``` +. +
aaa
+    ```
+
+```````````````````````````````` + + + +Code fences (opening and closing) cannot contain internal spaces or tabs: + +```````````````````````````````` example +``` ``` +aaa +. +

+aaa

+```````````````````````````````` + + +```````````````````````````````` example +~~~~~~ +aaa +~~~ ~~ +. +
aaa
+~~~ ~~
+
+```````````````````````````````` + + +Fenced code blocks can interrupt paragraphs, and can be followed +directly by paragraphs, without a blank line between: + +```````````````````````````````` example +foo +``` +bar +``` +baz +. +

foo

+
bar
+
+

baz

+```````````````````````````````` + + +Other blocks can also occur before and after fenced code blocks +without an intervening blank line: + +```````````````````````````````` example +foo +--- +~~~ +bar +~~~ +# baz +. +

foo

+
bar
+
+

baz

+```````````````````````````````` + + +An [info string] can be provided after the opening code fence. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. + +```````````````````````````````` example +```ruby +def foo(x) + return 3 +end +``` +. +
def foo(x)
+  return 3
+end
+
+```````````````````````````````` + + +```````````````````````````````` example +~~~~ ruby startline=3 $%@#$ +def foo(x) + return 3 +end +~~~~~~~ +. +
def foo(x)
+  return 3
+end
+
+```````````````````````````````` + + +```````````````````````````````` example +````; +```` +. +
+```````````````````````````````` + + +[Info strings] for backtick code blocks cannot contain backticks: + +```````````````````````````````` example +``` aa ``` +foo +. +

aa +foo

+```````````````````````````````` + + +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +
foo
+
+```````````````````````````````` + + +Closing code fences cannot have [info strings]: + +```````````````````````````````` example +``` +``` aaa +``` +. +
``` aaa
+
+```````````````````````````````` + + + +## HTML blocks + +An [HTML block](@) is a group of lines that is treated +as raw HTML (and will not be escaped in HTML output). + +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three optional spaces of indentation). +It ends with the first subsequent line that meets a matching +[end condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. + +1. **Start condition:** line begins with the string ``, or the end of the line.\ +**End condition:** line contains an end tag +`
`, ``, ``, or `` (case-insensitive; it +need not match the start tag). + +2. **Start condition:** line begins with the string ``. + +3. **Start condition:** line begins with the string ``. + +4. **Start condition:** line begins with the string ``. 5. **Start condition:** line begins with the string ``, or +by a space, a tab, the end of the line, the string `>`, or the string `/>`.\ **End condition:** line is followed by a [blank line]. 7. **Start condition:** line begins with a complete [open tag] -(with any [tag name] other than `script`, -`style`, or `pre`) or a complete [closing tag], -followed only by [whitespace] or the end of the line.\ +(with any [tag name] other than `pre`, `script`, +`style`, or `textarea`) or a complete [closing tag], +followed by zero or more spaces and tabs, followed by the end of the line.\ **End condition:** line is followed by a [blank line]. HTML blocks continue until they are closed by their appropriate @@ -2080,7 +2425,7 @@ block** that might otherwise be recognised as a start condition will be ignored by the parser and passed through as-is, without changing the parser's state. -For instance, `
` within a HTML block started by `` will not affect
+For instance, `
` within an HTML block started by `
` will not affect the parser state; as the HTML block was started in by start condition 6, it will end at any blank line. This can be surprising: @@ -2101,7 +2446,7 @@ _world_.
```````````````````````````````` -In this case, the HTML block is terminated by the newline — the `**Hello**` +In this case, the HTML block is terminated by the blank line — the `**Hello**` text remains verbatim — and regular parsing resumes, with a paragraph, emphasised `world` and inline and block HTML following. @@ -2379,7 +2724,7 @@ rather than an [HTML block].) HTML tags designed to contain literal content -(`script`, `style`, `pre`), comments, processing instructions, +(`pre`, `script`, `style`, `textarea`), comments, processing instructions, and declarations are treated somewhat differently. Instead of ending at the first blank line, these blocks end at the first line containing a corresponding end tag. @@ -2425,6 +2770,26 @@ document.getElementById("demo").innerHTML = "Hello JavaScript!"; ```````````````````````````````` +A textarea tag (type 1): + +```````````````````````````````` example + +. + +```````````````````````````````` + A style tag (type 1): ```````````````````````````````` example @@ -2603,7 +2968,8 @@ function matchwo(a,b) ```````````````````````````````` -The opening tag can be indented 1-3 spaces, but not 4: +The opening tag can be preceded by up to three spaces of indentation, but not +four: ```````````````````````````````` example @@ -2679,7 +3045,7 @@ specification, which says: > The only restrictions are that block-level HTML elements — > e.g. `
`, ``, `
`, `

`, etc. — must be separated from > surrounding content by blank lines, and the start and end tags of the -> block should not be indented with tabs or spaces. +> block should not be indented with spaces or tabs. In some ways Gruber's rule is more restrictive than the one given here: @@ -2797,14 +3163,15 @@ deleted. The exception is inside `

` tags, but as described
 ## Link reference definitions
 
 A [link reference definition](@)
-consists of a [link label], indented up to three spaces, followed
-by a colon (`:`), optional [whitespace] (including up to one
+consists of a [link label], optionally preceded by up to three spaces of
+indentation, followed
+by a colon (`:`), optional spaces or tabs (including up to one
 [line ending]), a [link destination],
-optional [whitespace] (including up to one
+optional spaces or tabs (including up to one
 [line ending]), and an optional [link
 title], which if it is present must be separated
-from the [link destination] by [whitespace].
-No further [non-whitespace characters] may occur on the line.
+from the [link destination] by spaces or tabs.
+No further character may occur.
 
 A [link reference definition]
 does not correspond to a structural element of a document.  Instead, it
@@ -2922,7 +3289,7 @@ The link destination may not be omitted:
 ````````````````````````````````
 
 The title must be separated from the link destination by
-whitespace:
+spaces or tabs:
 
 ```````````````````````````````` example
 [foo]: (baz)
@@ -2991,8 +3358,11 @@ case-insensitive (see [matches]).
 ````````````````````````````````
 
 
-Here is a link reference definition with no corresponding link.
-It contributes nothing to the document.
+Whether something is a [link reference definition] is
+independent of whether the link reference it defines is
+used in the document.  Thus, for example, the following
+document contains just a link reference definition, and
+no visible content:
 
 ```````````````````````````````` example
 [foo]: /url
@@ -3013,7 +3383,7 @@ bar
 
 
 This is not a link reference definition, because there are
-[non-whitespace characters] after the title:
+characters other than spaces or tabs after the title:
 
 ```````````````````````````````` example
 [foo]: /url "title" ok
@@ -3145,18 +3515,6 @@ are defined:
 ````````````````````````````````
 
 
-Whether something is a [link reference definition] is
-independent of whether the link reference it defines is
-used in the document.  Thus, for example, the following
-document contains just a link reference definition, and
-no visible content:
-
-```````````````````````````````` example
-[foo]: /url
-.
-````````````````````````````````
-
-
 ## Paragraphs
 
 A sequence of non-blank lines that cannot be interpreted as other
@@ -3164,7 +3522,7 @@ kinds of blocks forms a [paragraph](@).
 The contents of the paragraph are the result of parsing the
 paragraph's raw content as inlines.  The paragraph's raw content
 is formed by concatenating the lines and removing initial and final
-[whitespace].
+spaces or tabs.
 
 A simple example with two paragraphs:
 
@@ -3194,7 +3552,7 @@ ddd

```````````````````````````````` -Multiple blank lines between paragraph have no effect: +Multiple blank lines between paragraphs have no effect: ```````````````````````````````` example aaa @@ -3207,7 +3565,7 @@ bbb ```````````````````````````````` -Leading spaces are skipped: +Leading spaces or tabs are skipped: ```````````````````````````````` example aaa @@ -3232,8 +3590,8 @@ ccc

```````````````````````````````` -However, the first line may be indented at most three spaces, -or an indented code block will be triggered: +However, the first line may be preceded by up to three spaces of indentation. +Four spaces of indentation is too many: ```````````````````````````````` example aaa @@ -3254,7 +3612,7 @@ bbb ```````````````````````````````` -Final spaces are stripped before inline parsing, so a paragraph +Final spaces or tabs are stripped before inline parsing, so a paragraph that ends with two or more spaces will not end with a [hard line break]: @@ -3313,9 +3671,11 @@ these constructions. (A recipe is provided below in the section entitled ## Block quotes -A [block quote marker](@) -consists of 0-3 spaces of initial indent, plus (a) the character `>` together -with a following space, or (b) a single character `>` not followed by a space. +A [block quote marker](@), +optionally preceded by up to three spaces of indentation, +consists of (a) the character `>` together with a following space of +indentation, or (b) a single character `>` not followed by a space of +indentation. The following rules define [block quotes]: @@ -3327,8 +3687,8 @@ The following rules define [block quotes]: 2. **Laziness.** If a string of lines *Ls* constitute a [block quote](#block-quotes) with contents *Bs*, then the result of deleting the initial [block quote marker] from one or - more lines in which the next [non-whitespace character] after the [block - quote marker] is [paragraph continuation + more lines in which the next character other than a space or tab after the + [block quote marker] is [paragraph continuation text] is a block quote with *Bs* as its content. [Paragraph continuation text](@) is text that will be parsed as part of the content of a paragraph, but does @@ -3354,7 +3714,7 @@ baz

```````````````````````````````` -The spaces after the `>` characters can be omitted: +The space or tab after the `>` characters can be omitted: ```````````````````````````````` example ># Foo @@ -3369,7 +3729,7 @@ baz

```````````````````````````````` -The `>` characters can be indented 1-3 spaces: +The `>` characters can be preceded by up to three spaces of indentation: ```````````````````````````````` example > # Foo @@ -3384,7 +3744,7 @@ baz

```````````````````````````````` -Four spaces gives us a code block: +Four spaces of indentation is too many: ```````````````````````````````` example > # Foo @@ -3719,8 +4079,8 @@ baz

When including an indented code block in a block quote, remember that the [block quote marker] includes -both the `>` and a following space. So *five spaces* are needed after -the `>`: +both the `>` and a following space of indentation. So *five spaces* are needed +after the `>`: ```````````````````````````````` example > code @@ -3755,10 +4115,10 @@ in some browsers.) The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of - blocks *Bs* starting with a [non-whitespace character], and *M* is a - list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result - of prepending *M* and the following spaces to the first line of - *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a + blocks *Bs* starting with a character other than a space or tab, and *M* is + a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation, + then the result of prepending *M* and the following spaces to the first line + of Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item (bullet or ordered) is determined by the type of its list marker. If the list item is ordered, then it is also assigned a start @@ -3823,8 +4183,8 @@ with two lines.

The most important thing to notice is that the position of the text after the list marker determines how much indentation is needed in subsequent blocks in the list item. If the list -marker takes up two spaces, and there are three spaces between -the list marker and the next [non-whitespace character], then blocks +marker takes up two spaces of indentation, and there are three spaces between +the list marker and the next character other than a space or tab, then blocks must be indented five spaces in order to fall under the list item. @@ -3885,10 +4245,10 @@ put under the list item: It is tempting to think of this in terms of columns: the continuation -blocks must be indented at least to the column of the first -[non-whitespace character] after the list marker. However, that is not quite right. -The spaces after the list marker determine how much relative indentation -is needed. Which column this indentation reaches will depend on +blocks must be indented at least to the column of the first character other than +a space or tab after the list marker. However, that is not quite right. +The spaces of indentation after the list marker determine how much relative +indentation is needed. Which column this indentation reaches will depend on how the list item is embedded in other constructions, as shown by this example: @@ -3935,7 +4295,7 @@ far enough past the blockquote marker: ```````````````````````````````` -Note that at least one space is needed between the list marker and +Note that at least one space or tab is needed between the list marker and any following content, so these are not list items: ```````````````````````````````` example @@ -4067,16 +4427,16 @@ A start number may not be negative: 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code block, and *M* is a list marker of width *W* followed by - one space, then the result of prepending *M* and the following - space to the first line of *Ls*, and indenting subsequent lines of - *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. + one space of indentation, then the result of prepending *M* and the + following space to the first line of *Ls*, and indenting subsequent lines + of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. If a line is empty, then it need not be indented. The type of the list item (bullet or ordered) is determined by the type of its list marker. If the list item is ordered, then it is also assigned a start number, based on the ordered list marker. -An indented code block will have to be indented four spaces beyond -the edge of the region where text will be included in the list item. +An indented code block will have to be preceded by four spaces of indentation +beyond the edge of the region where text will be included in the list item. In the following case that is 6 spaces: ```````````````````````````````` example @@ -4112,8 +4472,8 @@ And in this case it is 11 spaces: If the *first* block in the list item is an indented code block, -then by rule #2, the contents must be indented *one* space after the -list marker: +then by rule #2, the contents must be preceded by *one* space of indentation +after the list marker: ```````````````````````````````` example indented code @@ -4149,7 +4509,7 @@ paragraph ```````````````````````````````` -Note that an additional space indent is interpreted as space +Note that an additional space of indentation is interpreted as space inside the code block: ```````````````````````````````` example @@ -4173,10 +4533,10 @@ inside the code block: Note that rules #1 and #2 only apply to two cases: (a) cases in which the lines to be included in a list item begin with a -[non-whitespace character], and (b) cases in which +characer other than a space or tab, and (b) cases in which they begin with an indented code block. In a case like the following, where the first block begins with -a three-space indent, the rules do not allow us to form a list item by +three spaces of indentation, the rules do not allow us to form a list item by indenting the whole thing and prepending a list marker: ```````````````````````````````` example @@ -4201,8 +4561,8 @@ bar ```````````````````````````````` -This is not a significant restriction, because when a block begins -with 1-3 spaces indent, the indentation can always be removed without +This is not a significant restriction, because when a block is preceded by up to +three spaces of indentation, the indentation can always be removed without a change in interpretation, allowing rule #1 to be applied. So, in the above case: @@ -4222,11 +4582,10 @@ the above case: 3. **Item starting with a blank line.** If a sequence of lines *Ls* starting with a single [blank line] constitute a (possibly empty) - sequence of blocks *Bs*, not separated from each other by more than - one blank line, and *M* is a list marker of width *W*, + sequence of blocks *Bs*, and *M* is a list marker of width *W*, then the result of prepending *M* to the first line of *Ls*, and - indenting subsequent lines of *Ls* by *W + 1* spaces, is a list - item with *Bs* as its contents. + preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a + list item with *Bs* as its contents. If a line is empty, then it need not be indented. The type of the list item (bullet or ordered) is determined by the type of its list marker. If the list item is ordered, then it is also assigned a @@ -4301,7 +4660,7 @@ Here is an empty bullet list item: ```````````````````````````````` -It does not matter whether there are spaces following the [list marker]: +It does not matter whether there are spaces or tabs following the [list marker]: ```````````````````````````````` example - foo @@ -4358,9 +4717,9 @@ foo 4. **Indentation.** If a sequence of lines *Ls* constitutes a list item - according to rule #1, #2, or #3, then the result of indenting each line - of *Ls* by 1-3 spaces (the same for each line) also constitutes a - list item with the same contents and attributes. If a line is + according to rule #1, #2, or #3, then the result of preceding each line + of *Ls* by up to three spaces of indentation (the same for each line) also + constitutes a list item with the same contents and attributes. If a line is empty, then it need not be indented. Indented one space: @@ -4459,7 +4818,7 @@ Four spaces indent gives a code block: 5. **Laziness.** If a string of lines *Ls* constitute a [list item](#list-items) with contents *Bs*, then the result of deleting some or all of the indentation from one or more lines in which the - next [non-whitespace character] after the indentation is + next character other than a space or tab after the indentation is [paragraph continuation text] is a list item with the same contents and attributes. The unindented lines are called @@ -4544,7 +4903,7 @@ continued here.

The rules for sublists follow from the general rules [above][List items]. A sublist must be indented the same number -of spaces a paragraph would need to be in order to be included +of spaces of indentation a paragraph would need to be in order to be included in the list item. So, in this case we need two spaces indent: @@ -4777,8 +5136,8 @@ The choice of four spaces is arbitrary. It can be learned, but it is not likely to be guessed, and it trips up beginners regularly. Would it help to adopt a two-space rule? The problem is that such -a rule, together with the rule allowing 1--3 spaces indentation of the -initial list marker, allows text that is indented *less than* the +a rule, together with the rule allowing up to three spaces of indentation for +the initial list marker, allows text that is indented *less than* the original list marker to be included in the list item. For example, `Markdown.pl` parses @@ -5170,8 +5529,8 @@ item: ```````````````````````````````` -Note, however, that list items may not be indented more than -three spaces. Here `- e` is treated as a paragraph continuation +Note, however, that list items may not be preceded by more than +three spaces of indentation. Here `- e` is treated as a paragraph continuation line, because it is indented more than three spaces: ```````````````````````````````` example @@ -5257,7 +5616,7 @@ So is this, with a empty second item: ```````````````````````````````` -These are loose lists, even though there is no space between the items, +These are loose lists, even though there are no blank lines between the items, because one of the items directly contains two block-level elements with a blank line between them: @@ -5265,585 +5624,246 @@ with a blank line between them: - a - b - c -- d -. -
    -
  • -

    a

    -
  • -
  • -

    b

    -

    c

    -
  • -
  • -

    d

    -
  • -
-```````````````````````````````` - - -```````````````````````````````` example -- a -- b - - [ref]: /url -- d -. -
    -
  • -

    a

    -
  • -
  • -

    b

    -
  • -
  • -

    d

    -
  • -
-```````````````````````````````` - - -This is a tight list, because the blank lines are in a code block: - -```````````````````````````````` example -- a -- ``` - b - - - ``` -- c -. -
    -
  • a
  • -
  • -
    b
    -
    -
    -
    -
  • -
  • c
  • -
-```````````````````````````````` - - -This is a tight list, because the blank line is between two -paragraphs of a sublist. So the sublist is loose while -the outer list is tight: - -```````````````````````````````` example -- a - - b - - c -- d -. -
    -
  • a -
      -
    • -

      b

      -

      c

      -
    • -
    -
  • -
  • d
  • -
-```````````````````````````````` - - -This is a tight list, because the blank line is inside the -block quote: - -```````````````````````````````` example -* a - > b - > -* c -. -
    -
  • a -
    -

    b

    -
    -
  • -
  • c
  • -
-```````````````````````````````` - - -This list is tight, because the consecutive block elements -are not separated by blank lines: - -```````````````````````````````` example -- a - > b - ``` - c - ``` -- d -. -
    -
  • a -
    -

    b

    -
    -
    c
    -
    -
  • -
  • d
  • -
-```````````````````````````````` - - -A single-paragraph list is tight: - -```````````````````````````````` example -- a -. -
    -
  • a
  • -
-```````````````````````````````` - - -```````````````````````````````` example -- a - - b -. -
    -
  • a -
      -
    • b
    • -
    -
  • -
-```````````````````````````````` - - -This list is loose, because of the blank line between the -two block elements in the list item: - -```````````````````````````````` example -1. ``` - foo - ``` - - bar -. -
    -
  1. -
    foo
    -
    -

    bar

    -
  2. -
-```````````````````````````````` - - -Here the outer list is loose, the inner list tight: - -```````````````````````````````` example -* foo - * bar - - baz -. -
    -
  • -

    foo

    -
      -
    • bar
    • -
    -

    baz

    -
  • -
-```````````````````````````````` - - -```````````````````````````````` example -- a - - b - - c - -- d - - e - - f -. -
    -
  • -

    a

    -
      -
    • b
    • -
    • c
    • -
    -
  • -
  • -

    d

    -
      -
    • e
    • -
    • f
    • -
    -
  • -
-```````````````````````````````` - - -# Inlines - -Inlines are parsed sequentially from the beginning of the character -stream to the end (left to right, in left-to-right languages). -Thus, for example, in - -```````````````````````````````` example -`hi`lo` -. -

hilo`

-```````````````````````````````` - -`hi` is parsed as code, leaving the backtick at the end as a literal -backtick. - - -## Backslash escapes - -Any ASCII punctuation character may be backslash-escaped: - -```````````````````````````````` example -\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ -. -

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

-```````````````````````````````` - - -Backslashes before other characters are treated as literal -backslashes: - -```````````````````````````````` example -\→\A\a\ \3\φ\« -. -

\→\A\a\ \3\φ\«

-```````````````````````````````` - - -Escaped characters are treated as regular characters and do -not have their usual Markdown meanings: - -```````````````````````````````` example -\*not emphasized* -\
not a tag -\[not a link](/foo) -\`not code` -1\. not a list -\* not a list -\# not a heading -\[foo]: /url "not a reference" -\ö not a character entity -. -

*not emphasized* -<br/> not a tag -[not a link](/foo) -`not code` -1. not a list -* not a list -# not a heading -[foo]: /url "not a reference" -&ouml; not a character entity

-```````````````````````````````` - - -If a backslash is itself escaped, the following character is not: - -```````````````````````````````` example -\\*emphasis* -. -

\emphasis

-```````````````````````````````` - - -A backslash at the end of the line is a [hard line break]: - -```````````````````````````````` example -foo\ -bar -. -

foo
-bar

-```````````````````````````````` - - -Backslash escapes do not work in code blocks, code spans, autolinks, or -raw HTML: - -```````````````````````````````` example -`` \[\` `` -. -

\[\`

-```````````````````````````````` - - -```````````````````````````````` example - \[\] -. -
\[\]
-
-```````````````````````````````` - - -```````````````````````````````` example -~~~ -\[\] -~~~ -. -
\[\]
-
-```````````````````````````````` - - -```````````````````````````````` example - -. -

http://example.com?find=\*

-```````````````````````````````` - - -```````````````````````````````` example - -. - -```````````````````````````````` - - -But they work in all other contexts, including URLs and link titles, -link references, and [info strings] in [fenced code blocks]: - -```````````````````````````````` example -[foo](/bar\* "ti\*tle") -. -

foo

-```````````````````````````````` - - -```````````````````````````````` example -[foo] - -[foo]: /bar\* "ti\*tle" -. -

foo

-```````````````````````````````` - - -```````````````````````````````` example -``` foo\+bar -foo -``` -. -
foo
-
-```````````````````````````````` - - - -## Entity and numeric character references - -Valid HTML entity references and numeric character references -can be used in place of the corresponding Unicode character, -with the following exceptions: - -- Entity and character references are not recognized in code - blocks and code spans. - -- Entity and character references cannot stand in place of - special characters that define structural elements in - CommonMark. For example, although `*` can be used - in place of a literal `*` character, `*` cannot replace - `*` in emphasis delimiters, bullet list markers, or thematic - breaks. - -Conforming CommonMark parsers need not store information about -whether a particular character was represented in the source -using a Unicode character or an entity reference. - -[Entity references](@) consist of `&` + any of the valid -HTML5 entity names + `;`. The -document -is used as an authoritative source for the valid entity -references and their corresponding code points. - -```````````````````````````````` example -  & © Æ Ď -¾ ℋ ⅆ -∲ ≧̸ -. -

  & © Æ Ď -¾ ℋ ⅆ -∲ ≧̸

-```````````````````````````````` - - -[Decimal numeric character -references](@) -consist of `&#` + a string of 1--7 arabic digits + `;`. A -numeric character reference is parsed as the corresponding -Unicode character. Invalid Unicode code points will be replaced by -the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, -the code point `U+0000` will also be replaced by `U+FFFD`. - -```````````````````````````````` example -# Ӓ Ϡ � -. -

# Ӓ Ϡ �

-```````````````````````````````` - - -[Hexadecimal numeric character -references](@) consist of `&#` + -either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. -They too are parsed as the corresponding Unicode character (this -time specified with a hexadecimal numeral instead of decimal). - -```````````````````````````````` example -" ആ ಫ + c +- d . -

" ആ ಫ

+
    +
  • +

    a

    +
  • +
  • +

    b

    +

    c

    +
  • +
  • +

    d

    +
  • +
```````````````````````````````` -Here are some nonentities: - ```````````````````````````````` example -  &x; &#; &#x; -� -&#abcdef0; -&ThisIsNotDefined; &hi?; +- a +- b + + [ref]: /url +- d . -

&nbsp &x; &#; &#x; -&#987654321; -&#abcdef0; -&ThisIsNotDefined; &hi?;

+
    +
  • +

    a

    +
  • +
  • +

    b

    +
  • +
  • +

    d

    +
  • +
```````````````````````````````` -Although HTML5 does accept some entity references -without a trailing semicolon (such as `©`), these are not -recognized here, because it makes the grammar too ambiguous: +This is a tight list, because the blank lines are in a code block: ```````````````````````````````` example -© +- a +- ``` + b + + + ``` +- c . -

&copy

+
    +
  • a
  • +
  • +
    b
    +
    +
    +
    +
  • +
  • c
  • +
```````````````````````````````` -Strings that are not on the list of HTML5 named entities are not -recognized as entity references either: +This is a tight list, because the blank line is between two +paragraphs of a sublist. So the sublist is loose while +the outer list is tight: ```````````````````````````````` example -&MadeUpEntity; +- a + - b + + c +- d . -

&MadeUpEntity;

+
    +
  • a +
      +
    • +

      b

      +

      c

      +
    • +
    +
  • +
  • d
  • +
```````````````````````````````` -Entity and numeric character references are recognized in any -context besides code spans or code blocks, including -URLs, [link titles], and [fenced code block][] [info strings]: +This is a tight list, because the blank line is inside the +block quote: ```````````````````````````````` example - +* a + > b + > +* c . - +
    +
  • a +
    +

    b

    +
    +
  • +
  • c
  • +
```````````````````````````````` +This list is tight, because the consecutive block elements +are not separated by blank lines: + ```````````````````````````````` example -[foo](/föö "föö") +- a + > b + ``` + c + ``` +- d . -

foo

+
    +
  • a +
    +

    b

    +
    +
    c
    +
    +
  • +
  • d
  • +
```````````````````````````````` -```````````````````````````````` example -[foo] +A single-paragraph list is tight: -[foo]: /föö "föö" +```````````````````````````````` example +- a . -

foo

+
    +
  • a
  • +
```````````````````````````````` ```````````````````````````````` example -``` föö -foo -``` +- a + - b . -
foo
-
+
    +
  • a +
      +
    • b
    • +
    +
  • +
```````````````````````````````` -Entity and numeric character references are treated as literal -text in code spans and code blocks: +This list is loose, because of the blank line between the +two block elements in the list item: ```````````````````````````````` example -`föö` -. -

f&ouml;&ouml;

-```````````````````````````````` - +1. ``` + foo + ``` -```````````````````````````````` example - föfö + bar . -
f&ouml;f&ouml;
+
    +
  1. +
    foo
     
    +

    bar

    +
  2. +
```````````````````````````````` -Entity and numeric character references cannot be used -in place of symbols indicating structure in CommonMark -documents. +Here the outer list is loose, the inner list tight: ```````````````````````````````` example -*foo* -*foo* +* foo + * bar + + baz . -

*foo* -foo

+
    +
  • +

    foo

    +
      +
    • bar
    • +
    +

    baz

    +
  • +
```````````````````````````````` + ```````````````````````````````` example -* foo +- a + - b + - c -* foo +- d + - e + - f . -

* foo

    -
  • foo
  • +
  • +

    a

    +
      +
    • b
    • +
    • c
    • +
    +
  • +
  • +

    d

    +
      +
    • e
    • +
    • f
    • +
    +
```````````````````````````````` -```````````````````````````````` example -foo bar -. -

foo -bar

-```````````````````````````````` +# Inlines + +Inlines are parsed sequentially from the beginning of the character +stream to the end (left to right, in left-to-right languages). +Thus, for example, in ```````````````````````````````` example - foo +`hi`lo` . -

→foo

+

hilo`

```````````````````````````````` +`hi` is parsed as code, leaving the backtick at the end as a literal +backtick. -```````````````````````````````` example -[a](url "tit") -. -

[a](url "tit")

-```````````````````````````````` ## Code spans @@ -5854,7 +5874,7 @@ preceded nor followed by a backtick. A [code span](@) begins with a backtick string and ends with a backtick string of equal length. The contents of the code span are -the characters between the two backtick strings, normalized in the +the characters between these two backtick strings, normalized in the following ways: - First, [line endings] are converted to [spaces]. @@ -6133,17 +6153,17 @@ a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is a [delimiter run] that is (1) not followed by [Unicode whitespace], -and either (2a) not followed by a [punctuation character], or -(2b) followed by a [punctuation character] and -preceded by [Unicode whitespace] or a [punctuation character]. +and either (2a) not followed by a [Unicode punctuation character], or +(2b) followed by a [Unicode punctuation character] and +preceded by [Unicode whitespace] or a [Unicode punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is a [delimiter run] that is (1) not preceded by [Unicode whitespace], -and either (2a) not preceded by a [punctuation character], or -(2b) preceded by a [punctuation character] and -followed by [Unicode whitespace] or a [punctuation character]. +and either (2a) not preceded by a [Unicode punctuation character], or +(2b) preceded by a [Unicode punctuation character] and +followed by [Unicode whitespace] or a [Unicode punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. @@ -6198,7 +6218,7 @@ The following rules define emphasis and strong emphasis: it is part of a [left-flanking delimiter run] and either (a) not part of a [right-flanking delimiter run] or (b) part of a [right-flanking delimiter run] - preceded by punctuation. + preceded by a [Unicode punctuation character]. 3. A single `*` character [can close emphasis](@) iff it is part of a [right-flanking delimiter run]. @@ -6207,7 +6227,7 @@ The following rules define emphasis and strong emphasis: it is part of a [right-flanking delimiter run] and either (a) not part of a [left-flanking delimiter run] or (b) part of a [left-flanking delimiter run] - followed by punctuation. + followed by a [Unicode punctuation character]. 5. A double `**` [can open strong emphasis](@) iff it is part of a [left-flanking delimiter run]. @@ -6216,7 +6236,7 @@ The following rules define emphasis and strong emphasis: it is part of a [left-flanking delimiter run] and either (a) not part of a [right-flanking delimiter run] or (b) part of a [right-flanking delimiter run] - preceded by punctuation. + preceded by a [Unicode punctuation character]. 7. A double `**` [can close strong emphasis](@) iff it is part of a [right-flanking delimiter run]. @@ -6225,7 +6245,7 @@ The following rules define emphasis and strong emphasis: it is part of a [right-flanking delimiter run] and either (a) not part of a [left-flanking delimiter run] or (b) part of a [left-flanking delimiter run] - followed by punctuation. + followed by a [Unicode punctuation character]. 9. Emphasis begins with a delimiter that [can open emphasis] and ends with a delimiter that [can close emphasis], and that uses the same @@ -6437,7 +6457,7 @@ whitespace: ```````````````````````````````` -A newline also counts as whitespace: +A line ending also counts as whitespace: ```````````````````````````````` example *foo bar @@ -6602,7 +6622,7 @@ __ foo bar__ ```````````````````````````````` -A newline counts as whitespace: +A line ending counts as whitespace: ```````````````````````````````` example __ foo bar__ @@ -6881,7 +6901,7 @@ emphasis sections in this example: The same condition ensures that the following cases are all strong emphasis nested inside -emphasis, even when the interior spaces are +emphasis, even when the interior whitespace is omitted: @@ -7458,13 +7478,14 @@ following rules apply: A [link destination](@) consists of either - a sequence of zero or more characters between an opening `<` and a - closing `>` that contains no line breaks or unescaped + closing `>` that contains no line endings or unescaped `<` or `>` characters, or -- a nonempty sequence of characters that does not start with - `<`, does not include ASCII space or control characters, and - includes parentheses only if (a) they are backslash-escaped or - (b) they are part of a balanced pair of unescaped parentheses. +- a nonempty sequence of characters that does not start with `<`, + does not include [ASCII control characters][ASCII control character] + or [space] character, and includes parentheses only if (a) they are + backslash-escaped or (b) they are part of a balanced pair of + unescaped parentheses. (Implementations may impose limits on parentheses nesting to avoid performance issues, but at least three levels of nesting should be supported.) @@ -7487,10 +7508,14 @@ Although [link titles] may span multiple lines, they may not contain a [blank line]. An [inline link](@) consists of a [link text] followed immediately -by a left parenthesis `(`, optional [whitespace], an optional -[link destination], an optional [link title] separated from the link -destination by [whitespace], optional [whitespace], and a right -parenthesis `)`. The link's text consists of the inlines contained +by a left parenthesis `(`, an optional [link destination], an optional +[link title], and a right parenthesis `)`. +These four components may be separated by spaces, tabs, and up to one line +ending. +If both [link destination] and [link title] are present, they *must* be +separated by spaces, tabs, and up to one line ending. + +The link's text consists of the inlines contained in the [link text] (excluding the enclosing square brackets). The link's URI consists of the link destination, excluding enclosing `<...>` if present, with backslash-escapes in effect as described @@ -7507,7 +7532,8 @@ Here is a simple inline link: ```````````````````````````````` -The title may be omitted: +The title, the link text and even +the destination may be omitted: ```````````````````````````````` example [link](/uri) @@ -7515,8 +7541,12 @@ The title may be omitted:

link

```````````````````````````````` +```````````````````````````````` example +[](./target.md) +. +

+```````````````````````````````` -Both the title and the destination may be omitted: ```````````````````````````````` example [link]() @@ -7531,6 +7561,13 @@ Both the title and the destination may be omitted:

link

```````````````````````````````` + +```````````````````````````````` example +[]() +. +

+```````````````````````````````` + The destination can only contain spaces if it is enclosed in pointy brackets: @@ -7546,7 +7583,7 @@ enclosed in pointy brackets:

link

```````````````````````````````` -The destination cannot contain line breaks, +The destination cannot contain line endings, even if enclosed in pointy brackets: ```````````````````````````````` example @@ -7615,6 +7652,13 @@ balanced: However, if you have unbalanced parentheses, you need to escape or use the `<...>` form: +```````````````````````````````` example +[link](foo(and(bar)) +. +

[link](foo(and(bar))

+```````````````````````````````` + + ```````````````````````````````` example [link](foo\(and\(bar\)) . @@ -7714,7 +7758,8 @@ may be used in titles: ```````````````````````````````` -Titles must be separated from the link using a [whitespace]. +Titles must be separated from the link using spaces, tabs, and up to one line +ending. Other [Unicode whitespace] like non-breaking space doesn't work. ```````````````````````````````` example @@ -7757,7 +7802,8 @@ titles with no closing quotation mark, though 1.0.2b8 does not. It seems preferable to adopt a simple, rational rule that works the same way in inline links and link reference definitions.) -[Whitespace] is allowed around the destination and title: +Spaces, tabs, and up to one line ending is allowed around the destination and +title: ```````````````````````````````` example [link]( /uri @@ -7908,7 +7954,8 @@ that [matches] a [link reference definition] elsewhere in the document. A [link label](@) begins with a left bracket (`[`) and ends with the first right bracket (`]`) that is not backslash-escaped. -Between these brackets there must be at least one [non-whitespace character]. +Between these brackets there must be at least one character that is not a space, +tab, or line ending. Unescaped square bracket characters are not allowed inside the opening and closing square brackets of [link labels]. A link label can have at most 999 characters inside the square @@ -7918,14 +7965,13 @@ One label [matches](@) another just in case their normalized forms are equal. To normalize a label, strip off the opening and closing brackets, perform the *Unicode case fold*, strip leading and trailing -[whitespace] and collapse consecutive internal -[whitespace] to a single space. If there are multiple +spaces, tabs, and line endings, and collapse consecutive internal +spaces, tabs, and line endings to a single space. If there are multiple matching reference link definitions, the one that comes first in the document is used. (It is desirable in such cases to emit a warning.) -The contents of the first link label are parsed as inlines, which are -used as the link's text. The link's URI and title are provided by the -matching [link reference definition]. +The link's URI and title are provided by the matching [link +reference definition]. Here is a simple example: @@ -8018,11 +8064,11 @@ emphasis grouping: ```````````````````````````````` example -[foo *bar][ref] +[foo *bar][ref]* [ref]: /uri . -

foo *bar

+

foo *bar*

```````````````````````````````` @@ -8070,15 +8116,15 @@ Matching is case-insensitive: Unicode case fold is used: ```````````````````````````````` example -[Толпой][Толпой] is a Russian word. +[ẞ] -[ТОЛПОЙ]: /url +[SS]: /url . -

Толпой is a Russian word.

+

```````````````````````````````` -Consecutive internal [whitespace] is treated as one space for +Consecutive internal spaces, tabs, and line endings are treated as one space for purposes of determining matching: ```````````````````````````````` example @@ -8091,7 +8137,7 @@ purposes of determining matching: ```````````````````````````````` -No [whitespace] is allowed between the [link text] and the +No spaces, tabs, or line endings are allowed between the [link text] and the [link label]: ```````````````````````````````` example @@ -8221,7 +8267,8 @@ Note that in this example `]` is not backslash-escaped: ```````````````````````````````` -A [link label] must contain at least one [non-whitespace character]: +A [link label] must contain at least one character that is not a space, tab, or +line ending: ```````````````````````````````` example [] @@ -8286,7 +8333,7 @@ The link labels are case-insensitive: -As with full reference links, [whitespace] is not +As with full reference links, spaces, tabs, or line endings are not allowed between the two sets of brackets: ```````````````````````````````` example @@ -8614,7 +8661,7 @@ The labels are case-insensitive: ```````````````````````````````` -As with reference links, [whitespace] is not allowed +As with reference links, spaces, tabs, and line endings, are not allowed between the two sets of brackets: ```````````````````````````````` example @@ -8707,9 +8754,9 @@ a link to the URI, with the URI as the link's label. An [absolute URI](@), for these purposes, consists of a [scheme] followed by a colon (`:`) -followed by zero or more characters other than ASCII -[whitespace] and control characters, `<`, and `>`. If -the URI includes these characters, they must be percent-encoded +followed by zero or more characters other [ASCII control +characters][ASCII control character], [space], `<`, and `>`. +If the URI includes these characters, they must be percent-encoded (e.g. `%20` for a space). For purposes of this spec, a [scheme](@) is any sequence @@ -8895,7 +8942,7 @@ A [tag name](@) consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (`-`). -An [attribute](@) consists of [whitespace], +An [attribute](@) consists of spaces, tabs, and up to one line ending, an [attribute name], and an optional [attribute value specification]. @@ -8905,9 +8952,9 @@ letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML specification restricted to ASCII. HTML5 is laxer.) An [attribute value specification](@) -consists of optional [whitespace], -a `=` character, optional [whitespace], and an [attribute -value]. +consists of optional spaces, tabs, and up to one line ending, +a `=` character, optional spaces, tabs, and up to one line ending, +and an [attribute value]. An [attribute value](@) consists of an [unquoted attribute value], @@ -8915,7 +8962,7 @@ a [single-quoted attribute value], or a [double-quoted attribute value]. An [unquoted attribute value](@) is a nonempty string of characters not -including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. +including spaces, tabs, line endings, `"`, `'`, `=`, `<`, `>`, or `` ` ``. A [single-quoted attribute value](@) consists of `'`, zero or more @@ -8926,11 +8973,12 @@ consists of `"`, zero or more characters not including `"`, and a final `"`. An [open tag](@) consists of a `<` character, a [tag name], -zero or more [attributes], optional [whitespace], an optional `/` -character, and a `>` character. +zero or more [attributes], optional spaces, tabs, and up to one line ending, +an optional `/` character, and a `>` character. A [closing tag](@) consists of the string ``. +[tag name], optional spaces, tabs, and up to one line ending, and the character +`>`. An [HTML comment](@) consists of ``, where *text* does not start with `>` or `->`, does not end with `-`, @@ -8942,10 +8990,8 @@ consists of the string ``, and the string `?>`. -A [declaration](@) consists of the -string ``, and the character `>`. +A [declaration](@) consists of the string ``, and the character `>`. A [CDATA section](@) consists of the string `< @@ -9046,7 +9092,7 @@ bim!bop />

```````````````````````````````` -Missing [whitespace]: +Missing whitespace: ```````````````````````````````` example @@ -9158,7 +9204,7 @@ foo ## Hard line breaks -A line break (not in a code span or HTML tag) that is preceded +A line ending (not in a code span or HTML tag) that is preceded by two or more spaces and does not occur at the end of a block is parsed as a [hard line break](@) (rendered in HTML as a `
` tag): @@ -9173,7 +9219,7 @@ baz

For a more visible alternative, a backslash before the -[line ending] may be used instead of two spaces: +[line ending] may be used instead of two or more spaces: ```````````````````````````````` example foo\ @@ -9215,7 +9261,7 @@ bar

```````````````````````````````` -Line breaks can occur inside emphasis, links, and other constructs +Hard line breaks can occur inside emphasis, links, and other constructs that allow inline content: ```````````````````````````````` example @@ -9236,13 +9282,13 @@ bar

```````````````````````````````` -Line breaks do not occur inside code spans +Hard line breaks do not occur inside code spans ```````````````````````````````` example -`code +`code span` . -

code span

+

code span

```````````````````````````````` @@ -9308,9 +9354,9 @@ foo ## Soft line breaks -A regular line break (not in a code span or HTML tag) that is not +A regular line ending (not in a code span or HTML tag) that is not preceded by two or more spaces or a backslash is parsed as a -[softbreak](@). (A softbreak may be rendered in HTML either as a +[softbreak](@). (A soft line break may be rendered in HTML either as a [line ending] or as a space. The result will be the same in browsers. In the examples here, a [line ending] will be used.) @@ -9336,7 +9382,7 @@ baz

A conforming parser may render a soft line break in HTML either as a -line break or as a space. +line ending or as a space. A renderer may also provide an option to render soft line breaks as hard line breaks. @@ -9444,7 +9490,7 @@ blocks. But we cannot close unmatched blocks yet, because we may have a blocks, we look for new block starts (e.g. `>` for a block quote). If we encounter a new block start, we close any blocks unmatched in step 1 before creating the new block as a child of the last -matched block. +matched container block. 3. Finally, we look at the remainder of the line (after block markers like `>`, list markers, and indentation have been consumed). @@ -9660,8 +9706,9 @@ just above `stack_bottom` (or the first element if `stack_bottom` is NULL). We keep track of the `openers_bottom` for each delimiter -type (`*`, `_`) and each length of the closing delimiter run -(modulo 3). Initialize this to `stack_bottom`. +type (`*`, `_`), indexed to the length of the closing delimiter run +(modulo 3) and to whether the closing delimiter can also be an +opener. Initialize this to `stack_bottom`. Then we repeat the following until we run out of potential closers: @@ -9707,4 +9754,3 @@ closers: After we're done, we remove all delimiters above `stack_bottom` from the delimiter stack. - From a19627a8a13865aa0b0c85bfb1392fe35c483c4b Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Wed, 27 Apr 2022 15:41:00 +0100 Subject: [PATCH 2/6] Fix handling of too-long entities - Remove duplicated logic, the parser combinator version now uses the stateful version under the hood. This also fixes the problem with invalid characters still being added to the buffer. - Update length limits of escapes to match the spec. --- src/parser.ml | 123 ++++++++--------------------------------- tests/dune.inc | 1 + tests/extract_tests.ml | 2 +- 3 files changed, 24 insertions(+), 102 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 0937224e..9aa6a269 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -4,7 +4,7 @@ open Compat module Sub : sig type t - val of_string : string -> t + val of_string : ?off:int -> string -> t val to_string : t -> string @@ -30,6 +30,8 @@ module Sub : sig val is_empty : t -> bool + val get_offset : t -> int + val length : t -> int val sub : len:int -> t -> t @@ -40,12 +42,14 @@ end = struct ; len : int } - let of_string base = { base; off = 0; len = String.length base } + let of_string ?(off=0) base = { base; off; len = String.length base - off } let to_string { base; off; len } = String.sub base off len let print ppf s = Format.fprintf ppf "%S" (to_string s) + let get_offset { off; _ } = off + let length { len; _ } = len let offset n { base; off; len } = @@ -103,15 +107,9 @@ end = struct in loop n s - let tails n s = + let tails n { base; off; len } = if n < 0 then invalid_arg "tails"; - let rec loop n s = - if n = 0 then - s - else - loop (pred n) (tail s) - in - loop n s + { base; off = off + n; len = len - n } let is_empty s = length s = 0 @@ -177,6 +175,8 @@ module P : sig val peek_after : char -> state -> char val pair : 'a t -> 'b t -> ('a * 'b) t + + val on_sub : (Sub.t -> ('a * Sub.t)) -> 'a t end = struct type state = { str : string @@ -290,6 +290,11 @@ end = struct let x = p st in let y = q st in (x, y) + + let on_sub fn st = + let result, s = fn (Sub.of_string ~off:st.pos st.str) in + st.pos <- Sub.get_offset s; + result end type html_kind = @@ -531,7 +536,7 @@ let entity s = match Sub.heads 2 s with | '#' :: ('x' | 'X') :: _ -> let rec loop m n s = - if m > 8 then raise Fail; + if m > 6 then raise Fail; match Sub.head s with | Some ('a' .. 'f' as c) -> loop @@ -561,7 +566,7 @@ let entity s = loop 0 0 (Sub.tails 2 s) | '#' :: _ -> let rec loop m n s = - if m > 8 then raise Fail; + if m > 7 then raise Fail; match Sub.head s with | Some ('0' .. '9' as c) -> loop (succ m) ((n * 10) + Char.code c - Char.code '0') (Sub.tail s) @@ -1051,95 +1056,11 @@ let inline_attribute_string s = attr let entity buf st = - let p = pos st in - if next st <> '&' then raise Fail; - match peek st with - | Some '#' -> ( - junk st; - match peek st with - | Some ('x' | 'X') -> - junk st; - let rec aux n m = - if n > 8 then - Buffer.add_string buf (range st p (pos st - p)) - else - match peek st with - | Some ('0' .. '9' as c) -> - junk st; - aux (succ n) ((m * 16) + Char.code c - Char.code '0') - | Some ('a' .. 'f' as c) -> - junk st; - aux (succ n) ((m * 16) + Char.code c - Char.code 'a' + 10) - | Some ('A' .. 'F' as c) -> - junk st; - aux (succ n) ((m * 16) + Char.code c - Char.code 'A' + 10) - | Some ';' -> - junk st; - if n = 0 then - Buffer.add_string buf (range st p (pos st - p)) - else - let u = - if Uchar.is_valid m && m <> 0 then - Uchar.of_int m - else - Uchar.rep - in - Buffer.add_utf_8_uchar buf u - | Some _ - | None -> - Buffer.add_string buf (range st p (pos st - p)) - in - aux 0 0 - | Some '0' .. '9' -> - let rec aux n m = - if n > 8 then - Buffer.add_string buf (range st p (pos st - p)) - else - match peek st with - | Some ('0' .. '9' as c) -> - junk st; - aux (succ n) ((m * 10) + Char.code c - Char.code '0') - | Some ';' -> - junk st; - if n = 0 then - Buffer.add_string buf (range st p (pos st - p)) - else - let u = - if Uchar.is_valid m && m <> 0 then - Uchar.of_int m - else - Uchar.rep - in - Buffer.add_utf_8_uchar buf u - | Some _ - | None -> - Buffer.add_string buf (range st p (pos st - p)) - in - aux 0 0 - | Some _ - | None -> - Buffer.add_string buf (range st p (pos st - p))) - | Some ('0' .. '9' | 'a' .. 'z' | 'A' .. 'Z') -> - let q = pos st in - let rec aux () = - match peek st with - | Some ('0' .. '9' | 'a' .. 'z' | 'A' .. 'Z') -> - junk st; - aux () - | Some ';' -> ( - let name = range st q (pos st - q) in - junk st; - match Entities.f name with - | [] -> Buffer.add_string buf (range st p (pos st - p)) - | _ :: _ as cps -> List.iter (Buffer.add_utf_8_uchar buf) cps) - | Some _ - | None -> - Buffer.add_string buf (range st p (pos st - p)) - in - aux () - | Some _ - | None -> - Buffer.add_string buf (range st p (pos st - p)) + junk st; + match on_sub entity st with + | cs -> List.iter (Buffer.add_utf_8_uchar buf) cs + | exception Fail -> + Buffer.add_char buf '&' module Pre = struct type delim = diff --git a/tests/dune.inc b/tests/dune.inc index 5a9d1832..aae1beae 100644 --- a/tests/dune.inc +++ b/tests/dune.inc @@ -4724,6 +4724,7 @@ (alias spec-025) (alias spec-026) (alias spec-027) + (alias spec-028) (alias spec-029) (alias spec-030) (alias spec-031) diff --git a/tests/extract_tests.ml b/tests/extract_tests.ml index f2f6136a..a2e2dbab 100644 --- a/tests/extract_tests.ml +++ b/tests/extract_tests.ml @@ -9,7 +9,7 @@ let protect ~finally f = r let disabled = - [ 028; 171; 206; 215; 216; 410; 411; 414; 415; 416; 428; 468; 469; 519; 539 ] + [ 171; 206; 215; 216; 410; 411; 414; 415; 416; 428; 468; 469; 519; 539 ] let with_open_in fn f = let ic = open_in fn in From 79e3b56e5baa6a2af476e24f61941224ee1b8edc Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Wed, 27 Apr 2022 15:41:48 +0100 Subject: [PATCH 3/6] Treat textarea as a special tag --- src/parser.ml | 2 +- tests/dune.inc | 1 + tests/extract_tests.ml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 9aa6a269..0e93fefb 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -810,7 +810,7 @@ let known_tags = ; "ul" ] -let special_tags = [ "script"; "pre"; "style" ] +let special_tags = [ "pre"; "script"; "style"; "textarea" ] let known_tag s = let s = String.lowercase_ascii s in diff --git a/tests/dune.inc b/tests/dune.inc index aae1beae..c9e4868d 100644 --- a/tests/dune.inc +++ b/tests/dune.inc @@ -4867,6 +4867,7 @@ (alias spec-168) (alias spec-169) (alias spec-170) + (alias spec-171) (alias spec-172) (alias spec-173) (alias spec-174) diff --git a/tests/extract_tests.ml b/tests/extract_tests.ml index a2e2dbab..a727a4e4 100644 --- a/tests/extract_tests.ml +++ b/tests/extract_tests.ml @@ -9,7 +9,7 @@ let protect ~finally f = r let disabled = - [ 171; 206; 215; 216; 410; 411; 414; 415; 416; 428; 468; 469; 519; 539 ] + [ 206; 215; 216; 410; 411; 414; 415; 416; 428; 468; 469; 519; 539 ] let with_open_in fn f = let ic = open_in fn in From f1d3b68a10cb1ef80e3d8232ab1e68d507a416e5 Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Thu, 28 Apr 2022 13:47:52 +0100 Subject: [PATCH 4/6] Reformat code --- src/parser.ml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/parser.ml b/src/parser.ml index 0e93fefb..da5f66cd 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -42,7 +42,7 @@ end = struct ; len : int } - let of_string ?(off=0) base = { base; off; len = String.length base - off } + let of_string ?(off = 0) base = { base; off; len = String.length base - off } let to_string { base; off; len } = String.sub base off len @@ -176,7 +176,7 @@ module P : sig val pair : 'a t -> 'b t -> ('a * 'b) t - val on_sub : (Sub.t -> ('a * Sub.t)) -> 'a t + val on_sub : (Sub.t -> 'a * Sub.t) -> 'a t end = struct type state = { str : string @@ -1059,8 +1059,7 @@ let entity buf st = junk st; match on_sub entity st with | cs -> List.iter (Buffer.add_utf_8_uchar buf) cs - | exception Fail -> - Buffer.add_char buf '&' + | exception Fail -> Buffer.add_char buf '&' module Pre = struct type delim = From 01f7f216b7dccc6033ad0da44ff816cf3abbeb80 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 23 May 2022 17:46:30 -0400 Subject: [PATCH 5/6] Apply updated format fix --- .ocamlformat | 5 +- bin/main.ml | 9 +- src/ast.ml | 4 +- src/block.ml | 49 +-- src/compat.ml | 20 +- src/html.ml | 46 +-- src/html.mli | 1 - src/omd.ml | 5 - src/omd.mli | 3 - src/parser.ml | 666 ++++++++++------------------------------- src/sexp.ml | 15 +- src/toc.ml | 31 +- tests/extract_tests.ml | 11 +- tests/omd.ml | 1 - 14 files changed, 194 insertions(+), 672 deletions(-) diff --git a/.ocamlformat b/.ocamlformat index b50de730..f277a599 100644 --- a/.ocamlformat +++ b/.ocamlformat @@ -1,12 +1,9 @@ -version = 0.19.0 +version = 0.21.0 exp-grouping = preserve break-fun-sig = fit-or-vertical break-fun-decl = fit-or-vertical wrap-fun-args = false dock-collection-brackets = false -break-cases = all break-separators = before break-infix = fit-or-vertical -if-then-else = k-r -nested-match = align type-decl = sparse diff --git a/bin/main.ml b/bin/main.ml index b313a92c..00d3efb9 100644 --- a/bin/main.ml +++ b/bin/main.ml @@ -32,7 +32,6 @@ let print_version () = exit 0 let input = ref [] - let output = ref "" let spec = @@ -53,14 +52,10 @@ let main () = (fun s -> input := s :: !input) "omd [options] [inputfile1 .. inputfileN] [options]"; let with_output f = - if !output = "" then - f stdout - else - with_open_out !output f + if !output = "" then f stdout else with_open_out !output f in with_output @@ fun oc -> - if !input = [] then - process stdin oc + if !input = [] then process stdin oc else let f filename = with_open_in filename @@ fun ic -> process ic oc in List.(iter f (rev !input)) diff --git a/src/ast.ml b/src/ast.ml index 8495e14a..ca8349e3 100644 --- a/src/ast.ml +++ b/src/ast.ml @@ -96,7 +96,5 @@ module Mapper = MakeMapper (StringT) (InlineT) let same_block_list_kind k1 k2 = match (k1, k2) with - | Ordered (_, c1), Ordered (_, c2) - | Bullet c1, Bullet c2 -> - c1 = c2 + | Ordered (_, c1), Ordered (_, c2) | Bullet c1, Bullet c2 -> c1 = c2 | _ -> false diff --git a/src/block.ml b/src/block.ml index cda413bb..44cc5d6d 100644 --- a/src/block.ml +++ b/src/block.ml @@ -33,20 +33,11 @@ module Pre = struct let trim_left s = let rec loop i = - if i >= String.length s then - i - else - match s.[i] with - | ' ' - | '\t' -> - loop (succ i) - | _ -> i + if i >= String.length s then i + else match s.[i] with ' ' | '\t' -> loop (succ i) | _ -> i in let i = loop 0 in - if i > 0 then - String.sub s i (String.length s - i) - else - s + if i > 0 then String.sub s i (String.length s - i) else s let rec close link_defs { blocks; next } = let finish = finish link_defs in @@ -61,10 +52,7 @@ module Pre = struct in let s = String.sub s off (String.length s - off) |> String.trim in link_defs := defs @ !link_defs; - if s = "" then - blocks - else - Paragraph ([], s) :: blocks + if s = "" then blocks else Paragraph ([], s) :: blocks | Rfenced_code (_, _, _kind, (label, _other), [], attr) -> Code_block (attr, label, "") :: blocks | Rfenced_code (_, _, _kind, (label, _other), l, attr) -> @@ -78,10 +66,7 @@ module Pre = struct Definition_list ([], l @ [ { term; defs = List.rev defs } ]) :: blocks | Rindented_code l -> (* TODO: trim from the right *) - let rec loop = function - | "" :: l -> loop l - | _ as l -> l - in + let rec loop = function "" :: l -> loop l | _ as l -> l in Code_block ([], "", concat (loop l)) :: blocks | Rhtml (_, l) -> Html_block ([], concat l) :: blocks | Rempty -> blocks @@ -89,7 +74,6 @@ module Pre = struct and finish link_defs state = List.rev (close link_defs state) let empty = { blocks = []; next = Rempty } - let classify_line s = Parser.parse s let rec process link_defs { blocks; next } s = @@ -139,10 +123,7 @@ module Pre = struct | Rfenced_code (ind, num, q, info, lines, a), _ -> let s = let ind = min (Parser.indent s) ind in - if ind > 0 then - Sub.offset ind s - else - s + if ind > 0 then Sub.offset ind s else s in { blocks ; next = Rfenced_code (ind, num, q, info, Sub.to_string s :: lines, a) @@ -195,21 +176,13 @@ module Pre = struct true | _ -> false in - if prev_empty && new_block state.next then - Loose - else - style + if prev_empty && new_block state.next then Loose else style in { blocks; next = Rlist (kind, style, false, ind, items, state) } | ( Rlist (kind, style, prev_empty, _, items, state) , Llist_item (kind', ind, s) ) when same_block_list_kind kind kind' -> - let style = - if prev_empty then - Loose - else - style - in + let style = if prev_empty then Loose else style in { blocks ; next = Rlist @@ -230,8 +203,7 @@ module Pre = struct | None -> None) | Rparagraph (_ :: _ as lines) -> ( match classify_line s with - | Parser.Lparagraph - | Lindented_code _ + | Parser.Lparagraph | Lindented_code _ | Lsetext_heading (1, _) | Lhtml (false, _) -> Some (Rparagraph (Sub.to_string s :: lines)) @@ -258,8 +230,7 @@ module Pre = struct let read_line s off = let buf = Buffer.create 128 in let rec loop cr_read off = - if off >= String.length s then - (Buffer.contents buf, None) + if off >= String.length s then (Buffer.contents buf, None) else match s.[off] with | '\n' -> (Buffer.contents buf, Some (succ off)) diff --git a/src/compat.ml b/src/compat.ml index 6f3edfc2..7789f422 100644 --- a/src/compat.ml +++ b/src/compat.ml @@ -9,18 +9,11 @@ module List = struct let rec find_map f = function | [] -> None - | x :: xs -> - match f x with - | None -> find_map f xs - | y -> y + | x :: xs -> ( match f x with None -> find_map f xs | y -> y) let rec find_opt p = function | [] -> None - | x :: l -> - if p x then - Some x - else - find_opt p l + | x :: l -> if p x then Some x else find_opt p l end module Buffer = struct @@ -51,12 +44,9 @@ module String = struct let for_all p s = let n = length s in let rec loop i = - if i = n then - true - else if p (unsafe_get s i) then - loop (succ i) - else - false + if i = n then true + else if p (unsafe_get s i) then loop (succ i) + else false in loop 0 end diff --git a/src/html.ml b/src/html.ml index c5bd6212..12c4a31b 100644 --- a/src/html.ml +++ b/src/html.ml @@ -12,17 +12,11 @@ type t = | Concat of t * t let elt etype name attrs childs = Element (etype, name, attrs, childs) - let text s = Text s - let raw s = Raw s let concat t1 t2 = - match (t1, t2) with - | Null, t - | t, Null -> - t - | _ -> Concat (t1, t2) + match (t1, t2) with Null, t | t, Null -> t | _ -> Concat (t1, t2) let concat_map f l = List.fold_left (fun accu x -> concat accu (f x)) Null l @@ -30,8 +24,7 @@ let concat_map f l = List.fold_left (fun accu x -> concat accu (f x)) Null l let htmlentities s = let b = Buffer.create (String.length s) in let rec loop i = - if i >= String.length s then - Buffer.contents b + if i >= String.length s then Buffer.contents b else begin begin match s.[i] with @@ -91,17 +84,13 @@ let to_plain_text t = let buf = Buffer.create 1024 in let rec go : _ inline -> unit = function | Concat (_, l) -> List.iter go l - | Text (_, t) - | Code (_, t) -> - Buffer.add_string buf t + | Text (_, t) | Code (_, t) -> Buffer.add_string buf t | Emph (_, i) | Strong (_, i) | Link (_, { label = i; _ }) | Image (_, { label = i; _ }) -> go i - | Hard_break _ - | Soft_break _ -> - Buffer.add_char buf ' ' + | Hard_break _ | Soft_break _ -> Buffer.add_char buf ' ' | Html _ -> () in go t; @@ -111,18 +100,14 @@ let nl = Raw "\n" let rec url label destination title attrs = let attrs = - match title with - | None -> attrs - | Some title -> ("title", title) :: attrs + match title with None -> attrs | Some title -> ("title", title) :: attrs in let attrs = ("href", escape_uri destination) :: attrs in elt Inline "a" attrs (Some (inline label)) and img label destination title attrs = let attrs = - match title with - | None -> attrs - | Some title -> ("title", title) :: attrs + match title with None -> attrs | Some title -> ("title", title) :: attrs in let attrs = ("src", escape_uri destination) :: ("alt", to_plain_text label) :: attrs @@ -148,11 +133,7 @@ let rec block = function elt Block "blockquote" attr (Some (concat nl (concat_map block q))) | Paragraph (attr, md) -> elt Block "p" attr (Some (inline md)) | List (attr, ty, sp, bl) -> - let name = - match ty with - | Ordered _ -> "ol" - | Bullet _ -> "ul" - in + let name = match ty with Ordered _ -> "ol" | Bullet _ -> "ul" in let attr = match ty with | Ordered (n, _) when n <> 1 -> ("start", string_of_int n) :: attr @@ -164,21 +145,14 @@ let rec block = function | Paragraph (_, t), Tight -> concat (inline t) nl | _ -> block t in - let nl = - if sp = Tight then - Null - else - nl - in + let nl = if sp = Tight then Null else nl in elt Block "li" [] (Some (concat nl (concat_map block' t))) in elt Block name attr (Some (concat nl (concat_map li bl))) | Code_block (attr, label, code) -> let code_attr = - if String.trim label = "" then - [] - else - [ ("class", "language-" ^ label) ] + if String.trim label = "" then [] + else [ ("class", "language-" ^ label) ] in let c = text code in elt Block "pre" attr (Some (elt Inline "code" code_attr (Some c))) diff --git a/src/html.mli b/src/html.mli index 16ceba30..2b8449fe 100644 --- a/src/html.mli +++ b/src/html.mli @@ -12,5 +12,4 @@ type t = | Concat of t * t val of_doc : attributes block list -> t - val to_string : t -> string diff --git a/src/omd.ml b/src/omd.ml index a091b99c..6104ad75 100644 --- a/src/omd.ml +++ b/src/omd.ml @@ -15,13 +15,8 @@ let parse_inlines (md, defs) = List.map (Mapper.map (parse_inline defs)) md let of_channel ic = parse_inlines (Pre.of_channel ic) - let of_string s = parse_inlines (Pre.of_string s) - let to_html doc = Html.to_string (Html.of_doc doc) - let to_sexp ast = Format.asprintf "@[%a@]@." Sexp.print (Sexp.create ast) - let headers = Toc.headers - let toc = Toc.toc diff --git a/src/omd.mli b/src/omd.mli index f0f055bd..f0b3f6ee 100644 --- a/src/omd.mli +++ b/src/omd.mli @@ -47,11 +47,8 @@ type doc = attributes block list (** A markdown document *) val of_channel : in_channel -> doc - val of_string : string -> doc - val to_html : doc -> string - val to_sexp : doc -> string val headers : diff --git a/src/parser.ml b/src/parser.ml index da5f66cd..63429897 100644 --- a/src/parser.ml +++ b/src/parser.ml @@ -5,35 +5,20 @@ module Sub : sig type t val of_string : ?off:int -> string -> t - val to_string : t -> string - val offset : int -> t -> t - val lexbuf : t -> Lexing.lexbuf - val contains : string -> t -> bool - val print : Format.formatter -> t -> unit - val head : ?rev:unit -> t -> char option - val tail : ?rev:unit -> t -> t - val heads : int -> t -> char list - val tails : int -> t -> t - val for_all : (char -> bool) -> t -> bool - val exists : (char -> bool) -> t -> bool - val is_empty : t -> bool - val get_offset : t -> int - val length : t -> int - val sub : len:int -> t -> t end = struct type t = @@ -43,20 +28,15 @@ end = struct } let of_string ?(off = 0) base = { base; off; len = String.length base - off } - let to_string { base; off; len } = String.sub base off len - let print ppf s = Format.fprintf ppf "%S" (to_string s) - let get_offset { off; _ } = off - let length { len; _ } = len let offset n { base; off; len } = if n < 0 then invalid_arg "offset"; let rec loop n base off len = - if n = 0 || len = 0 then - { base; off; len } + if n = 0 || len = 0 then { base; off; len } else match base.[off] with | '\t' -> @@ -76,10 +56,8 @@ end = struct let contains s1 { base; off; len } = let rec loop off = - if off + String.length s1 > len then - false - else - s1 = String.sub base off (String.length s1) || loop (off + 1) + if off + String.length s1 > len then false + else s1 = String.sub base off (String.length s1) || loop (off + 1) in loop off @@ -98,12 +76,9 @@ end = struct let heads n s = if n < 0 then invalid_arg "heads"; let rec loop n s = - if n = 0 || length s = 0 then - [] + if n = 0 || length s = 0 then [] else - match head s with - | Some c -> c :: loop (pred n) (tail s) - | None -> [] + match head s with Some c -> c :: loop (pred n) (tail s) | None -> [] in loop n s @@ -115,12 +90,9 @@ end = struct let exists f s = let rec loop s i = - if i >= s.len then - false - else if f s.base.[s.off + i] then - true - else - loop s (succ i) + if i >= s.len then false + else if f s.base.[s.off + i] then true + else loop s (succ i) in loop s 0 @@ -135,47 +107,27 @@ exception Fail module P : sig type state - type 'a t = state -> 'a val of_string : string -> state - val peek : char option t - val peek_exn : char t - val pos : state -> int - val range : state -> int -> int -> string - val set_pos : state -> int -> unit - val junk : unit t - val char : char -> unit t - val next : char t - val ( ||| ) : 'a t -> 'a t -> 'a t - val ws : unit t - val sp : unit t - val ws1 : unit t - val ( >>> ) : unit t -> 'a t -> 'a t - val ( <<< ) : 'a t -> unit t -> 'a t - val protect : 'a t -> 'a t - val peek_before : char -> state -> char - val peek_after : char -> state -> char - val pair : 'a t -> 'b t -> ('a * 'b) t - val on_sub : (Sub.t -> 'a * Sub.t) -> 'a t end = struct type state = @@ -193,86 +145,60 @@ end = struct st.pos <- st.pos + 1 let next st = - if st.pos >= String.length st.str then - raise Fail + if st.pos >= String.length st.str then raise Fail else let c = st.str.[st.pos] in st.pos <- st.pos + 1; c let peek_exn st = - if st.pos >= String.length st.str then - raise Fail - else - st.str.[st.pos] + if st.pos >= String.length st.str then raise Fail else st.str.[st.pos] let peek st = - if st.pos >= String.length st.str then - None - else - Some st.str.[st.pos] + if st.pos >= String.length st.str then None else Some st.str.[st.pos] - let peek_before c st = - if st.pos = 0 then - c - else - st.str.[st.pos - 1] + let peek_before c st = if st.pos = 0 then c else st.str.[st.pos - 1] let peek_after c st = - if st.pos + 1 >= String.length st.str then - c - else - st.str.[st.pos + 1] + if st.pos + 1 >= String.length st.str then c else st.str.[st.pos + 1] let pos st = st.pos - let range st pos n = String.sub st.str pos n - let set_pos st pos = st.pos <- pos - let junk st = if st.pos < String.length st.str then st.pos <- st.pos + 1 let protect p st = let off = pos st in - try p st with - | e -> - set_pos st off; - raise e + try p st + with e -> + set_pos st off; + raise e - let ( ||| ) p1 p2 st = - try protect p1 st with - | Fail -> p2 st + let ( ||| ) p1 p2 st = try protect p1 st with Fail -> p2 st let ws st = let rec loop () = match peek_exn st with - | ' ' - | '\t' - | '\010' .. '\013' -> + | ' ' | '\t' | '\010' .. '\013' -> junk st; loop () | _ -> () in - try loop () with - | Fail -> () + try loop () with Fail -> () let sp st = let rec loop () = match peek_exn st with - | ' ' - | '\t' -> + | ' ' | '\t' -> junk st; loop () | _ -> () in - try loop () with - | Fail -> () + try loop () with Fail -> () let ws1 st = match peek_exn st with - | ' ' - | '\t' - | '\010' .. '\013' -> + | ' ' | '\t' | '\010' .. '\013' -> junk st; ws st | _ -> raise Fail @@ -327,26 +253,16 @@ let sp3 s = let s = Sub.tail s in match Sub.head s with | Some ' ' -> (3, Sub.tail s) - | Some _ - | None -> - (2, s)) - | Some _ - | None -> - (1, s)) - | Some _ - | None -> - (0, s) - -let ( ||| ) p1 p2 s = - try p1 s with - | Fail -> p2 s + | Some _ | None -> (2, s)) + | Some _ | None -> (1, s)) + | Some _ | None -> (0, s) + +let ( ||| ) p1 p2 s = try p1 s with Fail -> p2 s let rec ws ?rev s = match Sub.head ?rev s with | Some (' ' | '\t' | '\010' .. '\013') -> ws ?rev (Sub.tail ?rev s) - | None - | Some _ -> - s + | None | Some _ -> s let is_empty s = Sub.is_empty (ws s) @@ -363,9 +279,7 @@ let thematic_break s = Lthematic_break in loop 1 (Sub.tail s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let setext_heading s = match Sub.head s with @@ -373,56 +287,19 @@ let setext_heading s = let rec loop n s = match Sub.head s with | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) - | Some _ - | None -> + | Some _ | None -> if not (Sub.is_empty (ws s)) then raise Fail; if c = '-' && n = 1 then raise Fail; (* can be interpreted as an empty list item *) - Lsetext_heading - ( (if c = '-' then - 2 - else - 1) - , n ) + Lsetext_heading ((if c = '-' then 2 else 1), n) in loop 1 (Sub.tail s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let is_punct = function - | '!' - | '"' - | '#' - | '$' - | '%' - | '&' - | '\'' - | '(' - | ')' - | '*' - | '+' - | ',' - | '-' - | '.' - | '/' - | ':' - | ';' - | '<' - | '=' - | '>' - | '?' - | '@' - | '[' - | '\\' - | ']' - | '^' - | '_' - | '`' - | '{' - | '|' - | '}' - | '~' -> + | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' + | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']' | '^' + | '_' | '`' | '{' | '|' | '}' | '~' -> true | _ -> false @@ -431,8 +308,7 @@ let parse_attributes = function | Some s -> ( let attributes = String.split_on_char ' ' s in let f (id, classes, acc) s = - if s = "" then - (id, classes, acc) + if s = "" then (id, classes, acc) else match s.[0] with | '#' -> (Some (String.sub s 1 (String.length s - 1)), classes, acc) @@ -448,12 +324,9 @@ let parse_attributes = function let acc = if classes <> [] then ("class", String.concat " " (List.rev classes)) :: acc - else - acc + else acc in - match id with - | Some id -> ("id", id) :: acc - | None -> acc) + match id with Some id -> ("id", id) :: acc | None -> acc) let attribute_string s = let buf = Buffer.create 64 in @@ -466,8 +339,7 @@ let attribute_string s = | Some c when is_punct c -> Buffer.add_char buf c; loop (Sub.tail s) - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf c; loop s) | Some '{' -> @@ -521,9 +393,7 @@ let atx_heading s = let rec loop t = match Sub.head ~rev:() t with | Some '#' -> loop (Sub.tail ~rev:() t) - | Some (' ' | '\t' | '\010' .. '\013') - | None -> - ws ~rev:() t + | Some (' ' | '\t' | '\010' .. '\013') | None -> ws ~rev:() t | Some _ -> s in Latx_heading (n, Sub.to_string (ws (loop s)), a) @@ -553,15 +423,11 @@ let entity s = | Some ';' -> if m = 0 then raise Fail; let u = - if n = 0 || not (Uchar.is_valid n) then - Uchar.rep - else - Uchar.of_int n + if n = 0 || not (Uchar.is_valid n) then Uchar.rep + else Uchar.of_int n in ([ u ], Sub.tail s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail in loop 0 0 (Sub.tails 2 s) | '#' :: _ -> @@ -573,15 +439,11 @@ let entity s = | Some ';' -> if m = 0 then raise Fail; let u = - if n = 0 || not (Uchar.is_valid n) then - Uchar.rep - else - Uchar.of_int n + if n = 0 || not (Uchar.is_valid n) then Uchar.rep + else Uchar.of_int n in ([ u ], Sub.tail s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail in loop 0 0 (Sub.tail s) | ('a' .. 'z' | 'A' .. 'Z') :: _ -> @@ -594,9 +456,7 @@ let entity s = match Entities.f name with | [] -> raise Fail | cps -> (cps, Sub.tail t)) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail in loop 1 (Sub.tail s) | _ -> raise Fail @@ -604,23 +464,13 @@ let entity s = let info_string c s = let buf = Buffer.create 17 in let s, a = - match Sub.head ~rev:() s with - | Some '}' -> attribute_string s - | _ -> (s, []) + match Sub.head ~rev:() s with Some '}' -> attribute_string s | _ -> (s, []) in let s = ws ~rev:() (ws s) in let rec loop s = match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013') - | None -> - if - c = '`' - && Sub.exists - (function - | '`' -> true - | _ -> false) - s - then + | Some (' ' | '\t' | '\010' .. '\013') | None -> + if c = '`' && Sub.exists (function '`' -> true | _ -> false) s then raise Fail; ((Buffer.contents buf, Sub.to_string (ws s)), a) | Some '`' when c = '`' -> raise Fail @@ -630,8 +480,7 @@ let info_string c s = | Some c when is_punct c -> Buffer.add_char buf c; loop (Sub.tail s) - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf c; loop s) | Some ('&' as c) -> ( @@ -655,31 +504,21 @@ let fenced_code ind s = let rec loop n s = match Sub.head s with | Some c1 when c = c1 -> loop (succ n) (Sub.tail s) - | Some _ - | None -> + | Some _ | None -> if n < 3 then raise Fail; let s, a = info_string c s in - let c = - if c = '`' then - Backtick - else - Tilde - in + let c = if c = '`' then Backtick else Tilde in Lfenced_code (ind, n, c, s, a) in loop 1 (Sub.tail s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let indent s = let rec loop n s = match Sub.head s with | Some ' ' -> loop (n + 1) (Sub.tail s) | Some '\t' -> loop (n + 4) (Sub.tail s) - | Some _ - | None -> - n + | Some _ | None -> n in loop 0 s @@ -687,21 +526,13 @@ let unordered_list_item ind s = match Sub.head s with | Some (('+' | '-' | '*') as c) -> let s = Sub.tail s in - if is_empty s then - Llist_item (Bullet c, 2 + ind, s) + if is_empty s then Llist_item (Bullet c, 2 + ind, s) else let n = indent s in if n = 0 then raise Fail; - let n = - if n <= 4 then - n - else - 1 - in + let n = if n <= 4 then n else 1 in Llist_item (Bullet c, n + 1 + ind, Sub.offset n s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let ordered_list_item ind s = let rec loop n m s = @@ -711,21 +542,13 @@ let ordered_list_item ind s = loop (succ n) ((m * 10) + Char.code c - Char.code '0') (Sub.tail s) | Some (('.' | ')') as c) -> let s = Sub.tail s in - if is_empty s then - Llist_item (Ordered (m, c), n + 1 + ind, s) + if is_empty s then Llist_item (Ordered (m, c), n + 1 + ind, s) else let ind' = indent s in if ind' = 0 then raise Fail; - let ind' = - if ind' <= 4 then - ind' - else - 1 - in + let ind' = if ind' <= 4 then ind' else 1 in Llist_item (Ordered (m, c), n + ind + ind' + 1, Sub.offset ind' s) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail in loop 0 0 s @@ -736,14 +559,10 @@ let tag_name s0 = match Sub.head s with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-') -> loop (succ len) (Sub.tail s) - | Some _ - | None -> - (Sub.to_string (Sub.sub s0 ~len), s) + | Some _ | None -> (Sub.to_string (Sub.sub s0 ~len), s) in loop 1 (Sub.tail s0) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let known_tags = [ "address" @@ -826,34 +645,26 @@ let closing_tag s = | Some '>' -> if not (is_empty (Sub.tail s)) then raise Fail; Lhtml (false, Hblank) - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let special_tag tag s = if not (special_tag tag) then raise Fail; match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013' | '>') - | None -> + | Some (' ' | '\t' | '\010' .. '\013' | '>') | None -> Lhtml (true, Hcontains [ ""; "
"; "" ]) | Some _ -> raise Fail let known_tag tag s = if not (known_tag tag) then raise Fail; match Sub.heads 2 s with - | (' ' | '\t' | '\010' .. '\013') :: _ - | [] - | '>' :: _ - | '/' :: '>' :: _ -> + | (' ' | '\t' | '\010' .. '\013') :: _ | [] | '>' :: _ | '/' :: '>' :: _ -> Lhtml (true, Hblank) | _ -> raise Fail let ws1 s = match Sub.head s with | Some (' ' | '\t' | '\010' .. '\013') -> ws s - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let attribute_name s = match Sub.head s with @@ -862,14 +673,10 @@ let attribute_name s = match Sub.head s with | Some ('a' .. 'z' | 'A' .. 'Z' | '_' | '.' | ':' | '0' .. '9') -> loop (Sub.tail s) - | Some _ - | None -> - s + | Some _ | None -> s in loop s - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let attribute_value s = match Sub.head s with @@ -902,16 +709,10 @@ let attribute s = | Some '=' -> let s = ws (Sub.tail s) in attribute_value s - | Some _ - | None -> - s + | Some _ | None -> s let attributes s = - let rec loop s = - match attribute s with - | s -> loop s - | exception Fail -> s - in + let rec loop s = match attribute s with s -> loop s | exception Fail -> s in loop s let open_tag s = @@ -948,16 +749,12 @@ let blank s = let tag_string s = let buf = Buffer.create 17 in let s, a = - match Sub.head ~rev:() s with - | Some '}' -> attribute_string s - | _ -> (s, []) + match Sub.head ~rev:() s with Some '}' -> attribute_string s | _ -> (s, []) in let s = ws ~rev:() (ws s) in let rec loop s = match Sub.head s with - | Some (' ' | '\t' | '\010' .. '\013') - | None -> - (Buffer.contents buf, a) + | Some (' ' | '\t' | '\010' .. '\013') | None -> (Buffer.contents buf, a) | Some c -> Buffer.add_char buf c; loop (Sub.tail s) @@ -980,12 +777,7 @@ let parse s0 = match Sub.head s with | Some '>' -> let s = Sub.offset 1 s in - let s = - if indent s > 0 then - Sub.offset 1 s - else - s - in + let s = if indent s > 0 then Sub.offset 1 s else s in Lblockquote s | Some '=' -> setext_heading s | Some '-' -> @@ -1001,9 +793,7 @@ let parse s0 = | Some _ -> (blank ||| indented_code ind) s | None -> Lempty -let parse s = - try parse s with - | Fail -> Lparagraph +let parse s = try parse s with Fail -> Lparagraph open P @@ -1012,19 +802,15 @@ let is_empty st = try let rec loop () = match next st with - | ' ' - | '\t' - | '\010' .. '\013' -> - loop () + | ' ' | '\t' | '\010' .. '\013' -> loop () | _ -> set_pos st off; false in loop () - with - | Fail -> - set_pos st off; - true + with Fail -> + set_pos st off; + true let inline_attribute_string s = let ppos = pos s in @@ -1038,8 +824,7 @@ let inline_attribute_string s = | Some '}' -> junk s; Some (Buffer.contents buf) - | None - | Some '{' -> + | None | Some '{' -> set_pos s pos; None | Some c -> @@ -1081,20 +866,14 @@ module Pre = struct | Emph of delim * delim * emph_style * int | R of attributes inline - let concat = function - | [ x ] -> x - | l -> Concat ([], l) + let concat = function [ x ] -> x | l -> Concat ([], l) let left_flanking = function - | Emph (_, Other, _, _) - | Emph ((Ws | Punct), Punct, _, _) -> - true + | Emph (_, Other, _, _) | Emph ((Ws | Punct), Punct, _, _) -> true | _ -> false let right_flanking = function - | Emph (Other, _, _, _) - | Emph (Punct, (Ws | Punct), _, _) -> - true + | Emph (Other, _, _, _) | Emph (Punct, (Ws | Punct), _, _) -> true | _ -> false let is_opener = function @@ -1110,44 +889,11 @@ module Pre = struct | _ -> false let classify_delim = function - | '!' - | '"' - | '#' - | '$' - | '%' - | '&' - | '\'' - | '(' - | ')' - | '*' - | '+' - | ',' - | '-' - | '.' - | '/' - | ':' - | ';' - | '<' - | '=' - | '>' - | '?' - | '@' - | '[' - | '\\' - | ']' - | '^' - | '_' - | '`' - | '{' - | '|' - | '}' - | '~' -> + | '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' + | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' + | ']' | '^' | '_' | '`' | '{' | '|' | '}' | '~' -> Punct - | ' ' - | '\t' - | '\010' .. '\013' - | '\160' -> - Ws + | ' ' | '\t' | '\010' .. '\013' | '\160' -> Ws | _ -> Other let to_r = function @@ -1164,40 +910,25 @@ module Pre = struct | (Emph (_, post, q2, n2) as x) :: xs when is_closer x && q1 = q2 -> let xs = if n1 >= 2 && n2 >= 2 then - if n2 > 2 then - Emph (Punct, post, q2, n2 - 2) :: xs - else - xs - else if n2 > 1 then - Emph (Punct, post, q2, n2 - 1) :: xs - else - xs + if n2 > 2 then Emph (Punct, post, q2, n2 - 2) :: xs else xs + else if n2 > 1 then Emph (Punct, post, q2, n2 - 1) :: xs + else xs in let r = let il = concat (List.map to_r (parse_emph (List.rev acc))) in - if n1 >= 2 && n2 >= 2 then - R (Strong ([], il)) :: xs - else - R (Emph ([], il)) :: xs + if n1 >= 2 && n2 >= 2 then R (Strong ([], il)) :: xs + else R (Emph ([], il)) :: xs in let r = if n1 >= 2 && n2 >= 2 then - if n1 > 2 then - Emph (pre, Punct, q1, n1 - 2) :: r - else - r - else if n1 > 1 then - Emph (pre, Punct, q1, n1 - 1) :: r - else - r + if n1 > 2 then Emph (pre, Punct, q1, n1 - 2) :: r else r + else if n1 > 1 then Emph (pre, Punct, q1, n1 - 1) :: r + else r in parse_emph r | (Emph _ as x) :: xs1 as xs when is_opener x -> let xs' = parse_emph xs in - if xs' = xs then - loop (x :: acc) xs1 - else - loop acc xs' + if xs' = xs then loop (x :: acc) xs1 else loop acc xs' | x :: xs -> loop (x :: acc) xs | [] -> x :: List.rev acc in @@ -1214,9 +945,7 @@ let escape buf st = | Some c when is_punct c -> junk st; Buffer.add_char buf c - | Some _ - | None -> - Buffer.add_char buf '\\' + | Some _ | None -> Buffer.add_char buf '\\' let link_label allow_balanced_brackets st = if peek_exn st <> '[' then raise Fail; @@ -1263,14 +992,10 @@ let link_label allow_balanced_brackets st = let normalize s = let buf = Buffer.create (String.length s) in let rec loop start seen_ws i = - if i >= String.length s then - Buffer.contents buf + if i >= String.length s then Buffer.contents buf else match s.[i] with - | ' ' - | '\t' - | '\010' .. '\013' -> - loop start true (succ i) + | ' ' | '\t' | '\010' .. '\013' -> loop start true (succ i) | _ as c -> if (not start) && seen_ws then Buffer.add_char buf ' '; Buffer.add_char buf (Char.lowercase_ascii c); @@ -1280,17 +1005,14 @@ let normalize s = let tag_name st = match peek_exn st with - | 'a' .. 'z' - | 'A' .. 'Z' -> + | 'a' .. 'z' | 'A' .. 'Z' -> junk st; let rec loop () = match peek st with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-') -> junk st; loop () - | Some _ - | None -> - () + | Some _ | None -> () in loop () | _ -> raise Fail @@ -1302,9 +1024,7 @@ let ws_buf buf st = Buffer.add_char buf c; junk st; loop () - | Some _ - | None -> - () + | Some _ | None -> () in loop () @@ -1319,9 +1039,7 @@ let closing_tag st = let list p st = let rec loop () = - match protect p st with - | () -> loop () - | exception Fail -> () + match protect p st with () -> loop () | exception Fail -> () in loop () @@ -1354,15 +1072,7 @@ let double_quoted_attribute st = let unquoted_attribute st = let rec loop n = match peek_exn st with - | ' ' - | '\t' - | '\010' .. '\013' - | '"' - | '\'' - | '=' - | '<' - | '>' - | '`' -> + | ' ' | '\t' | '\010' .. '\013' | '"' | '\'' | '=' | '<' | '>' | '`' -> if n = 0 then raise Fail (* | '&' -> *) (* entity buf st; loop () *) @@ -1380,38 +1090,26 @@ let attribute_value st = let attribute_name st = match peek_exn st with - | 'a' .. 'z' - | 'A' .. 'Z' - | '_' - | ':' -> + | 'a' .. 'z' | 'A' .. 'Z' | '_' | ':' -> junk st; let rec loop () = match peek st with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '_' | '.' | ':' | '-') -> junk st; loop () - | Some _ - | None -> - () + | Some _ | None -> () in loop () | _ -> raise Fail -let option d p st = - match protect p st with - | r -> r - | exception Fail -> d - +let option d p st = match protect p st with r -> r | exception Fail -> d let some p st = Some (p st) - let attribute_value_specification = ws >>> char '=' >>> ws >>> attribute_value let ws1_buf buf st = match peek st with | Some (' ' | '\t' | '\010' .. '\013') -> ws_buf buf st - | Some _ - | None -> - raise Fail + | Some _ | None -> raise Fail let attribute st = ws1 st; @@ -1545,9 +1243,7 @@ let declaration st = junk st; Buffer.add_char buf c; loop () - | ' ' - | '\t' - | '\010' .. '\013' -> + | ' ' | '\t' | '\010' .. '\013' -> ws1_buf buf st; let rec loop () = match peek_exn st with @@ -1579,9 +1275,7 @@ let link_destination st = | '>' -> junk st; Buffer.contents buf - | '\010' .. '\013' - | '<' -> - raise Fail + | '\010' .. '\013' | '<' -> raise Fail | '\\' -> escape buf st; loop () @@ -1626,10 +1320,7 @@ let link_destination st = loop 0 let eol st = - match peek st with - | Some '\n' -> junk st - | Some _ -> raise Fail - | None -> () + match peek st with Some '\n' -> junk st | Some _ -> raise Fail | None -> () let link_title st = let buf = Buffer.create 17 in @@ -1674,34 +1365,26 @@ let link_title st = loop () | _ -> raise Fail -let space st = - match peek_exn st with - | ' ' -> junk st - | _ -> raise Fail +let space st = match peek_exn st with ' ' -> junk st | _ -> raise Fail let many p st = try while true do p st done - with - | Fail -> () + with Fail -> () let scheme st = match peek_exn st with - | 'a' .. 'z' - | 'A' .. 'Z' -> + | 'a' .. 'z' | 'A' .. 'Z' -> let rec loop n = if n < 32 then match peek st with | Some ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '+' | '.' | '-') -> junk st; loop (succ n) - | Some _ - | None -> - n - else - n + | Some _ | None -> n + else n in let n = loop 0 in if n < 2 then raise Fail @@ -1735,26 +1418,8 @@ let email_address st = | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' - | '.' - | '!' - | '#' - | '$' - | '%' - | '&' - | '\'' - | '*' - | '+' - | '/' - | '=' - | '?' - | '^' - | '_' - | '`' - | '{' - | '|' - | '}' - | '~' - | '-' -> + | '.' | '!' | '#' | '$' | '%' | '&' | '\'' | '*' | '+' | '/' | '=' | '?' + | '^' | '_' | '`' | '{' | '|' | '}' | '~' | '-' -> junk st; loop (succ n) | '@' -> @@ -1762,9 +1427,7 @@ let email_address st = let label st = let let_dig st = match peek_exn st with - | 'a' .. 'z' - | 'A' .. 'Z' - | '0' .. '9' -> + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> junk st; false | '-' -> @@ -1812,10 +1475,7 @@ let get_buf buf = s let text buf acc = - if Buffer.length buf = 0 then - acc - else - Pre.R (Text ([], get_buf buf)) :: acc + if Buffer.length buf = 0 then acc else Pre.R (Text ([], get_buf buf)) :: acc let inline_pre buf acc st = let pos = pos st in @@ -1830,16 +1490,13 @@ let inline_pre buf acc st = let finish () = let content = Buffer.contents bufcode in let content = - if String.for_all (fun c -> c = ' ') content then - content + if String.for_all (fun c -> c = ' ') content then content else if String.length content >= 2 && content.[0] = ' ' && content.[String.length content - 1] = ' ' - then - String.sub content 1 (String.length content - 2) - else - content + then String.sub content 1 (String.length content - 2) + else content in let attr = inline_attribute_string st in Pre.R (Code (attr, content)) :: acc @@ -1852,12 +1509,7 @@ let inline_pre buf acc st = | _ when m = n -> finish () | Some ((' ' | '\t' | '\010' .. '\013') as c) -> if m > 0 then Buffer.add_string bufcode (String.make m '`'); - Buffer.add_char - bufcode - (if c = '\010' then - ' ' - else - c); + Buffer.add_char bufcode (if c = '\010' then ' ' else c); junk st; gobble_body (start && m = 0) 0 | Some c -> @@ -1913,8 +1565,8 @@ let rec inline defs st = if peek_after '\000' st = ']' then ( junk st; junk st; - reflink lab - ) else + reflink lab) + else match protect (link_label false) st with | _ -> set_pos st off0; @@ -1928,9 +1580,7 @@ let rec inline defs st = junk st; loop (Left_bracket kind :: text acc) st | exception Fail -> reflink lab) - | Some _ - | None -> - reflink lab) + | Some _ | None -> reflink lab) | exception Fail -> junk st; loop (Left_bracket kind :: text acc) st @@ -1941,22 +1591,22 @@ let rec inline defs st = | def -> let attr = inline_attribute_string st in loop (Pre.R (Link (attr, def)) :: text acc) st - | exception Fail -> - match - protect - (closing_tag - ||| open_tag - ||| html_comment - ||| declaration - ||| cdata_section - ||| processing_instruction) - st - with - | tag -> loop (Pre.R (Html ([], tag)) :: text acc) st - | exception Fail -> - junk st; - Buffer.add_char buf c; - loop acc st) + | exception Fail -> ( + match + protect + (closing_tag + ||| open_tag + ||| html_comment + ||| declaration + ||| cdata_section + ||| processing_instruction) + st + with + | tag -> loop (Pre.R (Html ([], tag)) :: text acc) st + | exception Fail -> + junk st; + Buffer.add_char buf c; + loop acc st)) | '\n' -> junk st; sp st; @@ -1972,8 +1622,7 @@ let rec inline defs st = Buffer.add_string buf " "; loop acc st) | Some '\n' -> loop acc st - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf c; loop acc st) | '`' -> loop (inline_pre buf acc st) st @@ -1987,16 +1636,14 @@ let rec inline defs st = junk st; Buffer.add_char buf c; loop acc st - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf c; loop acc st) | '!' as c -> ( junk st; match peek st with | Some '[' -> reference_link Img (text acc) st - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf c; loop acc st) | '&' -> @@ -2062,8 +1709,7 @@ let rec inline defs st = Buffer.add_char buf ']'; set_pos st off1; loop acc st) - | Some _ - | None -> + | Some _ | None -> Buffer.add_char buf ']'; loop acc st) | (Pre.R (Link _) as x) :: acc' -> aux true (x :: xs) acc' @@ -2079,12 +1725,7 @@ let rec inline defs st = let f post n st = let pre = pre |> Pre.classify_delim in let post = post |> Pre.classify_delim in - let e = - if c = '*' then - Pre.Star - else - Pre.Underscore - in + let e = if c = '*' then Pre.Star else Pre.Underscore in loop (Pre.Emph (pre, post, e, n) :: text acc) st in let rec aux n = @@ -2132,18 +1773,13 @@ let link_reference_definition st : attributes Ast.link_def = | Some '\n' when not seen_nl -> junk st; loop true - | Some _ - | None -> - () + | Some _ | None -> () in loop false in let ws1 st = match next st with - | ' ' - | '\t' - | '\010' .. '\013' -> - ws st + | ' ' | '\t' | '\010' .. '\013' -> ws st | _ -> raise Fail in ignore (sp3 st); diff --git a/src/sexp.ml b/src/sexp.ml index 86caba9a..3ba00ccf 100644 --- a/src/sexp.ml +++ b/src/sexp.ml @@ -7,11 +7,7 @@ type t = let atom s = Atom s let rec link { label; destination; title; _ } = - let title = - match title with - | Some title -> [ Atom title ] - | None -> [] - in + let title = match title with Some title -> [ Atom title ] | None -> [] in List (Atom "link" :: inline label :: Atom destination :: title) and inline = function @@ -53,15 +49,10 @@ let create ast = List (List.map block ast) let needs_quotes s = let rec loop i = - if i >= String.length s then - false + if i >= String.length s then false else match s.[i] with - | ' ' - | '\t' - | '\x00' .. '\x1F' - | '\x7F' .. '\x9F' -> - true + | ' ' | '\t' | '\x00' .. '\x1F' | '\x7F' .. '\x9F' -> true | _ -> loop (succ i) in loop 0 diff --git a/src/toc.ml b/src/toc.ml index cecdf751..38e5a4b6 100644 --- a/src/toc.ml +++ b/src/toc.ml @@ -9,12 +9,7 @@ let rec remove_links inline = | Link (_, link) -> link.label | Image (attr, link) -> Image (attr, { link with label = remove_links link.label }) - | Hard_break _ - | Soft_break _ - | Html _ - | Code _ - | Text _ -> - inline + | Hard_break _ | Soft_break _ | Html _ | Code _ | Text _ -> inline let headers = let remove_links_f = remove_links in @@ -25,18 +20,12 @@ let headers = (function | Heading (attr, level, inline) -> let inline = - if remove_links then - remove_links_f inline - else - inline + if remove_links then remove_links_f inline else inline in headers := (attr, level, inline) :: !headers | Blockquote (_, blocks) -> loop blocks | List (_, _, _, block_lists) -> List.iter loop block_lists - | Paragraph _ - | Thematic_break _ - | Html_block _ - | Definition_list _ + | Paragraph _ | Thematic_break _ | Html_block _ | Definition_list _ | Code_block _ -> ()) blocks @@ -56,16 +45,14 @@ let rec find_start headers level number subsections = match subsections with | [] -> headers (* no subsection to find *) | n :: subsections -> find_start headers (level + 1) n subsections - else - find_start tl level number subsections + else find_start tl level number subsections | (_, header_level, _) :: tl when header_level = level -> (* At proper [level]. Have we reached the [number] one? *) if number <= 1 then match subsections with | [] -> tl (* no subsection to find *) | n :: subsections -> find_start tl (level + 1) n subsections - else - find_start tl level (number - 1) subsections + else find_start tl level (number - 1) subsections | _ -> (* Sought [level] has not been found in the current section *) [] @@ -74,9 +61,7 @@ let unordered_list items = List ([], Bullet '*', Tight, items) let find_id attributes = List.find_map - (function - | k, v when String.equal "id" k -> Some v - | _ -> None) + (function k, v when String.equal "id" k -> Some v | _ -> None) attributes let link attributes label = @@ -125,6 +110,4 @@ let toc ?(start = []) ?(depth = 2) doc = in let len = List.length start in let toc, _ = make_toc headers ~min_level:(len + 1) ~max_level:(len + depth) in - match toc with - | [] -> [] - | _ -> [ unordered_list toc ] + match toc with [] -> [] | _ -> [ unordered_list toc ] diff --git a/tests/extract_tests.ml b/tests/extract_tests.ml index a727a4e4..ce727c9e 100644 --- a/tests/extract_tests.ml +++ b/tests/extract_tests.ml @@ -23,9 +23,7 @@ let begins_with s s' = String.length s >= String.length s' && String.sub s 0 (String.length s') = s' let test_delim = "````````````````````````````````" - let tab_re = Str.regexp_string "→" - let insert_tabs s = Str.global_replace tab_re "\t" s type test = @@ -64,14 +62,15 @@ let parse_test_spec filename = end in get_html () - end else begin + end + else begin add_line buf line; get_test () end in go (get_test () :: tests) (succ example) - end else - go tests example + end + else go tests example in go [] 1 @@ -116,7 +115,6 @@ let write_dune_file test_specs tests = (fun ppf -> List.iter (pp ppf) tests) let li_begin_re = Str.regexp_string "
  • \n" - let li_end_re = Str.regexp_string "\n
  • " let normalize_html s = @@ -147,7 +145,6 @@ let spec = ] let test_specs = ref [] - let add_to_list l x = l := x :: !l let () = diff --git a/tests/omd.ml b/tests/omd.ml index a436a623..d0d86eba 100644 --- a/tests/omd.ml +++ b/tests/omd.ml @@ -8,7 +8,6 @@ let protect ~finally f = r let li_begin_re = Str.regexp_string "
  • \n" - let li_end_re = Str.regexp_string "\n
  • " let normalize_html s = From b9df6b22dc4b3ef58292fb67a4b30077bcd511c2 Mon Sep 17 00:00:00 2001 From: Shon Feder Date: Mon, 23 May 2022 21:25:00 -0400 Subject: [PATCH 6/6] Add changelog entry --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index bdfa59c5..164d3998 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,8 @@ unreleased ---------- +- Update parser to support CommonMark Spec 0.30 (#266, @SquidDev) + - Preserve the order of input files in the HTML output to stdout (#258, @patricoferris)