parser.go

package mark

import (
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// parse holds the state of the parser.
type parse struct {
	Nodes     []Node
	lex       Lexer
	options   *Options
	tr        *parse
	output    string
	peekCount int
	token     [3]item                 // three-token lookahead for parser
	links     map[string]*DefLinkNode // Deflink parsing, used RefLinks
	renderFn  map[NodeType]RenderFn   // Custom overridden fns
}

// Return new parser
func newParse(input string, opts *Options) *parse {
	return &parse{
		lex:      lex(input),
		options:  opts,
		links:    make(map[string]*DefLinkNode),
		renderFn: make(map[NodeType]RenderFn),
	}
}

// parse convert the raw text to Nodeparse.
func (p *parse) parse() {
Loop:
	for {
		var n Node
		switch t := p.peek(); t.typ {
		case itemEOF, itemError:
			break Loop
		case itemNewLine:
			p.next()
		case itemHr:
			n = p.newHr(p.next().pos)
		case itemHTML:
			t = p.next()
			n = p.newHTML(t.pos, t.val)
		case itemDefLink:
			n = p.parseDefLink()
		case itemHeading, itemLHeading:
			n = p.parseHeading()
		case itemCodeBlock, itemGfmCodeBlock:
			n = p.parseCodeBlock()
		case itemList:
			n = p.parseList()
		case itemTable, itemLpTable:
			n = p.parseTable()
		case itemBlockQuote:
			n = p.parseBlockQuote()
		case itemIndent:
			space := p.next()
			// If it isn't followed by itemText
			if p.peek().typ != itemText {
				continue
			}
			p.backup2(space)
			fallthrough
		// itemText
		default:
			tmp := p.newParagraph(t.pos)
			tmp.Nodes = p.parseText(p.next().val + p.scanLines())
			n = tmp
		}
		if n != nil {
			p.append(n)
		}
	}
}

// Root getter
func (p *parse) root() *parse {
	if p.tr == nil {
		return p
	}
	return p.tr.root()
}

// Render parse nodes to the wanted output
func (p *parse) render() {
	var output string
	for i, node := range p.Nodes {
		// If there's a custom render function, use it instead.
		if fn, ok := p.renderFn[node.Type()]; ok {
			output = fn(node)
		} else {
			output = node.Render()
		}
		p.output += output
		if output != "" && i != len(p.Nodes)-1 {
			p.output += "\n"
		}
	}
}

// append new node to nodes-list
func (p *parse) append(n Node) {
	p.Nodes = append(p.Nodes, n)
}

// next returns the next token
func (p *parse) next() item {
	if p.peekCount > 0 {
		p.peekCount--
	} else {
		p.token[0] = p.lex.nextItem()
	}
	return p.token[p.peekCount]
}

// peek returns but does not consume the next token.
func (p *parse) peek() item {
	if p.peekCount > 0 {
		return p.token[p.peekCount-1]
	}
	p.peekCount = 1
	p.token[0] = p.lex.nextItem()
	return p.token[0]
}

// backup backs the input stream tp one token
func (p *parse) backup() {
	p.peekCount++
}

// backup2 backs the input stream up two tokens.
// The zeroth token is already there.
func (p *parse) backup2(t1 item) {
	p.token[1] = t1
	p.peekCount = 2
}

// parseText
func (p *parse) parseText(input string) (nodes []Node) {
	// Trim whitespaces that not a line-break
	input = regexp.MustCompile(`(?m)^ +| +(\n|$)`).ReplaceAllStringFunc(input, func(s string) string {
		if reBr.MatchString(s) {
			return s
		}
		return strings.Replace(s, " ", "", -1)
	})
	l := lexInline(input)
	for token := range l.items {
		var node Node
		switch token.typ {
		case itemBr:
			node = p.newBr(token.pos)
		case itemStrong, itemItalic, itemStrike, itemCode:
			node = p.parseEmphasis(token.typ, token.pos, token.val)
		case itemLink, itemAutoLink, itemGfmLink:
			var title, href string
			var text []Node
			if token.typ == itemLink {
				match := reLink.FindStringSubmatch(token.val)
				text = p.parseText(match[1])
				href, title = match[2], match[3]
			} else {
				var match []string
				if token.typ == itemGfmLink {
					match = reGfmLink.FindStringSubmatch(token.val)
				} else {
					match = reAutoLink.FindStringSubmatch(token.val)
				}
				href = match[1]
				text = append(text, p.newText(token.pos, match[1]))
			}
			node = p.newLink(token.pos, title, href, text...)
		case itemImage:
			match := reImage.FindStringSubmatch(token.val)
			node = p.newImage(token.pos, match[3], match[2], match[1])
		case itemRefLink, itemRefImage:
			match := reRefLink.FindStringSubmatch(token.val)
			text, ref := match[1], match[2]
			if ref == "" {
				ref = text
			}
			if token.typ == itemRefLink {
				node = p.newRefLink(token.typ, token.pos, token.val, ref, p.parseText(text))
			} else {
				node = p.newRefImage(token.typ, token.pos, token.val, ref, text)
			}
		case itemHTML:
			node = p.newHTML(token.pos, token.val)
		default:
			node = p.newText(token.pos, token.val)
		}
		nodes = append(nodes, node)
	}
	return nodes
}

// parse inline emphasis
func (p *parse) parseEmphasis(typ itemType, pos Pos, val string) *EmphasisNode {
	var re *regexp.Regexp
	switch typ {
	case itemStrike:
		re = reStrike
	case itemStrong:
		re = reStrong
	case itemCode:
		re = reCode
	case itemItalic:
		re = reItalic
	}
	node := p.newEmphasis(pos, typ)
	match := re.FindStringSubmatch(val)
	text := match[len(match)-1]
	if text == "" {
		text = match[1]
	}
	node.Nodes = p.parseText(text)
	return node
}

// parse heading block
func (p *parse) parseHeading() (node *HeadingNode) {
	token := p.next()
	level := 1
	var text string
	if token.typ == itemHeading {
		match := reHeading.FindStringSubmatch(token.val)
		level, text = len(match[1]), match[2]
	} else {
		match := reLHeading.FindStringSubmatch(token.val)
		// using equal signs for first-level, and dashes for second-level.
		text = match[1]
		if match[2] == "-" {
			level = 2
		}
	}
	node = p.newHeading(token.pos, level, text)
	node.Nodes = p.parseText(text)
	return
}

func (p *parse) parseDefLink() *DefLinkNode {
	token := p.next()
	match := reDefLink.FindStringSubmatch(token.val)
	name := strings.ToLower(match[1])
	// name(lowercase), href, title
	n := p.newDefLink(token.pos, name, match[2], match[3])
	// store in links
	links := p.root().links
	if _, ok := links[name]; !ok {
		links[name] = n
	}
	return n
}

// parse codeBlock
func (p *parse) parseCodeBlock() *CodeNode {
	var lang, text string
	token := p.next()
	if token.typ == itemGfmCodeBlock {
		codeStart := reGfmCode.FindStringSubmatch(token.val)
		lang = codeStart[3]
		text = token.val[len(codeStart[0]):]
	} else {
		text = reCodeBlock.trim(token.val, "")
	}
	return p.newCode(token.pos, lang, text)
}

func (p *parse) parseBlockQuote() (n *BlockQuoteNode) {
	token := p.next()
	// replacer
	re := regexp.MustCompile(`(?m)^ *> ?`)
	raw := re.ReplaceAllString(token.val, "")
	// TODO(a8m): doesn't work right now with defLink(inside the blockQuote)
	tr := &parse{lex: lex(raw), tr: p}
	tr.parse()
	n = p.newBlockQuote(token.pos)
	n.Nodes = tr.Nodes
	return
}

// parse list
func (p *parse) parseList() *ListNode {
	token := p.next()
	list := p.newList(token.pos, isDigit(token.val))
Loop:
	for {
		switch token = p.peek(); token.typ {
		case itemLooseItem, itemListItem:
			list.append(p.parseListItem())
		default:
			break Loop
		}
	}
	return list
}

// parse listItem
func (p *parse) parseListItem() *ListItemNode {
	token := p.next()
	item := p.newListItem(token.pos)
	token.val = strings.TrimSpace(token.val)
	if p.isTaskItem(token.val) {
		item.Nodes = p.parseTaskItem(token)
		return item
	}
	tr := &parse{lex: lex(token.val), tr: p}
	tr.parse()
	for _, node := range tr.Nodes {
		// wrap with paragraph only when it's a loose item
		if n, ok := node.(*ParagraphNode); ok && token.typ == itemListItem {
			item.Nodes = append(item.Nodes, n.Nodes...)
		} else {
			item.append(node)
		}
	}
	return item
}

// parseTaskItem parses list item as a task item.
func (p *parse) parseTaskItem(token item) []Node {
	checkbox := p.newCheckbox(token.pos, token.val[1] == 'x')
	token.val = strings.TrimSpace(token.val[3:])
	return append([]Node{checkbox}, p.parseText(token.val)...)
}

// isTaskItem tests if the given string is list task item.
func (p *parse) isTaskItem(s string) bool {
	if len(s) < 5 || s[0] != '[' || (s[1] != 'x' && s[1] != ' ') || s[2] != ']' {
		return false
	}
	return "" != strings.TrimSpace(s[3:])
}

// parse table
func (p *parse) parseTable() *TableNode {
	table := p.newTable(p.next().pos)
	// Align	[ None, Left, Right, ... ]
	// Header	[ Cells: [ ... ] ]
	// Data:	[ Rows: [ Cells: [ ... ] ] ]
	rows := struct {
		Align  []AlignType
		Header []item
		Cells  [][]item
	}{}
Loop:
	for i := 0; ; {
		switch token := p.next(); token.typ {
		case itemTableRow:
			i++
			if i > 2 {
				rows.Cells = append(rows.Cells, []item{})
			}
		case itemTableCell:
			// Header
			if i == 1 {
				rows.Header = append(rows.Header, token)
				// Alignment
			} else if i == 2 {
				rows.Align = append(rows.Align, parseAlign(token.val))
				// Data
			} else {
				pos := i - 3
				rows.Cells[pos] = append(rows.Cells[pos], token)
			}
		default:
			p.backup()
			break Loop
		}
	}
	// Tranform to nodes
	table.append(p.parseCells(Header, rows.Header, rows.Align))
	// Table body
	for _, row := range rows.Cells {
		table.append(p.parseCells(Data, row, rows.Align))
	}
	return table
}

// parse cells and return new row
func (p *parse) parseCells(kind int, items []item, align []AlignType) *RowNode {
	var row *RowNode
	for i, item := range items {
		if i == 0 {
			row = p.newRow(item.pos)
		}
		cell := p.newCell(item.pos, kind, align[i])
		cell.Nodes = p.parseText(item.val)
		row.append(cell)
	}
	return row
}

// Used to consume lines(itemText) for a continues paragraphs
func (p *parse) scanLines() (s string) {
	for {
		tkn := p.next()
		if tkn.typ == itemText || tkn.typ == itemIndent {
			s += tkn.val
		} else if tkn.typ == itemNewLine {
			if t := p.peek().typ; t != itemText && t != itemIndent {
				p.backup2(tkn)
				break
			}
			s += tkn.val
		} else {
			p.backup()
			break
		}
	}
	return
}

// get align-string and return the align type of it
func parseAlign(s string) (typ AlignType) {
	sfx, pfx := strings.HasSuffix(s, ":"), strings.HasPrefix(s, ":")
	switch {
	case sfx && pfx:
		typ = Center
	case sfx:
		typ = Right
	case pfx:
		typ = Left
	}
	return
}

// test if given string is digit
func isDigit(s string) bool {
	r, _ := utf8.DecodeRuneInString(s)
	return unicode.IsDigit(r)
}