lexer.go

package mark

import (
	"regexp"
	"strings"
	"unicode/utf8"
)

// type position
type Pos int

// itemType identifies the type of lex items.
type itemType int

// Item represent a token or text string returned from the scanner
type item struct {
	typ itemType // The type of this item.
	pos Pos      // The starting position, in bytes, of this item in the input string.
	val string   // The value of this item.
}

const eof = -1 // Zero value so closed channel delivers EOF

const (
	itemError itemType = iota // Error occurred; value is text of error
	itemEOF
	itemNewLine
	itemHTML
	itemHeading
	itemLHeading
	itemBlockQuote
	itemList
	itemListItem
	itemLooseItem
	itemCodeBlock
	itemGfmCodeBlock
	itemHr
	itemTable
	itemLpTable
	itemTableRow
	itemTableCell
	itemStrong
	itemItalic
	itemStrike
	itemCode
	itemLink
	itemDefLink
	itemRefLink
	itemAutoLink
	itemGfmLink
	itemImage
	itemRefImage
	itemText
	itemBr
	itemPipe
	itemIndent
)

// stateFn represents the state of the scanner as a function that returns the next state.
type stateFn func(*lexer) stateFn

// Lexer interface, used to composed it inside the parser
type Lexer interface {
	nextItem() item
}

// lexer holds the state of the scanner.
type lexer struct {
	input   string    // the string being scanned
	state   stateFn   // the next lexing function to enter
	pos     Pos       // current position in the input
	start   Pos       // start position of this item
	width   Pos       // width of last rune read from input
	lastPos Pos       // position of most recent item returned by nextItem
	items   chan item // channel of scanned items
}

// lex creates a new lexer for the input string.
func lex(input string) *lexer {
	l := &lexer{
		input: input,
		items: make(chan item),
	}
	go l.run()
	return l
}

// lexInline create a new lexer for one phase lexing(inline blocks).
func lexInline(input string) *lexer {
	l := &lexer{
		input: input,
		items: make(chan item),
	}
	go l.lexInline()
	return l
}

// run runs the state machine for the lexer.
func (l *lexer) run() {
	for l.state = lexAny; l.state != nil; {
		l.state = l.state(l)
	}
	close(l.items)
}

// next return the next rune in the input
func (l *lexer) next() rune {
	if int(l.pos) >= len(l.input) {
		l.width = 0
		return eof
	}
	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
	l.width = Pos(w)
	l.pos += l.width
	return r
}

// lexAny scanner is kind of forwarder, it get the current char in the text
// and forward it to the appropriate scanner based on some conditions.
func lexAny(l *lexer) stateFn {
	switch r := l.peek(); r {
	case '*', '-', '_':
		return lexHr
	case '+', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
		return lexList
	case '<':
		return lexHTML
	case '>':
		return lexBlockQuote
	case '[':
		return lexDefLink
	case '#':
		return lexHeading
	case '`', '~':
		return lexGfmCode
	case ' ':
		if reCodeBlock.MatchString(l.input[l.pos:]) {
			return lexCode
		} else if reGfmCode.MatchString(l.input[l.pos:]) {
			return lexGfmCode
		}
		// Keep moving forward until we get all the indentation size
		for ; r == l.peek(); r = l.next() {
		}
		l.emit(itemIndent)
		return lexAny
	case '|':
		if m := reTable.itemLp.MatchString(l.input[l.pos:]); m {
			l.emit(itemLpTable)
			return lexTable
		}
		fallthrough
	default:
		if m := reTable.item.MatchString(l.input[l.pos:]); m {
			l.emit(itemTable)
			return lexTable
		}
		return lexText
	}
}

// lexHeading test if the current text position is an heading item.
// is so, it will emit an item and return back to lenAny function
// else, lex it as a simple text value
func lexHeading(l *lexer) stateFn {
	if m := reHeading.FindString(l.input[l.pos:]); m != "" {
		l.pos += Pos(len(m))
		l.emit(itemHeading)
		return lexAny
	}
	return lexText
}

// lexHr test if the current text position is an horizontal rules item.
// is so, it will emit an horizontal rule item and return back to lenAny function
// else, forward it to lexList function
func lexHr(l *lexer) stateFn {
	if match := reHr.FindString(l.input[l.pos:]); match != "" {
		l.pos += Pos(len(match))
		l.emit(itemHr)
		return lexAny
	}
	return lexList
}

// lexGfmCode test if the current text position is start of GFM code-block item.
// if so, it will generate regexp based on the fence type[`~] and it length.
// it scan until the end, and then emit the code-block item and return back to the
// lenAny forwarder.
// else, lex it as a simple inline text.
func lexGfmCode(l *lexer) stateFn {
	if match := reGfmCode.FindStringSubmatch(l.input[l.pos:]); len(match) != 0 {
		l.pos += Pos(len(match[0]))
		fence := match[2]
		// Generate Regexp based on fence type[`~] and length
		reGfmEnd := reGfmCode.endGen(fence[0:1], len(fence))
		infoContainer := reGfmEnd.FindStringSubmatch(l.input[l.pos:])
		l.pos += Pos(len(infoContainer[0]))
		infoString := infoContainer[1]
		// Remove leading and trailing spaces
		if indent := len(match[1]); indent > 0 {
			reSpace := reSpaceGen(indent)
			infoString = reSpace.ReplaceAllString(infoString, "")
		}
		l.emit(itemGfmCodeBlock, match[0]+infoString)
		return lexAny
	}
	return lexText
}

// lexCode scans code block.
func lexCode(l *lexer) stateFn {
	match := reCodeBlock.FindString(l.input[l.pos:])
	l.pos += Pos(len(match))
	l.emit(itemCodeBlock)
	return lexAny
}

// lexText scans until end-of-line(\n)
func lexText(l *lexer) stateFn {
	// Drain text before emitting
	emit := func(item itemType, pos Pos) {
		if l.pos > l.start {
			l.emit(itemText)
		}
		l.pos += pos
		l.emit(item)
	}
Loop:
	for {
		switch r := l.peek(); r {
		case eof:
			emit(itemEOF, Pos(0))
			break Loop
		case '\n':
			// CM 4.4: An indented code block cannot interrupt a paragraph.
			if l.pos > l.start && strings.HasPrefix(l.input[l.pos+1:], "    ") {
				l.next()
				continue
			}
			emit(itemNewLine, l.width)
			break Loop
		default:
			// Test for Setext-style headers
			if m := reLHeading.FindString(l.input[l.pos:]); m != "" {
				emit(itemLHeading, Pos(len(m)))
				break Loop
			}
			l.next()
		}
	}
	return lexAny
}

// backup steps back one rune. Can only be called once per call of next.
func (l *lexer) backup() {
	l.pos -= l.width
}

// peek returns but does not consume the next rune in the input.
func (l *lexer) peek() rune {
	r := l.next()
	l.backup()
	return r
}

// emit passes an item back to the client.
func (l *lexer) emit(t itemType, s ...string) {
	if len(s) == 0 {
		s = append(s, l.input[l.start:l.pos])
	}
	l.items <- item{t, l.start, s[0]}
	l.start = l.pos
}

// lexItem return the next item token, called by the parser.
func (l *lexer) nextItem() item {
	item := <-l.items
	l.lastPos = l.pos
	return item
}

// One phase lexing(inline reason)
func (l *lexer) lexInline() {
	escape := regexp.MustCompile("^\\\\([\\`*{}\\[\\]()#+\\-.!_>~|])")
	// Drain text before emitting
	emit := func(item itemType, pos int) {
		if l.pos > l.start {
			l.emit(itemText)
		}
		l.pos += Pos(pos)
		l.emit(item)
	}
Loop:
	for {
		switch r := l.peek(); r {
		case eof:
			if l.pos > l.start {
				l.emit(itemText)
			}
			break Loop
		// backslash escaping
		case '\\':
			if m := escape.FindStringSubmatch(l.input[l.pos:]); len(m) != 0 {
				if l.pos > l.start {
					l.emit(itemText)
				}
				l.pos += Pos(len(m[0]))
				l.emit(itemText, m[1])
				break
			}
			fallthrough
		case ' ':
			if m := reBr.FindString(l.input[l.pos:]); m != "" {
				// pos - length of new-line
				emit(itemBr, len(m))
				break
			}
			l.next()
		case '_', '*', '~', '`':
			input := l.input[l.pos:]
			// Strong
			if m := reStrong.FindString(input); m != "" {
				emit(itemStrong, len(m))
				break
			}
			// Italic
			if m := reItalic.FindString(input); m != "" {
				emit(itemItalic, len(m))
				break
			}
			// Strike
			if m := reStrike.FindString(input); m != "" {
				emit(itemStrike, len(m))
				break
			}
			// InlineCode
			if m := reCode.FindString(input); m != "" {
				emit(itemCode, len(m))
				break
			}
			l.next()
		// itemLink, itemImage, itemRefLink, itemRefImage
		case '[', '!':
			input := l.input[l.pos:]
			if m := reLink.FindString(input); m != "" {
				pos := len(m)
				if r == '[' {
					emit(itemLink, pos)
				} else {
					emit(itemImage, pos)
				}
				break
			}
			if m := reRefLink.FindString(input); m != "" {
				pos := len(m)
				if r == '[' {
					emit(itemRefLink, pos)
				} else {
					emit(itemRefImage, pos)
				}
				break
			}
			l.next()
		// itemAutoLink, htmlBlock
		case '<':
			if m := reAutoLink.FindString(l.input[l.pos:]); m != "" {
				emit(itemAutoLink, len(m))
				break
			}
			if match, res := l.matchHTML(l.input[l.pos:]); match {
				emit(itemHTML, len(res))
				break
			}
			l.next()
		default:
			if m := reGfmLink.FindString(l.input[l.pos:]); m != "" {
				emit(itemGfmLink, len(m))
				break
			}
			l.next()
		}
	}
	close(l.items)
}

// lexHTML.
func lexHTML(l *lexer) stateFn {
	if match, res := l.matchHTML(l.input[l.pos:]); match {
		l.pos += Pos(len(res))
		l.emit(itemHTML)
		return lexAny
	}
	return lexText
}

// Test if the given input is match the HTML pattern(blocks only)
func (l *lexer) matchHTML(input string) (bool, string) {
	if m := reHTML.comment.FindString(input); m != "" {
		return true, m
	}
	if m := reHTML.item.FindStringSubmatch(input); len(m) != 0 {
		el, name := m[0], m[1]
		// if name is a span... is a text
		if reHTML.span.MatchString(name) {
			return false, ""
		}
		// if it's a self-closed html element, but not a itemAutoLink
		if strings.HasSuffix(el, "/>") && !reAutoLink.MatchString(el) {
			return true, el
		}
		if name == reHTML.CDATA_OPEN {
			name = reHTML.CDATA_CLOSE
		}
		reEndTag := reHTML.endTagGen(name)
		if m := reEndTag.FindString(input); m != "" {
			return true, m
		}
	}
	return false, ""
}

// lexDefLink scans link definition
func lexDefLink(l *lexer) stateFn {
	if m := reDefLink.FindString(l.input[l.pos:]); m != "" {
		l.pos += Pos(len(m))
		l.emit(itemDefLink)
		return lexAny
	}
	return lexText
}

// lexList scans ordered and unordered lists.
func lexList(l *lexer) stateFn {
	match, items := l.matchList(l.input[l.pos:])
	if !match {
		return lexText
	}
	var space int
	var typ itemType
	for i, item := range items {
		// Emit itemList on the first loop
		if i == 0 {
			l.emit(itemList, reList.marker.FindStringSubmatch(item)[1])
		}
		// Initialize each loop
		typ = itemListItem
		space = len(item)
		l.pos += Pos(space)
		item = reList.marker.ReplaceAllString(item, "")
		// Indented
		if strings.Contains(item, "\n ") {
			space -= len(item)
			reSpace := reSpaceGen(space)
			item = reSpace.ReplaceAllString(item, "")
		}
		// If current is loose
		for _, l := range reList.loose.FindAllString(item, -1) {
			if len(strings.TrimSpace(l)) > 0 || i != len(items)-1 {
				typ = itemLooseItem
				break
			}
		}
		// or previous
		if typ != itemLooseItem && i > 0 && strings.HasSuffix(items[i-1], "\n\n") {
			typ = itemLooseItem
		}
		l.emit(typ, strings.TrimSpace(item))
	}
	return lexAny
}

func (l *lexer) matchList(input string) (bool, []string) {
	var res []string
	reItem := reList.item
	if !reItem.MatchString(input) {
		return false, res
	}
	// First item
	m := reItem.FindStringSubmatch(input)
	item, depth := m[0], len(m[1])
	input = input[len(item):]
	// Loop over the input
	for len(input) > 0 {
		// Count new-lines('\n')
		if m := reList.scanNewLine(input); m != "" {
			item += m
			input = input[len(m):]
			if len(m) >= 2 || !reItem.MatchString(input) && !strings.HasPrefix(input, " ") {
				break
			}
		}
		// DefLink or hr
		if reDefLink.MatchString(input) || reHr.MatchString(input) {
			break
		}
		// It's list in the same depth
		if m := reItem.FindStringSubmatch(input); len(m) > 0 && len(m[1]) == depth {
			if item != "" {
				res = append(res, item)
			}
			item = m[0]
			input = input[len(item):]
		} else {
			m := reList.scanLine(input)
			item += m
			input = input[len(m):]
		}
	}
	// Drain res
	if item != "" {
		res = append(res, item)
	}
	return true, res
}

// Test if the given input match blockquote
func (l *lexer) matchBlockQuote(input string) (bool, string) {
	match := reBlockQuote.FindString(input)
	if match == "" {
		return false, match
	}
	lines := strings.Split(match, "\n")
	for i, line := range lines {
		// if line is a link-definition or horizontal role, we cut the match until this point
		if reDefLink.MatchString(line) || reHr.MatchString(line) {
			match = strings.Join(lines[0:i], "\n")
			break
		}
	}
	return true, match
}

// lexBlockQuote
func lexBlockQuote(l *lexer) stateFn {
	if match, res := l.matchBlockQuote(l.input[l.pos:]); match {
		l.pos += Pos(len(res))
		l.emit(itemBlockQuote)
		return lexAny
	}
	return lexText
}

// lexTable
func lexTable(l *lexer) stateFn {
	re := reTable.item
	if l.peek() == '|' {
		re = reTable.itemLp
	}
	table := re.FindStringSubmatch(l.input[l.pos:])
	l.pos += Pos(len(table[0]))
	l.start = l.pos
	// Ignore the first match, and flat all rows(by splitting \n)
	rows := append(table[1:3], strings.Split(table[3], "\n")...)
	for _, row := range rows {
		if row == "" {
			continue
		}
		l.emit(itemTableRow)
		rawCells := reTable.trim(row, "")
		cells := reTable.split(rawCells, -1)
		// Emit cells in the current row
		for _, cell := range cells {
			l.emit(itemTableCell, cell)
		}
	}
	return lexAny
}