-
Notifications
You must be signed in to change notification settings - Fork 230
/
MiniSoupTokeniserState.java
98 lines (92 loc) · 2.76 KB
/
MiniSoupTokeniserState.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package org.jsoup.parser;
/**
* 词法分析状态机。
* States and transition activations for the Tokeniser.
*/
enum MiniSoupTokeniserState implements ITokeniserState {
/**
* 什么层级都没有的状态
* ⬇
* <div>test</div>
* ⬇
* <div>test</div>
*/
Data {
// in data state, gather characters until a character reference or tag is found
public void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '<':
t.advanceTransition(TagOpen);
break;
case eof:
t.emit(new Token.EOF());
break;
default:
String data = r.consumeToAny('&', '<', nullChar);
t.emit(data);
break;
}
}
},
/**
* ⬇
* <div>test</div>
*/
TagOpen {
// from < in data
public void read(Tokeniser t, CharacterReader r) {
switch (r.current()) {
case '/':
t.advanceTransition(EndTagOpen);
break;
default:
if (r.matchesLetter()) {
t.createTagPending(true);
t.transition(TagName);
} else {
t.error(this);
t.emit('<'); // char that got us here
t.transition(Data);
}
break;
}
}
},
/**
* ⬇
* <div>test</div>
*/
EndTagOpen {
public void read(Tokeniser t, CharacterReader r) {
if (r.isEmpty()) {
t.eofError(this);
t.emit("</");
t.transition(Data);
} else if (r.matches('>')) {
t.error(this);
t.advanceTransition(Data);
}
}
},
/**
* ⬇
* <div>test</div>
*/
TagName {
// from < or </ in data, will have start or end tag pending
public void read(Tokeniser t, CharacterReader r) {
// previous TagOpen state did NOT consume, will have a letter char in current
String tagName = r.consumeToAny('\t', '\n', '\r', '\f', ' ', '/', '>', nullChar).toLowerCase();
t.tagPending.appendTagName(tagName);
switch (r.consume()) {
case '>':
t.emitTagPending();
t.transition(Data);
break;
}
}
};
public abstract void read(Tokeniser t, CharacterReader r);
private static final char nullChar = '\u0000';
private static final char eof = CharacterReader.EOF;
}