Skip to content

Commit

Permalink
imp(lexer): operate on u8 instead of char
Browse files Browse the repository at this point in the history
This improved throughput from 230MB/s to 280MB/s, which is quite
worth the while. In case of json, only the values within Strings are
actually potentially unicode, everything else is not
  • Loading branch information
Byron committed May 7, 2015
1 parent 50c9f81 commit d5a694d
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 60 deletions.
4 changes: 2 additions & 2 deletions benches/usage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ const NULL_RIDDEN: &'static str = r##"
#[bench]
fn lexer_throughput_in_bytes(b: &mut test::Bencher) {
b.iter(|| {
let it = Lexer::new(NULL_RIDDEN.chars());
let it = Lexer::new(NULL_RIDDEN.bytes());
for t in it {
test::black_box(t);
}
Expand All @@ -128,7 +128,7 @@ fn lexer_throughput_in_bytes(b: &mut test::Bencher) {
#[bench]
fn filter_null_throughput_in_bytes(b: &mut test::Bencher) {
b.iter(|| {
let f = FilterNull::new(Lexer::new(NULL_RIDDEN.chars()));
let f = FilterNull::new(Lexer::new(NULL_RIDDEN.bytes()));
for t in f {
test::black_box(t);
}
Expand Down
4 changes: 2 additions & 2 deletions src/filter_null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ impl<I> Iterator for FilterNull<I> where I: Iterator<Item=Token>{
// WE HAVE A STR : STR triplete, and we forget it
// This works by just not putting it onto the ringbuffer
// See if there is a (optional) comma
// If self.buf has anything, it must be commas !
// Usually, it is only 0 or 1 !
// If self.buf has anything, it must be a comma !
// It is only 0 or 1 !
match self.next_token() {
Some(comma_candidate) => {
first_str_candidate =
Expand Down
88 changes: 44 additions & 44 deletions src/lexer.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/// A lexer for utf-8 encoded json data
pub struct Lexer<I: Iterator<Item=char>> {
pub struct Lexer<I: Iterator<Item=u8>> {
chars: I,
next_char: Option<char>,
next_byte: Option<u8>,
cursor: u64,
}

Expand Down Expand Up @@ -34,7 +34,7 @@ pub enum TokenType {

/// any json number, like `1.24123` or `123`
// NOTE: We can't do numbers with our simplified lexer as it would require
// us to read a char just to see that it's not a number and thus the previous
// us to read a byte just to see that it's not a number and thus the previous
// tokens are to be returned. But we cannot peek without drastically complicating
// our so far quite speedy implementation.
// Number,
Expand Down Expand Up @@ -67,24 +67,24 @@ pub struct Token {
pub span: Span,
}

impl<I> Lexer<I> where I: Iterator<Item=char> {
impl<I> Lexer<I> where I: Iterator<Item=u8> {
/// Returns a new Lexer from a given character iterator.
pub fn new(chars: I) -> Lexer<I> {
Lexer {
chars: chars,
next_char: None,
next_byte: None,
cursor: 0,
}
}

fn put_back(&mut self, c: char) {
debug_assert!(self.next_char.is_none());
self.next_char = Some(c);
fn put_back(&mut self, c: u8) {
debug_assert!(self.next_byte.is_none());
self.next_byte = Some(c);
self.cursor -= 1;
}

fn next_char(&mut self) -> Option<char> {
match self.next_char.take() {
fn next_byte(&mut self) -> Option<u8> {
match self.next_byte.take() {
Some(c) => {
self.cursor += 1;
Some(c)
Expand All @@ -108,18 +108,18 @@ enum Mode {
// String parse mode: bool = ignore_next
String(bool),
// `null` parse mode: buf, buf-index
Null([char; 4], usize),
Null([u8; 4], usize),
// `true` parse mode
True([char; 4], usize),
True([u8; 4], usize),
// `false` parse mode
False([char; 5], usize),
False([u8; 5], usize),
// `Number` parse mode
Number,
SlowPath,
}

impl<I> Iterator for Lexer<I>
where I: Iterator<Item=char> {
where I: Iterator<Item=u8> {
type Item = Token;

/// Lex the underlying character stream to generate tokens
Expand All @@ -130,23 +130,23 @@ impl<I> Iterator for Lexer<I>
let mut state = Mode::SlowPath;
let last_cursor = self.cursor;

while let Some(c) = self.next_char() {
while let Some(c) = self.next_byte() {
let mut set_cursor = |cursor| {
first = cursor - 1;
};

match state {
Mode::String(ref mut ign_next) => {
if *ign_next && (c == '"' || c == '\\') {
if *ign_next && (c == b'"' || c == b'\\') {
*ign_next = false;
continue;
}
match c {
'"' => {
b'"' => {
t = TokenType::String;
break;
},
'\\' => {
b'\\' => {
*ign_next = true;
continue;
},
Expand All @@ -158,8 +158,8 @@ impl<I> Iterator for Lexer<I>
Mode::Null(ref mut b, ref mut i) => {
b[*i] = c;
if *i == 3 {
// we know b[0] is 'n'
if b[1] == 'u' && b[2] == 'l' && b[3] == 'l' {
// we know b[0] is b'n'
if b[1] == b'u' && b[2] == b'l' && b[3] == b'l' {
t = TokenType::Null;
}
break;
Expand All @@ -170,9 +170,9 @@ impl<I> Iterator for Lexer<I>
},
Mode::Number => {
match c {
'0' ... '9'
|'-'
|'.' => {
b'0' ... b'9'
|b'-'
|b'.' => {
continue;
},
_ => {
Expand All @@ -185,8 +185,8 @@ impl<I> Iterator for Lexer<I>
Mode::True(ref mut b, ref mut i) => {
b[*i] = c;
if *i == 3 {
// we know b[0] is 't'
if b[1] == 'r' && b[2] == 'u' && b[3] == 'e' {
// we know b[0] is b't'
if b[1] == b'r' && b[2] == b'u' && b[3] == b'e' {
t = TokenType::BooleanTrue;
}
break;
Expand All @@ -198,8 +198,8 @@ impl<I> Iterator for Lexer<I>
Mode::False(ref mut b, ref mut i) => {
b[*i] = c;
if *i == 4 {
// we know b[0] is 'f'
if b[1] == 'a' && b[2] == 'l' && b[3] == 's' && b[4] == 'e' {
// we know b[0] is b'f'
if b[1] == b'a' && b[2] == b'l' && b[3] == b's' && b[4] == b'e' {
t = TokenType::BooleanFalse;
}
break;
Expand All @@ -210,35 +210,35 @@ impl<I> Iterator for Lexer<I>
},
Mode::SlowPath => {
match c {
'{' => { t = TokenType::CurlyOpen; set_cursor(self.cursor); break; },
'}' => { t = TokenType::CurlyClose; set_cursor(self.cursor); break; },
'"' => {
b'{' => { t = TokenType::CurlyOpen; set_cursor(self.cursor); break; },
b'}' => { t = TokenType::CurlyClose; set_cursor(self.cursor); break; },
b'"' => {
state = Mode::String(false);
set_cursor(self.cursor);
},
'n' => {
state = Mode::Null([c, 'x', 'x', 'x'], 1);
b'n' => {
state = Mode::Null([c, b'x', b'x', b'x'], 1);
set_cursor(self.cursor);
},
'0' ... '9'
|'-'
|'.'=> {
b'0' ... b'9'
|b'-'
|b'.'=> {
state = Mode::Number;
set_cursor(self.cursor);
},
't' => {
state = Mode::True([c, 'x', 'x', 'x'], 1);
b't' => {
state = Mode::True([c, b'x', b'x', b'x'], 1);
set_cursor(self.cursor);
},
'f' => {
state = Mode::False([c, 'x', 'x', 'x', 'x'], 1);
b'f' => {
state = Mode::False([c, b'x', b'x', b'x', b'x'], 1);
set_cursor(self.cursor);
},
'[' => { t = TokenType::BracketOpen; set_cursor(self.cursor); break; },
']' => { t = TokenType::BracketClose; set_cursor(self.cursor); break; },
':' => { t = TokenType::Colon; set_cursor(self.cursor); break; },
',' => { t = TokenType::Comma; set_cursor(self.cursor); break; },
'\\' => {
b'[' => { t = TokenType::BracketOpen; set_cursor(self.cursor); break; },
b']' => { t = TokenType::BracketClose; set_cursor(self.cursor); break; },
b':' => { t = TokenType::Colon; set_cursor(self.cursor); break; },
b',' => { t = TokenType::Comma; set_cursor(self.cursor); break; },
b'\\' => {
// invalid
debug_assert_eq!(t, TokenType::Invalid);
set_cursor(self.cursor);
Expand Down
4 changes: 2 additions & 2 deletions tests/filters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ fn filter_null_values() {
(r#"{"s":null, "s":null, "s":true }"#, 13, 5),
(r#"{"s":true, "s":null, "s":true }"#, 13, 9),
(r#"{"s":true, "s":null "s":true }"#, 12, 8),] {
assert_eq!(Lexer::new(src.chars()).count(), count);
assert_eq!(FilterNull::new(Lexer::new(src.chars())).count(), fcount);
assert_eq!(Lexer::new(src.bytes()).count(), count);
assert_eq!(FilterNull::new(Lexer::new(src.bytes())).count(), fcount);
}
}
20 changes: 10 additions & 10 deletions tests/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use json_tools::{Lexer, Token, Span, TokenType};
#[test]
fn string_value() {
let src = r#"{ "face": "😂" }"#;
let mut it = Lexer::new(src.chars());
let mut it = Lexer::new(src.bytes());

assert_eq!(it.next(), Some(Token { kind: TokenType::CurlyOpen,
span: Span { first: 0,
Expand All @@ -18,24 +18,24 @@ fn string_value() {
end: 9 } }));
assert_eq!(it.next(), Some(Token { kind: TokenType::String,
span: Span { first: 10,
end: 13 } }));
end: 16 } }));
assert_eq!(it.next(), Some(Token { kind: TokenType::CurlyClose,
span: Span { first: 14,
end: 15 } }));
span: Span { first: 17,
end: 18 } }));
}


#[test]
fn string_escaping() {
let src = r#"{"s":"\"in\""}"#;
let it = Lexer::new(src.chars());
let it = Lexer::new(src.bytes());
assert_eq!(it.skip(3).next(), Some(Token { kind: TokenType::String,
span: Span { first: 5,
end: 13 } }));

// '\"' makes us ignore the beginning of the string, and we never hit the end
let src = r#"{"s":\"foo"}"#;
let mut it = Lexer::new(src.chars());
let mut it = Lexer::new(src.bytes());
// this is the '\' character - only valid within a string
assert_eq!(it.by_ref().skip(3).next(), Some(Token { kind: TokenType::Invalid,
span: Span { first: 5,
Expand All @@ -54,7 +54,7 @@ fn string_escaping() {
fn unclosed_string_value() {
// '\"' makes us ignore the beginning of the string, and we never hit the end
let src = r#"{"s":"f}"#;
let mut it = Lexer::new(src.chars());
let mut it = Lexer::new(src.bytes());

// unclosed strings are invalid
assert_eq!(it.by_ref().skip(3).next(), Some(Token { kind: TokenType::Invalid,
Expand All @@ -65,14 +65,14 @@ fn unclosed_string_value() {
#[test]
fn backslash_escapes_backslash_in_string_value() {
let src = r#"{"s":"f\\"}"#;
let mut it = Lexer::new(src.chars());
let mut it = Lexer::new(src.bytes());

assert_eq!(it.by_ref().skip(3).next(), Some(Token { kind: TokenType::String,
span: Span { first: 5,
end: 10 } }));

let src = r#"{"s":"f\"}"#;
let mut it = Lexer::new(src.chars());
let mut it = Lexer::new(src.bytes());

assert_eq!(it.by_ref().skip(3).next(), Some(Token { kind: TokenType::Invalid,
span: Span { first: 5,
Expand All @@ -94,7 +94,7 @@ fn special_values_closed_and_unclosed() {
(r#"{"v":-1.23}"#, TokenType::Number, 5, 10),
(r#"{"v":1.}"#, TokenType::Number, 5, 7),
(r#"{"v":.}"#, TokenType::Number, 5, 6),] {
assert_eq!(Lexer::new(src.chars()).skip(3).next(),
assert_eq!(Lexer::new(src.bytes()).skip(3).next(),
Some(Token { kind: kind.clone(),
span: Span { first: first,
end: end } }));
Expand Down

0 comments on commit d5a694d

Please sign in to comment.