Skip to content

Commit

Permalink
Add a decode() function that looks for a BOM.
Browse files Browse the repository at this point in the history
Only works with a single string entirely in memory.

This is part of lifthrasiir#19,
although complete lifthrasiir#19 would also support incremental decoding.
  • Loading branch information
SimonSapin committed Dec 11, 2013
1 parent e379c2c commit 0c1d3f6
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,26 @@ mod tests {
// corresponding rust-encoding native API:
assert_eq!(all::WINDOWS_949.decode(broken, DecodeReplace), Ok(~"\uc6b0\uc640\ufffd\uc559"));
}


#[test]
fn test_decode() {
fn test_one(input: &[u8], expected_result: &str, expected_encoding: &str) {
let (result, used_encoding) = decode(
input, DecodeStrict, all::ISO_8859_1 as &'static Encoding);
let result = result.unwrap();
assert_eq!(used_encoding.name(), expected_encoding);
assert_eq!(result.as_slice(), expected_result);
}

test_one([0xEF, 0xBB, 0xBF, 0xC3, 0xA9], "é", "utf-8");
test_one([0xC3, 0xA9], "é", "iso-8859-1");

test_one([0xFE, 0xFF, 0x00, 0xE9], "é", "utf-16be");
test_one([0x00, 0xE9], "\x00é", "iso-8859-1");

test_one([0xFF, 0xFE, 0xE9, 0x00], "é", "utf-16le");
test_one([0xE9, 0x00], \x00", "iso-8859-1");
}
}

33 changes: 33 additions & 0 deletions src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,36 @@ impl EncoderTrap {
}
}


/// Determine the encoding by looking for a Byte Order Mark (BOM)
/// and decoded a single string in memory.
/// Return the result and the used encoding.
pub fn decode(input: &[u8], trap: DecoderTrap, fallback_encoding: &'static Encoding)
-> (Result<~str,SendStr>, &'static Encoding) {
use all::{UTF_8, UTF_16LE, UTF_16BE};
if input.starts_with([0xEF, 0xBB, 0xBF]) {
(UTF_8.decode(input.slice_from(3), trap), UTF_8 as &'static Encoding)
} else if input.starts_with([0xFE, 0xFF]) {
(UTF_16BE.decode(input.slice_from(2), trap), UTF_16BE as &'static Encoding)
} else if input.starts_with([0xFF, 0xFE]) {
(UTF_16LE.decode(input.slice_from(2), trap), UTF_16LE as &'static Encoding)
} else {
(fallback_encoding.decode(input, trap), fallback_encoding)
}
}


// XXX backported from Rust 0.9-pre:

trait VecStartsWith<T:Eq> {
/// Returns true if `needle` is a prefix of the vector.
fn starts_with(&self, needle: &[T]) -> bool;
}

impl<'self,T:Eq> VecStartsWith<T> for &'self [T] {
#[inline]
fn starts_with(&self, needle: &[T]) -> bool {
let n = needle.len();
self.len() >= n && needle == self.slice_to(n)
}
}

0 comments on commit 0c1d3f6

Please sign in to comment.