|
|
|
@ -1,5 +1,4 @@
|
|
|
|
|
//! Module implementing the lexer cursor. This is used for managing the input byte stream.
|
|
|
|
|
|
|
|
|
|
use crate::{profiler::BoaProfiler, syntax::ast::Position}; |
|
|
|
|
use std::io::{self, Bytes, Error, ErrorKind, Read}; |
|
|
|
|
|
|
|
|
@ -57,22 +56,38 @@ where
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Peeks the next character.
|
|
|
|
|
/// Peeks the next byte.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek(&mut self) -> Result<Option<char>, Error> { |
|
|
|
|
pub(super) fn peek(&mut self) -> Result<Option<u8>, Error> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::peek()", "Lexing"); |
|
|
|
|
|
|
|
|
|
self.iter.peek_byte() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_n(&mut self, n: u8) -> Result<u32, Error> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::peek_n()", "Lexing"); |
|
|
|
|
|
|
|
|
|
self.iter.peek_n_bytes(n) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Peeks the next UTF-8 character in u32 code point.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::peek_char()", "Lexing"); |
|
|
|
|
|
|
|
|
|
self.iter.peek_char() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented
|
|
|
|
|
/// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn next_is(&mut self, peek: char) -> io::Result<bool> { |
|
|
|
|
pub(super) fn next_is(&mut self, byte: u8) -> io::Result<bool> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_is()", "Lexing"); |
|
|
|
|
|
|
|
|
|
Ok(match self.peek()? { |
|
|
|
|
Some(next) if next == peek => { |
|
|
|
|
let _ = self.iter.next_char(); |
|
|
|
|
Some(next) if next == byte => { |
|
|
|
|
let _ = self.next_byte()?; |
|
|
|
|
true |
|
|
|
|
} |
|
|
|
|
_ => false, |
|
|
|
@ -80,34 +95,57 @@ where
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Applies the predicate to the next character and returns the result.
|
|
|
|
|
/// Returns false if there is no next character.
|
|
|
|
|
/// Returns false if the next character is not a valid ascii or there is no next character.
|
|
|
|
|
/// Otherwise returns the result from the predicate on the ascii in char
|
|
|
|
|
///
|
|
|
|
|
/// The buffer is not incremented.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn next_is_pred<F>(&mut self, pred: &F) -> io::Result<bool> |
|
|
|
|
pub(super) fn next_is_ascii_pred<F>(&mut self, pred: &F) -> io::Result<bool> |
|
|
|
|
where |
|
|
|
|
F: Fn(char) -> bool, |
|
|
|
|
{ |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_is_pred()", "Lexing"); |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing"); |
|
|
|
|
|
|
|
|
|
Ok(match self.peek()? { |
|
|
|
|
Some(byte) => match byte { |
|
|
|
|
0..=0x7F => pred(char::from(byte)), |
|
|
|
|
_ => false, |
|
|
|
|
}, |
|
|
|
|
None => false, |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Applies the predicate to the next UTF-8 character and returns the result.
|
|
|
|
|
/// Returns false if there is no next character, otherwise returns the result from the
|
|
|
|
|
/// predicate on the ascii char
|
|
|
|
|
///
|
|
|
|
|
/// The buffer is not incremented.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool> |
|
|
|
|
where |
|
|
|
|
F: Fn(u32) -> bool, |
|
|
|
|
{ |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_is_char_pred()", "Lexing"); |
|
|
|
|
|
|
|
|
|
Ok(if let Some(peek) = self.peek()? { |
|
|
|
|
Ok(if let Some(peek) = self.peek_char()? { |
|
|
|
|
pred(peek) |
|
|
|
|
} else { |
|
|
|
|
false |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Fills the buffer with all characters until the stop character is found.
|
|
|
|
|
/// Fills the buffer with all bytes until the stop byte is found.
|
|
|
|
|
/// Returns error when reaching the end of the buffer.
|
|
|
|
|
///
|
|
|
|
|
/// Note: It will not add the stop character to the buffer.
|
|
|
|
|
pub(super) fn take_until(&mut self, stop: char, buf: &mut String) -> io::Result<()> { |
|
|
|
|
/// Note that all bytes up until the stop byte are added to the buffer, including the byte right before.
|
|
|
|
|
pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec<u8>) -> io::Result<()> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::take_until()", "Lexing"); |
|
|
|
|
|
|
|
|
|
loop { |
|
|
|
|
if self.next_is(stop)? { |
|
|
|
|
return Ok(()); |
|
|
|
|
} else if let Some(ch) = self.next_char()? { |
|
|
|
|
buf.push(ch); |
|
|
|
|
} else if let Some(byte) = self.next_byte()? { |
|
|
|
|
buf.push(byte); |
|
|
|
|
} else { |
|
|
|
|
return Err(io::Error::new( |
|
|
|
|
ErrorKind::UnexpectedEof, |
|
|
|
@ -117,21 +155,45 @@ where
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Fills the buffer with characters until the first character (x) for which the predicate (pred) is false
|
|
|
|
|
/// (or the next character is none).
|
|
|
|
|
/// Fills the buffer with characters until the first ascii character for which the predicate (pred) is false.
|
|
|
|
|
/// It also stops when the next character is not an ascii or there is no next character.
|
|
|
|
|
///
|
|
|
|
|
/// Note that all characters up until x are added to the buffer including the character right before.
|
|
|
|
|
pub(super) fn take_while_pred<F>(&mut self, buf: &mut String, pred: &F) -> io::Result<()> |
|
|
|
|
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
|
|
|
|
|
pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()> |
|
|
|
|
where |
|
|
|
|
F: Fn(char) -> bool, |
|
|
|
|
{ |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::take_while_pred()", "Lexing"); |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::take_while_ascii_pred()", "Lexing"); |
|
|
|
|
|
|
|
|
|
loop { |
|
|
|
|
if !self.next_is_ascii_pred(pred)? { |
|
|
|
|
return Ok(()); |
|
|
|
|
} else if let Some(byte) = self.next_byte()? { |
|
|
|
|
buf.push(byte); |
|
|
|
|
} else { |
|
|
|
|
// next_is_pred will return false if the next value is None so the None case should already be handled.
|
|
|
|
|
unreachable!(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Fills the buffer with characters until the first character for which the predicate (pred) is false.
|
|
|
|
|
/// It also stops when there is no next character.
|
|
|
|
|
///
|
|
|
|
|
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
|
|
|
|
|
pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()> |
|
|
|
|
where |
|
|
|
|
F: Fn(u32) -> bool, |
|
|
|
|
{ |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::take_while_char_pred()", "Lexing"); |
|
|
|
|
|
|
|
|
|
loop { |
|
|
|
|
if !self.next_is_pred(pred)? { |
|
|
|
|
if !self.next_is_char_pred(pred)? { |
|
|
|
|
return Ok(()); |
|
|
|
|
} else if let Some(ch) = self.next_char()? { |
|
|
|
|
buf.push(ch); |
|
|
|
|
} else if let Some(ch) = self.peek_char()? { |
|
|
|
|
for _ in 0..utf8_len(ch) { |
|
|
|
|
buf.push(self.next_byte()?.unwrap()); |
|
|
|
|
} |
|
|
|
|
} else { |
|
|
|
|
// next_is_pred will return false if the next value is None so the None case should already be handled.
|
|
|
|
|
unreachable!(); |
|
|
|
@ -139,7 +201,7 @@ where
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// It will fill the buffer with checked ASCII bytes.
|
|
|
|
|
/// It will fill the buffer with bytes.
|
|
|
|
|
///
|
|
|
|
|
/// This expects for the buffer to be fully filled. If it's not, it will fail with an
|
|
|
|
|
/// `UnexpectedEof` I/O error.
|
|
|
|
@ -150,28 +212,63 @@ where
|
|
|
|
|
self.iter.fill_bytes(buf) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next byte.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(crate) fn next_byte(&mut self) -> Result<Option<u8>, Error> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_byte()", "Lexing"); |
|
|
|
|
|
|
|
|
|
let byte = self.iter.next_byte()?; |
|
|
|
|
|
|
|
|
|
match byte { |
|
|
|
|
Some(b'\r') => { |
|
|
|
|
// Try to take a newline if it's next, for windows "\r\n" newlines
|
|
|
|
|
// Otherwise, treat as a Mac OS9 bare '\r' newline
|
|
|
|
|
if self.peek()? == Some(b'\n') { |
|
|
|
|
let _ = self.iter.next_byte(); |
|
|
|
|
} |
|
|
|
|
self.next_line(); |
|
|
|
|
} |
|
|
|
|
Some(b'\n') => self.next_line(), |
|
|
|
|
Some(0xE2) => { |
|
|
|
|
// Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9)
|
|
|
|
|
let next_bytes = self.peek_n(2)?; |
|
|
|
|
if next_bytes == 0xA8_80 || next_bytes == 0xA9_80 { |
|
|
|
|
self.next_line(); |
|
|
|
|
} else { |
|
|
|
|
// 0xE2 is a utf8 first byte
|
|
|
|
|
self.next_column(); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
Some(b) if utf8_is_first_byte(b) => self.next_column(), |
|
|
|
|
_ => {} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Ok(byte) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next UTF-8 character.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(crate) fn next_char(&mut self) -> Result<Option<char>, Error> { |
|
|
|
|
pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> { |
|
|
|
|
let _timer = BoaProfiler::global().start_event("cursor::next_char()", "Lexing"); |
|
|
|
|
|
|
|
|
|
let chr = self.iter.next_char()?; |
|
|
|
|
let ch = self.iter.next_char()?; |
|
|
|
|
|
|
|
|
|
match chr { |
|
|
|
|
Some('\r') => { |
|
|
|
|
match ch { |
|
|
|
|
Some(0xD) => { |
|
|
|
|
// Try to take a newline if it's next, for windows "\r\n" newlines
|
|
|
|
|
// Otherwise, treat as a Mac OS9 bare '\r' newline
|
|
|
|
|
if self.peek()? == Some('\n') { |
|
|
|
|
let _ = self.iter.next_char(); |
|
|
|
|
if self.peek()? == Some(0xA) { |
|
|
|
|
let _ = self.iter.next_byte(); |
|
|
|
|
} |
|
|
|
|
self.next_line(); |
|
|
|
|
} |
|
|
|
|
Some('\n') | Some('\u{2028}') | Some('\u{2029}') => self.next_line(), |
|
|
|
|
// '\n' | '\u{2028}' | '\u{2029}'
|
|
|
|
|
Some(0xA) | Some(0x2028) | Some(0x2029) => self.next_line(), |
|
|
|
|
Some(_) => self.next_column(), |
|
|
|
|
None => {} |
|
|
|
|
_ => {} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
Ok(chr) |
|
|
|
|
Ok(ch) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -179,7 +276,9 @@ where
|
|
|
|
|
#[derive(Debug)] |
|
|
|
|
struct InnerIter<R> { |
|
|
|
|
iter: Bytes<R>, |
|
|
|
|
peeked_char: Option<Option<char>>, |
|
|
|
|
num_peeked_bytes: u8, |
|
|
|
|
peeked_bytes: u32, |
|
|
|
|
peeked_char: Option<Option<u32>>, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl<R> InnerIter<R> { |
|
|
|
@ -188,6 +287,8 @@ impl<R> InnerIter<R> {
|
|
|
|
|
fn new(iter: Bytes<R>) -> Self { |
|
|
|
|
Self { |
|
|
|
|
iter, |
|
|
|
|
num_peeked_bytes: 0, |
|
|
|
|
peeked_bytes: 0, |
|
|
|
|
peeked_char: None, |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -197,14 +298,14 @@ impl<R> InnerIter<R>
|
|
|
|
|
where |
|
|
|
|
R: Read, |
|
|
|
|
{ |
|
|
|
|
/// It will fill the buffer with checked ASCII bytes.
|
|
|
|
|
/// It will fill the buffer with checked ascii bytes.
|
|
|
|
|
///
|
|
|
|
|
/// This expects for the buffer to be fully filled. If it's not, it will fail with an
|
|
|
|
|
/// `UnexpectedEof` I/O error.
|
|
|
|
|
#[inline] |
|
|
|
|
fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { |
|
|
|
|
for byte in buf.iter_mut() { |
|
|
|
|
*byte = self.next_ascii()?.ok_or_else(|| { |
|
|
|
|
*byte = self.next_byte()?.ok_or_else(|| { |
|
|
|
|
io::Error::new( |
|
|
|
|
io::ErrorKind::UnexpectedEof, |
|
|
|
|
"unexpected EOF when filling buffer", |
|
|
|
@ -214,90 +315,197 @@ where
|
|
|
|
|
Ok(()) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Peeks the next UTF-8 checked character.
|
|
|
|
|
/// Increments the iter by n bytes.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_char(&mut self) -> Result<Option<char>, Error> { |
|
|
|
|
if let Some(v) = self.peeked_char { |
|
|
|
|
Ok(v) |
|
|
|
|
fn increment(&mut self, n: u32) -> Result<(), Error> { |
|
|
|
|
for _ in 0..n { |
|
|
|
|
if None == self.next_byte()? { |
|
|
|
|
break; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
Ok(()) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Peeks the next byte.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_byte(&mut self) -> Result<Option<u8>, Error> { |
|
|
|
|
if self.num_peeked_bytes > 0 { |
|
|
|
|
let byte = self.peeked_bytes as u8; |
|
|
|
|
Ok(Some(byte)) |
|
|
|
|
} else { |
|
|
|
|
let chr = self.next_char()?; |
|
|
|
|
self.peeked_char = Some(chr); |
|
|
|
|
Ok(chr) |
|
|
|
|
match self.iter.next().transpose()? { |
|
|
|
|
Some(byte) => { |
|
|
|
|
self.num_peeked_bytes = 1; |
|
|
|
|
self.peeked_bytes = byte as u32; |
|
|
|
|
Ok(Some(byte)) |
|
|
|
|
} |
|
|
|
|
None => Ok(None), |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next UTF-8 checked character.
|
|
|
|
|
fn next_char(&mut self) -> io::Result<Option<char>> { |
|
|
|
|
if let Some(v) = self.peeked_char.take() { |
|
|
|
|
return Ok(v); |
|
|
|
|
/// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<u32, Error> { |
|
|
|
|
while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 { |
|
|
|
|
match self.iter.next().transpose()? { |
|
|
|
|
Some(byte) => { |
|
|
|
|
self.peeked_bytes |= (byte as u32) << (self.num_peeked_bytes * 8); |
|
|
|
|
self.num_peeked_bytes += 1; |
|
|
|
|
} |
|
|
|
|
None => break, |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
let first_byte = match self.iter.next().transpose()? { |
|
|
|
|
Some(b) => b, |
|
|
|
|
None => return Ok(None), |
|
|
|
|
}; |
|
|
|
|
match n { |
|
|
|
|
0 => Ok(0), |
|
|
|
|
1 => Ok(self.peeked_bytes & 0xFF), |
|
|
|
|
2 => Ok(self.peeked_bytes & 0xFFFF), |
|
|
|
|
3 => Ok(self.peeked_bytes & 0xFFFFFF), |
|
|
|
|
_ => Ok(self.peeked_bytes), |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
let chr: char = if first_byte < 0x80 { |
|
|
|
|
// 0b0xxx_xxxx
|
|
|
|
|
first_byte.into() |
|
|
|
|
/// Peeks the next unchecked character in u32 code point.
|
|
|
|
|
#[inline] |
|
|
|
|
pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> { |
|
|
|
|
if let Some(ch) = self.peeked_char { |
|
|
|
|
Ok(ch) |
|
|
|
|
} else { |
|
|
|
|
let mut buf = [first_byte, 0u8, 0u8, 0u8]; |
|
|
|
|
let num_bytes = if first_byte < 0xE0 { |
|
|
|
|
// 0b110x_xxxx
|
|
|
|
|
2 |
|
|
|
|
} else if first_byte < 0xF0 { |
|
|
|
|
// 0b1110_xxxx
|
|
|
|
|
3 |
|
|
|
|
} else { |
|
|
|
|
// 0b1111_0xxx
|
|
|
|
|
4 |
|
|
|
|
// Decode UTF-8
|
|
|
|
|
let x = match self.peek_byte()? { |
|
|
|
|
Some(b) if b < 128 => { |
|
|
|
|
self.peeked_char = Some(Some(b as u32)); |
|
|
|
|
return Ok(Some(b as u32)); |
|
|
|
|
} |
|
|
|
|
Some(b) => b, |
|
|
|
|
None => { |
|
|
|
|
self.peeked_char = None; |
|
|
|
|
return Ok(None); |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
for b in buf.iter_mut().take(num_bytes).skip(1) { |
|
|
|
|
let next = match self.iter.next() { |
|
|
|
|
Some(Ok(b)) => b, |
|
|
|
|
Some(Err(e)) => return Err(e), |
|
|
|
|
None => { |
|
|
|
|
return Err(io::Error::new( |
|
|
|
|
io::ErrorKind::InvalidData, |
|
|
|
|
"stream did not contain valid UTF-8", |
|
|
|
|
)) |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
*b = next; |
|
|
|
|
// Multibyte case follows
|
|
|
|
|
// Decode from a byte combination out of: [[[x y] z] w]
|
|
|
|
|
// NOTE: Performance is sensitive to the exact formulation here
|
|
|
|
|
let init = utf8_first_byte(x, 2); |
|
|
|
|
let y = (self.peek_n_bytes(2)? >> 8) as u8; |
|
|
|
|
let mut ch = utf8_acc_cont_byte(init, y); |
|
|
|
|
if x >= 0xE0 { |
|
|
|
|
// [[x y z] w] case
|
|
|
|
|
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
|
|
|
|
|
let z = (self.peek_n_bytes(3)? >> 16) as u8; |
|
|
|
|
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); |
|
|
|
|
ch = init << 12 | y_z; |
|
|
|
|
if x >= 0xF0 { |
|
|
|
|
// [x y z w] case
|
|
|
|
|
// use only the lower 3 bits of `init`
|
|
|
|
|
let w = (self.peek_n_bytes(4)? >> 24) as u8; |
|
|
|
|
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
self.peeked_char = Some(Some(ch)); |
|
|
|
|
Ok(Some(ch)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next byte
|
|
|
|
|
#[inline] |
|
|
|
|
fn next_byte(&mut self) -> io::Result<Option<u8>> { |
|
|
|
|
self.peeked_char = None; |
|
|
|
|
if self.num_peeked_bytes > 0 { |
|
|
|
|
let byte = (self.peeked_bytes & 0xFF) as u8; |
|
|
|
|
self.num_peeked_bytes -= 1; |
|
|
|
|
self.peeked_bytes >>= 8; |
|
|
|
|
Ok(Some(byte)) |
|
|
|
|
} else { |
|
|
|
|
self.iter.next().transpose() |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next unchecked char in u32 code point.
|
|
|
|
|
#[inline] |
|
|
|
|
fn next_char(&mut self) -> io::Result<Option<u32>> { |
|
|
|
|
if let Some(ch) = self.peeked_char.take() { |
|
|
|
|
if let Some(c) = ch { |
|
|
|
|
self.increment(utf8_len(c))?; |
|
|
|
|
} |
|
|
|
|
return Ok(ch); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if let Ok(s) = std::str::from_utf8(&buf) { |
|
|
|
|
if let Some(chr) = s.chars().next() { |
|
|
|
|
chr |
|
|
|
|
} else { |
|
|
|
|
return Err(io::Error::new( |
|
|
|
|
io::ErrorKind::InvalidData, |
|
|
|
|
"stream did not contain valid UTF-8", |
|
|
|
|
)); |
|
|
|
|
} |
|
|
|
|
} else { |
|
|
|
|
return Err(io::Error::new( |
|
|
|
|
io::ErrorKind::InvalidData, |
|
|
|
|
"stream did not contain valid UTF-8", |
|
|
|
|
)); |
|
|
|
|
// Decode UTF-8
|
|
|
|
|
let x = match self.next_byte()? { |
|
|
|
|
Some(b) if b < 128 => return Ok(Some(b as u32)), |
|
|
|
|
Some(b) => b, |
|
|
|
|
None => return Ok(None), |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
// Multibyte case follows
|
|
|
|
|
// Decode from a byte combination out of: [[[x y] z] w]
|
|
|
|
|
// NOTE: Performance is sensitive to the exact formulation here
|
|
|
|
|
let init = utf8_first_byte(x, 2); |
|
|
|
|
let y = unwrap_or_0(self.next_byte()?); |
|
|
|
|
let mut ch = utf8_acc_cont_byte(init, y); |
|
|
|
|
if x >= 0xE0 { |
|
|
|
|
// [[x y z] w] case
|
|
|
|
|
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
|
|
|
|
|
let z = unwrap_or_0(self.next_byte()?); |
|
|
|
|
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); |
|
|
|
|
ch = init << 12 | y_z; |
|
|
|
|
if x >= 0xF0 { |
|
|
|
|
// [x y z w] case
|
|
|
|
|
// use only the lower 3 bits of `init`
|
|
|
|
|
let w = unwrap_or_0(self.next_byte()?); |
|
|
|
|
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); |
|
|
|
|
} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
Ok(Some(chr)) |
|
|
|
|
Ok(Some(ch)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Retrieves the next ASCII checked character.
|
|
|
|
|
#[inline] |
|
|
|
|
fn next_ascii(&mut self) -> io::Result<Option<u8>> { |
|
|
|
|
match self.next_char() { |
|
|
|
|
Ok(Some(chr)) if chr.is_ascii() => Ok(Some(chr as u8)), |
|
|
|
|
Ok(None) => Ok(None), |
|
|
|
|
_ => Err(io::Error::new( |
|
|
|
|
io::ErrorKind::InvalidData, |
|
|
|
|
"non-ASCII byte found", |
|
|
|
|
)), |
|
|
|
|
} |
|
|
|
|
/// Mask of the value bits of a continuation byte.
|
|
|
|
|
const CONT_MASK: u8 = 0b0011_1111; |
|
|
|
|
|
|
|
|
|
/// Returns the initial codepoint accumulator for the first byte.
|
|
|
|
|
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
|
|
|
|
|
/// for width 3, and 3 bits for width 4.
|
|
|
|
|
#[inline] |
|
|
|
|
fn utf8_first_byte(byte: u8, width: u32) -> u32 { |
|
|
|
|
(byte & (0x7F >> width)) as u32 |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Returns the value of `ch` updated with continuation byte `byte`.
|
|
|
|
|
#[inline] |
|
|
|
|
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { |
|
|
|
|
(ch << 6) | (byte & CONT_MASK) as u32 |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the
|
|
|
|
|
/// bits `11`).
|
|
|
|
|
#[inline] |
|
|
|
|
fn utf8_is_first_byte(byte: u8) -> bool { |
|
|
|
|
byte <= 0x7F || (byte >> 6) == 0x11 |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#[inline] |
|
|
|
|
fn unwrap_or_0(opt: Option<u8>) -> u8 { |
|
|
|
|
match opt { |
|
|
|
|
Some(byte) => byte, |
|
|
|
|
None => 0, |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#[inline] |
|
|
|
|
fn utf8_len(ch: u32) -> u32 { |
|
|
|
|
if ch <= 0x7F { |
|
|
|
|
1 |
|
|
|
|
} else if ch <= 0x7FF { |
|
|
|
|
2 |
|
|
|
|
} else if ch <= 0xFFFF { |
|
|
|
|
3 |
|
|
|
|
} else { |
|
|
|
|
4 |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|