Browse Source

Improve lexer by make cursor iterate over bytes (#915)

pull/954/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
cc473855f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 10
      boa/src/syntax/lexer/comment.rs
  2. 416
      boa/src/syntax/lexer/cursor.rs
  3. 18
      boa/src/syntax/lexer/identifier.rs
  4. 179
      boa/src/syntax/lexer/mod.rs
  5. 137
      boa/src/syntax/lexer/number.rs
  6. 62
      boa/src/syntax/lexer/operator.rs
  7. 67
      boa/src/syntax/lexer/regex.rs
  8. 4
      boa/src/syntax/lexer/spread.rs
  9. 46
      boa/src/syntax/lexer/string.rs
  10. 24
      boa/src/syntax/lexer/template.rs
  11. 101
      boa/src/syntax/lexer/tests.rs

10
boa/src/syntax/lexer/comment.rs

@ -31,11 +31,11 @@ impl<R> Tokenizer<R> for SingleLineComment {
// Skip either to the end of the line or to the end of the input
while let Some(ch) = cursor.peek()? {
if ch == '\n' {
if ch == b'\n' {
break;
} else {
// Consume char.
cursor.next_char()?.expect("Comment character vansihed");
cursor.next_byte()?.expect("Comment character vansihed");
}
}
Ok(Token::new(
@ -66,10 +66,10 @@ impl<R> Tokenizer<R> for MultiLineComment {
let mut new_line = false;
loop {
if let Some(ch) = cursor.next_char()? {
if ch == '*' && cursor.next_is('/')? {
if let Some(ch) = cursor.next_byte()? {
if ch == b'*' && cursor.next_is(b'/')? {
break;
} else if ch == '\n' {
} else if ch == b'\n' {
new_line = true;
}
} else {

416
boa/src/syntax/lexer/cursor.rs

@ -1,5 +1,4 @@
//! Module implementing the lexer cursor. This is used for managing the input byte stream.
use crate::{profiler::BoaProfiler, syntax::ast::Position};
use std::io::{self, Bytes, Error, ErrorKind, Read};
@ -57,22 +56,38 @@ where
}
}
/// Peeks the next character.
/// Peeks the next byte.
#[inline]
pub(super) fn peek(&mut self) -> Result<Option<char>, Error> {
pub(super) fn peek(&mut self) -> Result<Option<u8>, Error> {
let _timer = BoaProfiler::global().start_event("cursor::peek()", "Lexing");
self.iter.peek_byte()
}
/// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
#[inline]
pub(super) fn peek_n(&mut self, n: u8) -> Result<u32, Error> {
let _timer = BoaProfiler::global().start_event("cursor::peek_n()", "Lexing");
self.iter.peek_n_bytes(n)
}
/// Peeks the next UTF-8 character in u32 code point.
#[inline]
pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
let _timer = BoaProfiler::global().start_event("cursor::peek_char()", "Lexing");
self.iter.peek_char()
}
/// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented
/// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented
#[inline]
pub(super) fn next_is(&mut self, peek: char) -> io::Result<bool> {
pub(super) fn next_is(&mut self, byte: u8) -> io::Result<bool> {
let _timer = BoaProfiler::global().start_event("cursor::next_is()", "Lexing");
Ok(match self.peek()? {
Some(next) if next == peek => {
let _ = self.iter.next_char();
Some(next) if next == byte => {
let _ = self.next_byte()?;
true
}
_ => false,
@ -80,34 +95,57 @@ where
}
/// Applies the predicate to the next character and returns the result.
/// Returns false if there is no next character.
/// Returns false if the next character is not a valid ascii or there is no next character.
/// Otherwise returns the result from the predicate on the ascii in char
///
/// The buffer is not incremented.
#[inline]
pub(super) fn next_is_pred<F>(&mut self, pred: &F) -> io::Result<bool>
pub(super) fn next_is_ascii_pred<F>(&mut self, pred: &F) -> io::Result<bool>
where
F: Fn(char) -> bool,
{
let _timer = BoaProfiler::global().start_event("cursor::next_is_pred()", "Lexing");
let _timer = BoaProfiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing");
Ok(match self.peek()? {
Some(byte) => match byte {
0..=0x7F => pred(char::from(byte)),
_ => false,
},
None => false,
})
}
/// Applies the predicate to the next UTF-8 character and returns the result.
/// Returns false if there is no next character, otherwise returns the result from the
/// predicate on the ascii char
///
/// The buffer is not incremented.
#[inline]
pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
where
F: Fn(u32) -> bool,
{
let _timer = BoaProfiler::global().start_event("cursor::next_is_char_pred()", "Lexing");
Ok(if let Some(peek) = self.peek()? {
Ok(if let Some(peek) = self.peek_char()? {
pred(peek)
} else {
false
})
}
/// Fills the buffer with all characters until the stop character is found.
/// Fills the buffer with all bytes until the stop byte is found.
/// Returns error when reaching the end of the buffer.
///
/// Note: It will not add the stop character to the buffer.
pub(super) fn take_until(&mut self, stop: char, buf: &mut String) -> io::Result<()> {
/// Note that all bytes up until the stop byte are added to the buffer, including the byte right before.
pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec<u8>) -> io::Result<()> {
let _timer = BoaProfiler::global().start_event("cursor::take_until()", "Lexing");
loop {
if self.next_is(stop)? {
return Ok(());
} else if let Some(ch) = self.next_char()? {
buf.push(ch);
} else if let Some(byte) = self.next_byte()? {
buf.push(byte);
} else {
return Err(io::Error::new(
ErrorKind::UnexpectedEof,
@ -117,21 +155,45 @@ where
}
}
/// Fills the buffer with characters until the first character (x) for which the predicate (pred) is false
/// (or the next character is none).
/// Fills the buffer with characters until the first ascii character for which the predicate (pred) is false.
/// It also stops when the next character is not an ascii or there is no next character.
///
/// Note that all characters up until x are added to the buffer including the character right before.
pub(super) fn take_while_pred<F>(&mut self, buf: &mut String, pred: &F) -> io::Result<()>
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
where
F: Fn(char) -> bool,
{
let _timer = BoaProfiler::global().start_event("cursor::take_while_pred()", "Lexing");
let _timer = BoaProfiler::global().start_event("cursor::take_while_ascii_pred()", "Lexing");
loop {
if !self.next_is_ascii_pred(pred)? {
return Ok(());
} else if let Some(byte) = self.next_byte()? {
buf.push(byte);
} else {
// next_is_pred will return false if the next value is None so the None case should already be handled.
unreachable!();
}
}
}
/// Fills the buffer with characters until the first character for which the predicate (pred) is false.
/// It also stops when there is no next character.
///
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
where
F: Fn(u32) -> bool,
{
let _timer = BoaProfiler::global().start_event("cursor::take_while_char_pred()", "Lexing");
loop {
if !self.next_is_pred(pred)? {
if !self.next_is_char_pred(pred)? {
return Ok(());
} else if let Some(ch) = self.next_char()? {
buf.push(ch);
} else if let Some(ch) = self.peek_char()? {
for _ in 0..utf8_len(ch) {
buf.push(self.next_byte()?.unwrap());
}
} else {
// next_is_pred will return false if the next value is None so the None case should already be handled.
unreachable!();
@ -139,7 +201,7 @@ where
}
}
/// It will fill the buffer with checked ASCII bytes.
/// It will fill the buffer with bytes.
///
/// This expects for the buffer to be fully filled. If it's not, it will fail with an
/// `UnexpectedEof` I/O error.
@ -150,28 +212,63 @@ where
self.iter.fill_bytes(buf)
}
/// Retrieves the next byte.
#[inline]
pub(crate) fn next_byte(&mut self) -> Result<Option<u8>, Error> {
let _timer = BoaProfiler::global().start_event("cursor::next_byte()", "Lexing");
let byte = self.iter.next_byte()?;
match byte {
Some(b'\r') => {
// Try to take a newline if it's next, for windows "\r\n" newlines
// Otherwise, treat as a Mac OS9 bare '\r' newline
if self.peek()? == Some(b'\n') {
let _ = self.iter.next_byte();
}
self.next_line();
}
Some(b'\n') => self.next_line(),
Some(0xE2) => {
// Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9)
let next_bytes = self.peek_n(2)?;
if next_bytes == 0xA8_80 || next_bytes == 0xA9_80 {
self.next_line();
} else {
// 0xE2 is a utf8 first byte
self.next_column();
}
}
Some(b) if utf8_is_first_byte(b) => self.next_column(),
_ => {}
}
Ok(byte)
}
/// Retrieves the next UTF-8 character.
#[inline]
pub(crate) fn next_char(&mut self) -> Result<Option<char>, Error> {
pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> {
let _timer = BoaProfiler::global().start_event("cursor::next_char()", "Lexing");
let chr = self.iter.next_char()?;
let ch = self.iter.next_char()?;
match chr {
Some('\r') => {
match ch {
Some(0xD) => {
// Try to take a newline if it's next, for windows "\r\n" newlines
// Otherwise, treat as a Mac OS9 bare '\r' newline
if self.peek()? == Some('\n') {
let _ = self.iter.next_char();
if self.peek()? == Some(0xA) {
let _ = self.iter.next_byte();
}
self.next_line();
}
Some('\n') | Some('\u{2028}') | Some('\u{2029}') => self.next_line(),
// '\n' | '\u{2028}' | '\u{2029}'
Some(0xA) | Some(0x2028) | Some(0x2029) => self.next_line(),
Some(_) => self.next_column(),
None => {}
_ => {}
}
Ok(chr)
Ok(ch)
}
}
@ -179,7 +276,9 @@ where
#[derive(Debug)]
struct InnerIter<R> {
iter: Bytes<R>,
peeked_char: Option<Option<char>>,
num_peeked_bytes: u8,
peeked_bytes: u32,
peeked_char: Option<Option<u32>>,
}
impl<R> InnerIter<R> {
@ -188,6 +287,8 @@ impl<R> InnerIter<R> {
fn new(iter: Bytes<R>) -> Self {
Self {
iter,
num_peeked_bytes: 0,
peeked_bytes: 0,
peeked_char: None,
}
}
@ -197,14 +298,14 @@ impl<R> InnerIter<R>
where
R: Read,
{
/// It will fill the buffer with checked ASCII bytes.
/// It will fill the buffer with checked ascii bytes.
///
/// This expects for the buffer to be fully filled. If it's not, it will fail with an
/// `UnexpectedEof` I/O error.
#[inline]
fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> {
for byte in buf.iter_mut() {
*byte = self.next_ascii()?.ok_or_else(|| {
*byte = self.next_byte()?.ok_or_else(|| {
io::Error::new(
io::ErrorKind::UnexpectedEof,
"unexpected EOF when filling buffer",
@ -214,90 +315,197 @@ where
Ok(())
}
/// Peeks the next UTF-8 checked character.
/// Increments the iter by n bytes.
#[inline]
pub(super) fn peek_char(&mut self) -> Result<Option<char>, Error> {
if let Some(v) = self.peeked_char {
Ok(v)
fn increment(&mut self, n: u32) -> Result<(), Error> {
for _ in 0..n {
if None == self.next_byte()? {
break;
}
}
Ok(())
}
/// Peeks the next byte.
#[inline]
pub(super) fn peek_byte(&mut self) -> Result<Option<u8>, Error> {
if self.num_peeked_bytes > 0 {
let byte = self.peeked_bytes as u8;
Ok(Some(byte))
} else {
let chr = self.next_char()?;
self.peeked_char = Some(chr);
Ok(chr)
match self.iter.next().transpose()? {
Some(byte) => {
self.num_peeked_bytes = 1;
self.peeked_bytes = byte as u32;
Ok(Some(byte))
}
None => Ok(None),
}
}
}
/// Retrieves the next UTF-8 checked character.
fn next_char(&mut self) -> io::Result<Option<char>> {
if let Some(v) = self.peeked_char.take() {
return Ok(v);
/// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
#[inline]
pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<u32, Error> {
while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 {
match self.iter.next().transpose()? {
Some(byte) => {
self.peeked_bytes |= (byte as u32) << (self.num_peeked_bytes * 8);
self.num_peeked_bytes += 1;
}
None => break,
};
}
let first_byte = match self.iter.next().transpose()? {
Some(b) => b,
None => return Ok(None),
};
match n {
0 => Ok(0),
1 => Ok(self.peeked_bytes & 0xFF),
2 => Ok(self.peeked_bytes & 0xFFFF),
3 => Ok(self.peeked_bytes & 0xFFFFFF),
_ => Ok(self.peeked_bytes),
}
}
let chr: char = if first_byte < 0x80 {
// 0b0xxx_xxxx
first_byte.into()
/// Peeks the next unchecked character in u32 code point.
#[inline]
pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
if let Some(ch) = self.peeked_char {
Ok(ch)
} else {
let mut buf = [first_byte, 0u8, 0u8, 0u8];
let num_bytes = if first_byte < 0xE0 {
// 0b110x_xxxx
2
} else if first_byte < 0xF0 {
// 0b1110_xxxx
3
} else {
// 0b1111_0xxx
4
// Decode UTF-8
let x = match self.peek_byte()? {
Some(b) if b < 128 => {
self.peeked_char = Some(Some(b as u32));
return Ok(Some(b as u32));
}
Some(b) => b,
None => {
self.peeked_char = None;
return Ok(None);
}
};
for b in buf.iter_mut().take(num_bytes).skip(1) {
let next = match self.iter.next() {
Some(Ok(b)) => b,
Some(Err(e)) => return Err(e),
None => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"stream did not contain valid UTF-8",
))
}
};
*b = next;
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = (self.peek_n_bytes(2)? >> 8) as u8;
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = (self.peek_n_bytes(3)? >> 16) as u8;
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = (self.peek_n_bytes(4)? >> 24) as u8;
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
};
self.peeked_char = Some(Some(ch));
Ok(Some(ch))
}
}
/// Retrieves the next byte
#[inline]
fn next_byte(&mut self) -> io::Result<Option<u8>> {
self.peeked_char = None;
if self.num_peeked_bytes > 0 {
let byte = (self.peeked_bytes & 0xFF) as u8;
self.num_peeked_bytes -= 1;
self.peeked_bytes >>= 8;
Ok(Some(byte))
} else {
self.iter.next().transpose()
}
}
/// Retrieves the next unchecked char in u32 code point.
#[inline]
fn next_char(&mut self) -> io::Result<Option<u32>> {
if let Some(ch) = self.peeked_char.take() {
if let Some(c) = ch {
self.increment(utf8_len(c))?;
}
return Ok(ch);
}
if let Ok(s) = std::str::from_utf8(&buf) {
if let Some(chr) = s.chars().next() {
chr
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"stream did not contain valid UTF-8",
));
}
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"stream did not contain valid UTF-8",
));
// Decode UTF-8
let x = match self.next_byte()? {
Some(b) if b < 128 => return Ok(Some(b as u32)),
Some(b) => b,
None => return Ok(None),
};
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(self.next_byte()?);
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(self.next_byte()?);
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(self.next_byte()?);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
};
Ok(Some(chr))
Ok(Some(ch))
}
}
/// Retrieves the next ASCII checked character.
#[inline]
fn next_ascii(&mut self) -> io::Result<Option<u8>> {
match self.next_char() {
Ok(Some(chr)) if chr.is_ascii() => Ok(Some(chr as u8)),
Ok(None) => Ok(None),
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
"non-ASCII byte found",
)),
}
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the
/// bits `11`).
#[inline]
fn utf8_is_first_byte(byte: u8) -> bool {
byte <= 0x7F || (byte >> 6) == 0x11
}
#[inline]
fn unwrap_or_0(opt: Option<u8>) -> u8 {
match opt {
Some(byte) => byte,
None => 0,
}
}
#[inline]
fn utf8_len(ch: u32) -> u32 {
if ch <= 0x7F {
1
} else if ch <= 0x7FF {
2
} else if ch <= 0xFFFF {
3
} else {
4
}
}

18
boa/src/syntax/lexer/identifier.rs

@ -8,7 +8,9 @@ use crate::{
lexer::{Token, TokenKind},
},
};
use core::convert::TryFrom;
use std::io::Read;
use std::str;
const STRICT_FORBIDDEN_IDENTIFIERS: [&str; 11] = [
"eval",
@ -51,13 +53,21 @@ impl<R> Tokenizer<R> for Identifier {
{
let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");
let mut buf = self.init.to_string();
let mut init_buf = [0u8; 4];
let mut buf = Vec::new();
self.init.encode_utf8(&mut init_buf);
buf.extend(init_buf.iter().take(self.init.len_utf8()));
cursor.take_while_pred(&mut buf, &|c: char| {
c.is_alphabetic() || c.is_digit(10) || c == '_'
cursor.take_while_char_pred(&mut buf, &|c: u32| {
if let Ok(c) = char::try_from(c) {
c.is_alphabetic() || c.is_digit(10) || c == '_'
} else {
false
}
})?;
let tk = match buf.as_str() {
let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let tk = match token_str {
"true" => TokenKind::BooleanLiteral(true),
"false" => TokenKind::BooleanLiteral(false),
"null" => TokenKind::NullLiteral,

179
boa/src/syntax/lexer/mod.rs

@ -42,6 +42,7 @@ use self::{
};
use crate::syntax::ast::{Punctuator, Span};
pub use crate::{profiler::BoaProfiler, syntax::ast::Position};
use core::convert::TryFrom;
pub use error::Error;
use std::io::Read;
pub use token::{Token, TokenKind};
@ -69,12 +70,12 @@ impl<R> Lexer<R> {
/// * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
///
/// [More information](https://tc39.es/ecma262/#table-32)
fn is_whitespace(ch: char) -> bool {
fn is_whitespace(ch: u32) -> bool {
matches!(
ch,
'\u{0020}' | '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{00A0}' | '\u{FEFF}' |
0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
// Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
'\u{1680}' | '\u{2000}'..='\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}'
0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
)
}
@ -127,12 +128,12 @@ impl<R> Lexer<R> {
if let Some(c) = self.cursor.peek()? {
match c {
'/' => {
self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/'
b'/' => {
self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/'
SingleLineComment.lex(&mut self.cursor, start)
}
'*' => {
self.cursor.next_char()?.expect("* token vanished"); // Consume the '*'
b'*' => {
self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*'
MultiLineComment.lex(&mut self.cursor, start)
}
ch => {
@ -140,9 +141,9 @@ impl<R> Lexer<R> {
InputElement::Div | InputElement::TemplateTail => {
// Only div punctuator allowed, regex not.
if ch == '=' {
if ch == b'=' {
// Indicates this is an AssignDiv.
self.cursor.next_char()?.expect("= token vanished"); // Consume the '='
self.cursor.next_byte()?.expect("= token vanished"); // Consume the '='
Ok(Token::new(
Punctuator::AssignDiv.into(),
Span::new(start, self.cursor.pos()),
@ -178,90 +179,104 @@ impl<R> Lexer<R> {
{
let _timer = BoaProfiler::global().start_event("next()", "Lexing");
let (start, next_chr) = loop {
let (start, next_ch) = loop {
let start = self.cursor.pos();
if let Some(next_chr) = self.cursor.next_char()? {
if let Some(next_ch) = self.cursor.next_char()? {
// Ignore whitespace
if !Self::is_whitespace(next_chr) {
break (start, next_chr);
if !Self::is_whitespace(next_ch) {
break (start, next_ch);
}
} else {
return Ok(None);
}
};
let token = match next_chr {
'\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
TokenKind::LineTerminator,
Span::new(start, self.cursor.pos()),
)),
'"' | '\'' => StringLiteral::new(next_chr).lex(&mut self.cursor, start),
'`' => TemplateLiteral.lex(&mut self.cursor, start),
_ if next_chr.is_digit(10) => NumberLiteral::new(next_chr).lex(&mut self.cursor, start),
_ if next_chr.is_alphabetic() || next_chr == '$' || next_chr == '_' => {
Identifier::new(next_chr).lex(&mut self.cursor, start)
}
';' => Ok(Token::new(
Punctuator::Semicolon.into(),
Span::new(start, self.cursor.pos()),
)),
':' => Ok(Token::new(
Punctuator::Colon.into(),
Span::new(start, self.cursor.pos()),
)),
'.' => SpreadLiteral::new().lex(&mut self.cursor, start),
'(' => Ok(Token::new(
Punctuator::OpenParen.into(),
Span::new(start, self.cursor.pos()),
)),
')' => Ok(Token::new(
Punctuator::CloseParen.into(),
Span::new(start, self.cursor.pos()),
)),
',' => Ok(Token::new(
Punctuator::Comma.into(),
Span::new(start, self.cursor.pos()),
)),
'{' => Ok(Token::new(
Punctuator::OpenBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'}' => Ok(Token::new(
Punctuator::CloseBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'[' => Ok(Token::new(
Punctuator::OpenBracket.into(),
Span::new(start, self.cursor.pos()),
)),
']' => Ok(Token::new(
Punctuator::CloseBracket.into(),
Span::new(start, self.cursor.pos()),
)),
'?' => Ok(Token::new(
Punctuator::Question.into(),
Span::new(start, self.cursor.pos()),
)),
'/' => self.lex_slash_token(start),
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
Operator::new(next_chr).lex(&mut self.cursor, start)
if let Ok(c) = char::try_from(next_ch) {
let token = match c {
'\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
TokenKind::LineTerminator,
Span::new(start, self.cursor.pos()),
)),
'"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
'`' => TemplateLiteral.lex(&mut self.cursor, start),
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_alphabetic() || c == '$' || c == '_' => {
Identifier::new(c).lex(&mut self.cursor, start)
}
';' => Ok(Token::new(
Punctuator::Semicolon.into(),
Span::new(start, self.cursor.pos()),
)),
':' => Ok(Token::new(
Punctuator::Colon.into(),
Span::new(start, self.cursor.pos()),
)),
'.' => SpreadLiteral::new().lex(&mut self.cursor, start),
'(' => Ok(Token::new(
Punctuator::OpenParen.into(),
Span::new(start, self.cursor.pos()),
)),
')' => Ok(Token::new(
Punctuator::CloseParen.into(),
Span::new(start, self.cursor.pos()),
)),
',' => Ok(Token::new(
Punctuator::Comma.into(),
Span::new(start, self.cursor.pos()),
)),
'{' => Ok(Token::new(
Punctuator::OpenBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'}' => Ok(Token::new(
Punctuator::CloseBlock.into(),
Span::new(start, self.cursor.pos()),
)),
'[' => Ok(Token::new(
Punctuator::OpenBracket.into(),
Span::new(start, self.cursor.pos()),
)),
']' => Ok(Token::new(
Punctuator::CloseBracket.into(),
Span::new(start, self.cursor.pos()),
)),
'?' => Ok(Token::new(
Punctuator::Question.into(),
Span::new(start, self.cursor.pos()),
)),
'/' => self.lex_slash_token(start),
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",
c,
start.line_number(),
start.column_number()
);
Err(Error::syntax(details, start))
}
}?;
if token.kind() == &TokenKind::Comment {
// Skip comment
self.next()
} else {
Ok(Some(token))
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",
next_chr,
} else {
Err(Error::syntax(
format!(
"unexpected utf-8 char '\\u{}' at line {}, column {}",
next_ch,
start.line_number(),
start.column_number()
);
Err(Error::syntax(details, start))
}
}?;
if token.kind() == &TokenKind::Comment {
// Skip comment
self.next()
} else {
Ok(Some(token))
),
start,
))
}
}
}

137
boa/src/syntax/lexer/number.rs

@ -9,6 +9,7 @@ use crate::{
lexer::{token::Numeric, Token},
},
};
use std::str;
use std::{io::Read, str::FromStr};
/// Number literal lexing.
@ -23,12 +24,12 @@ use std::{io::Read, str::FromStr};
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type
#[derive(Debug, Clone, Copy)]
pub(super) struct NumberLiteral {
init: char,
init: u8,
}
impl NumberLiteral {
/// Creates a new string literal lexer.
pub(super) fn new(init: char) -> Self {
pub(super) fn new(init: u8) -> Self {
Self { init }
}
}
@ -63,8 +64,9 @@ impl NumericKind {
}
}
#[inline]
fn take_signed_integer<R>(
buf: &mut String,
buf: &mut Vec<u8>,
cursor: &mut Cursor<R>,
kind: &NumericKind,
) -> Result<(), Error>
@ -73,30 +75,31 @@ where
{
// The next part must be SignedInteger.
// This is optionally a '+' or '-' followed by 1 or more DecimalDigits.
match cursor.next_char()? {
Some('+') => {
buf.push('+');
if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
match cursor.next_byte()? {
Some(b'+') => {
buf.push(b'+');
if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
// A digit must follow the + or - symbol.
return Err(Error::syntax("No digit found after + symbol", cursor.pos()));
}
}
Some('-') => {
buf.push('-');
if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
Some(b'-') => {
buf.push(b'-');
if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
// A digit must follow the + or - symbol.
return Err(Error::syntax("No digit found after - symbol", cursor.pos()));
}
}
Some(c) if c.is_digit(kind.base()) => buf.push(c),
Some(c) => {
return Err(Error::syntax(
format!(
"When lexing exponential value found unexpected char: '{}'",
c
),
cursor.pos(),
));
Some(byte) => {
let ch = char::from(byte);
if ch.is_ascii() && ch.is_digit(kind.base()) {
buf.push(byte);
} else {
return Err(Error::syntax(
"When lexing exponential value found unexpected char",
cursor.pos(),
));
}
}
None => {
return Err(Error::syntax(
@ -107,7 +110,7 @@ where
}
// Consume the decimal digits.
cursor.take_while_pred(buf, &|c: char| c.is_digit(kind.base()))?;
cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?;
Ok(())
}
@ -118,12 +121,12 @@ where
/// - [ECMAScript Specification][spec]
///
/// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals
#[inline]
fn check_after_numeric_literal<R>(cursor: &mut Cursor<R>) -> Result<(), Error>
where
R: Read,
{
let pred = |ch: char| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_';
if cursor.next_is_pred(&pred)? {
if cursor.next_is_ascii_pred(&|ch| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_')? {
Err(Error::syntax(
"a numeric literal must not be followed by an alphanumeric, $ or _ characters",
cursor.pos(),
@ -140,17 +143,17 @@ impl<R> Tokenizer<R> for NumberLiteral {
{
let _timer = BoaProfiler::global().start_event("NumberLiteral", "Lexing");
let mut buf = self.init.to_string();
let mut buf = vec![self.init];
// Default assume the number is a base 10 integer.
let mut kind = NumericKind::Integer(10);
let c = cursor.peek();
if self.init == '0' {
if self.init == b'0' {
if let Some(ch) = c? {
match ch {
'x' | 'X' => {
b'x' | b'X' => {
// Remove the initial '0' from buffer.
cursor.next_char()?.expect("x or X character vanished");
buf.pop();
@ -159,16 +162,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
kind = NumericKind::Integer(16);
// Checks if the next char after '0x' is a digit of that base. if not return an error.
if let Some(digit) = cursor.peek()? {
if !digit.is_digit(16) {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(16))? {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
}
'o' | 'O' => {
b'o' | b'O' => {
// Remove the initial '0' from buffer.
cursor.next_char()?.expect("o or O character vanished");
buf.pop();
@ -177,16 +178,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
kind = NumericKind::Integer(8);
// Checks if the next char after '0o' is a digit of that base. if not return an error.
if let Some(digit) = cursor.peek()? {
if !digit.is_digit(8) {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
}
'b' | 'B' => {
b'b' | b'B' => {
// Remove the initial '0' from buffer.
cursor.next_char()?.expect("b or B character vanished");
buf.pop();
@ -195,16 +194,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
kind = NumericKind::Integer(2);
// Checks if the next char after '0b' is a digit of that base. if not return an error.
if let Some(digit) = cursor.peek()? {
if !digit.is_digit(2) {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? {
return Err(Error::syntax(
"expected hexadecimal digit after number base prefix",
cursor.pos(),
));
}
}
'n' => {
b'n' => {
cursor.next_char()?.expect("n character vanished");
// DecimalBigIntegerLiteral '0n'
@ -213,7 +210,8 @@ impl<R> Tokenizer<R> for NumberLiteral {
Span::new(start_pos, cursor.pos()),
));
}
ch => {
byte => {
let ch = char::from(byte);
if ch.is_digit(8) {
// LegacyOctalIntegerLiteral
if cursor.strict_mode() {
@ -226,7 +224,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
// Remove the initial '0' from buffer.
buf.pop();
buf.push(cursor.next_char()?.expect("'0' character vanished"));
buf.push(cursor.next_byte()?.expect("'0' character vanished"));
kind = NumericKind::Integer(8);
}
@ -240,7 +238,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
start_pos,
));
} else {
buf.push(cursor.next_char()?.expect("Number digit vanished"));
buf.push(cursor.next_byte()?.expect("Number digit vanished"));
}
} // Else indicates that the symbol is a non-number.
}
@ -256,42 +254,42 @@ impl<R> Tokenizer<R> for NumberLiteral {
}
// Consume digits until a non-digit character is encountered or all the characters are consumed.
cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
// The non-digit character could be:
// 'n' To indicate a BigIntLiteralSuffix.
// '.' To indicate a decimal seperator.
// 'e' | 'E' To indicate an ExponentPart.
match cursor.peek()? {
Some('n') => {
Some(b'n') => {
// DecimalBigIntegerLiteral
// Lexing finished.
// Consume the n
cursor.next_char()?.expect("n character vanished");
cursor.next_byte()?.expect("n character vanished");
kind = kind.to_bigint();
}
Some('.') => {
Some(b'.') => {
if kind.base() == 10 {
// Only base 10 numbers can have a decimal seperator.
// Number literal lexing finished if a . is found for a number in a different base.
cursor.next_char()?.expect(". token vanished");
buf.push('.'); // Consume the .
cursor.next_byte()?.expect(". token vanished");
buf.push(b'.'); // Consume the .
kind = NumericKind::Rational;
// Consume digits until a non-digit character is encountered or all the characters are consumed.
cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
// The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part.
// Another '.' or 'n' is not allowed.
match cursor.peek()? {
Some('e') | Some('E') => {
Some(b'e') | Some(b'E') => {
// Consume the ExponentIndicator.
cursor.next_char()?.expect("e or E token vanished");
cursor.next_byte()?.expect("e or E token vanished");
buf.push('E');
buf.push(b'E');
take_signed_integer(&mut buf, cursor, &kind)?;
}
@ -301,10 +299,10 @@ impl<R> Tokenizer<R> for NumberLiteral {
}
}
}
Some('e') | Some('E') => {
Some(b'e') | Some(b'E') => {
kind = NumericKind::Rational;
cursor.next_char()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
buf.push('E');
cursor.next_byte()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
buf.push(b'E');
take_signed_integer(&mut buf, cursor, &kind)?;
}
Some(_) | None => {
@ -314,14 +312,15 @@ impl<R> Tokenizer<R> for NumberLiteral {
check_after_numeric_literal(cursor)?;
let num_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let num = match kind {
NumericKind::BigInt(base) => {
Numeric::BigInt(
BigInt::from_string_radix(&buf, base).expect("Could not convert to BigInt")
BigInt::from_string_radix(num_str, base).expect("Could not convert to BigInt")
)
}
NumericKind::Rational /* base: 10 */ => {
let val = f64::from_str(&buf).expect("Failed to parse float after checks");
let val = f64::from_str(num_str).expect("Failed to parse float after checks");
let int_val = val as i32;
// The truncated float should be identically to the non-truncated float for the conversion to be loss-less,
@ -335,12 +334,12 @@ impl<R> Tokenizer<R> for NumberLiteral {
}
},
NumericKind::Integer(base) => {
if let Ok(num) = i32::from_str_radix(&buf, base) {
if let Ok(num) = i32::from_str_radix(num_str, base) {
Numeric::Integer(num)
} else {
let b = f64::from(base);
let mut result = 0.0_f64;
for c in buf.chars() {
for c in num_str.chars() {
let digit = f64::from(c.to_digit(base).expect("could not parse digit after already checking validity"));
result = result * b + digit;
}

62
boa/src/syntax/lexer/operator.rs

@ -17,8 +17,8 @@ macro_rules! vop {
($cursor:ident, $assign_op:expr, $op:expr) => ({
match $cursor.peek()? {
None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
Some('=') => {
$cursor.next_char()?.expect("= token vanished");
Some(b'=') => {
$cursor.next_byte()?.expect("= token vanished");
$cursor.next_column();
$assign_op
}
@ -28,13 +28,13 @@ macro_rules! vop {
($cursor:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr), +}) => ({
match $cursor.peek()? {
None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
Some('=') => {
$cursor.next_char()?.expect("= token vanished");
Some(b'=') => {
$cursor.next_byte()?.expect("= token vanished");
$cursor.next_column();
$assign_op
},
$($case => {
$cursor.next_char()?.expect("Token vanished");
$cursor.next_byte()?.expect("Token vanished");
$cursor.next_column();
$block
})+,
@ -44,7 +44,7 @@ macro_rules! vop {
($cursor:ident, $op:expr, {$($case:pat => $block:expr),+}) => {
match $cursor.peek().ok_or_else(|| Error::syntax("could not preview next value", $cursor.pos()))? {
$($case => {
$cursor.next_char()?;
$cursor.next_byte()?;
$cursor.next_column();
$block
})+,
@ -72,7 +72,7 @@ macro_rules! op {
#[derive(Debug, Clone, Copy)]
pub(super) struct Operator {
init: char,
init: u8,
}
/// Operator lexing.
@ -87,7 +87,7 @@ pub(super) struct Operator {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators
impl Operator {
/// Creates a new operator lexer.
pub(super) fn new(init: char) -> Self {
pub(super) fn new(init: u8) -> Self {
Self { init }
}
}
@ -100,61 +100,63 @@ impl<R> Tokenizer<R> for Operator {
let _timer = BoaProfiler::global().start_event("Operator", "Lexing");
match self.init {
'*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
Some('*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
b'*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
Some(b'*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
}),
'+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
Some('+') => Ok(Punctuator::Inc)
b'+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
Some(b'+') => Ok(Punctuator::Inc)
}),
'-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
Some('-') => {
b'-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
Some(b'-') => {
Ok(Punctuator::Dec)
}
}),
'%' => op!(
b'%' => op!(
cursor,
start_pos,
Ok(Punctuator::AssignMod),
Ok(Punctuator::Mod)
),
'|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
Some('|') => Ok(Punctuator::BoolOr)
b'|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
Some(b'|') => Ok(Punctuator::BoolOr)
}),
'&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
Some('&') => Ok(Punctuator::BoolAnd)
b'&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
Some(b'&') => Ok(Punctuator::BoolAnd)
}),
'^' => op!(
b'^' => op!(
cursor,
start_pos,
Ok(Punctuator::AssignXor),
Ok(Punctuator::Xor)
),
'=' => op!(cursor, start_pos, if cursor.next_is('=')? {
b'=' => op!(cursor, start_pos, if cursor.next_is(b'=')? {
Ok(Punctuator::StrictEq)
} else {
Ok(Punctuator::Eq)
}, Ok(Punctuator::Assign), {
Some('>') => {
Some(b'>') => {
Ok(Punctuator::Arrow)
}
}),
'<' => op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
Some('<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
}),
'>' => {
b'<' => {
op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
Some(b'<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
})
}
b'>' => {
op!(cursor, start_pos, Ok(Punctuator::GreaterThanOrEq), Ok(Punctuator::GreaterThan), {
Some('>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
Some('>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
Some(b'>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
Some(b'>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
})
})
}
'!' => op!(
b'!' => op!(
cursor,
start_pos,
vop!(cursor, Ok(Punctuator::StrictNotEq), Ok(Punctuator::NotEq)),
Ok(Punctuator::Not)
),
'~' => Ok(Token::new(
b'~' => Ok(Token::new(
Punctuator::Neg.into(),
Span::new(start_pos, cursor.pos()),
)),

67
boa/src/syntax/lexer/regex.rs

@ -9,6 +9,8 @@ use crate::{
},
};
use bitflags::bitflags;
use std::io::{self, ErrorKind};
use std::str;
use std::{
fmt::{self, Display, Formatter},
io::Read,
@ -39,11 +41,11 @@ impl<R> Tokenizer<R> for RegexLiteral {
{
let _timer = BoaProfiler::global().start_event("RegexLiteral", "Lexing");
let mut body = String::new();
let mut body = Vec::new();
// Lex RegularExpressionBody.
loop {
match cursor.next_char()? {
match cursor.next_byte()? {
None => {
// Abrupt end.
return Err(Error::syntax(
@ -51,29 +53,45 @@ impl<R> Tokenizer<R> for RegexLiteral {
cursor.pos(),
));
}
Some(c) => {
match c {
'/' => break, // RegularExpressionBody finished.
'\n' | '\r' | '\u{2028}' | '\u{2029}' => {
Some(b) => {
match b {
b'/' => break, // RegularExpressionBody finished.
b'\n' | b'\r' => {
// Not allowed in Regex literal.
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
'\\' => {
0xE2 if (cursor.peek_n(2)? == 0xA8_80 || cursor.peek_n(2)? == 0xA9_80) => {
// '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
b'\\' => {
// Escape sequence
body.push('\\');
if let Some(sc) = cursor.next_char()? {
body.push(b'\\');
if let Some(sc) = cursor.next_byte()? {
match sc {
'\n' | '\r' | '\u{2028}' | '\u{2029}' => {
b'\n' | b'\r' => {
// Not allowed in Regex literal.
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
ch => body.push(ch),
0xE2 if (cursor.peek_n(2)? == 0xA8_80
|| cursor.peek_n(2)? == 0xA9_80) =>
{
// '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
return Err(Error::syntax(
"new lines are not allowed in regular expressions",
cursor.pos(),
));
}
b => body.push(b),
}
} else {
// Abrupt end of regex.
@ -83,20 +101,31 @@ impl<R> Tokenizer<R> for RegexLiteral {
));
}
}
_ => body.push(c),
_ => body.push(b),
}
}
}
}
let mut flags = String::new();
let mut flags = Vec::new();
let flags_start = cursor.pos();
cursor.take_while_pred(&mut flags, &char::is_alphabetic)?;
Ok(Token::new(
TokenKind::regular_expression_literal(body, parse_regex_flags(&flags, flags_start)?),
Span::new(start_pos, cursor.pos()),
))
cursor.take_while_ascii_pred(&mut flags, &|c: char| c.is_alphabetic())?;
let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
if let Ok(body_str) = str::from_utf8(body.as_slice()) {
Ok(Token::new(
TokenKind::regular_expression_literal(
body_str,
parse_regex_flags(flags_str, flags_start)?,
),
Span::new(start_pos, cursor.pos()),
))
} else {
Err(Error::from(io::Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8 character in regular expressions",
)))
}
}
}

4
boa/src/syntax/lexer/spread.rs

@ -38,8 +38,8 @@ impl<R> Tokenizer<R> for SpreadLiteral {
let _timer = BoaProfiler::global().start_event("SpreadLiteral", "Lexing");
// . or ...
if cursor.next_is('.')? {
if cursor.next_is('.')? {
if cursor.next_is(b'.')? {
if cursor.next_is(b'.')? {
Ok(Token::new(
Punctuator::Spread.into(),
Span::new(start_pos, cursor.pos()),

46
boa/src/syntax/lexer/string.rs

@ -8,6 +8,7 @@ use crate::{
lexer::{Token, TokenKind},
},
};
use core::convert::TryFrom;
use std::{
io::{self, ErrorKind, Read},
str,
@ -58,12 +59,13 @@ impl<R> Tokenizer<R> for StringLiteral {
let mut buf: Vec<u16> = Vec::new();
loop {
let next_chr_start = cursor.pos();
let next_chr = cursor.next_char()?.ok_or_else(|| {
let next_chr = char::try_from(cursor.next_char()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
))
})?;
})?)
.unwrap();
match next_chr {
'\'' if self.terminator == StringTerminator::SingleQuote => {
@ -76,22 +78,22 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.next_char()?.ok_or_else(|| {
let escape = cursor.next_byte()?.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in string literal",
))
})?;
if escape != '\n' {
if escape != b'\n' {
match escape {
'n' => buf.push('\n' as u16),
'r' => buf.push('\r' as u16),
't' => buf.push('\t' as u16),
'b' => buf.push('\x08' as u16),
'f' => buf.push('\x0c' as u16),
'0' => buf.push('\0' as u16),
'x' => {
b'n' => buf.push('\n' as u16),
b'r' => buf.push('\r' as u16),
b't' => buf.push('\t' as u16),
b'b' => buf.push('\x08' as u16),
b'f' => buf.push('\x0c' as u16),
b'0' => buf.push('\0' as u16),
b'x' => {
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
@ -106,17 +108,20 @@ impl<R> Tokenizer<R> for StringLiteral {
buf.push(code_point);
}
'u' => {
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is('{')? {
cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.
if cursor.next_is(b'{')? {
cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'.
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_str = String::with_capacity(6);
cursor.take_until('}', &mut code_point_str)?;
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;
cursor.next_char()?.expect("} character vanished"); // Consume the '}'.
cursor.next_byte()?.expect("} character vanished"); // Consume the '}'.
let code_point_str = unsafe {
str::from_utf8_unchecked(code_point_buf.as_slice())
};
// We know this is a single unicode codepoint, convert to u32
let code_point = u32::from_str_radix(&code_point_str, 16)
.map_err(|_| {
@ -156,13 +161,12 @@ impl<R> Tokenizer<R> for StringLiteral {
buf.push(code_point);
}
}
'\'' | '"' | '\\' => buf.push(escape as u16),
ch => {
b'\'' | b'"' | b'\\' => buf.push(escape as u16),
_ => {
let details = format!(
"invalid escape sequence `{}` at line {}, column {}",
"invalid escape sequence at line {}, column {}",
next_chr_start.line_number(),
next_chr_start.column_number(),
ch
);
return Err(Error::syntax(details, cursor.pos()));
}

24
boa/src/syntax/lexer/template.rs

@ -9,6 +9,7 @@ use crate::{
},
};
use std::io::{self, ErrorKind, Read};
use std::str;
/// Template literal lexing.
///
@ -30,23 +31,30 @@ impl<R> Tokenizer<R> for TemplateLiteral {
{
let _timer = BoaProfiler::global().start_event("TemplateLiteral", "Lexing");
let mut buf = String::new();
let mut buf = Vec::new();
loop {
match cursor.next_char()? {
match cursor.next_byte()? {
None => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"Unterminated template literal",
)));
}
Some('`') => break, // Template literal finished.
Some(next_ch) => buf.push(next_ch), // TODO when there is an expression inside the literal
Some(b'`') => break, // Template literal finished.
Some(next_byte) => buf.push(next_byte), // TODO when there is an expression inside the literal
}
}
Ok(Token::new(
TokenKind::template_literal(buf),
Span::new(start_pos, cursor.pos()),
))
if let Ok(s) = str::from_utf8(buf.as_slice()) {
Ok(Token::new(
TokenKind::template_literal(s),
Span::new(start_pos, cursor.pos()),
))
} else {
Err(Error::from(io::Error::new(
ErrorKind::InvalidData,
"Invalid UTF-8 character in template literal",
)))
}
}
}

101
boa/src/syntax/lexer/tests.rs

@ -6,6 +6,7 @@ use super::token::Numeric;
use super::*;
use super::{Error, Position};
use crate::syntax::ast::Keyword;
use std::str;
fn span(start: (u32, u32), end: (u32, u32)) -> Span {
Span::new(Position::new(start.0, start.1), Position::new(end.0, end.1))
@ -280,19 +281,19 @@ fn check_positions_codepoint() {
// String token starts on column 13
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 13), (1, 34))
span((1, 13), (1, 36))
);
// Close parenthesis token starts on column 34
// Close parenthesis token starts on column 36
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 34), (1, 35))
span((1, 36), (1, 37))
);
// Semi Colon token starts on column 35
// Semi Colon token starts on column 37
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 35), (1, 36))
span((1, 37), (1, 38))
);
}
@ -554,38 +555,102 @@ fn addition_no_spaces_e_number() {
}
#[test]
fn take_while_pred_simple() {
fn take_while_ascii_pred_simple() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: String = String::new();
let mut buf: Vec<u8> = Vec::new();
cur.take_while_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
.unwrap();
assert_eq!(buf, "abc");
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
}
#[test]
fn take_while_pred_immediate_stop() {
fn take_while_ascii_pred_immediate_stop() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: String = String::new();
let mut buf: Vec<u8> = Vec::new();
cur.take_while_pred(&mut buf, &|c| c == 'd').unwrap();
cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap();
assert_eq!(buf, "");
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
}
#[test]
fn take_while_pred_entire_str() {
fn take_while_ascii_pred_entire_str() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: String = String::new();
let mut buf: Vec<u8> = Vec::new();
cur.take_while_pred(&mut buf, &|c| c.is_alphabetic())
.unwrap();
cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
}
#[test]
fn take_while_ascii_pred_non_ascii_stop() {
let mut cur = Cursor::new("abcde😀fghijk".as_bytes());
let mut buf: Vec<u8> = Vec::new();
cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde");
}
#[test]
fn take_while_char_pred_simple() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: Vec<u8> = Vec::new();
cur.take_while_char_pred(&mut buf, &|c| {
c == 'a' as u32 || c == 'b' as u32 || c == 'c' as u32
})
.unwrap();
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
}
#[test]
fn take_while_char_pred_immediate_stop() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: Vec<u8> = Vec::new();
cur.take_while_char_pred(&mut buf, &|_| false).unwrap();
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
}
#[test]
fn take_while_char_pred_entire_str() {
let mut cur = Cursor::new(&b"abcdefghijk"[..]);
let mut buf: Vec<u8> = Vec::new();
cur.take_while_char_pred(&mut buf, &|_| true).unwrap();
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
}
#[test]
fn take_while_char_pred_utf8_char() {
let mut cur = Cursor::new("abc😀defghijk".as_bytes());
let mut buf: Vec<u8> = Vec::new();
cur.take_while_char_pred(&mut buf, &|c| {
if let Ok(c) = char::try_from(c) {
c == 'a' || c == 'b' || c == 'c' || c == '😀'
} else {
false
}
})
.unwrap();
assert_eq!(buf, "abcdefghijk");
assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc😀");
}
#[test]

Loading…
Cancel
Save