diff --git a/core/engine/src/builtins/eval/mod.rs b/core/engine/src/builtins/eval/mod.rs index 8ed4078eec..995772bc3f 100644 --- a/core/engine/src/builtins/eval/mod.rs +++ b/core/engine/src/builtins/eval/mod.rs @@ -99,7 +99,7 @@ impl Eval { // 2. If Type(x) is not String, return x. // TODO: rework parser to take an iterator of `u32` unicode codepoints - let Some(x) = x.as_string().map(JsString::to_std_string_escaped) else { + let Some(x) = x.as_string() else { return Ok(x.clone()); }; @@ -118,7 +118,7 @@ impl Eval { // b. If script is a List of errors, throw a SyntaxError exception. // c. If script Contains ScriptBody is false, return undefined. // d. Let body be the ScriptBody of script. - let mut parser = Parser::new(Source::from_bytes(&x)); + let mut parser = Parser::new(Source::from_utf16(x)); parser.set_identifier(context.next_parser_identifier()); if strict { parser.set_strict(); diff --git a/core/engine/src/context/mod.rs b/core/engine/src/context/mod.rs index c5a9cce945..3f5daa086a 100644 --- a/core/engine/src/context/mod.rs +++ b/core/engine/src/context/mod.rs @@ -5,6 +5,7 @@ mod hooks; pub(crate) mod icu; pub mod intrinsics; +use boa_parser::source::ReadChar; pub use hooks::{DefaultHooks, HostHooks}; #[cfg(feature = "intl")] @@ -14,7 +15,7 @@ use intrinsics::Intrinsics; #[cfg(not(feature = "intl"))] pub use std::marker::PhantomData; -use std::{cell::Cell, io::Read, path::Path, rc::Rc}; +use std::{cell::Cell, path::Path, rc::Rc}; use crate::{ builtins, @@ -185,7 +186,7 @@ impl Context { /// Note that this won't run any scheduled promise jobs; you need to call [`Context::run_jobs`] /// on the context or [`JobQueue::run_jobs`] on the provided queue to run them. #[allow(clippy::unit_arg, dropping_copy_types)] - pub fn eval(&mut self, src: Source<'_, R>) -> JsResult { + pub fn eval(&mut self, src: Source<'_, R>) -> JsResult { let main_timer = Profiler::global().start_event("Script evaluation", "Main"); let result = Script::parse(src, None, self)?.evaluate(self); diff --git a/core/engine/src/module/mod.rs b/core/engine/src/module/mod.rs index 6700760310..bcf07e99f2 100644 --- a/core/engine/src/module/mod.rs +++ b/core/engine/src/module/mod.rs @@ -25,6 +25,7 @@ mod loader; mod namespace; mod source; mod synthetic; +use boa_parser::source::ReadChar; pub use loader::*; pub use namespace::ModuleNamespace; use source::SourceTextModule; @@ -33,7 +34,6 @@ pub use synthetic::{SyntheticModule, SyntheticModuleInitializer}; use std::cell::{Cell, RefCell}; use std::collections::HashSet; use std::hash::Hash; -use std::io::Read; use std::rc::Rc; use rustc_hash::FxHashSet; @@ -141,7 +141,7 @@ impl Module { /// Parses the provided `src` as an ECMAScript module, returning an error if parsing fails. /// /// [spec]: https://tc39.es/ecma262/#sec-parsemodule - pub fn parse( + pub fn parse( src: Source<'_, R>, realm: Option, context: &mut Context, diff --git a/core/engine/src/script.rs b/core/engine/src/script.rs index dfb2b905a2..276fbebd3e 100644 --- a/core/engine/src/script.rs +++ b/core/engine/src/script.rs @@ -8,10 +8,8 @@ //! [spec]: https://tc39.es/ecma262/#sec-scripts //! [script]: https://tc39.es/ecma262/#sec-script-records -use std::io::Read; - use boa_gc::{Finalize, Gc, GcRefCell, Trace}; -use boa_parser::{Parser, Source}; +use boa_parser::{source::ReadChar, Parser, Source}; use boa_profiler::Profiler; use rustc_hash::FxHashMap; @@ -76,7 +74,7 @@ impl Script { /// Parses the provided `src` as an ECMAScript script, returning an error if parsing fails. /// /// [spec]: https://tc39.es/ecma262/#sec-parse-script - pub fn parse( + pub fn parse( src: Source<'_, R>, realm: Option, context: &mut Context, diff --git a/core/engine/src/tests/operators.rs b/core/engine/src/tests/operators.rs index 6d688e1041..553d8135e6 100644 --- a/core/engine/src/tests/operators.rs +++ b/core/engine/src/tests/operators.rs @@ -334,7 +334,7 @@ fn assignment_to_non_assignable_ctd() { TestAction::assert_native_error( src, JsNativeErrorKind::Syntax, - "Invalid left-hand side in assignment at line 1, col 13", + "Invalid left-hand side in assignment at line 1, col 12", ) }), ); @@ -362,7 +362,7 @@ fn multicharacter_assignment_to_non_assignable_ctd() { TestAction::assert_native_error( src, JsNativeErrorKind::Syntax, - "Invalid left-hand side in assignment at line 1, col 13", + "Invalid left-hand side in assignment at line 1, col 12", ) }), ); @@ -397,7 +397,7 @@ fn multicharacter_bitwise_assignment_to_non_assignable_ctd() { TestAction::assert_native_error( src, JsNativeErrorKind::Syntax, - "Invalid left-hand side in assignment at line 1, col 13", + "Invalid left-hand side in assignment at line 1, col 12", ) }), ); diff --git a/core/parser/src/lexer/comment.rs b/core/parser/src/lexer/comment.rs index 6b23e19ed7..aca752df1f 100644 --- a/core/parser/src/lexer/comment.rs +++ b/core/parser/src/lexer/comment.rs @@ -1,10 +1,10 @@ //! Boa's lexing for ECMAScript comments. use crate::lexer::{Cursor, Error, Token, TokenKind, Tokenizer}; +use crate::source::ReadChar; use boa_ast::{Position, Span}; use boa_interner::Interner; use boa_profiler::Profiler; -use std::io::Read; /// Lexes a single line comment. /// @@ -26,7 +26,7 @@ impl Tokenizer for SingleLineComment { _interner: &mut Interner, ) -> Result where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("SingleLineComment", "Lexing"); @@ -66,7 +66,7 @@ impl Tokenizer for MultiLineComment { _interner: &mut Interner, ) -> Result where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("MultiLineComment", "Lexing"); @@ -74,7 +74,7 @@ impl Tokenizer for MultiLineComment { while let Some(ch) = cursor.next_char()? { let tried_ch = char::try_from(ch); match tried_ch { - Ok(c) if c == '*' && cursor.next_is(b'/')? => { + Ok(c) if c == '*' && cursor.next_if(0x2F /* / */)? => { return Ok(Token::new( if new_line { TokenKind::LineTerminator @@ -115,7 +115,7 @@ impl Tokenizer for HashbangComment { _interner: &mut Interner, ) -> Result where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("Hashbang", "Lexing"); diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index b191781136..20efc3ab07 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -1,15 +1,18 @@ //! Boa's lexer cursor that manages the input byte stream. + +use crate::source::{ReadChar, UTF8Input}; use boa_ast::Position; use boa_profiler::Profiler; -use std::io::{self, Bytes, Error, ErrorKind, Read}; +use std::io::{self, Error, ErrorKind}; /// Cursor over the source code. #[derive(Debug)] pub(super) struct Cursor { - iter: InnerIter, + iter: R, pos: Position, module: bool, strict: bool, + peeked: [Option; 4], } impl Cursor { @@ -19,7 +22,7 @@ impl Cursor { } /// Advances the position to the next column. - pub(super) fn next_column(&mut self) { + fn next_column(&mut self) { let current_line = self.pos.line_number(); let next_column = self.pos.column_number() + 1; self.pos = Position::new(current_line, next_column); @@ -53,62 +56,66 @@ impl Cursor { } } -impl Cursor -where - R: Read, -{ +impl Cursor { /// Creates a new Lexer cursor. pub(super) fn new(inner: R) -> Self { Self { - iter: InnerIter::new(inner.bytes()), + iter: inner, pos: Position::new(1, 1), strict: false, module: false, + peeked: [None; 4], } } /// Creates a new Lexer cursor with an initial position. pub(super) fn with_position(inner: R, pos: Position) -> Self { Self { - iter: InnerIter::new(inner.bytes()), + iter: inner, pos, strict: false, module: false, + peeked: [None; 4], } } - /// Peeks the next byte. - pub(super) fn peek(&mut self) -> Result, Error> { - let _timer = Profiler::global().start_event("cursor::peek()", "Lexing"); - - self.iter.peek_byte() - } - /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). - pub(super) fn peek_n(&mut self, n: u8) -> Result<&[u8], Error> { + pub(super) fn peek_n(&mut self, n: u8) -> Result<&[Option; 4], Error> { let _timer = Profiler::global().start_event("cursor::peek_n()", "Lexing"); - self.iter.peek_n_bytes(n) + let peeked = self.peeked.iter().filter(|c| c.is_some()).count(); + let needs_peek = n as usize - peeked; + + for i in 0..needs_peek { + let next = self.iter.next_char()?; + self.peeked[i + peeked] = next; + } + + Ok(&self.peeked) } /// Peeks the next UTF-8 character in u32 code point. pub(super) fn peek_char(&mut self) -> Result, Error> { let _timer = Profiler::global().start_event("cursor::peek_char()", "Lexing"); - self.iter.peek_char() + if let Some(c) = self.peeked[0] { + return Ok(Some(c)); + } + + let next = self.iter.next_char()?; + self.peeked[0] = next; + Ok(next) } - /// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented. - pub(super) fn next_is(&mut self, byte: u8) -> io::Result { - let _timer = Profiler::global().start_event("cursor::next_is()", "Lexing"); + pub(super) fn next_if(&mut self, c: u32) -> io::Result { + let _timer = Profiler::global().start_event("cursor::next_if()", "Lexing"); - Ok(match self.peek()? { - Some(next) if next == byte => { - self.next_byte()?; - true - } - _ => false, - }) + if self.peek_char()? == Some(c) { + self.next_char()?; + Ok(true) + } else { + Ok(false) + } } /// Applies the predicate to the next character and returns the result. @@ -120,41 +127,30 @@ where where F: Fn(char) -> bool, { - let _timer = Profiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing"); + let _timer = Profiler::global().start_event("cursor::next_is_pred()", "Lexing"); - Ok(match self.peek()? { - Some(byte) if (0..=0x7F).contains(&byte) => pred(char::from(byte)), + Ok(match self.peek_char()? { + Some(byte) if (0..=0x7F).contains(&byte) => + { + #[allow(clippy::cast_possible_truncation)] + pred(char::from(byte as u8)) + } Some(_) | None => false, }) } - /// Applies the predicate to the next UTF-8 character and returns the result. - /// Returns false if there is no next character, otherwise returns the result from the - /// predicate on the ascii char - /// - /// The buffer is not incremented. - #[cfg(test)] - pub(super) fn next_is_char_pred(&mut self, pred: &F) -> io::Result - where - F: Fn(u32) -> bool, - { - let _timer = Profiler::global().start_event("cursor::next_is_char_pred()", "Lexing"); - - Ok(self.peek_char()?.map_or(false, pred)) - } - /// Fills the buffer with all bytes until the stop byte is found. /// Returns error when reaching the end of the buffer. /// /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before. - pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec) -> io::Result<()> { + pub(super) fn take_until(&mut self, stop: u32, buf: &mut Vec) -> io::Result<()> { let _timer = Profiler::global().start_event("cursor::take_until()", "Lexing"); loop { - if self.next_is(stop)? { + if self.next_if(stop)? { return Ok(()); - } else if let Some(byte) = self.next_byte()? { - buf.push(byte); + } else if let Some(c) = self.next_char()? { + buf.push(c); } else { return Err(io::Error::new( ErrorKind::UnexpectedEof, @@ -177,8 +173,9 @@ where loop { if !self.next_is_ascii_pred(pred)? { return Ok(()); - } else if let Some(byte) = self.next_byte()? { - buf.push(byte); + } else if let Some(byte) = self.next_char()? { + #[allow(clippy::cast_possible_truncation)] + buf.push(byte as u8); } else { // next_is_pred will return false if the next value is None so the None case should already be handled. unreachable!(); @@ -186,89 +183,25 @@ where } } - /// Fills the buffer with characters until the first character for which the predicate (pred) is false. - /// It also stops when there is no next character. - /// - /// Note that all characters up until the stop character are added to the buffer, including the character right before. - #[cfg(test)] - pub(super) fn take_while_char_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> - where - F: Fn(u32) -> bool, - { - let _timer = Profiler::global().start_event("cursor::take_while_char_pred()", "Lexing"); - - loop { - if !self.next_is_char_pred(pred)? { - return Ok(()); - } else if let Some(ch) = self.peek_char()? { - for _ in 0..utf8_len(ch) { - buf.push( - self.next_byte()? - .expect("already checked that the next character exists"), - ); - } - } else { - // next_is_pred will return false if the next value is None so the None case should already be handled. - unreachable!(); - } - } - } - - /// It will fill the buffer with bytes. - /// - /// This expects for the buffer to be fully filled. If it's not, it will fail with an - /// `UnexpectedEof` I/O error. - pub(super) fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { - let _timer = Profiler::global().start_event("cursor::fill_bytes()", "Lexing"); - - self.iter.fill_bytes(buf) - } - - /// Retrieves the next byte. - pub(crate) fn next_byte(&mut self) -> Result, Error> { - let _timer = Profiler::global().start_event("cursor::next_byte()", "Lexing"); - - let byte = self.iter.next_byte()?; - - match byte { - Some(b'\r') => { - // Try to take a newline if it's next, for windows "\r\n" newlines - // Otherwise, treat as a Mac OS9 bare '\r' newline - if self.peek()? == Some(b'\n') { - let _next = self.iter.next_byte(); - } - self.next_line(); - } - Some(b'\n') => self.next_line(), - Some(0xE2) => { - // Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) - let next_bytes = self.peek_n(2)?; - if next_bytes == [0x80, 0xA8] || next_bytes == [0x80, 0xA9] { - self.next_line(); - } else { - // 0xE2 is a utf8 first byte - self.next_column(); - } - } - Some(b) if utf8_is_first_byte(b) => self.next_column(), - _ => {} - } - - Ok(byte) - } - /// Retrieves the next UTF-8 character. pub(crate) fn next_char(&mut self) -> Result, Error> { let _timer = Profiler::global().start_event("cursor::next_char()", "Lexing"); - let ch = self.iter.next_char()?; + let ch = if let Some(c) = self.peeked[0] { + self.peeked[0] = None; + self.peeked.rotate_left(1); + Some(c) + } else { + self.iter.next_char()? + }; match ch { Some(0xD) => { // Try to take a newline if it's next, for windows "\r\n" newlines // Otherwise, treat as a Mac OS9 bare '\r' newline - if self.peek()? == Some(0xA) { - let _next = self.iter.next_byte(); + if self.peek_char()? == Some(0xA) { + self.peeked[0] = None; + self.peeked.rotate_left(1); } self.next_line(); } @@ -282,224 +215,8 @@ where } } -/// Inner iterator for a cursor. -#[derive(Debug)] -struct InnerIter { - iter: Bytes, - num_peeked_bytes: u8, - peeked_bytes: [u8; 4], - #[allow(clippy::option_option)] - peeked_char: Option>, -} - -impl InnerIter { - /// Creates a new inner iterator. - const fn new(iter: Bytes) -> Self { - Self { - iter, - num_peeked_bytes: 0, - peeked_bytes: [0; 4], - peeked_char: None, - } - } -} - -impl InnerIter -where - R: Read, -{ - /// It will fill the buffer with checked ascii bytes. - /// - /// This expects for the buffer to be fully filled. If it's not, it will fail with an - /// `UnexpectedEof` I/O error. - fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { - for byte in &mut *buf { - *byte = self.next_byte()?.ok_or_else(|| { - io::Error::new( - io::ErrorKind::UnexpectedEof, - "unexpected EOF when filling buffer", - ) - })?; - } - Ok(()) - } - - /// Increments the iter by n bytes. - fn increment(&mut self, n: u32) -> Result<(), Error> { - for _ in 0..n { - if (self.next_byte()?).is_none() { - break; - } - } - Ok(()) - } - - /// Peeks the next byte. - pub(super) fn peek_byte(&mut self) -> Result, Error> { - if self.num_peeked_bytes > 0 { - let byte = self.peeked_bytes[0]; - Ok(Some(byte)) - } else { - match self.iter.next().transpose()? { - Some(byte) => { - self.num_peeked_bytes = 1; - self.peeked_bytes[0] = byte; - Ok(Some(byte)) - } - None => Ok(None), - } - } - } - - /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). - pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<&[u8], Error> { - while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 { - match self.iter.next().transpose()? { - Some(byte) => { - self.peeked_bytes[usize::from(self.num_peeked_bytes)] = byte; - self.num_peeked_bytes += 1; - } - None => break, - }; - } - Ok(&self.peeked_bytes[..usize::from(u8::min(n, self.num_peeked_bytes))]) - } - - /// Peeks the next unchecked character in u32 code point. - pub(super) fn peek_char(&mut self) -> Result, Error> { - if let Some(ch) = self.peeked_char { - Ok(ch) - } else { - // Decode UTF-8 - let (x, y, z, w) = match self.peek_n_bytes(4)? { - [b, ..] if *b < 128 => { - let char = u32::from(*b); - self.peeked_char = Some(Some(char)); - return Ok(Some(char)); - } - [] => { - self.peeked_char = None; - return Ok(None); - } - bytes => ( - bytes[0], - bytes.get(1).copied(), - bytes.get(2).copied(), - bytes.get(3).copied(), - ), - }; - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte(x, 2); - let y = y.unwrap_or_default(); - let mut ch = utf8_acc_cont_byte(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = z.unwrap_or_default(); - let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = w.unwrap_or_default(); - ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); - } - }; - - self.peeked_char = Some(Some(ch)); - Ok(Some(ch)) - } - } - - /// Retrieves the next byte - fn next_byte(&mut self) -> io::Result> { - self.peeked_char = None; - if self.num_peeked_bytes > 0 { - let byte = self.peeked_bytes[0]; - self.num_peeked_bytes -= 1; - self.peeked_bytes.rotate_left(1); - Ok(Some(byte)) - } else { - self.iter.next().transpose() - } - } - - /// Retrieves the next unchecked char in u32 code point. - fn next_char(&mut self) -> io::Result> { - if let Some(ch) = self.peeked_char.take() { - if let Some(c) = ch { - self.increment(utf8_len(c))?; - } - return Ok(ch); - } - - // Decode UTF-8 - let x = match self.next_byte()? { - Some(b) if b < 128 => return Ok(Some(u32::from(b))), - Some(b) => b, - None => return Ok(None), - }; - - // Multibyte case follows - // Decode from a byte combination out of: [[[x y] z] w] - // NOTE: Performance is sensitive to the exact formulation here - let init = utf8_first_byte(x, 2); - let y = unwrap_or_0(self.next_byte()?); - let mut ch = utf8_acc_cont_byte(init, y); - if x >= 0xE0 { - // [[x y z] w] case - // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid - let z = unwrap_or_0(self.next_byte()?); - let y_z = utf8_acc_cont_byte(u32::from(y & CONT_MASK), z); - ch = init << 12 | y_z; - if x >= 0xF0 { - // [x y z w] case - // use only the lower 3 bits of `init` - let w = unwrap_or_0(self.next_byte()?); - ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); - } - }; - - Ok(Some(ch)) - } -} - -/// Mask of the value bits of a continuation byte. -const CONT_MASK: u8 = 0b0011_1111; - -/// Returns the initial codepoint accumulator for the first byte. -/// The first byte is special, only want bottom 5 bits for width 2, 4 bits -/// for width 3, and 3 bits for width 4. -fn utf8_first_byte(byte: u8, width: u32) -> u32 { - u32::from(byte & (0x7F >> width)) -} - -/// Returns the value of `ch` updated with continuation byte `byte`. -fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { - (ch << 6) | u32::from(byte & CONT_MASK) -} - -/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the -/// bits `11`). -const fn utf8_is_first_byte(byte: u8) -> bool { - byte <= 0x7F || (byte >> 6) == 0x11 -} - -fn unwrap_or_0(opt: Option) -> u8 { - opt.unwrap_or(0) -} - -const fn utf8_len(ch: u32) -> u32 { - if ch <= 0x7F { - 1 - } else if ch <= 0x7FF { - 2 - } else if ch <= 0xFFFF { - 3 - } else { - 4 +impl<'a> From<&'a [u8]> for Cursor> { + fn from(input: &'a [u8]) -> Self { + Self::new(UTF8Input::new(input)) } } diff --git a/core/parser/src/lexer/identifier.rs b/core/parser/src/lexer/identifier.rs index a1ff167e90..6251a18b99 100644 --- a/core/parser/src/lexer/identifier.rs +++ b/core/parser/src/lexer/identifier.rs @@ -3,10 +3,10 @@ use crate::lexer::{ token::ContainsEscapeSequence, Cursor, Error, StringLiteral, Token, TokenKind, Tokenizer, }; +use crate::source::ReadChar; use boa_ast::{Keyword, Position, Span}; use boa_interner::Interner; use boa_profiler::Profiler; -use std::io::Read; /// Identifier lexing. /// @@ -60,7 +60,7 @@ impl Tokenizer for Identifier { interner: &mut Interner, ) -> Result where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("Identifier", "Lexing"); @@ -95,12 +95,12 @@ impl Identifier { init: char, ) -> Result<(String, bool), Error> where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("Identifier::take_identifier_name", "Lexing"); let mut contains_escaped_chars = false; - let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? { + let mut identifier_name = if init == '\\' && cursor.next_if(0x75 /* u */)? { let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?; if Self::is_identifier_start(ch) { @@ -119,10 +119,10 @@ impl Identifier { loop { let ch = match cursor.peek_char()? { - Some(0x005C /* \ */) if cursor.peek_n(2)?.get(1) == Some(&0x75) /* u */ => { + Some(0x005C /* \ */) if cursor.peek_n(2)?[1] == Some(0x75) /* u */ => { let pos = cursor.pos(); - let _next = cursor.next_byte(); - let _next = cursor.next_byte(); + let _next = cursor.next_char(); + let _next = cursor.next_char(); let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?; if Self::is_identifier_part(ch) { diff --git a/core/parser/src/lexer/mod.rs b/core/parser/src/lexer/mod.rs index 192c24d88b..01650f899b 100644 --- a/core/parser/src/lexer/mod.rs +++ b/core/parser/src/lexer/mod.rs @@ -41,10 +41,10 @@ use self::{ string::StringLiteral, template::TemplateLiteral, }; +use crate::source::{ReadChar, UTF8Input}; use boa_ast::{Position, Punctuator, Span}; use boa_interner::Interner; use boa_profiler::Profiler; -use std::io::Read; pub use self::{ error::Error, @@ -60,7 +60,7 @@ trait Tokenizer { interner: &mut Interner, ) -> Result where - R: Read; + R: ReadChar; } /// Lexer or tokenizer for the Boa JavaScript Engine. @@ -104,7 +104,7 @@ impl Lexer { /// Creates a new lexer. pub fn new(reader: R) -> Self where - R: Read, + R: ReadChar, { Self { cursor: Cursor::new(reader), @@ -125,18 +125,20 @@ impl Lexer { interner: &mut Interner, ) -> Result where - R: Read, + R: ReadChar, { let _timer = Profiler::global().start_event("lex_slash_token", "Lexing"); - if let Some(c) = self.cursor.peek()? { + if let Some(c) = self.cursor.peek_char()? { match c { - b'/' => { - self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/' + // / + 0x002F => { + self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/' SingleLineComment.lex(&mut self.cursor, start, interner) } - b'*' => { - self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*' + // * + 0x002A => { + self.cursor.next_char()?.expect("* token vanished"); // Consume the '*' MultiLineComment.lex(&mut self.cursor, start, interner) } ch => { @@ -144,9 +146,10 @@ impl Lexer { InputElement::Div | InputElement::TemplateTail => { // Only div punctuator allowed, regex not. - if ch == b'=' { + // = + if ch == 0x003D { // Indicates this is an AssignDiv. - self.cursor.next_byte()?.expect("= token vanished"); // Consume the '=' + self.cursor.next_char()?.expect("= token vanished"); // Consume the '=' Ok(Token::new( Punctuator::AssignDiv.into(), Span::new(start, self.cursor.pos()), @@ -176,7 +179,7 @@ impl Lexer { /// Skips an HTML close comment (`-->`) if the `annex-b` feature is enabled. pub(crate) fn skip_html_close(&mut self, interner: &mut Interner) -> Result<(), Error> where - R: Read, + R: ReadChar, { if cfg!(not(feature = "annex-b")) || self.module() { return Ok(()); @@ -186,10 +189,11 @@ impl Lexer { let _next = self.cursor.next_char(); } - if self.cursor.peek_n(3)? == [b'-', b'-', b'>'] { - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); - let _next = self.cursor.next_byte(); + // --> + if self.cursor.peek_n(3)?[..3] == [Some(0x2D), Some(0x2D), Some(0x3E)] { + let _next = self.cursor.next_char(); + let _next = self.cursor.next_char(); + let _next = self.cursor.next_char(); let start = self.cursor.pos(); SingleLineComment.lex(&mut self.cursor, start, interner)?; @@ -206,7 +210,7 @@ impl Lexer { // We intentionally don't implement Iterator trait as Result