mirror of https://github.com/boa-dev/boa.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
308 lines
10 KiB
308 lines
10 KiB
//! A lexical analyzer for JavaScript source code. |
|
//! |
|
//! This module contains the Boa lexer or tokenizer implementation. |
|
//! |
|
//! The Lexer splits its input source code into a sequence of input elements called tokens, |
|
//! represented by the [Token](../ast/token/struct.Token.html) structure. It also removes |
|
//! whitespace and comments and attaches them to the next token. |
|
//! |
|
//! This is tightly coupled with the parser due to the javascript goal-symbol requirements |
|
//! as documented by the spec. |
|
//! |
|
//! More information: |
|
//! - [ECMAScript reference][spec] |
|
//! |
|
//! [spec]: https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar |
|
|
|
mod comment; |
|
mod cursor; |
|
pub mod error; |
|
mod identifier; |
|
mod number; |
|
mod operator; |
|
mod regex; |
|
mod spread; |
|
mod string; |
|
mod template; |
|
pub mod token; |
|
|
|
#[cfg(test)] |
|
mod tests; |
|
|
|
use self::{ |
|
comment::{MultiLineComment, SingleLineComment}, |
|
cursor::Cursor, |
|
identifier::Identifier, |
|
number::NumberLiteral, |
|
operator::Operator, |
|
regex::RegexLiteral, |
|
spread::SpreadLiteral, |
|
string::StringLiteral, |
|
template::TemplateLiteral, |
|
}; |
|
use crate::syntax::ast::{Punctuator, Span}; |
|
pub use crate::{profiler::BoaProfiler, syntax::ast::Position}; |
|
use core::convert::TryFrom; |
|
pub use error::Error; |
|
use std::io::Read; |
|
pub use token::{Token, TokenKind}; |
|
|
|
trait Tokenizer<R> { |
|
/// Lexes the next token. |
|
fn lex(&mut self, cursor: &mut Cursor<R>, start_pos: Position) -> Result<Token, Error> |
|
where |
|
R: Read; |
|
} |
|
|
|
/// Lexer or tokenizer for the Boa JavaScript Engine. |
|
#[derive(Debug)] |
|
pub struct Lexer<R> { |
|
cursor: Cursor<R>, |
|
goal_symbol: InputElement, |
|
} |
|
|
|
impl<R> Lexer<R> { |
|
/// Checks if a character is whitespace as per ECMAScript standards. |
|
/// |
|
/// The Rust `char::is_whitespace` function and the ECMAScript standard use different sets of |
|
/// characters as whitespaces: |
|
/// * Rust uses `\p{White_Space}`, |
|
/// * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}` |
|
/// |
|
/// [More information](https://tc39.es/ecma262/#table-32) |
|
fn is_whitespace(ch: u32) -> bool { |
|
matches!( |
|
ch, |
|
0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF | |
|
// Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above) |
|
0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000 |
|
) |
|
} |
|
|
|
/// Sets the goal symbol for the lexer. |
|
#[inline] |
|
pub(crate) fn set_goal(&mut self, elm: InputElement) { |
|
self.goal_symbol = elm; |
|
} |
|
|
|
/// Gets the goal symbol the lexer is currently using. |
|
#[inline] |
|
pub(crate) fn get_goal(&self) -> InputElement { |
|
self.goal_symbol |
|
} |
|
|
|
#[inline] |
|
pub(super) fn strict_mode(&self) -> bool { |
|
self.cursor.strict_mode() |
|
} |
|
|
|
#[inline] |
|
pub(super) fn set_strict_mode(&mut self, strict_mode: bool) { |
|
self.cursor.set_strict_mode(strict_mode) |
|
} |
|
|
|
/// Creates a new lexer. |
|
#[inline] |
|
pub fn new(reader: R) -> Self |
|
where |
|
R: Read, |
|
{ |
|
Self { |
|
cursor: Cursor::new(reader), |
|
goal_symbol: Default::default(), |
|
} |
|
} |
|
|
|
// Handles lexing of a token starting '/' with the '/' already being consumed. |
|
// This could be a divide symbol or the start of a regex. |
|
// |
|
// A '/' symbol can always be a comment but if as tested above it is not then |
|
// that means it could be multiple different tokens depending on the input token. |
|
// |
|
// As per https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar |
|
pub(crate) fn lex_slash_token(&mut self, start: Position) -> Result<Token, Error> |
|
where |
|
R: Read, |
|
{ |
|
let _timer = BoaProfiler::global().start_event("lex_slash_token", "Lexing"); |
|
|
|
if let Some(c) = self.cursor.peek()? { |
|
match c { |
|
b'/' => { |
|
self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/' |
|
SingleLineComment.lex(&mut self.cursor, start) |
|
} |
|
b'*' => { |
|
self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*' |
|
MultiLineComment.lex(&mut self.cursor, start) |
|
} |
|
ch => { |
|
match self.get_goal() { |
|
InputElement::Div | InputElement::TemplateTail => { |
|
// Only div punctuator allowed, regex not. |
|
|
|
if ch == b'=' { |
|
// Indicates this is an AssignDiv. |
|
self.cursor.next_byte()?.expect("= token vanished"); // Consume the '=' |
|
Ok(Token::new( |
|
Punctuator::AssignDiv.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)) |
|
} else { |
|
Ok(Token::new( |
|
Punctuator::Div.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)) |
|
} |
|
} |
|
InputElement::RegExp | InputElement::RegExpOrTemplateTail => { |
|
// Can be a regular expression. |
|
RegexLiteral.lex(&mut self.cursor, start) |
|
} |
|
} |
|
} |
|
} |
|
} else { |
|
Err(Error::syntax( |
|
"Abrupt end: Expecting Token /,*,= or regex", |
|
start, |
|
)) |
|
} |
|
} |
|
|
|
/// Retrieves the next token from the lexer. |
|
// We intentionally don't implement Iterator trait as Result<Option> is cleaner to handle. |
|
#[allow(clippy::should_implement_trait)] |
|
pub fn next(&mut self) -> Result<Option<Token>, Error> |
|
where |
|
R: Read, |
|
{ |
|
let _timer = BoaProfiler::global().start_event("next()", "Lexing"); |
|
|
|
let (start, next_ch) = loop { |
|
let start = self.cursor.pos(); |
|
if let Some(next_ch) = self.cursor.next_char()? { |
|
// Ignore whitespace |
|
if !Self::is_whitespace(next_ch) { |
|
break (start, next_ch); |
|
} |
|
} else { |
|
return Ok(None); |
|
} |
|
}; |
|
|
|
if let Ok(c) = char::try_from(next_ch) { |
|
let token = match c { |
|
'\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new( |
|
TokenKind::LineTerminator, |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start), |
|
'`' => TemplateLiteral.lex(&mut self.cursor, start), |
|
';' => Ok(Token::new( |
|
Punctuator::Semicolon.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
':' => Ok(Token::new( |
|
Punctuator::Colon.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'.' => { |
|
if self.cursor.peek()?.map(|c| (b'0'..=b'9').contains(&c)) == Some(true) { |
|
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) |
|
} else { |
|
SpreadLiteral::new().lex(&mut self.cursor, start) |
|
} |
|
} |
|
'(' => Ok(Token::new( |
|
Punctuator::OpenParen.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
')' => Ok(Token::new( |
|
Punctuator::CloseParen.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
',' => Ok(Token::new( |
|
Punctuator::Comma.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'{' => Ok(Token::new( |
|
Punctuator::OpenBlock.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'}' => Ok(Token::new( |
|
Punctuator::CloseBlock.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'[' => Ok(Token::new( |
|
Punctuator::OpenBracket.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
']' => Ok(Token::new( |
|
Punctuator::CloseBracket.into(), |
|
Span::new(start, self.cursor.pos()), |
|
)), |
|
'/' => self.lex_slash_token(start), |
|
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => { |
|
Operator::new(next_ch as u8).lex(&mut self.cursor, start) |
|
} |
|
_ if c.is_digit(10) => { |
|
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) |
|
} |
|
_ if Identifier::is_identifier_start(c as u32) => { |
|
Identifier::new(c).lex(&mut self.cursor, start) |
|
} |
|
_ => { |
|
let details = format!( |
|
"unexpected '{}' at line {}, column {}", |
|
c, |
|
start.line_number(), |
|
start.column_number() |
|
); |
|
Err(Error::syntax(details, start)) |
|
} |
|
}?; |
|
|
|
if token.kind() == &TokenKind::Comment { |
|
// Skip comment |
|
self.next() |
|
} else { |
|
Ok(Some(token)) |
|
} |
|
} else { |
|
Err(Error::syntax( |
|
format!( |
|
"unexpected utf-8 char '\\u{}' at line {}, column {}", |
|
next_ch, |
|
start.line_number(), |
|
start.column_number() |
|
), |
|
start, |
|
)) |
|
} |
|
} |
|
|
|
pub(crate) fn lex_template(&mut self, start: Position) -> Result<Token, Error> |
|
where |
|
R: Read, |
|
{ |
|
TemplateLiteral.lex(&mut self.cursor, start) |
|
} |
|
} |
|
|
|
/// ECMAScript goal symbols. |
|
/// |
|
/// <https://tc39.es/ecma262/#sec-ecmascript-language-lexical-grammar> |
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
|
pub(crate) enum InputElement { |
|
Div, |
|
RegExp, |
|
RegExpOrTemplateTail, |
|
TemplateTail, |
|
} |
|
|
|
impl Default for InputElement { |
|
fn default() -> Self { |
|
InputElement::RegExp |
|
} |
|
}
|
|
|