Browse Source

Fix unicode escape in identifiers (#1102)

pull/1278/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
08f232fe99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 36
      boa/src/syntax/ast/keyword.rs
  2. 2
      boa/src/syntax/lexer/cursor.rs
  3. 129
      boa/src/syntax/lexer/identifier.rs
  4. 7
      boa/src/syntax/lexer/mod.rs
  5. 5
      boa/src/syntax/lexer/tests.rs

36
boa/src/syntax/ast/keyword.rs

@ -199,6 +199,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends
Extends,
/// The `false` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
False,
/// The `finally` keyword.
///
/// More information:
@ -301,6 +311,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new
New,
/// The `null` keyword.
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-NullLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null
Null,
/// The `of` keyword.
///
/// More information:
@ -369,6 +389,16 @@ pub enum Keyword {
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions
Throw,
/// The `true` keyword
///
/// More information:
/// - [ECMAScript reference][spec]
/// - [MDN documentation][mdn]
///
/// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
True,
/// The `try` keyword.
///
/// More information:
@ -479,6 +509,7 @@ impl Keyword {
Self::Enum => "enum",
Self::Extends => "extends",
Self::Export => "export",
Self::False => "false",
Self::Finally => "finally",
Self::For => "for",
Self::Function => "function",
@ -488,12 +519,14 @@ impl Keyword {
Self::Import => "import",
Self::Let => "let",
Self::New => "new",
Self::Null => "null",
Self::Of => "of",
Self::Return => "return",
Self::Super => "super",
Self::Switch => "switch",
Self::This => "this",
Self::Throw => "throw",
Self::True => "true",
Self::Try => "try",
Self::TypeOf => "typeof",
Self::Var => "var",
@ -552,6 +585,7 @@ impl FromStr for Keyword {
"enum" => Ok(Self::Enum),
"extends" => Ok(Self::Extends),
"export" => Ok(Self::Export),
"false" => Ok(Self::False),
"finally" => Ok(Self::Finally),
"for" => Ok(Self::For),
"function" => Ok(Self::Function),
@ -561,12 +595,14 @@ impl FromStr for Keyword {
"import" => Ok(Self::Import),
"let" => Ok(Self::Let),
"new" => Ok(Self::New),
"null" => Ok(Self::Null),
"of" => Ok(Self::Of),
"return" => Ok(Self::Return),
"super" => Ok(Self::Super),
"switch" => Ok(Self::Switch),
"this" => Ok(Self::This),
"throw" => Ok(Self::Throw),
"true" => Ok(Self::True),
"try" => Ok(Self::Try),
"typeof" => Ok(Self::TypeOf),
"var" => Ok(Self::Var),

2
boa/src/syntax/lexer/cursor.rs

@ -130,6 +130,7 @@ where
/// predicate on the ascii char
///
/// The buffer is not incremented.
#[allow(dead_code)]
#[inline]
pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
where
@ -191,6 +192,7 @@ where
/// It also stops when there is no next character.
///
/// Note that all characters up until the stop character are added to the buffer, including the character right before.
#[allow(dead_code)]
pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
where
F: Fn(u32) -> bool,

129
boa/src/syntax/lexer/identifier.rs

@ -5,7 +5,7 @@ use crate::{
profiler::BoaProfiler,
syntax::{
ast::{Keyword, Position, Span},
lexer::{Token, TokenKind},
lexer::{StringLiteral, Token, TokenKind},
},
};
use boa_unicode::UnicodeProperties;
@ -86,43 +86,100 @@ impl<R> Tokenizer<R> for Identifier {
{
let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");
let mut init_buf = [0u8; 4];
let mut buf = Vec::new();
self.init.encode_utf8(&mut init_buf);
buf.extend(init_buf.iter().take(self.init.len_utf8()));
cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;
let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let tk = match token_str {
"true" => TokenKind::BooleanLiteral(true),
"false" => TokenKind::BooleanLiteral(false),
"null" => TokenKind::NullLiteral,
slice => {
if let Ok(keyword) = slice.parse() {
if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}
TokenKind::Keyword(keyword)
} else {
if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) {
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
slice
)
.into(),
start_pos,
));
}
TokenKind::identifier(slice)
}
let (identifier_name, contains_escaped_chars) =
Self::take_identifier_name(cursor, start_pos, self.init)?;
let token_kind = if let Ok(keyword) = identifier_name.parse() {
if contains_escaped_chars {
return Err(Error::Syntax(
"unicode escaped characters are not allowed in keyword".into(),
start_pos,
));
}
if cursor.strict_mode() && keyword == Keyword::With {
return Err(Error::Syntax(
"using 'with' statement not allowed in strict mode".into(),
start_pos,
));
}
match keyword {
Keyword::True => TokenKind::BooleanLiteral(true),
Keyword::False => TokenKind::BooleanLiteral(false),
Keyword::Null => TokenKind::NullLiteral,
_ => TokenKind::Keyword(keyword),
}
} else {
if cursor.strict_mode()
&& STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str())
{
return Err(Error::Syntax(
format!(
"using future reserved keyword '{}' not allowed in strict mode",
identifier_name
)
.into(),
start_pos,
));
}
TokenKind::identifier(identifier_name.into_boxed_str())
};
Ok(Token::new(tk, Span::new(start_pos, cursor.pos())))
Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
}
}
impl Identifier {
#[inline]
fn take_identifier_name<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
init: char,
) -> Result<(String, bool), Error>
where
R: Read,
{
let mut contains_escaped_chars = false;
let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? {
let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;
if Self::is_identifier_start(ch) {
contains_escaped_chars = true;
String::from(char::try_from(ch).unwrap())
} else {
return Err(Error::Syntax("invalid identifier start".into(), start_pos));
}
} else {
// The caller guarantees that `init` is a valid identifier start
String::from(init)
};
loop {
let ch = match cursor.peek_char()? {
Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => {
let pos = cursor.pos();
let _ = cursor.next_byte();
let _ = cursor.next_byte();
let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?;
if Self::is_identifier_part(ch) {
contains_escaped_chars = true;
ch
} else {
return Err(Error::Syntax("invalid identifier part".into(), pos));
}
}
Some(ch) if Self::is_identifier_part(ch) => {
let _ = cursor.next_char()?;
ch
},
_ => break,
};
identifier_name.push(char::try_from(ch).unwrap());
}
Ok((identifier_name, contains_escaped_chars))
}
}

7
boa/src/syntax/lexer/mod.rs

@ -246,12 +246,15 @@ impl<R> Lexer<R> {
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
'\\' if self.cursor.peek()? == Some(b'u') => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if Identifier::is_identifier_start(c as u32) => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",

5
boa/src/syntax/lexer/tests.rs

@ -73,7 +73,7 @@ fn check_multi_line_comment() {
#[test]
fn check_identifier() {
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}";
let mut lexer = Lexer::new(s.as_bytes());
let expected = [
@ -86,6 +86,9 @@ fn check_identifier() {
TokenKind::identifier("Ѐ"),
TokenKind::identifier("ЀЀ"),
TokenKind::identifier("x\u{200C}\u{200D}"),
TokenKind::identifier("x"),
TokenKind::identifier("xx"),
TokenKind::identifier("xxx"),
];
expect_tokens(&mut lexer, &expected);

Loading…
Cancel
Save