Fix unicode escape in identifiers (#1102)

4 years ago · 08f232fe99
5 changed files with 140 additions and 39 deletions
--- a/boa/src/syntax/ast/keyword.rs
+++ b/boa/src/syntax/ast/keyword.rs
@ -199,6 +199,16 @@ pub enum Keyword {
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends
    Extends,

+    /// The `false` keyword.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+    False,
+
    /// The `finally` keyword.
    ///
    /// More information:
@ -301,6 +311,16 @@ pub enum Keyword {
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new
    New,

+    /// The `null` keyword.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-NullLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null
+    Null,
+
    /// The `of` keyword.
    ///
    /// More information:
@ -369,6 +389,16 @@ pub enum Keyword {
    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions
    Throw,

+    /// The `true` keyword
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///  - [MDN documentation][mdn]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral
+    /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean
+    True,
+
    /// The `try` keyword.
    ///
    /// More information:
@ -479,6 +509,7 @@ impl Keyword {
            Self::Enum => "enum",
            Self::Extends => "extends",
            Self::Export => "export",
+            Self::False => "false",
            Self::Finally => "finally",
            Self::For => "for",
            Self::Function => "function",
@ -488,12 +519,14 @@ impl Keyword {
            Self::Import => "import",
            Self::Let => "let",
            Self::New => "new",
+            Self::Null => "null",
            Self::Of => "of",
            Self::Return => "return",
            Self::Super => "super",
            Self::Switch => "switch",
            Self::This => "this",
            Self::Throw => "throw",
+            Self::True => "true",
            Self::Try => "try",
            Self::TypeOf => "typeof",
            Self::Var => "var",
@ -552,6 +585,7 @@ impl FromStr for Keyword {
            "enum" => Ok(Self::Enum),
            "extends" => Ok(Self::Extends),
            "export" => Ok(Self::Export),
+            "false" => Ok(Self::False),
            "finally" => Ok(Self::Finally),
            "for" => Ok(Self::For),
            "function" => Ok(Self::Function),
@ -561,12 +595,14 @@ impl FromStr for Keyword {
            "import" => Ok(Self::Import),
            "let" => Ok(Self::Let),
            "new" => Ok(Self::New),
+            "null" => Ok(Self::Null),
            "of" => Ok(Self::Of),
            "return" => Ok(Self::Return),
            "super" => Ok(Self::Super),
            "switch" => Ok(Self::Switch),
            "this" => Ok(Self::This),
            "throw" => Ok(Self::Throw),
+            "true" => Ok(Self::True),
            "try" => Ok(Self::Try),
            "typeof" => Ok(Self::TypeOf),
            "var" => Ok(Self::Var),
--- a/boa/src/syntax/lexer/cursor.rs
+++ b/boa/src/syntax/lexer/cursor.rs
@ -130,6 +130,7 @@ where
    /// predicate on the ascii char
    ///
    /// The buffer is not incremented.
+    #[allow(dead_code)]
    #[inline]
    pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
    where
@ -191,6 +192,7 @@ where
    /// It also stops when there is no next character.
    ///
    /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    #[allow(dead_code)]
    pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
    where
        F: Fn(u32) -> bool,
--- a/boa/src/syntax/lexer/identifier.rs
+++ b/boa/src/syntax/lexer/identifier.rs
@ -5,7 +5,7 @@ use crate::{
    profiler::BoaProfiler,
    syntax::{
        ast::{Keyword, Position, Span},
-        lexer::{Token, TokenKind},
+        lexer::{StringLiteral, Token, TokenKind},
    },
 };
 use boa_unicode::UnicodeProperties;
@ -86,43 +86,100 @@ impl<R> Tokenizer<R> for Identifier {
    {
        let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");

-        let mut init_buf = [0u8; 4];
-        let mut buf = Vec::new();
-        self.init.encode_utf8(&mut init_buf);
-        buf.extend(init_buf.iter().take(self.init.len_utf8()));
-
-        cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;
-
-        let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
-        let tk = match token_str {
-            "true" => TokenKind::BooleanLiteral(true),
-            "false" => TokenKind::BooleanLiteral(false),
-            "null" => TokenKind::NullLiteral,
-            slice => {
-                if let Ok(keyword) = slice.parse() {
-                    if cursor.strict_mode() && keyword == Keyword::With {
-                        return Err(Error::Syntax(
-                            "using 'with' statement not allowed in strict mode".into(),
-                            start_pos,
-                        ));
-                    }
-                    TokenKind::Keyword(keyword)
-                } else {
-                    if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) {
-                        return Err(Error::Syntax(
-                            format!(
-                                "using future reserved keyword '{}' not allowed in strict mode",
-                                slice
-                            )
-                            .into(),
-                            start_pos,
-                        ));
-                    }
-                    TokenKind::identifier(slice)
-                }
+        let (identifier_name, contains_escaped_chars) =
+            Self::take_identifier_name(cursor, start_pos, self.init)?;
+
+        let token_kind = if let Ok(keyword) = identifier_name.parse() {
+            if contains_escaped_chars {
+                return Err(Error::Syntax(
+                    "unicode escaped characters are not allowed in keyword".into(),
+                    start_pos,
+                ));
+            }
+
+            if cursor.strict_mode() && keyword == Keyword::With {
+                return Err(Error::Syntax(
+                    "using 'with' statement not allowed in strict mode".into(),
+                    start_pos,
+                ));
+            }
+
+            match keyword {
+                Keyword::True => TokenKind::BooleanLiteral(true),
+                Keyword::False => TokenKind::BooleanLiteral(false),
+                Keyword::Null => TokenKind::NullLiteral,
+                _ => TokenKind::Keyword(keyword),
            }
+        } else {
+            if cursor.strict_mode()
+                && STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str())
+            {
+                return Err(Error::Syntax(
+                    format!(
+                        "using future reserved keyword '{}' not allowed in strict mode",
+                        identifier_name
+                    )
+                    .into(),
+                    start_pos,
+                ));
+            }
+            TokenKind::identifier(identifier_name.into_boxed_str())
        };

-        Ok(Token::new(tk, Span::new(start_pos, cursor.pos())))
+        Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos())))
+    }
+}
+
+impl Identifier {
+    #[inline]
+    fn take_identifier_name<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        init: char,
+    ) -> Result<(String, bool), Error>
+    where
+        R: Read,
+    {
+        let mut contains_escaped_chars = false;
+        let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? {
+            let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?;
+
+            if Self::is_identifier_start(ch) {
+                contains_escaped_chars = true;
+                String::from(char::try_from(ch).unwrap())
+            } else {
+                return Err(Error::Syntax("invalid identifier start".into(), start_pos));
+            }
+        } else {
+            // The caller guarantees that `init` is a valid identifier start
+            String::from(init)
+        };
+
+        loop {
+            let ch = match cursor.peek_char()? {
+                Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => {
+                    let pos = cursor.pos();
+                    let _ = cursor.next_byte();
+                    let _ = cursor.next_byte();
+                    let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?;
+
+                    if Self::is_identifier_part(ch) {
+                        contains_escaped_chars = true;
+                        ch
+                    } else {
+                        return Err(Error::Syntax("invalid identifier part".into(), pos));
+                    }
+                }
+                Some(ch) if Self::is_identifier_part(ch) => {
+                    let _ = cursor.next_char()?;
+                    ch
+                },
+                _ => break,
+            };
+
+            identifier_name.push(char::try_from(ch).unwrap());
+        }
+
+        Ok((identifier_name, contains_escaped_chars))
    }
 }
--- a/boa/src/syntax/lexer/mod.rs
+++ b/boa/src/syntax/lexer/mod.rs
@ -246,12 +246,15 @@ impl<R> Lexer<R> {
                '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
                    Operator::new(next_ch as u8).lex(&mut self.cursor, start)
                }
-                _ if c.is_digit(10) => {
-                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                '\\' if self.cursor.peek()? == Some(b'u') => {
+                    Identifier::new(c).lex(&mut self.cursor, start)
                }
                _ if Identifier::is_identifier_start(c as u32) => {
                    Identifier::new(c).lex(&mut self.cursor, start)
                }
+                _ if c.is_digit(10) => {
+                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
                _ => {
                    let details = format!(
                        "unexpected '{}' at line {}, column {}",
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@ -73,7 +73,7 @@ fn check_multi_line_comment() {

 #[test]
 fn check_identifier() {
-    let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
+    let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}";
    let mut lexer = Lexer::new(s.as_bytes());

    let expected = [
@ -86,6 +86,9 @@ fn check_identifier() {
        TokenKind::identifier("Ѐ"),
        TokenKind::identifier("ЀЀ"),
        TokenKind::identifier("x\u{200C}\u{200D}"),
+        TokenKind::identifier("x"),
+        TokenKind::identifier("xx"),
+        TokenKind::identifier("xxx"),
    ];

    expect_tokens(&mut lexer, &expected);