From 6d866f8012df40903fa0bb664b71d899cc7e1be3 Mon Sep 17 00:00:00 2001 From: tofpie <75836434+tofpie@users.noreply.github.com> Date: Fri, 11 Dec 2020 16:18:12 +0000 Subject: [PATCH] Fix Unicode character escape sequence parsing (#959) Co-authored-by: tofpie --- boa/src/syntax/lexer/string.rs | 4 --- boa/src/syntax/lexer/tests.rs | 47 ++++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index ad85c40e10..ab07cc9552 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -111,14 +111,10 @@ impl Tokenizer for StringLiteral { b'u' => { // Support \u{X..X} (Unicode Codepoint) if cursor.next_is(b'{')? { - cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'. - // TODO: use bytes for a bit better performance (using stack) let mut code_point_buf = Vec::with_capacity(6); cursor.take_until(b'}', &mut code_point_buf)?; - cursor.next_byte()?.expect("} character vanished"); // Consume the '}'. - let code_point_str = unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index 4dd22e194c..accc078c9d 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -259,7 +259,7 @@ fn check_positions() { #[test] fn check_positions_codepoint() { - let s = r#"console.log("hello world\u{{2764}}"); // Test"#; + let s = r#"console.log("hello world\u{2764}"); // Test"#; // --------123456789 let mut lexer = Lexer::new(s.as_bytes()); @@ -281,19 +281,19 @@ fn check_positions_codepoint() { // String token starts on column 13 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 13), (1, 36)) + span((1, 13), (1, 34)) ); - // Close parenthesis token starts on column 36 + // Close parenthesis token starts on column 34 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 36), (1, 37)) + span((1, 34), (1, 35)) ); - // Semi Colon token starts on column 37 + // Semi Colon token starts on column 35 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 37), (1, 38)) + span((1, 35), (1, 36)) ); } @@ -702,10 +702,10 @@ fn codepoint_with_no_braces() { fn illegal_code_point_following_numeric_literal() { // Checks as per https://tc39.es/ecma262/#sec-literals-numeric-literals that a NumericLiteral cannot // be immediately followed by an IdentifierStart where the IdentifierStart - let mut lexer = Lexer::new(&br#"17.4\u{{2764}}"#[..]); + let mut lexer = Lexer::new(&br#"17.4\u{2764}"#[..]); assert!( lexer.next().is_err(), - "IdentifierStart \\u{{2764}} following NumericLiteral not rejected as expected" + "IdentifierStart \\u{2764} following NumericLiteral not rejected as expected" ); } @@ -723,6 +723,37 @@ fn non_english_str() { expect_tokens(&mut lexer, &expected); } +#[test] +fn unicode_escape_with_braces() { + let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]); + + let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())]; + + expect_tokens(&mut lexer, &expected); + + lexer = Lexer::new(&br#"\u{{a0}"#[..]); + + if let Error::Syntax(_, pos) = lexer + .next() + .expect_err("Malformed Unicode character sequence expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } + + lexer = Lexer::new(&br#"\u{{a0}}"#[..]); + + if let Error::Syntax(_, pos) = lexer + .next() + .expect_err("Malformed Unicode character sequence expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } +} + mod carriage_return { use super::*;