diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 6f20599811..b4542a70d4 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -58,171 +58,273 @@ impl Tokenizer for StringLiteral { let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let (lit, span) = - unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; + Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?; Ok(Token::new(TokenKind::string_literal(lit), span)) } } -pub(super) fn unescape_string( - cursor: &mut Cursor, - start_pos: Position, - terminator: StringTerminator, - strict_mode: bool, -) -> Result<(String, Span), Error> -where - R: Read, -{ - let mut buf = Vec::new(); - loop { - let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); - - match next_chr { - Some('\'') if terminator == StringTerminator::SingleQuote => { - break; - } - Some('"') if terminator == StringTerminator::DoubleQuote => { - break; - } - Some('\\') => { - let _timer = - BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing"); +impl StringLiteral { + /// Checks if a character is LineTerminator as per ECMAScript standards. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// + /// [spec]: https://tc39.es/ecma262/#prod-LineTerminator + #[inline] + pub(super) fn is_line_terminator(ch: char) -> bool { + matches!( + ch, + '\u{000A}' /* */ | '\u{000D}' /* */ | '\u{2028}' /* */ | '\u{2029}' /* */ + ) + } - let escape = cursor.peek()?.ok_or_else(|| { - Error::from(io::Error::new( - ErrorKind::UnexpectedEof, - "unterminated escape sequence in literal", - )) - })?; + pub(super) fn take_string_characters( + cursor: &mut Cursor, + start_pos: Position, + terminator: StringTerminator, + strict_mode: bool, + ) -> Result<(String, Span), Error> + where + R: Read, + { + let mut buf = Vec::new(); + loop { + let ch_start_pos = cursor.pos(); + let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap(); + + match ch { + Some('\'') if terminator == StringTerminator::SingleQuote => { + break; + } + Some('"') if terminator == StringTerminator::DoubleQuote => { + break; + } + None if terminator == StringTerminator::End => { + break; + } + Some('\\') => { + let _timer = BoaProfiler::global() + .start_event("StringLiteral - escape sequence", "Lexing"); - if escape <= 0x7f { - let _ = cursor.next_byte()?; - match escape { - b'\n' => (), - b'n' => buf.push('\n' as u16), - b'r' => buf.push('\r' as u16), - b't' => buf.push('\t' as u16), - b'b' => buf.push('\x08' as u16), - b'f' => buf.push('\x0c' as u16), - b'0' => buf.push('\0' as u16), - b'x' => { - let mut code_point_utf8_bytes = [0u8; 2]; - cursor.fill_bytes(&mut code_point_utf8_bytes)?; - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Hexadecimal character escape sequence"); - let code_point = - u16::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax( - "invalid Hexadecimal escape sequence", - cursor.pos(), - ) - })?; - - buf.push(code_point); + let escape_ch = cursor + .next_char()? + .and_then(|byte| char::try_from(byte).ok()) + .ok_or_else(|| { + Error::from(io::Error::new( + ErrorKind::UnexpectedEof, + "unterminated escape sequence in literal", + )) + })?; + + match escape_ch { + 'b' => buf.push(0x0008 /* */), + 't' => buf.push(0x0009 /* */), + 'n' => buf.push(0x000A /* */), + 'v' => buf.push(0x000B /* */), + 'f' => buf.push(0x000C /* */), + 'r' => buf.push(0x000D /* */), + '"' => buf.push(0x0022 /* " */), + '\'' => buf.push(0x0027 /* ' */), + '\\' => buf.push(0x005C /* \ */), + '0' if cursor + .peek()? + .filter(|next_byte| (b'0'..=b'9').contains(next_byte)) + .is_none() => + { + buf.push(0x0000 /* NULL */) } - b'u' => { - // Support \u{X..X} (Unicode Codepoint) - if cursor.next_is(b'{')? { - // TODO: use bytes for a bit better performance (using stack) - let mut code_point_buf = Vec::with_capacity(6); - cursor.take_until(b'}', &mut code_point_buf)?; - - let code_point_str = - unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; - // We know this is a single unicode codepoint, convert to u32 - let code_point = - u32::from_str_radix(&code_point_str, 16).map_err(|_| { - Error::syntax( - "malformed Unicode character escape sequence", - cursor.pos(), - ) - })?; - - // UTF16Encoding of a numeric code point value - if code_point > 0x10_FFFF { - return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos())); - } else if code_point <= 65535 { - buf.push(code_point as u16); - } else { - let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; - let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; - buf.push(cu1); - buf.push(cu2); - } - } else { - // Collect each character after \u e.g \uD83D will give "D83D" - let mut code_point_utf8_bytes = [0u8; 4]; - cursor.fill_bytes(&mut code_point_utf8_bytes)?; - - // Convert to u16 - let code_point_str = str::from_utf8(&code_point_utf8_bytes) - .expect("malformed Unicode character escape sequence"); - let code_point = - u16::from_str_radix(code_point_str, 16).map_err(|_| { - Error::syntax( - "invalid Unicode escape sequence", - cursor.pos(), - ) - })?; - - buf.push(code_point); - } + 'x' => { + Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?; + } + 'u' => { + Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?; } - n if char::is_digit(char::from(n), 8) => { + '8' | '9' => { + // Grammar: NonOctalDecimalEscapeSequence if strict_mode { return Err(Error::syntax( - "octal escape sequences are deprecated", - cursor.pos(), + "\\8 and \\9 are not allowed in strict mode", + ch_start_pos, )); + } else { + buf.push(escape_ch as u16); } - let mut o = char::from(n).to_digit(8).unwrap(); - - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { - let _ = cursor.next_byte()?; - o = o * 8 + char::from(n).to_digit(8).unwrap(); - if n <= b'3' { - match cursor.peek()? { - Some(c) if char::is_digit(char::from(c), 8) => { - let _ = cursor.next_byte(); - o = o * 8 + char::from(n).to_digit(8).unwrap(); - } - _ => (), - } - } - } - _ => (), + } + _ if escape_ch.is_digit(8) => { + Self::take_legacy_octal_escape_sequence( + cursor, + ch_start_pos, + Some(&mut buf), + strict_mode, + escape_ch as u8, + )?; + } + _ if Self::is_line_terminator(escape_ch) => { + // Grammar: LineContinuation + // Grammar: \ LineTerminatorSequence + // LineContinuation is the empty String. Do nothing and continue lexing. + } + _ => { + if escape_ch.len_utf16() == 1 { + buf.push(escape_ch as u16); + } else { + buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter()); } - buf.push(o as u16); } - _ => buf.push(escape as u16), }; } + Some(ch) => { + if ch.len_utf16() == 1 { + buf.push(ch as u16); + } else { + buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter()); + } + } + None => { + return Err(Error::from(io::Error::new( + ErrorKind::UnexpectedEof, + "unterminated string literal", + ))); + } } - Some(next_ch) => { - if next_ch.len_utf16() == 1 { - buf.push(next_ch as u16); - } else { - let mut code_point_bytes_buf = [0u16; 2]; - let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); + } + + Ok(( + String::from_utf16_lossy(buf.as_slice()), + Span::new(start_pos, cursor.pos()), + )) + } + + #[inline] + pub(super) fn take_unicode_escape_sequence( + cursor: &mut Cursor, + start_pos: Position, + code_units_buf: Option<&mut Vec>, + ) -> Result + where + R: Read, + { + // Support \u{X..X} (Unicode CodePoint) + if cursor.next_is(b'{')? { + // TODO: use bytes for a bit better performance (using stack) + let mut code_point_buf = Vec::with_capacity(6); + cursor.take_until(b'}', &mut code_point_buf)?; + + let code_point = str::from_utf8(code_point_buf.as_slice()) + .ok() + .and_then(|code_point_str| { + // The `code_point_str` should represent a single unicode codepoint, convert to u32 + u32::from_str_radix(&code_point_str, 16).ok() + }) + .ok_or_else(|| { + Error::syntax("malformed Unicode character escape sequence", start_pos) + })?; - buf.extend(code_point_bytes.iter()); + // UTF16Encoding of a numeric code point value + if code_point > 0x10_FFFF { + return Err(Error::syntax( + "Unicode codepoint must not be greater than 0x10FFFF in escape sequence", + start_pos, + )); + } else if let Some(code_units_buf) = code_units_buf { + if code_point <= 65535 { + code_units_buf.push(code_point as u16); + } else { + let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; + let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; + code_units_buf.push(cu1); + code_units_buf.push(cu2); } } - None if terminator != StringTerminator::End => { - return Err(Error::from(io::Error::new( - ErrorKind::UnexpectedEof, - "unterminated string literal", - ))); - } - None => { - break; + + Ok(code_point) + } else { + // Grammar: Hex4Digits + // Collect each character after \u e.g \uD83D will give "D83D" + let mut code_point_utf8_bytes = [0u8; 4]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + + // Convert to u16 + let code_point = str::from_utf8(&code_point_utf8_bytes) + .ok() + .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok()) + .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?; + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point); } + + Ok(code_point as u32) + } + } + + #[inline] + fn take_hex_escape_sequence( + cursor: &mut Cursor, + start_pos: Position, + code_units_buf: Option<&mut Vec>, + ) -> Result + where + R: Read, + { + let mut code_point_utf8_bytes = [0u8; 2]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + let code_point = str::from_utf8(&code_point_utf8_bytes) + .ok() + .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok()) + .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?; + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point); } + + Ok(code_point as u32) } - Ok(( - String::from_utf16_lossy(buf.as_slice()), - Span::new(start_pos, cursor.pos()), - )) + #[inline] + fn take_legacy_octal_escape_sequence( + cursor: &mut Cursor, + start_pos: Position, + code_units_buf: Option<&mut Vec>, + strict_mode: bool, + init_byte: u8, + ) -> Result + where + R: Read, + { + if strict_mode { + return Err(Error::syntax( + "octal escape sequences are not allowed in strict mode", + start_pos, + )); + } + // Grammar: OctalDigit + let mut code_point = (init_byte - b'0') as u32; + + // Grammar: ZeroToThree OctalDigit + // Grammar: FourToSeven OctalDigit + if let Some(byte) = cursor.peek()? { + if (b'0'..=b'7').contains(&byte) { + let _ = cursor.next_byte()?; + code_point = (code_point * 8) + (byte - b'0') as u32; + + if (b'0'..=b'3').contains(&init_byte) { + // Grammar: ZeroToThree OctalDigit OctalDigit + if let Some(byte) = cursor.peek()? { + if (b'0'..=b'7').contains(&byte) { + let _ = cursor.next_byte()?; + code_point = (code_point * 8) + (byte - b'0') as u32; + } + } + } + } + } + + if let Some(code_units_buf) = code_units_buf { + code_units_buf.push(code_point as u16); + } + + Ok(code_point) + } } diff --git a/boa/src/syntax/lexer/template.rs b/boa/src/syntax/lexer/template.rs index a34ba02523..ecec7a7387 100644 --- a/boa/src/syntax/lexer/template.rs +++ b/boa/src/syntax/lexer/template.rs @@ -3,7 +3,7 @@ use super::{Cursor, Error, Tokenizer}; use crate::{ profiler::BoaProfiler, - syntax::lexer::string::{unescape_string, StringTerminator}, + syntax::lexer::string::{StringLiteral, StringTerminator}, syntax::{ ast::{Position, Span}, lexer::{Token, TokenKind}, @@ -44,7 +44,7 @@ impl Tokenizer for TemplateLiteral { match next_chr { '`' => { let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = unescape_string( + let (cooked, _) = StringLiteral::take_string_characters( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, @@ -58,7 +58,7 @@ impl Tokenizer for TemplateLiteral { '$' if cursor.peek()? == Some(b'{') => { let _ = cursor.next_byte()?; let raw = String::from_utf16_lossy(buf.as_slice()); - let (cooked, _) = unescape_string( + let (cooked, _) = StringLiteral::take_string_characters( &mut Cursor::with_position(raw.as_bytes(), start_pos), start_pos, StringTerminator::End, diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index f54b8f4b33..7ef4a34bc0 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -6,7 +6,7 @@ use super::token::Numeric; use super::*; use super::{Error, Position}; use crate::syntax::ast::Keyword; -use crate::syntax::lexer::string::{unescape_string, StringTerminator}; +use crate::syntax::lexer::string::{StringLiteral, StringTerminator}; use std::str; fn span(start: (u32, u32), end: (u32, u32)) -> Span { @@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() { } #[test] -fn codepoint_with_no_braces() { +fn string_codepoint_with_no_braces() { let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]); assert!(lexer.next().is_ok()); } @@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() { } #[test] -fn non_english_str() { +fn string_unicode() { let str = r#"'中文';"#; let mut lexer = Lexer::new(str.as_bytes()); @@ -828,7 +828,7 @@ fn non_english_str() { } #[test] -fn unicode_escape_with_braces() { +fn string_unicode_escape_with_braces() { let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]); let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())]; @@ -859,12 +859,12 @@ fn unicode_escape_with_braces() { } #[test] -fn unicode_escape_with_braces_() { +fn take_string_characters_unicode_escape_with_braces_2() { let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); - if let Ok((s, _)) = unescape_string( + if let Ok((s, _)) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() { } #[test] -fn unescape_string_with_single_escape() { +fn take_string_characters_with_single_escape() { let s = r#"\Б"#.to_string(); let mut cursor = Cursor::new(s.as_bytes()); - let (s, _) = unescape_string( + let (s, _) = StringLiteral::take_string_characters( &mut cursor, Position::new(1, 1), StringTerminator::End, @@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() { assert_eq!(s, "Б"); } +#[test] +fn take_string_characters_legacy_octal_escape() { + let test_cases = [ + (r#"\3"#, "\u{3}"), + (r#"\03"#, "\u{3}"), + (r#"\003"#, "\u{3}"), + (r#"\0003"#, "\u{0}3"), + (r#"\43"#, "#"), + (r#"\043"#, "#"), + (r#"\101"#, "A"), + ]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } + + for (s, _) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + + if let Error::Syntax(_, pos) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + true, + ) + .expect_err("Octal-escape in strict mode not rejected as expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } + } +} + +#[test] +fn take_string_characters_zero_escape() { + let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } +} + +#[test] +fn take_string_characters_non_octal_decimal_escape() { + let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")]; + + for (s, expected) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, *expected); + } + + for (s, _) in test_cases.iter() { + let mut cursor = Cursor::new(s.as_bytes()); + + if let Error::Syntax(_, pos) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + true, + ) + .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected") + { + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("invalid error type"); + } + } +} + +#[test] +fn take_string_characters_line_continuation() { + let s = "hello \\\nworld"; + let mut cursor = Cursor::new(s.as_bytes()); + let (s, _) = StringLiteral::take_string_characters( + &mut cursor, + Position::new(1, 1), + StringTerminator::End, + false, + ) + .unwrap(); + + assert_eq!(s, "hello world"); +} + mod carriage_return { use super::*;