From 6ddc2b47ff397e7a4bfad64b897576d5d503b566 Mon Sep 17 00:00:00 2001 From: raskad <32105367+raskad@users.noreply.github.com> Date: Thu, 1 Feb 2024 09:49:07 +0100 Subject: [PATCH] Fix line terminators in template strings (#3641) --- core/parser/src/lexer/cursor.rs | 11 -- core/parser/src/lexer/template.rs | 165 +++++++++++++----- core/parser/src/lexer/tests.rs | 2 +- core/parser/src/lexer/token.rs | 2 +- .../expression/left_hand_side/template.rs | 8 +- .../src/parser/expression/primary/mod.rs | 23 ++- .../parser/expression/primary/template/mod.rs | 20 ++- 7 files changed, 158 insertions(+), 73 deletions(-) diff --git a/core/parser/src/lexer/cursor.rs b/core/parser/src/lexer/cursor.rs index 20efc3ab07..e3f8eae0b4 100644 --- a/core/parser/src/lexer/cursor.rs +++ b/core/parser/src/lexer/cursor.rs @@ -68,17 +68,6 @@ impl Cursor { } } - /// Creates a new Lexer cursor with an initial position. - pub(super) fn with_position(inner: R, pos: Position) -> Self { - Self { - iter: inner, - pos, - strict: false, - module: false, - peeked: [None; 4], - } - } - /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). pub(super) fn peek_n(&mut self, n: u8) -> Result<&[Option; 4], Error> { let _timer = Profiler::global().start_event("cursor::peek_n()", "Lexing"); diff --git a/core/parser/src/lexer/template.rs b/core/parser/src/lexer/template.rs index 59b85790db..60123e43a0 100644 --- a/core/parser/src/lexer/template.rs +++ b/core/parser/src/lexer/template.rs @@ -1,12 +1,8 @@ //! Boa's lexing for ECMAScript template literals. -use crate::source::ReadChar; use crate::{ - lexer::{ - string::{StringLiteral, UTF16CodeUnitsBuffer}, - Cursor, Error, Token, TokenKind, Tokenizer, - }, - source::UTF8Input, + lexer::{string::UTF16CodeUnitsBuffer, Cursor, Error, Token, TokenKind, Tokenizer}, + source::ReadChar, }; use boa_ast::{Position, Span}; use boa_interner::{Interner, Sym}; @@ -16,17 +12,30 @@ use std::io::{self, ErrorKind}; #[cfg_attr(feature = "deser", derive(serde::Serialize, serde::Deserialize))] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct TemplateString { - /// The template string of template literal with argument `raw` true. + /// The raw template string. raw: Sym, - /// The start position of the template string. Used to make lexer error if `to_owned_cooked` - /// failed. - start_pos: Position, + + /// The cooked template string. + cooked: Option, } impl TemplateString { /// Creates a new `TemplateString` with the given raw template ans start position. - pub const fn new(raw: Sym, start_pos: Position) -> Self { - Self { raw, start_pos } + pub fn new(raw: Sym, interner: &mut Interner) -> Self { + Self { + raw: Self::as_raw(raw, interner), + cooked: Self::as_cooked(raw, interner), + } + } + + /// Returns the raw template string. + pub fn raw(self) -> Sym { + self.raw + } + + /// Returns the cooked template string if it exists. + pub fn cooked(self) -> Option { + self.cooked } /// Converts the raw template string into a mutable string slice. @@ -35,8 +44,34 @@ impl TemplateString { /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#sec-static-semantics-templatestrings - pub const fn as_raw(self) -> Sym { - self.raw + fn as_raw(raw: Sym, interner: &mut Interner) -> Sym { + let string = interner.resolve_expect(raw).utf16(); + let mut iter = string.iter().peekable(); + let mut buf: Vec = Vec::new(); + loop { + match iter.next() { + Some(0x5C /* \ */) => { + buf.push_code_point(0x5C); + match iter.next() { + Some(0x0D /* */) => { + buf.push_code_point(0x0A); + } + Some(ch) => { + buf.push_code_point(u32::from(*ch)); + } + None => break, + } + } + Some(0x0D /* */) => { + buf.push_code_point(0x0A); + } + Some(ch) => { + buf.push_code_point(u32::from(*ch)); + } + None => break, + } + } + interner.get_or_intern(buf.as_slice()) } /// Creates a new cooked template string. Returns a lexer error if it fails to cook the @@ -46,39 +81,91 @@ impl TemplateString { /// - [ECMAScript reference][spec] /// /// [spec]: https://tc39.es/ecma262/#sec-static-semantics-templatestrings - pub fn to_owned_cooked(self, interner: &mut Interner) -> Result { - let string = interner.resolve_expect(self.raw).to_string(); - let mut cursor = Cursor::with_position(UTF8Input::new(string.as_bytes()), self.start_pos); + fn as_cooked(raw: Sym, interner: &mut Interner) -> Option { + let string = interner.resolve_expect(raw).utf16(); + let mut iter = string.iter().peekable(); let mut buf: Vec = Vec::new(); loop { - let ch_start_pos = cursor.pos(); - let ch = cursor.next_char()?; - - match ch { - Some(0x005C /* \ */) => { - let escape_value = StringLiteral::take_escape_sequence_or_line_continuation( - &mut cursor, - ch_start_pos, - true, - true, - )?; - - if let (Some(escape_value), _) = escape_value { - buf.push_code_point(escape_value); - } + match iter.next() { + Some(0x5C /* \ */) => { + let escape_value = match iter.next() { + Some(0x62 /* b */) => 0x08 /* */, + Some(0x74 /* t */) => 0x09 /* */, + Some(0x6E /* n */) => 0x0A /* */, + Some(0x76 /* v */) => 0x0B /* */, + Some(0x66 /* f */) => 0x0C /* */, + Some(0x72 /* r */) => 0x0D /* */, + Some(0x22 /* " */) => 0x22 /* " */, + Some(0x27 /* ' */) => 0x27 /* ' */, + Some(0x5C /* \ */) => 0x5C /* \ */, + Some(0x30 /* 0 */) if iter + .peek() + .filter(|ch| (0x30..=0x39 /* 0..=9 */).contains(**ch)) + .is_none() => 0x00 /* NULL */, + // Hex Escape + Some(0x078 /* x */) => { + let mut s = String::with_capacity(2); + s.push(char::from_u32(u32::from(*iter.next()?))?); + s.push(char::from_u32(u32::from(*iter.next()?))?); + u16::from_str_radix(&s, 16).ok()?.into() + } + // Unicode Escape + Some(0x75 /* u */) => { + let next = *iter.next()?; + if next == 0x7B /* { */ { + let mut buffer = String::with_capacity(6); + loop { + let next = *iter.next()?; + if next == 0x7D /* } */ { + break; + } + buffer.push(char::from_u32(u32::from(next))?); + } + let cp = u32::from_str_radix(&buffer, 16).ok()?; + if cp > 0x10_FFFF { + return None; + } + cp + } else { + let mut s = String::with_capacity(4); + s.push(char::from_u32(u32::from(next))?); + s.push(char::from_u32(u32::from(*iter.next()?))?); + s.push(char::from_u32(u32::from(*iter.next()?))?); + s.push(char::from_u32(u32::from(*iter.next()?))?); + u16::from_str_radix(&s, 16).ok()?.into() + } + } + // NonOctalDecimalEscapeSequence + Some(0x38 /* 8 */ | 0x39 /* 9 */) => { + return None; + } + // LegacyOctalEscapeSequence + Some(ch) if (0x30..=0x37 /* '0'..='7' */).contains(ch) => { + return None; + } + // Line Terminator + Some(0x0A /* */ | 0x0D /* */ | 0x2028 /* */ | 0x2029 /* */) => { + continue; + } + Some(ch) => { + u32::from(*ch) + } + None => return None, + }; + buf.push_code_point(escape_value); + } + Some(0x0D /* */) => { + buf.push_code_point(0x0A); } Some(ch) => { - // The caller guarantees that sequences '`' and '${' never appear - // LineTerminatorSequence is consumed by `cursor.next_char()` and - // returns , which matches the TV of - buf.push_code_point(ch); + buf.push_code_point(u32::from(*ch)); } None => break, } } - Ok(interner.get_or_intern(&buf[..])) + Some(interner.get_or_intern(buf.as_slice())) } } @@ -120,7 +207,7 @@ impl Tokenizer for TemplateLiteral { // ` 0x0060 => { let raw_sym = interner.get_or_intern(&buf[..]); - let template_string = TemplateString::new(raw_sym, start_pos); + let template_string = TemplateString::new(raw_sym, interner); return Ok(Token::new( TokenKind::template_no_substitution(template_string), @@ -130,7 +217,7 @@ impl Tokenizer for TemplateLiteral { // $ 0x0024 if cursor.next_if(0x7B /* { */)? => { let raw_sym = interner.get_or_intern(&buf[..]); - let template_string = TemplateString::new(raw_sym, start_pos); + let template_string = TemplateString::new(raw_sym, interner); return Ok(Token::new( TokenKind::template_middle(template_string), diff --git a/core/parser/src/lexer/tests.rs b/core/parser/src/lexer/tests.rs index f775d21f2a..9cebcae7cf 100644 --- a/core/parser/src/lexer/tests.rs +++ b/core/parser/src/lexer/tests.rs @@ -169,7 +169,7 @@ fn check_template_literal_simple() { assert_eq!( lexer.next(interner).unwrap().unwrap().kind(), - &TokenKind::template_no_substitution(TemplateString::new(sym, Position::new(1, 1))) + &TokenKind::template_no_substitution(TemplateString::new(sym, interner)) ); } diff --git a/core/parser/src/lexer/token.rs b/core/parser/src/lexer/token.rs index 7f5002fc9c..ff51dce5e9 100644 --- a/core/parser/src/lexer/token.rs +++ b/core/parser/src/lexer/token.rs @@ -273,7 +273,7 @@ impl TokenKind { Self::Punctuator(punc) => punc.to_string(), Self::StringLiteral((lit, _)) => interner.resolve_expect(lit).to_string(), Self::TemplateNoSubstitution(ts) | Self::TemplateMiddle(ts) => { - interner.resolve_expect(ts.as_raw()).to_string() + interner.resolve_expect(ts.raw()).to_string() } Self::RegularExpressionLiteral(body, flags) => { format!( diff --git a/core/parser/src/parser/expression/left_hand_side/template.rs b/core/parser/src/parser/expression/left_hand_side/template.rs index 352cd20b5a..db160d14bc 100644 --- a/core/parser/src/parser/expression/left_hand_side/template.rs +++ b/core/parser/src/parser/expression/left_hand_side/template.rs @@ -64,8 +64,8 @@ where loop { match token.kind() { TokenKind::TemplateMiddle(template_string) => { - raws.push(template_string.as_raw()); - cookeds.push(template_string.to_owned_cooked(interner).ok()); + raws.push(template_string.raw()); + cookeds.push(template_string.cooked()); exprs.push( Expression::new(None, true, self.allow_yield, self.allow_await) .parse(cursor, interner)?, @@ -77,8 +77,8 @@ where )?; } TokenKind::TemplateNoSubstitution(template_string) => { - raws.push(template_string.as_raw()); - cookeds.push(template_string.to_owned_cooked(interner).ok()); + raws.push(template_string.raw()); + cookeds.push(template_string.cooked()); return Ok(TaggedTemplate::new( self.tag, raws.into_boxed_slice(), diff --git a/core/parser/src/parser/expression/primary/mod.rs b/core/parser/src/parser/expression/primary/mod.rs index e53c63ba04..241ffa9656 100644 --- a/core/parser/src/parser/expression/primary/mod.rs +++ b/core/parser/src/parser/expression/primary/mod.rs @@ -215,12 +215,13 @@ where Ok(node) } TokenKind::TemplateNoSubstitution(template_string) => { - let node = Literal::from( - template_string - .to_owned_cooked(interner) - .map_err(Error::lex)?, - ) - .into(); + let Some(cooked) = template_string.cooked() else { + return Err(Error::general( + "invalid escape in template literal", + tok.span().start(), + )); + }; + let node = Literal::from(cooked).into(); cursor.advance(interner); Ok(node) } @@ -261,13 +262,17 @@ where } } TokenKind::TemplateMiddle(template_string) => { + let Some(cooked) = template_string.cooked() else { + return Err(Error::general( + "invalid escape in template literal", + tok.span().start(), + )); + }; let parser = TemplateLiteral::new( self.allow_yield, self.allow_await, tok.span().start(), - template_string - .to_owned_cooked(interner) - .map_err(Error::lex)?, + cooked, ); cursor.advance(interner); parser.parse(cursor, interner).map(Into::into) diff --git a/core/parser/src/parser/expression/primary/template/mod.rs b/core/parser/src/parser/expression/primary/template/mod.rs index c71a5819d9..5a7b0671e8 100644 --- a/core/parser/src/parser/expression/primary/template/mod.rs +++ b/core/parser/src/parser/expression/primary/template/mod.rs @@ -77,10 +77,12 @@ where loop { match cursor.lex_template(self.start, interner)?.kind() { TokenKind::TemplateMiddle(template_string) => { - let cooked = template_string - .to_owned_cooked(interner) - .map_err(Error::lex)?; - + let Some(cooked) = template_string.cooked() else { + return Err(Error::general( + "invalid escape in template literal", + self.start, + )); + }; elements.push(TemplateElement::String(cooked)); elements.push(TemplateElement::Expr( Expression::new(None, true, self.allow_yield, self.allow_await) @@ -93,10 +95,12 @@ where )?; } TokenKind::TemplateNoSubstitution(template_string) => { - let cooked = template_string - .to_owned_cooked(interner) - .map_err(Error::lex)?; - + let Some(cooked) = template_string.cooked() else { + return Err(Error::general( + "invalid escape in template literal", + self.start, + )); + }; elements.push(TemplateElement::String(cooked)); return Ok(literal::TemplateLiteral::new(elements.into())); }