Browse Source

Fix lexing escapes in string literal and minor refactor (#1079)

* Refactor StringLiteral

* Fix octal escape in string literal

* Add tests

* Fix zero escape

* Fix zero escape lookahead

* Rename variables

* Rename helper functions

* Refactor match arms

* Fix escape line terminator sequence

* Fix single character escape

* Fix escape followed by unicode char

* Add NonOctalDecimalEscapeSequence

* Fix comment

* Refactor

* Modify error message

* Add tests

* Rename tests

* Add test for error

* Add comments for unsafe bytes to str

* Update boa/src/syntax/lexer/string.rs

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>

* Minor refactor

* Remove unsafe bytes to str

* Fix panic when reading invalid utf-8 chars

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>
pull/1091/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
00fc5e22bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 386
      boa/src/syntax/lexer/string.rs
  2. 6
      boa/src/syntax/lexer/template.rs
  3. 127
      boa/src/syntax/lexer/tests.rs

386
boa/src/syntax/lexer/string.rs

@ -58,171 +58,273 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let (lit, span) = let (lit, span) =
unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Ok(Token::new(TokenKind::string_literal(lit), span)) Ok(Token::new(TokenKind::string_literal(lit), span))
} }
} }
pub(super) fn unescape_string<R>( impl StringLiteral {
cursor: &mut Cursor<R>, /// Checks if a character is LineTerminator as per ECMAScript standards.
start_pos: Position, ///
terminator: StringTerminator, /// More information:
strict_mode: bool, /// - [ECMAScript reference][spec]
) -> Result<(String, Span), Error> ///
where /// [spec]: https://tc39.es/ecma262/#prod-LineTerminator
R: Read, #[inline]
{ pub(super) fn is_line_terminator(ch: char) -> bool {
let mut buf = Vec::new(); matches!(
loop { ch,
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); '\u{000A}' /* <LF> */ | '\u{000D}' /* <CR> */ | '\u{2028}' /* <LS> */ | '\u{2029}' /* <PS> */
)
match next_chr { }
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
Some('\\') => {
let _timer =
BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.peek()?.ok_or_else(|| { pub(super) fn take_string_characters<R>(
Error::from(io::Error::new( cursor: &mut Cursor<R>,
ErrorKind::UnexpectedEof, start_pos: Position,
"unterminated escape sequence in literal", terminator: StringTerminator,
)) strict_mode: bool,
})?; ) -> Result<(String, Span), Error>
where
R: Read,
{
let mut buf = Vec::new();
loop {
let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap();
match ch {
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
None if terminator == StringTerminator::End => {
break;
}
Some('\\') => {
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");
if escape <= 0x7f { let escape_ch = cursor
let _ = cursor.next_byte()?; .next_char()?
match escape { .and_then(|byte| char::try_from(byte).ok())
b'\n' => (), .ok_or_else(|| {
b'n' => buf.push('\n' as u16), Error::from(io::Error::new(
b'r' => buf.push('\r' as u16), ErrorKind::UnexpectedEof,
b't' => buf.push('\t' as u16), "unterminated escape sequence in literal",
b'b' => buf.push('\x08' as u16), ))
b'f' => buf.push('\x0c' as u16), })?;
b'0' => buf.push('\0' as u16),
b'x' => { match escape_ch {
let mut code_point_utf8_bytes = [0u8; 2]; 'b' => buf.push(0x0008 /* <BS> */),
cursor.fill_bytes(&mut code_point_utf8_bytes)?; 't' => buf.push(0x0009 /* <HT> */),
let code_point_str = str::from_utf8(&code_point_utf8_bytes) 'n' => buf.push(0x000A /* <LF> */),
.expect("malformed Hexadecimal character escape sequence"); 'v' => buf.push(0x000B /* <VT> */),
let code_point = 'f' => buf.push(0x000C /* <FF> */),
u16::from_str_radix(&code_point_str, 16).map_err(|_| { 'r' => buf.push(0x000D /* <CR> */),
Error::syntax( '"' => buf.push(0x0022 /* " */),
"invalid Hexadecimal escape sequence", '\'' => buf.push(0x0027 /* ' */),
cursor.pos(), '\\' => buf.push(0x005C /* \ */),
) '0' if cursor
})?; .peek()?
.filter(|next_byte| (b'0'..=b'9').contains(next_byte))
buf.push(code_point); .is_none() =>
{
buf.push(0x0000 /* NULL */)
} }
b'u' => { 'x' => {
// Support \u{X..X} (Unicode Codepoint) Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
if cursor.next_is(b'{')? { }
// TODO: use bytes for a bit better performance (using stack) 'u' => {
let mut code_point_buf = Vec::with_capacity(6); Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
cursor.take_until(b'}', &mut code_point_buf)?;
let code_point_str =
unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
// We know this is a single unicode codepoint, convert to u32
let code_point =
u32::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
})?;
// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
}
} else {
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
let code_point =
u16::from_str_radix(code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?;
buf.push(code_point);
}
} }
n if char::is_digit(char::from(n), 8) => { '8' | '9' => {
// Grammar: NonOctalDecimalEscapeSequence
if strict_mode { if strict_mode {
return Err(Error::syntax( return Err(Error::syntax(
"octal escape sequences are deprecated", "\\8 and \\9 are not allowed in strict mode",
cursor.pos(), ch_start_pos,
)); ));
} else {
buf.push(escape_ch as u16);
} }
let mut o = char::from(n).to_digit(8).unwrap(); }
_ if escape_ch.is_digit(8) => {
match cursor.peek()? { Self::take_legacy_octal_escape_sequence(
Some(c) if char::is_digit(char::from(c), 8) => { cursor,
let _ = cursor.next_byte()?; ch_start_pos,
o = o * 8 + char::from(n).to_digit(8).unwrap(); Some(&mut buf),
if n <= b'3' { strict_mode,
match cursor.peek()? { escape_ch as u8,
Some(c) if char::is_digit(char::from(c), 8) => { )?;
let _ = cursor.next_byte(); }
o = o * 8 + char::from(n).to_digit(8).unwrap(); _ if Self::is_line_terminator(escape_ch) => {
} // Grammar: LineContinuation
_ => (), // Grammar: \ LineTerminatorSequence
} // LineContinuation is the empty String. Do nothing and continue lexing.
} }
} _ => {
_ => (), if escape_ch.len_utf16() == 1 {
buf.push(escape_ch as u16);
} else {
buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter());
} }
buf.push(o as u16);
} }
_ => buf.push(escape as u16),
}; };
} }
Some(ch) => {
if ch.len_utf16() == 1 {
buf.push(ch as u16);
} else {
buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter());
}
}
None => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
} }
Some(next_ch) => { }
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16); Ok((
} else { String::from_utf16_lossy(buf.as_slice()),
let mut code_point_bytes_buf = [0u16; 2]; Span::new(start_pos, cursor.pos()),
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); ))
}
#[inline]
pub(super) fn take_unicode_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
// Support \u{X..X} (Unicode CodePoint)
if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;
let code_point = str::from_utf8(code_point_buf.as_slice())
.ok()
.and_then(|code_point_str| {
// The `code_point_str` should represent a single unicode codepoint, convert to u32
u32::from_str_radix(&code_point_str, 16).ok()
})
.ok_or_else(|| {
Error::syntax("malformed Unicode character escape sequence", start_pos)
})?;
buf.extend(code_point_bytes.iter()); // UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax(
"Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
start_pos,
));
} else if let Some(code_units_buf) = code_units_buf {
if code_point <= 65535 {
code_units_buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
code_units_buf.push(cu1);
code_units_buf.push(cu2);
} }
} }
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new( Ok(code_point)
ErrorKind::UnexpectedEof, } else {
"unterminated string literal", // Grammar: Hex4Digits
))); // Collect each character after \u e.g \uD83D will give "D83D"
} let mut code_point_utf8_bytes = [0u8; 4];
None => { cursor.fill_bytes(&mut code_point_utf8_bytes)?;
break;
// Convert to u16
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
} }
Ok(code_point as u32)
}
}
#[inline]
fn take_hex_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
} }
Ok(code_point as u32)
} }
Ok(( #[inline]
String::from_utf16_lossy(buf.as_slice()), fn take_legacy_octal_escape_sequence<R>(
Span::new(start_pos, cursor.pos()), cursor: &mut Cursor<R>,
)) start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
strict_mode: bool,
init_byte: u8,
) -> Result<u32, Error>
where
R: Read,
{
if strict_mode {
return Err(Error::syntax(
"octal escape sequences are not allowed in strict mode",
start_pos,
));
}
// Grammar: OctalDigit
let mut code_point = (init_byte - b'0') as u32;
// Grammar: ZeroToThree OctalDigit
// Grammar: FourToSeven OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
if (b'0'..=b'3').contains(&init_byte) {
// Grammar: ZeroToThree OctalDigit OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
}
}
}
}
}
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point as u16);
}
Ok(code_point)
}
} }

6
boa/src/syntax/lexer/template.rs

@ -3,7 +3,7 @@
use super::{Cursor, Error, Tokenizer}; use super::{Cursor, Error, Tokenizer};
use crate::{ use crate::{
profiler::BoaProfiler, profiler::BoaProfiler,
syntax::lexer::string::{unescape_string, StringTerminator}, syntax::lexer::string::{StringLiteral, StringTerminator},
syntax::{ syntax::{
ast::{Position, Span}, ast::{Position, Span},
lexer::{Token, TokenKind}, lexer::{Token, TokenKind},
@ -44,7 +44,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
match next_chr { match next_chr {
'`' => { '`' => {
let raw = String::from_utf16_lossy(buf.as_slice()); let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string( let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos), &mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos, start_pos,
StringTerminator::End, StringTerminator::End,
@ -58,7 +58,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
'$' if cursor.peek()? == Some(b'{') => { '$' if cursor.peek()? == Some(b'{') => {
let _ = cursor.next_byte()?; let _ = cursor.next_byte()?;
let raw = String::from_utf16_lossy(buf.as_slice()); let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string( let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos), &mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos, start_pos,
StringTerminator::End, StringTerminator::End,

127
boa/src/syntax/lexer/tests.rs

@ -6,7 +6,7 @@ use super::token::Numeric;
use super::*; use super::*;
use super::{Error, Position}; use super::{Error, Position};
use crate::syntax::ast::Keyword; use crate::syntax::ast::Keyword;
use crate::syntax::lexer::string::{unescape_string, StringTerminator}; use crate::syntax::lexer::string::{StringLiteral, StringTerminator};
use std::str; use std::str;
fn span(start: (u32, u32), end: (u32, u32)) -> Span { fn span(start: (u32, u32), end: (u32, u32)) -> Span {
@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() {
} }
#[test] #[test]
fn codepoint_with_no_braces() { fn string_codepoint_with_no_braces() {
let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]); let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]);
assert!(lexer.next().is_ok()); assert!(lexer.next().is_ok());
} }
@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() {
} }
#[test] #[test]
fn non_english_str() { fn string_unicode() {
let str = r#"'中文';"#; let str = r#"'中文';"#;
let mut lexer = Lexer::new(str.as_bytes()); let mut lexer = Lexer::new(str.as_bytes());
@ -828,7 +828,7 @@ fn non_english_str() {
} }
#[test] #[test]
fn unicode_escape_with_braces() { fn string_unicode_escape_with_braces() {
let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]); let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);
let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())]; let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];
@ -859,12 +859,12 @@ fn unicode_escape_with_braces() {
} }
#[test] #[test]
fn unicode_escape_with_braces_() { fn take_string_characters_unicode_escape_with_braces_2() {
let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string(); let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string();
let mut cursor = Cursor::new(s.as_bytes()); let mut cursor = Cursor::new(s.as_bytes());
if let Ok((s, _)) = unescape_string( if let Ok((s, _)) = StringLiteral::take_string_characters(
&mut cursor, &mut cursor,
Position::new(1, 1), Position::new(1, 1),
StringTerminator::End, StringTerminator::End,
@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() {
} }
#[test] #[test]
fn unescape_string_with_single_escape() { fn take_string_characters_with_single_escape() {
let s = r#"\Б"#.to_string(); let s = r#"\Б"#.to_string();
let mut cursor = Cursor::new(s.as_bytes()); let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = unescape_string( let (s, _) = StringLiteral::take_string_characters(
&mut cursor, &mut cursor,
Position::new(1, 1), Position::new(1, 1),
StringTerminator::End, StringTerminator::End,
@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() {
assert_eq!(s, "Б"); assert_eq!(s, "Б");
} }
#[test]
fn take_string_characters_legacy_octal_escape() {
let test_cases = [
(r#"\3"#, "\u{3}"),
(r#"\03"#, "\u{3}"),
(r#"\003"#, "\u{3}"),
(r#"\0003"#, "\u{0}3"),
(r#"\43"#, "#"),
(r#"\043"#, "#"),
(r#"\101"#, "A"),
];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Octal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_zero_escape() {
let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
}
#[test]
fn take_string_characters_non_octal_decimal_escape() {
let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Non-octal-decimal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_line_continuation() {
let s = "hello \\\nworld";
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, "hello world");
}
mod carriage_return { mod carriage_return {
use super::*; use super::*;

Loading…
Cancel
Save