Browse Source

Fix lexing escapes in string literal and minor refactor (#1079)

* Refactor StringLiteral

* Fix octal escape in string literal

* Add tests

* Fix zero escape

* Fix zero escape lookahead

* Rename variables

* Rename helper functions

* Refactor match arms

* Fix escape line terminator sequence

* Fix single character escape

* Fix escape followed by unicode char

* Add NonOctalDecimalEscapeSequence

* Fix comment

* Refactor

* Modify error message

* Add tests

* Rename tests

* Add test for error

* Add comments for unsafe bytes to str

* Update boa/src/syntax/lexer/string.rs

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>

* Minor refactor

* Remove unsafe bytes to str

* Fix panic when reading invalid utf-8 chars

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>
pull/1091/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
00fc5e22bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 300
      boa/src/syntax/lexer/string.rs
  2. 6
      boa/src/syntax/lexer/template.rs
  3. 127
      boa/src/syntax/lexer/tests.rs

300
boa/src/syntax/lexer/string.rs

@ -58,13 +58,28 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let (lit, span) = let (lit, span) =
unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?; Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Ok(Token::new(TokenKind::string_literal(lit), span)) Ok(Token::new(TokenKind::string_literal(lit), span))
} }
} }
pub(super) fn unescape_string<R>( impl StringLiteral {
/// Checks if a character is LineTerminator as per ECMAScript standards.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-LineTerminator
#[inline]
pub(super) fn is_line_terminator(ch: char) -> bool {
matches!(
ch,
'\u{000A}' /* <LF> */ | '\u{000D}' /* <CR> */ | '\u{2028}' /* <LS> */ | '\u{2029}' /* <PS> */
)
}
pub(super) fn take_string_characters<R>(
cursor: &mut Cursor<R>, cursor: &mut Cursor<R>,
start_pos: Position, start_pos: Position,
terminator: StringTerminator, terminator: StringTerminator,
@ -75,154 +90,241 @@ where
{ {
let mut buf = Vec::new(); let mut buf = Vec::new();
loop { loop {
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap(); let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap();
match next_chr { match ch {
Some('\'') if terminator == StringTerminator::SingleQuote => { Some('\'') if terminator == StringTerminator::SingleQuote => {
break; break;
} }
Some('"') if terminator == StringTerminator::DoubleQuote => { Some('"') if terminator == StringTerminator::DoubleQuote => {
break; break;
} }
None if terminator == StringTerminator::End => {
break;
}
Some('\\') => { Some('\\') => {
let _timer = let _timer = BoaProfiler::global()
BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing"); .start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.peek()?.ok_or_else(|| { let escape_ch = cursor
.next_char()?
.and_then(|byte| char::try_from(byte).ok())
.ok_or_else(|| {
Error::from(io::Error::new( Error::from(io::Error::new(
ErrorKind::UnexpectedEof, ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal", "unterminated escape sequence in literal",
)) ))
})?; })?;
if escape <= 0x7f { match escape_ch {
let _ = cursor.next_byte()?; 'b' => buf.push(0x0008 /* <BS> */),
match escape { 't' => buf.push(0x0009 /* <HT> */),
b'\n' => (), 'n' => buf.push(0x000A /* <LF> */),
b'n' => buf.push('\n' as u16), 'v' => buf.push(0x000B /* <VT> */),
b'r' => buf.push('\r' as u16), 'f' => buf.push(0x000C /* <FF> */),
b't' => buf.push('\t' as u16), 'r' => buf.push(0x000D /* <CR> */),
b'b' => buf.push('\x08' as u16), '"' => buf.push(0x0022 /* " */),
b'f' => buf.push('\x0c' as u16), '\'' => buf.push(0x0027 /* ' */),
b'0' => buf.push('\0' as u16), '\\' => buf.push(0x005C /* \ */),
b'x' => { '0' if cursor
let mut code_point_utf8_bytes = [0u8; 2]; .peek()?
cursor.fill_bytes(&mut code_point_utf8_bytes)?; .filter(|next_byte| (b'0'..=b'9').contains(next_byte))
let code_point_str = str::from_utf8(&code_point_utf8_bytes) .is_none() =>
.expect("malformed Hexadecimal character escape sequence"); {
let code_point = buf.push(0x0000 /* NULL */)
u16::from_str_radix(&code_point_str, 16).map_err(|_| { }
Error::syntax( 'x' => {
"invalid Hexadecimal escape sequence", Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
cursor.pos(), }
) 'u' => {
})?; Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
}
'8' | '9' => {
// Grammar: NonOctalDecimalEscapeSequence
if strict_mode {
return Err(Error::syntax(
"\\8 and \\9 are not allowed in strict mode",
ch_start_pos,
));
} else {
buf.push(escape_ch as u16);
}
}
_ if escape_ch.is_digit(8) => {
Self::take_legacy_octal_escape_sequence(
cursor,
ch_start_pos,
Some(&mut buf),
strict_mode,
escape_ch as u8,
)?;
}
_ if Self::is_line_terminator(escape_ch) => {
// Grammar: LineContinuation
// Grammar: \ LineTerminatorSequence
// LineContinuation is the empty String. Do nothing and continue lexing.
}
_ => {
if escape_ch.len_utf16() == 1 {
buf.push(escape_ch as u16);
} else {
buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter());
}
}
};
}
Some(ch) => {
if ch.len_utf16() == 1 {
buf.push(ch as u16);
} else {
buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter());
}
}
None => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
}
}
buf.push(code_point); Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
} }
b'u' => {
// Support \u{X..X} (Unicode Codepoint) #[inline]
pub(super) fn take_unicode_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
// Support \u{X..X} (Unicode CodePoint)
if cursor.next_is(b'{')? { if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack) // TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6); let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?; cursor.take_until(b'}', &mut code_point_buf)?;
let code_point_str = let code_point = str::from_utf8(code_point_buf.as_slice())
unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) }; .ok()
// We know this is a single unicode codepoint, convert to u32 .and_then(|code_point_str| {
let code_point = // The `code_point_str` should represent a single unicode codepoint, convert to u32
u32::from_str_radix(&code_point_str, 16).map_err(|_| { u32::from_str_radix(&code_point_str, 16).ok()
Error::syntax( })
"malformed Unicode character escape sequence", .ok_or_else(|| {
cursor.pos(), Error::syntax("malformed Unicode character escape sequence", start_pos)
)
})?; })?;
// UTF16Encoding of a numeric code point value // UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF { if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos())); return Err(Error::syntax(
} else if code_point <= 65535 { "Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
buf.push(code_point as u16); start_pos,
));
} else if let Some(code_units_buf) = code_units_buf {
if code_point <= 65535 {
code_units_buf.push(code_point as u16);
} else { } else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1); code_units_buf.push(cu1);
buf.push(cu2); code_units_buf.push(cu2);
} }
}
Ok(code_point)
} else { } else {
// Grammar: Hex4Digits
// Collect each character after \u e.g \uD83D will give "D83D" // Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4]; let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?; cursor.fill_bytes(&mut code_point_utf8_bytes)?;
// Convert to u16 // Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes) let code_point = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence"); .ok()
let code_point = .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
u16::from_str_radix(code_point_str, 16).map_err(|_| { .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
Error::syntax(
"invalid Unicode escape sequence", if let Some(code_units_buf) = code_units_buf {
cursor.pos(), code_units_buf.push(code_point);
) }
})?;
Ok(code_point as u32)
}
}
buf.push(code_point); #[inline]
fn take_hex_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
} }
Ok(code_point as u32)
} }
n if char::is_digit(char::from(n), 8) => {
#[inline]
fn take_legacy_octal_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
strict_mode: bool,
init_byte: u8,
) -> Result<u32, Error>
where
R: Read,
{
if strict_mode { if strict_mode {
return Err(Error::syntax( return Err(Error::syntax(
"octal escape sequences are deprecated", "octal escape sequences are not allowed in strict mode",
cursor.pos(), start_pos,
)); ));
} }
let mut o = char::from(n).to_digit(8).unwrap(); // Grammar: OctalDigit
let mut code_point = (init_byte - b'0') as u32;
match cursor.peek()? { // Grammar: ZeroToThree OctalDigit
Some(c) if char::is_digit(char::from(c), 8) => { // Grammar: FourToSeven OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?; let _ = cursor.next_byte()?;
o = o * 8 + char::from(n).to_digit(8).unwrap(); code_point = (code_point * 8) + (byte - b'0') as u32;
if n <= b'3' {
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte();
o = o * 8 + char::from(n).to_digit(8).unwrap();
}
_ => (),
}
}
}
_ => (),
}
buf.push(o as u16);
}
_ => buf.push(escape as u16),
};
}
}
Some(next_ch) => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
buf.extend(code_point_bytes.iter()); if (b'0'..=b'3').contains(&init_byte) {
// Grammar: ZeroToThree OctalDigit OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
} }
} }
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
} }
None => {
break;
} }
} }
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point as u16);
} }
Ok(( Ok(code_point)
String::from_utf16_lossy(buf.as_slice()), }
Span::new(start_pos, cursor.pos()),
))
} }

6
boa/src/syntax/lexer/template.rs

@ -3,7 +3,7 @@
use super::{Cursor, Error, Tokenizer}; use super::{Cursor, Error, Tokenizer};
use crate::{ use crate::{
profiler::BoaProfiler, profiler::BoaProfiler,
syntax::lexer::string::{unescape_string, StringTerminator}, syntax::lexer::string::{StringLiteral, StringTerminator},
syntax::{ syntax::{
ast::{Position, Span}, ast::{Position, Span},
lexer::{Token, TokenKind}, lexer::{Token, TokenKind},
@ -44,7 +44,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
match next_chr { match next_chr {
'`' => { '`' => {
let raw = String::from_utf16_lossy(buf.as_slice()); let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string( let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos), &mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos, start_pos,
StringTerminator::End, StringTerminator::End,
@ -58,7 +58,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
'$' if cursor.peek()? == Some(b'{') => { '$' if cursor.peek()? == Some(b'{') => {
let _ = cursor.next_byte()?; let _ = cursor.next_byte()?;
let raw = String::from_utf16_lossy(buf.as_slice()); let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string( let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos), &mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos, start_pos,
StringTerminator::End, StringTerminator::End,

127
boa/src/syntax/lexer/tests.rs

@ -6,7 +6,7 @@ use super::token::Numeric;
use super::*; use super::*;
use super::{Error, Position}; use super::{Error, Position};
use crate::syntax::ast::Keyword; use crate::syntax::ast::Keyword;
use crate::syntax::lexer::string::{unescape_string, StringTerminator}; use crate::syntax::lexer::string::{StringLiteral, StringTerminator};
use std::str; use std::str;
fn span(start: (u32, u32), end: (u32, u32)) -> Span { fn span(start: (u32, u32), end: (u32, u32)) -> Span {
@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() {
} }
#[test] #[test]
fn codepoint_with_no_braces() { fn string_codepoint_with_no_braces() {
let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]); let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]);
assert!(lexer.next().is_ok()); assert!(lexer.next().is_ok());
} }
@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() {
} }
#[test] #[test]
fn non_english_str() { fn string_unicode() {
let str = r#"'中文';"#; let str = r#"'中文';"#;
let mut lexer = Lexer::new(str.as_bytes()); let mut lexer = Lexer::new(str.as_bytes());
@ -828,7 +828,7 @@ fn non_english_str() {
} }
#[test] #[test]
fn unicode_escape_with_braces() { fn string_unicode_escape_with_braces() {
let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]); let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);
let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())]; let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];
@ -859,12 +859,12 @@ fn unicode_escape_with_braces() {
} }
#[test] #[test]
fn unicode_escape_with_braces_() { fn take_string_characters_unicode_escape_with_braces_2() {
let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string(); let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string();
let mut cursor = Cursor::new(s.as_bytes()); let mut cursor = Cursor::new(s.as_bytes());
if let Ok((s, _)) = unescape_string( if let Ok((s, _)) = StringLiteral::take_string_characters(
&mut cursor, &mut cursor,
Position::new(1, 1), Position::new(1, 1),
StringTerminator::End, StringTerminator::End,
@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() {
} }
#[test] #[test]
fn unescape_string_with_single_escape() { fn take_string_characters_with_single_escape() {
let s = r#"\Б"#.to_string(); let s = r#"\Б"#.to_string();
let mut cursor = Cursor::new(s.as_bytes()); let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = unescape_string( let (s, _) = StringLiteral::take_string_characters(
&mut cursor, &mut cursor,
Position::new(1, 1), Position::new(1, 1),
StringTerminator::End, StringTerminator::End,
@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() {
assert_eq!(s, "Б"); assert_eq!(s, "Б");
} }
#[test]
fn take_string_characters_legacy_octal_escape() {
let test_cases = [
(r#"\3"#, "\u{3}"),
(r#"\03"#, "\u{3}"),
(r#"\003"#, "\u{3}"),
(r#"\0003"#, "\u{0}3"),
(r#"\43"#, "#"),
(r#"\043"#, "#"),
(r#"\101"#, "A"),
];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Octal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_zero_escape() {
let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
}
#[test]
fn take_string_characters_non_octal_decimal_escape() {
let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Non-octal-decimal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_line_continuation() {
let s = "hello \\\nworld";
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, "hello world");
}
mod carriage_return { mod carriage_return {
use super::*; use super::*;

Loading…
Cancel
Save