Browse Source

Fix lexing escapes in string literal and minor refactor (#1079)

* Refactor StringLiteral

* Fix octal escape in string literal

* Add tests

* Fix zero escape

* Fix zero escape lookahead

* Rename variables

* Rename helper functions

* Refactor match arms

* Fix escape line terminator sequence

* Fix single character escape

* Fix escape followed by unicode char

* Add NonOctalDecimalEscapeSequence

* Fix comment

* Refactor

* Modify error message

* Add tests

* Rename tests

* Add test for error

* Add comments for unsafe bytes to str

* Update boa/src/syntax/lexer/string.rs

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>

* Minor refactor

* Remove unsafe bytes to str

* Fix panic when reading invalid utf-8 chars

Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>
pull/1091/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
00fc5e22bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 306
      boa/src/syntax/lexer/string.rs
  2. 6
      boa/src/syntax/lexer/template.rs
  3. 127
      boa/src/syntax/lexer/tests.rs

306
boa/src/syntax/lexer/string.rs

@ -58,171 +58,273 @@ impl<R> Tokenizer<R> for StringLiteral {
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let (lit, span) =
unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
Ok(Token::new(TokenKind::string_literal(lit), span))
}
}
pub(super) fn unescape_string<R>(
impl StringLiteral {
/// Checks if a character is LineTerminator as per ECMAScript standards.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-LineTerminator
#[inline]
pub(super) fn is_line_terminator(ch: char) -> bool {
matches!(
ch,
'\u{000A}' /* <LF> */ | '\u{000D}' /* <CR> */ | '\u{2028}' /* <LS> */ | '\u{2029}' /* <PS> */
)
}
pub(super) fn take_string_characters<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
terminator: StringTerminator,
strict_mode: bool,
) -> Result<(String, Span), Error>
where
) -> Result<(String, Span), Error>
where
R: Read,
{
{
let mut buf = Vec::new();
loop {
let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();
let ch_start_pos = cursor.pos();
let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap();
match next_chr {
match ch {
Some('\'') if terminator == StringTerminator::SingleQuote => {
break;
}
Some('"') if terminator == StringTerminator::DoubleQuote => {
break;
}
None if terminator == StringTerminator::End => {
break;
}
Some('\\') => {
let _timer =
BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
let _timer = BoaProfiler::global()
.start_event("StringLiteral - escape sequence", "Lexing");
let escape = cursor.peek()?.ok_or_else(|| {
let escape_ch = cursor
.next_char()?
.and_then(|byte| char::try_from(byte).ok())
.ok_or_else(|| {
Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated escape sequence in literal",
))
})?;
if escape <= 0x7f {
let _ = cursor.next_byte()?;
match escape {
b'\n' => (),
b'n' => buf.push('\n' as u16),
b'r' => buf.push('\r' as u16),
b't' => buf.push('\t' as u16),
b'b' => buf.push('\x08' as u16),
b'f' => buf.push('\x0c' as u16),
b'0' => buf.push('\0' as u16),
b'x' => {
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
let code_point =
u16::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Hexadecimal escape sequence",
cursor.pos(),
)
})?;
match escape_ch {
'b' => buf.push(0x0008 /* <BS> */),
't' => buf.push(0x0009 /* <HT> */),
'n' => buf.push(0x000A /* <LF> */),
'v' => buf.push(0x000B /* <VT> */),
'f' => buf.push(0x000C /* <FF> */),
'r' => buf.push(0x000D /* <CR> */),
'"' => buf.push(0x0022 /* " */),
'\'' => buf.push(0x0027 /* ' */),
'\\' => buf.push(0x005C /* \ */),
'0' if cursor
.peek()?
.filter(|next_byte| (b'0'..=b'9').contains(next_byte))
.is_none() =>
{
buf.push(0x0000 /* NULL */)
}
'x' => {
Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
}
'u' => {
Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
}
'8' | '9' => {
// Grammar: NonOctalDecimalEscapeSequence
if strict_mode {
return Err(Error::syntax(
"\\8 and \\9 are not allowed in strict mode",
ch_start_pos,
));
} else {
buf.push(escape_ch as u16);
}
}
_ if escape_ch.is_digit(8) => {
Self::take_legacy_octal_escape_sequence(
cursor,
ch_start_pos,
Some(&mut buf),
strict_mode,
escape_ch as u8,
)?;
}
_ if Self::is_line_terminator(escape_ch) => {
// Grammar: LineContinuation
// Grammar: \ LineTerminatorSequence
// LineContinuation is the empty String. Do nothing and continue lexing.
}
_ => {
if escape_ch.len_utf16() == 1 {
buf.push(escape_ch as u16);
} else {
buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter());
}
}
};
}
Some(ch) => {
if ch.len_utf16() == 1 {
buf.push(ch as u16);
} else {
buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter());
}
}
None => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
}
}
buf.push(code_point);
Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
}
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
#[inline]
pub(super) fn take_unicode_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
// Support \u{X..X} (Unicode CodePoint)
if cursor.next_is(b'{')? {
// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;
let code_point_str =
unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
// We know this is a single unicode codepoint, convert to u32
let code_point =
u32::from_str_radix(&code_point_str, 16).map_err(|_| {
Error::syntax(
"malformed Unicode character escape sequence",
cursor.pos(),
)
let code_point = str::from_utf8(code_point_buf.as_slice())
.ok()
.and_then(|code_point_str| {
// The `code_point_str` should represent a single unicode codepoint, convert to u32
u32::from_str_radix(&code_point_str, 16).ok()
})
.ok_or_else(|| {
Error::syntax("malformed Unicode character escape sequence", start_pos)
})?;
// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
return Err(Error::syntax(
"Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
start_pos,
));
} else if let Some(code_units_buf) = code_units_buf {
if code_point <= 65535 {
code_units_buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
code_units_buf.push(cu1);
code_units_buf.push(cu2);
}
}
Ok(code_point)
} else {
// Grammar: Hex4Digits
// Collect each character after \u e.g \uD83D will give "D83D"
let mut code_point_utf8_bytes = [0u8; 4];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
// Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Unicode character escape sequence");
let code_point =
u16::from_str_radix(code_point_str, 16).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
}
Ok(code_point as u32)
}
}
#[inline]
fn take_hex_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
) -> Result<u32, Error>
where
R: Read,
{
let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let code_point = str::from_utf8(&code_point_utf8_bytes)
.ok()
.and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
.ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
buf.push(code_point);
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point);
}
Ok(code_point as u32)
}
n if char::is_digit(char::from(n), 8) => {
#[inline]
fn take_legacy_octal_escape_sequence<R>(
cursor: &mut Cursor<R>,
start_pos: Position,
code_units_buf: Option<&mut Vec<u16>>,
strict_mode: bool,
init_byte: u8,
) -> Result<u32, Error>
where
R: Read,
{
if strict_mode {
return Err(Error::syntax(
"octal escape sequences are deprecated",
cursor.pos(),
"octal escape sequences are not allowed in strict mode",
start_pos,
));
}
let mut o = char::from(n).to_digit(8).unwrap();
// Grammar: OctalDigit
let mut code_point = (init_byte - b'0') as u32;
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
// Grammar: ZeroToThree OctalDigit
// Grammar: FourToSeven OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
o = o * 8 + char::from(n).to_digit(8).unwrap();
if n <= b'3' {
match cursor.peek()? {
Some(c) if char::is_digit(char::from(c), 8) => {
let _ = cursor.next_byte();
o = o * 8 + char::from(n).to_digit(8).unwrap();
}
_ => (),
}
}
}
_ => (),
}
buf.push(o as u16);
}
_ => buf.push(escape as u16),
};
}
}
Some(next_ch) => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
code_point = (code_point * 8) + (byte - b'0') as u32;
buf.extend(code_point_bytes.iter());
if (b'0'..=b'3').contains(&init_byte) {
// Grammar: ZeroToThree OctalDigit OctalDigit
if let Some(byte) = cursor.peek()? {
if (b'0'..=b'7').contains(&byte) {
let _ = cursor.next_byte()?;
code_point = (code_point * 8) + (byte - b'0') as u32;
}
}
None if terminator != StringTerminator::End => {
return Err(Error::from(io::Error::new(
ErrorKind::UnexpectedEof,
"unterminated string literal",
)));
}
None => {
break;
}
}
if let Some(code_units_buf) = code_units_buf {
code_units_buf.push(code_point as u16);
}
Ok((
String::from_utf16_lossy(buf.as_slice()),
Span::new(start_pos, cursor.pos()),
))
Ok(code_point)
}
}

6
boa/src/syntax/lexer/template.rs

@ -3,7 +3,7 @@
use super::{Cursor, Error, Tokenizer};
use crate::{
profiler::BoaProfiler,
syntax::lexer::string::{unescape_string, StringTerminator},
syntax::lexer::string::{StringLiteral, StringTerminator},
syntax::{
ast::{Position, Span},
lexer::{Token, TokenKind},
@ -44,7 +44,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
match next_chr {
'`' => {
let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string(
let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos,
StringTerminator::End,
@ -58,7 +58,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
'$' if cursor.peek()? == Some(b'{') => {
let _ = cursor.next_byte()?;
let raw = String::from_utf16_lossy(buf.as_slice());
let (cooked, _) = unescape_string(
let (cooked, _) = StringLiteral::take_string_characters(
&mut Cursor::with_position(raw.as_bytes(), start_pos),
start_pos,
StringTerminator::End,

127
boa/src/syntax/lexer/tests.rs

@ -6,7 +6,7 @@ use super::token::Numeric;
use super::*;
use super::{Error, Position};
use crate::syntax::ast::Keyword;
use crate::syntax::lexer::string::{unescape_string, StringTerminator};
use crate::syntax::lexer::string::{StringLiteral, StringTerminator};
use std::str;
fn span(start: (u32, u32), end: (u32, u32)) -> Span {
@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() {
}
#[test]
fn codepoint_with_no_braces() {
fn string_codepoint_with_no_braces() {
let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]);
assert!(lexer.next().is_ok());
}
@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() {
}
#[test]
fn non_english_str() {
fn string_unicode() {
let str = r#"'中文';"#;
let mut lexer = Lexer::new(str.as_bytes());
@ -828,7 +828,7 @@ fn non_english_str() {
}
#[test]
fn unicode_escape_with_braces() {
fn string_unicode_escape_with_braces() {
let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);
let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];
@ -859,12 +859,12 @@ fn unicode_escape_with_braces() {
}
#[test]
fn unicode_escape_with_braces_() {
fn take_string_characters_unicode_escape_with_braces_2() {
let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string();
let mut cursor = Cursor::new(s.as_bytes());
if let Ok((s, _)) = unescape_string(
if let Ok((s, _)) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() {
}
#[test]
fn unescape_string_with_single_escape() {
fn take_string_characters_with_single_escape() {
let s = r#"\Б"#.to_string();
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = unescape_string(
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() {
assert_eq!(s, "Б");
}
#[test]
fn take_string_characters_legacy_octal_escape() {
let test_cases = [
(r#"\3"#, "\u{3}"),
(r#"\03"#, "\u{3}"),
(r#"\003"#, "\u{3}"),
(r#"\0003"#, "\u{0}3"),
(r#"\43"#, "#"),
(r#"\043"#, "#"),
(r#"\101"#, "A"),
];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Octal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_zero_escape() {
let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
}
#[test]
fn take_string_characters_non_octal_decimal_escape() {
let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")];
for (s, expected) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, *expected);
}
for (s, _) in test_cases.iter() {
let mut cursor = Cursor::new(s.as_bytes());
if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
true,
)
.expect_err("Non-octal-decimal-escape in strict mode not rejected as expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}
}
#[test]
fn take_string_characters_line_continuation() {
let s = "hello \\\nworld";
let mut cursor = Cursor::new(s.as_bytes());
let (s, _) = StringLiteral::take_string_characters(
&mut cursor,
Position::new(1, 1),
StringTerminator::End,
false,
)
.unwrap();
assert_eq!(s, "hello world");
}
mod carriage_return {
use super::*;

Loading…
Cancel
Save