Fix lexing escapes in string literal and minor refactor (#1079)

* Refactor StringLiteral * Fix octal escape in string literal * Add tests * Fix zero escape * Fix zero escape lookahead * Rename variables * Rename helper functions * Refactor match arms * Fix escape line terminator sequence * Fix single character escape * Fix escape followed by unicode char * Add NonOctalDecimalEscapeSequence * Fix comment * Refactor * Modify error message * Add tests * Rename tests * Add test for error * Add comments for unsafe bytes to str * Update boa/src/syntax/lexer/string.rs Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com> * Minor refactor * Remove unsafe bytes to str * Fix panic when reading invalid utf-8 chars Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>
4 years ago · 00fc5e22bc
3 changed files with 366 additions and 153 deletions
--- a/boa/src/syntax/lexer/string.rs
+++ b/boa/src/syntax/lexer/string.rs
@ -58,171 +58,273 @@ impl<R> Tokenizer<R> for StringLiteral {
        let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
        let (lit, span) =
-            unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;
+            Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;
        Ok(Token::new(TokenKind::string_literal(lit), span))
    }
 }
-pub(super) fn unescape_string<R>(
+impl StringLiteral {
-    cursor: &mut Cursor<R>,
+    /// Checks if a character is LineTerminator as per ECMAScript standards.
-    start_pos: Position,
+    ///
-    terminator: StringTerminator,
+    /// More information:
-    strict_mode: bool,
+    ///  - [ECMAScript reference][spec]
-) -> Result<(String, Span), Error>
+    ///
-where
+    /// [spec]: https://tc39.es/ecma262/#prod-LineTerminator
-    R: Read,
+    #[inline]
-{
+    pub(super) fn is_line_terminator(ch: char) -> bool {
-    let mut buf = Vec::new();
+        matches!(
-    loop {
+            ch,
-        let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();
+            '\u{000A}' /* <LF> */ | '\u{000D}' /* <CR> */ | '\u{2028}' /* <LS> */ | '\u{2029}' /* <PS> */
-
+        )
-        match next_chr {
+    }
            Some('\'') if terminator == StringTerminator::SingleQuote => {
                break;
            }
            Some('"') if terminator == StringTerminator::DoubleQuote => {
                break;
            }
            Some('\\') => {
                let _timer =
                    BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
-                let escape = cursor.peek()?.ok_or_else(|| {
+    pub(super) fn take_string_characters<R>(
-                    Error::from(io::Error::new(
+        cursor: &mut Cursor<R>,
-                        ErrorKind::UnexpectedEof,
+        start_pos: Position,
-                        "unterminated escape sequence in literal",
+        terminator: StringTerminator,
-                    ))
+        strict_mode: bool,
-                })?;
+    ) -> Result<(String, Span), Error>
    where
        R: Read,
    {
        let mut buf = Vec::new();
        loop {
            let ch_start_pos = cursor.pos();
            let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap();
            match ch {
                Some('\'') if terminator == StringTerminator::SingleQuote => {
                    break;
                }
                Some('"') if terminator == StringTerminator::DoubleQuote => {
                    break;
                }
                None if terminator == StringTerminator::End => {
                    break;
                }
                Some('\\') => {
                    let _timer = BoaProfiler::global()
                        .start_event("StringLiteral - escape sequence", "Lexing");
-                if escape <= 0x7f {
+                    let escape_ch = cursor
-                    let _ = cursor.next_byte()?;
+                        .next_char()?
-                    match escape {
+                        .and_then(|byte| char::try_from(byte).ok())
-                        b'\n' => (),
+                        .ok_or_else(|| {
-                        b'n' => buf.push('\n' as u16),
+                            Error::from(io::Error::new(
-                        b'r' => buf.push('\r' as u16),
+                                ErrorKind::UnexpectedEof,
-                        b't' => buf.push('\t' as u16),
+                                "unterminated escape sequence in literal",
-                        b'b' => buf.push('\x08' as u16),
+                            ))
-                        b'f' => buf.push('\x0c' as u16),
+                        })?;
-                        b'0' => buf.push('\0' as u16),
+
-                        b'x' => {
+                    match escape_ch {
-                            let mut code_point_utf8_bytes = [0u8; 2];
+                        'b' => buf.push(0x0008 /* <BS> */),
-                            cursor.fill_bytes(&mut code_point_utf8_bytes)?;
+                        't' => buf.push(0x0009 /* <HT> */),
-                            let code_point_str = str::from_utf8(&code_point_utf8_bytes)
+                        'n' => buf.push(0x000A /* <LF> */),
-                                .expect("malformed Hexadecimal character escape sequence");
+                        'v' => buf.push(0x000B /* <VT> */),
-                            let code_point =
+                        'f' => buf.push(0x000C /* <FF> */),
-                                u16::from_str_radix(&code_point_str, 16).map_err(|_| {
+                        'r' => buf.push(0x000D /* <CR> */),
-                                    Error::syntax(
+                        '"' => buf.push(0x0022 /* " */),
-                                        "invalid Hexadecimal escape sequence",
+                        '\'' => buf.push(0x0027 /* ' */),
-                                        cursor.pos(),
+                        '\\' => buf.push(0x005C /* \ */),
-                                    )
+                        '0' if cursor
-                                })?;
+                            .peek()?
-
+                            .filter(|next_byte| (b'0'..=b'9').contains(next_byte))
-                            buf.push(code_point);
+                            .is_none() =>
                        {
                            buf.push(0x0000 /* NULL */)
                        }
-                        b'u' => {
+                        'x' => {
-                            // Support \u{X..X} (Unicode Codepoint)
+                            Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
-                            if cursor.next_is(b'{')? {
+                        }
-                                // TODO: use bytes for a bit better performance (using stack)
+                        'u' => {
-                                let mut code_point_buf = Vec::with_capacity(6);
+                            Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
                                cursor.take_until(b'}', &mut code_point_buf)?;
                                let code_point_str =
                                    unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
                                // We know this is a single unicode codepoint, convert to u32
                                let code_point =
                                    u32::from_str_radix(&code_point_str, 16).map_err(|_| {
                                        Error::syntax(
                                            "malformed Unicode character escape sequence",
                                            cursor.pos(),
                                        )
                                    })?;
                                // UTF16Encoding of a numeric code point value
                                if code_point > 0x10_FFFF {
                                    return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
                                } else if code_point <= 65535 {
                                    buf.push(code_point as u16);
                                } else {
                                    let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
                                    let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
                                    buf.push(cu1);
                                    buf.push(cu2);
                                }
                            } else {
                                // Collect each character after \u e.g \uD83D will give "D83D"
                                let mut code_point_utf8_bytes = [0u8; 4];
                                cursor.fill_bytes(&mut code_point_utf8_bytes)?;
                                // Convert to u16
                                let code_point_str = str::from_utf8(&code_point_utf8_bytes)
                                    .expect("malformed Unicode character escape sequence");
                                let code_point =
                                    u16::from_str_radix(code_point_str, 16).map_err(|_| {
                                        Error::syntax(
                                            "invalid Unicode escape sequence",
                                            cursor.pos(),
                                        )
                                    })?;
                                buf.push(code_point);
                            }
                        }
-                        n if char::is_digit(char::from(n), 8) => {
+                        '8' | '9' => {
                            // Grammar: NonOctalDecimalEscapeSequence
                            if strict_mode {
                                return Err(Error::syntax(
-                                    "octal escape sequences are deprecated",
+                                    "\\8 and \\9 are not allowed in strict mode",
-                                    cursor.pos(),
+                                    ch_start_pos,
                                ));
                            } else {
                                buf.push(escape_ch as u16);
                            }
-                            let mut o = char::from(n).to_digit(8).unwrap();
+                        }
-
+                        _ if escape_ch.is_digit(8) => {
-                            match cursor.peek()? {
+                            Self::take_legacy_octal_escape_sequence(
-                                Some(c) if char::is_digit(char::from(c), 8) => {
+                                cursor,
-                                    let _ = cursor.next_byte()?;
+                                ch_start_pos,
-                                    o = o * 8 + char::from(n).to_digit(8).unwrap();
+                                Some(&mut buf),
-                                    if n <= b'3' {
+                                strict_mode,
-                                        match cursor.peek()? {
+                                escape_ch as u8,
-                                            Some(c) if char::is_digit(char::from(c), 8) => {
+                            )?;
-                                                let _ = cursor.next_byte();
+                        }
-                                                o = o * 8 + char::from(n).to_digit(8).unwrap();
+                        _ if Self::is_line_terminator(escape_ch) => {
-                                            }
+                            // Grammar: LineContinuation
-                                            _ => (),
+                            // Grammar: \ LineTerminatorSequence
-                                        }
+                            // LineContinuation is the empty String. Do nothing and continue lexing.
-                                    }
+                        }
-                                }
+                        _ => {
-                                _ => (),
+                            if escape_ch.len_utf16() == 1 {
                                buf.push(escape_ch as u16);
                            } else {
                                buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter());
                            }
                            buf.push(o as u16);
                        }
                        _ => buf.push(escape as u16),
                    };
                }
                Some(ch) => {
                    if ch.len_utf16() == 1 {
                        buf.push(ch as u16);
                    } else {
                        buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter());
                    }
                }
                None => {
                    return Err(Error::from(io::Error::new(
                        ErrorKind::UnexpectedEof,
                        "unterminated string literal",
                    )));
                }
            }
-            Some(next_ch) => {
+        }
-                if next_ch.len_utf16() == 1 {
+
-                    buf.push(next_ch as u16);
+        Ok((
-                } else {
+            String::from_utf16_lossy(buf.as_slice()),
-                    let mut code_point_bytes_buf = [0u16; 2];
+            Span::new(start_pos, cursor.pos()),
-                    let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
+        ))
    }
    #[inline]
    pub(super) fn take_unicode_escape_sequence<R>(
        cursor: &mut Cursor<R>,
        start_pos: Position,
        code_units_buf: Option<&mut Vec<u16>>,
    ) -> Result<u32, Error>
    where
        R: Read,
    {
        // Support \u{X..X} (Unicode CodePoint)
        if cursor.next_is(b'{')? {
            // TODO: use bytes for a bit better performance (using stack)
            let mut code_point_buf = Vec::with_capacity(6);
            cursor.take_until(b'}', &mut code_point_buf)?;
            let code_point = str::from_utf8(code_point_buf.as_slice())
                .ok()
                .and_then(|code_point_str| {
                    // The `code_point_str` should represent a single unicode codepoint, convert to u32
                    u32::from_str_radix(&code_point_str, 16).ok()
                })
                .ok_or_else(|| {
                    Error::syntax("malformed Unicode character escape sequence", start_pos)
                })?;
-                    buf.extend(code_point_bytes.iter());
+            // UTF16Encoding of a numeric code point value
            if code_point > 0x10_FFFF {
                return Err(Error::syntax(
                    "Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
                    start_pos,
                ));
            } else if let Some(code_units_buf) = code_units_buf {
                if code_point <= 65535 {
                    code_units_buf.push(code_point as u16);
                } else {
                    let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
                    let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
                    code_units_buf.push(cu1);
                    code_units_buf.push(cu2);
                }
            }
-            None if terminator != StringTerminator::End => {
+
-                return Err(Error::from(io::Error::new(
+            Ok(code_point)
-                    ErrorKind::UnexpectedEof,
+        } else {
-                    "unterminated string literal",
+            // Grammar: Hex4Digits
-                )));
+            // Collect each character after \u e.g \uD83D will give "D83D"
-            }
+            let mut code_point_utf8_bytes = [0u8; 4];
-            None => {
+            cursor.fill_bytes(&mut code_point_utf8_bytes)?;
-                break;
+
            // Convert to u16
            let code_point = str::from_utf8(&code_point_utf8_bytes)
                .ok()
                .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
                .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
            if let Some(code_units_buf) = code_units_buf {
                code_units_buf.push(code_point);
            }
            Ok(code_point as u32)
        }
    }
    #[inline]
    fn take_hex_escape_sequence<R>(
        cursor: &mut Cursor<R>,
        start_pos: Position,
        code_units_buf: Option<&mut Vec<u16>>,
    ) -> Result<u32, Error>
    where
        R: Read,
    {
        let mut code_point_utf8_bytes = [0u8; 2];
        cursor.fill_bytes(&mut code_point_utf8_bytes)?;
        let code_point = str::from_utf8(&code_point_utf8_bytes)
            .ok()
            .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
            .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
        if let Some(code_units_buf) = code_units_buf {
            code_units_buf.push(code_point);
        }
        Ok(code_point as u32)
    }
-    Ok((
+    #[inline]
-        String::from_utf16_lossy(buf.as_slice()),
+    fn take_legacy_octal_escape_sequence<R>(
-        Span::new(start_pos, cursor.pos()),
+        cursor: &mut Cursor<R>,
-    ))
+        start_pos: Position,
        code_units_buf: Option<&mut Vec<u16>>,
        strict_mode: bool,
        init_byte: u8,
    ) -> Result<u32, Error>
    where
        R: Read,
    {
        if strict_mode {
            return Err(Error::syntax(
                "octal escape sequences are not allowed in strict mode",
                start_pos,
            ));
        }
        // Grammar: OctalDigit
        let mut code_point = (init_byte - b'0') as u32;
        // Grammar: ZeroToThree OctalDigit
        // Grammar: FourToSeven OctalDigit
        if let Some(byte) = cursor.peek()? {
            if (b'0'..=b'7').contains(&byte) {
                let _ = cursor.next_byte()?;
                code_point = (code_point * 8) + (byte - b'0') as u32;
                if (b'0'..=b'3').contains(&init_byte) {
                    // Grammar: ZeroToThree OctalDigit OctalDigit
                    if let Some(byte) = cursor.peek()? {
                        if (b'0'..=b'7').contains(&byte) {
                            let _ = cursor.next_byte()?;
                            code_point = (code_point * 8) + (byte - b'0') as u32;
                        }
                    }
                }
            }
        }
        if let Some(code_units_buf) = code_units_buf {
            code_units_buf.push(code_point as u16);
        }
        Ok(code_point)
    }
 }
--- a/boa/src/syntax/lexer/template.rs
+++ b/boa/src/syntax/lexer/template.rs
@ -3,7 +3,7 @@
 use super::{Cursor, Error, Tokenizer};
 use crate::{
    profiler::BoaProfiler,
-    syntax::lexer::string::{unescape_string, StringTerminator},
+    syntax::lexer::string::{StringLiteral, StringTerminator},
    syntax::{
        ast::{Position, Span},
        lexer::{Token, TokenKind},
@ -44,7 +44,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
            match next_chr {
                '`' => {
                    let raw = String::from_utf16_lossy(buf.as_slice());
-                    let (cooked, _) = unescape_string(
+                    let (cooked, _) = StringLiteral::take_string_characters(
                        &mut Cursor::with_position(raw.as_bytes(), start_pos),
                        start_pos,
                        StringTerminator::End,
@ -58,7 +58,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
                '$' if cursor.peek()? == Some(b'{') => {
                    let _ = cursor.next_byte()?;
                    let raw = String::from_utf16_lossy(buf.as_slice());
-                    let (cooked, _) = unescape_string(
+                    let (cooked, _) = StringLiteral::take_string_characters(
                        &mut Cursor::with_position(raw.as_bytes(), start_pos),
                        start_pos,
                        StringTerminator::End,
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@ -6,7 +6,7 @@ use super::token::Numeric;
 use super::*;
 use super::{Error, Position};
 use crate::syntax::ast::Keyword;
-use crate::syntax::lexer::string::{unescape_string, StringTerminator};
+use crate::syntax::lexer::string::{StringLiteral, StringTerminator};
 use std::str;
 fn span(start: (u32, u32), end: (u32, u32)) -> Span {
@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() {
 }
 #[test]
-fn codepoint_with_no_braces() {
+fn string_codepoint_with_no_braces() {
    let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]);
    assert!(lexer.next().is_ok());
 }
@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() {
 }
 #[test]
-fn non_english_str() {
+fn string_unicode() {
    let str = r#"'中文';"#;
    let mut lexer = Lexer::new(str.as_bytes());
@ -828,7 +828,7 @@ fn non_english_str() {
 }
 #[test]
-fn unicode_escape_with_braces() {
+fn string_unicode_escape_with_braces() {
    let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);
    let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];
@ -859,12 +859,12 @@ fn unicode_escape_with_braces() {
 }
 #[test]
-fn unicode_escape_with_braces_() {
+fn take_string_characters_unicode_escape_with_braces_2() {
    let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string();
    let mut cursor = Cursor::new(s.as_bytes());
-    if let Ok((s, _)) = unescape_string(
+    if let Ok((s, _)) = StringLiteral::take_string_characters(
        &mut cursor,
        Position::new(1, 1),
        StringTerminator::End,
@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() {
 }
 #[test]
-fn unescape_string_with_single_escape() {
+fn take_string_characters_with_single_escape() {
    let s = r#"\Б"#.to_string();
    let mut cursor = Cursor::new(s.as_bytes());
-    let (s, _) = unescape_string(
+    let (s, _) = StringLiteral::take_string_characters(
        &mut cursor,
        Position::new(1, 1),
        StringTerminator::End,
@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() {
    assert_eq!(s, "Б");
 }
 #[test]
 fn take_string_characters_legacy_octal_escape() {
    let test_cases = [
        (r#"\3"#, "\u{3}"),
        (r#"\03"#, "\u{3}"),
        (r#"\003"#, "\u{3}"),
        (r#"\0003"#, "\u{0}3"),
        (r#"\43"#, "#"),
        (r#"\043"#, "#"),
        (r#"\101"#, "A"),
    ];
    for (s, expected) in test_cases.iter() {
        let mut cursor = Cursor::new(s.as_bytes());
        let (s, _) = StringLiteral::take_string_characters(
            &mut cursor,
            Position::new(1, 1),
            StringTerminator::End,
            false,
        )
        .unwrap();
        assert_eq!(s, *expected);
    }
    for (s, _) in test_cases.iter() {
        let mut cursor = Cursor::new(s.as_bytes());
        if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
            &mut cursor,
            Position::new(1, 1),
            StringTerminator::End,
            true,
        )
        .expect_err("Octal-escape in strict mode not rejected as expected")
        {
            assert_eq!(pos, Position::new(1, 1));
        } else {
            panic!("invalid error type");
        }
    }
 }
 #[test]
 fn take_string_characters_zero_escape() {
    let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")];
    for (s, expected) in test_cases.iter() {
        let mut cursor = Cursor::new(s.as_bytes());
        let (s, _) = StringLiteral::take_string_characters(
            &mut cursor,
            Position::new(1, 1),
            StringTerminator::End,
            false,
        )
        .unwrap();
        assert_eq!(s, *expected);
    }
 }
 #[test]
 fn take_string_characters_non_octal_decimal_escape() {
    let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")];
    for (s, expected) in test_cases.iter() {
        let mut cursor = Cursor::new(s.as_bytes());
        let (s, _) = StringLiteral::take_string_characters(
            &mut cursor,
            Position::new(1, 1),
            StringTerminator::End,
            false,
        )
        .unwrap();
        assert_eq!(s, *expected);
    }
    for (s, _) in test_cases.iter() {
        let mut cursor = Cursor::new(s.as_bytes());
        if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
            &mut cursor,
            Position::new(1, 1),
            StringTerminator::End,
            true,
        )
        .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected")
        {
            assert_eq!(pos, Position::new(1, 1));
        } else {
            panic!("invalid error type");
        }
    }
 }
 #[test]
 fn take_string_characters_line_continuation() {
    let s = "hello \\\nworld";
    let mut cursor = Cursor::new(s.as_bytes());
    let (s, _) = StringLiteral::take_string_characters(
        &mut cursor,
        Position::new(1, 1),
        StringTerminator::End,
        false,
    )
    .unwrap();
    assert_eq!(s, "hello world");
 }
 mod carriage_return {
    use super::*;