Fix lexing escapes in string literal and minor refactor (#1079)

* Refactor StringLiteral * Fix octal escape in string literal * Add tests * Fix zero escape * Fix zero escape lookahead * Rename variables * Rename helper functions * Refactor match arms * Fix escape line terminator sequence * Fix single character escape * Fix escape followed by unicode char * Add NonOctalDecimalEscapeSequence * Fix comment * Refactor * Modify error message * Add tests * Rename tests * Add test for error * Add comments for unsafe bytes to str * Update boa/src/syntax/lexer/string.rs Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com> * Minor refactor * Remove unsafe bytes to str * Fix panic when reading invalid utf-8 chars Co-authored-by: tofpie <75836434+tofpie@users.noreply.github.com>
4 years ago · 00fc5e22bc
3 changed files with 366 additions and 153 deletions
--- a/boa/src/syntax/lexer/string.rs
+++ b/boa/src/syntax/lexer/string.rs
@ -58,171 +58,273 @@ impl<R> Tokenizer<R> for StringLiteral {
        let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");

        let (lit, span) =
-            unescape_string(cursor, start_pos, self.terminator, cursor.strict_mode())?;
+            Self::take_string_characters(cursor, start_pos, self.terminator, cursor.strict_mode())?;

        Ok(Token::new(TokenKind::string_literal(lit), span))
    }
 }

-pub(super) fn unescape_string<R>(
-    cursor: &mut Cursor<R>,
-    start_pos: Position,
-    terminator: StringTerminator,
-    strict_mode: bool,
-) -> Result<(String, Span), Error>
-where
-    R: Read,
-{
-    let mut buf = Vec::new();
-    loop {
-        let next_chr = cursor.next_char()?.map(char::try_from).transpose().unwrap();
-
-        match next_chr {
-            Some('\'') if terminator == StringTerminator::SingleQuote => {
-                break;
-            }
-            Some('"') if terminator == StringTerminator::DoubleQuote => {
-                break;
-            }
-            Some('\\') => {
-                let _timer =
-                    BoaProfiler::global().start_event("StringLiteral - escape sequence", "Lexing");
+impl StringLiteral {
+    /// Checks if a character is LineTerminator as per ECMAScript standards.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#prod-LineTerminator
+    #[inline]
+    pub(super) fn is_line_terminator(ch: char) -> bool {
+        matches!(
+            ch,
+            '\u{000A}' /* <LF> */ | '\u{000D}' /* <CR> */ | '\u{2028}' /* <LS> */ | '\u{2029}' /* <PS> */
+        )
+    }

-                let escape = cursor.peek()?.ok_or_else(|| {
-                    Error::from(io::Error::new(
-                        ErrorKind::UnexpectedEof,
-                        "unterminated escape sequence in literal",
-                    ))
-                })?;
+    pub(super) fn take_string_characters<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        terminator: StringTerminator,
+        strict_mode: bool,
+    ) -> Result<(String, Span), Error>
+    where
+        R: Read,
+    {
+        let mut buf = Vec::new();
+        loop {
+            let ch_start_pos = cursor.pos();
+            let ch = cursor.next_char()?.map(char::try_from).transpose().unwrap();
+
+            match ch {
+                Some('\'') if terminator == StringTerminator::SingleQuote => {
+                    break;
+                }
+                Some('"') if terminator == StringTerminator::DoubleQuote => {
+                    break;
+                }
+                None if terminator == StringTerminator::End => {
+                    break;
+                }
+                Some('\\') => {
+                    let _timer = BoaProfiler::global()
+                        .start_event("StringLiteral - escape sequence", "Lexing");

-                if escape <= 0x7f {
-                    let _ = cursor.next_byte()?;
-                    match escape {
-                        b'\n' => (),
-                        b'n' => buf.push('\n' as u16),
-                        b'r' => buf.push('\r' as u16),
-                        b't' => buf.push('\t' as u16),
-                        b'b' => buf.push('\x08' as u16),
-                        b'f' => buf.push('\x0c' as u16),
-                        b'0' => buf.push('\0' as u16),
-                        b'x' => {
-                            let mut code_point_utf8_bytes = [0u8; 2];
-                            cursor.fill_bytes(&mut code_point_utf8_bytes)?;
-                            let code_point_str = str::from_utf8(&code_point_utf8_bytes)
-                                .expect("malformed Hexadecimal character escape sequence");
-                            let code_point =
-                                u16::from_str_radix(&code_point_str, 16).map_err(|_| {
-                                    Error::syntax(
-                                        "invalid Hexadecimal escape sequence",
-                                        cursor.pos(),
-                                    )
-                                })?;
-
-                            buf.push(code_point);
+                    let escape_ch = cursor
+                        .next_char()?
+                        .and_then(|byte| char::try_from(byte).ok())
+                        .ok_or_else(|| {
+                            Error::from(io::Error::new(
+                                ErrorKind::UnexpectedEof,
+                                "unterminated escape sequence in literal",
+                            ))
+                        })?;
+
+                    match escape_ch {
+                        'b' => buf.push(0x0008 /* <BS> */),
+                        't' => buf.push(0x0009 /* <HT> */),
+                        'n' => buf.push(0x000A /* <LF> */),
+                        'v' => buf.push(0x000B /* <VT> */),
+                        'f' => buf.push(0x000C /* <FF> */),
+                        'r' => buf.push(0x000D /* <CR> */),
+                        '"' => buf.push(0x0022 /* " */),
+                        '\'' => buf.push(0x0027 /* ' */),
+                        '\\' => buf.push(0x005C /* \ */),
+                        '0' if cursor
+                            .peek()?
+                            .filter(|next_byte| (b'0'..=b'9').contains(next_byte))
+                            .is_none() =>
+                        {
+                            buf.push(0x0000 /* NULL */)
                        }
-                        b'u' => {
-                            // Support \u{X..X} (Unicode Codepoint)
-                            if cursor.next_is(b'{')? {
-                                // TODO: use bytes for a bit better performance (using stack)
-                                let mut code_point_buf = Vec::with_capacity(6);
-                                cursor.take_until(b'}', &mut code_point_buf)?;
-
-                                let code_point_str =
-                                    unsafe { str::from_utf8_unchecked(code_point_buf.as_slice()) };
-                                // We know this is a single unicode codepoint, convert to u32
-                                let code_point =
-                                    u32::from_str_radix(&code_point_str, 16).map_err(|_| {
-                                        Error::syntax(
-                                            "malformed Unicode character escape sequence",
-                                            cursor.pos(),
-                                        )
-                                    })?;
-
-                                // UTF16Encoding of a numeric code point value
-                                if code_point > 0x10_FFFF {
-                                    return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
-                                } else if code_point <= 65535 {
-                                    buf.push(code_point as u16);
-                                } else {
-                                    let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
-                                    let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
-                                    buf.push(cu1);
-                                    buf.push(cu2);
-                                }
-                            } else {
-                                // Collect each character after \u e.g \uD83D will give "D83D"
-                                let mut code_point_utf8_bytes = [0u8; 4];
-                                cursor.fill_bytes(&mut code_point_utf8_bytes)?;
-
-                                // Convert to u16
-                                let code_point_str = str::from_utf8(&code_point_utf8_bytes)
-                                    .expect("malformed Unicode character escape sequence");
-                                let code_point =
-                                    u16::from_str_radix(code_point_str, 16).map_err(|_| {
-                                        Error::syntax(
-                                            "invalid Unicode escape sequence",
-                                            cursor.pos(),
-                                        )
-                                    })?;
-
-                                buf.push(code_point);
-                            }
+                        'x' => {
+                            Self::take_hex_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
+                        }
+                        'u' => {
+                            Self::take_unicode_escape_sequence(cursor, ch_start_pos, Some(&mut buf))?;
                        }
-                        n if char::is_digit(char::from(n), 8) => {
+                        '8' | '9' => {
+                            // Grammar: NonOctalDecimalEscapeSequence
                            if strict_mode {
                                return Err(Error::syntax(
-                                    "octal escape sequences are deprecated",
-                                    cursor.pos(),
+                                    "\\8 and \\9 are not allowed in strict mode",
+                                    ch_start_pos,
                                ));
+                            } else {
+                                buf.push(escape_ch as u16);
                            }
-                            let mut o = char::from(n).to_digit(8).unwrap();
-
-                            match cursor.peek()? {
-                                Some(c) if char::is_digit(char::from(c), 8) => {
-                                    let _ = cursor.next_byte()?;
-                                    o = o * 8 + char::from(n).to_digit(8).unwrap();
-                                    if n <= b'3' {
-                                        match cursor.peek()? {
-                                            Some(c) if char::is_digit(char::from(c), 8) => {
-                                                let _ = cursor.next_byte();
-                                                o = o * 8 + char::from(n).to_digit(8).unwrap();
-                                            }
-                                            _ => (),
-                                        }
-                                    }
-                                }
-                                _ => (),
+                        }
+                        _ if escape_ch.is_digit(8) => {
+                            Self::take_legacy_octal_escape_sequence(
+                                cursor,
+                                ch_start_pos,
+                                Some(&mut buf),
+                                strict_mode,
+                                escape_ch as u8,
+                            )?;
+                        }
+                        _ if Self::is_line_terminator(escape_ch) => {
+                            // Grammar: LineContinuation
+                            // Grammar: \ LineTerminatorSequence
+                            // LineContinuation is the empty String. Do nothing and continue lexing.
+                        }
+                        _ => {
+                            if escape_ch.len_utf16() == 1 {
+                                buf.push(escape_ch as u16);
+                            } else {
+                                buf.extend(escape_ch.encode_utf16(&mut [0u16; 2]).iter());
                            }
-                            buf.push(o as u16);
                        }
-                        _ => buf.push(escape as u16),
                    };
                }
+                Some(ch) => {
+                    if ch.len_utf16() == 1 {
+                        buf.push(ch as u16);
+                    } else {
+                        buf.extend(ch.encode_utf16(&mut [0u16; 2]).iter());
+                    }
+                }
+                None => {
+                    return Err(Error::from(io::Error::new(
+                        ErrorKind::UnexpectedEof,
+                        "unterminated string literal",
+                    )));
+                }
            }
-            Some(next_ch) => {
-                if next_ch.len_utf16() == 1 {
-                    buf.push(next_ch as u16);
-                } else {
-                    let mut code_point_bytes_buf = [0u16; 2];
-                    let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
+        }
+
+        Ok((
+            String::from_utf16_lossy(buf.as_slice()),
+            Span::new(start_pos, cursor.pos()),
+        ))
+    }
+
+    #[inline]
+    pub(super) fn take_unicode_escape_sequence<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        code_units_buf: Option<&mut Vec<u16>>,
+    ) -> Result<u32, Error>
+    where
+        R: Read,
+    {
+        // Support \u{X..X} (Unicode CodePoint)
+        if cursor.next_is(b'{')? {
+            // TODO: use bytes for a bit better performance (using stack)
+            let mut code_point_buf = Vec::with_capacity(6);
+            cursor.take_until(b'}', &mut code_point_buf)?;
+
+            let code_point = str::from_utf8(code_point_buf.as_slice())
+                .ok()
+                .and_then(|code_point_str| {
+                    // The `code_point_str` should represent a single unicode codepoint, convert to u32
+                    u32::from_str_radix(&code_point_str, 16).ok()
+                })
+                .ok_or_else(|| {
+                    Error::syntax("malformed Unicode character escape sequence", start_pos)
+                })?;

-                    buf.extend(code_point_bytes.iter());
+            // UTF16Encoding of a numeric code point value
+            if code_point > 0x10_FFFF {
+                return Err(Error::syntax(
+                    "Unicode codepoint must not be greater than 0x10FFFF in escape sequence",
+                    start_pos,
+                ));
+            } else if let Some(code_units_buf) = code_units_buf {
+                if code_point <= 65535 {
+                    code_units_buf.push(code_point as u16);
+                } else {
+                    let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
+                    let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
+                    code_units_buf.push(cu1);
+                    code_units_buf.push(cu2);
                }
            }
-            None if terminator != StringTerminator::End => {
-                return Err(Error::from(io::Error::new(
-                    ErrorKind::UnexpectedEof,
-                    "unterminated string literal",
-                )));
-            }
-            None => {
-                break;
+
+            Ok(code_point)
+        } else {
+            // Grammar: Hex4Digits
+            // Collect each character after \u e.g \uD83D will give "D83D"
+            let mut code_point_utf8_bytes = [0u8; 4];
+            cursor.fill_bytes(&mut code_point_utf8_bytes)?;
+
+            // Convert to u16
+            let code_point = str::from_utf8(&code_point_utf8_bytes)
+                .ok()
+                .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
+                .ok_or_else(|| Error::syntax("invalid Unicode escape sequence", start_pos))?;
+
+            if let Some(code_units_buf) = code_units_buf {
+                code_units_buf.push(code_point);
            }
+
+            Ok(code_point as u32)
+        }
+    }
+
+    #[inline]
+    fn take_hex_escape_sequence<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        code_units_buf: Option<&mut Vec<u16>>,
+    ) -> Result<u32, Error>
+    where
+        R: Read,
+    {
+        let mut code_point_utf8_bytes = [0u8; 2];
+        cursor.fill_bytes(&mut code_point_utf8_bytes)?;
+        let code_point = str::from_utf8(&code_point_utf8_bytes)
+            .ok()
+            .and_then(|code_point_str| u16::from_str_radix(&code_point_str, 16).ok())
+            .ok_or_else(|| Error::syntax("invalid Hexadecimal escape sequence", start_pos))?;
+
+        if let Some(code_units_buf) = code_units_buf {
+            code_units_buf.push(code_point);
        }
+
+        Ok(code_point as u32)
    }

-    Ok((
-        String::from_utf16_lossy(buf.as_slice()),
-        Span::new(start_pos, cursor.pos()),
-    ))
+    #[inline]
+    fn take_legacy_octal_escape_sequence<R>(
+        cursor: &mut Cursor<R>,
+        start_pos: Position,
+        code_units_buf: Option<&mut Vec<u16>>,
+        strict_mode: bool,
+        init_byte: u8,
+    ) -> Result<u32, Error>
+    where
+        R: Read,
+    {
+        if strict_mode {
+            return Err(Error::syntax(
+                "octal escape sequences are not allowed in strict mode",
+                start_pos,
+            ));
+        }
+        // Grammar: OctalDigit
+        let mut code_point = (init_byte - b'0') as u32;
+
+        // Grammar: ZeroToThree OctalDigit
+        // Grammar: FourToSeven OctalDigit
+        if let Some(byte) = cursor.peek()? {
+            if (b'0'..=b'7').contains(&byte) {
+                let _ = cursor.next_byte()?;
+                code_point = (code_point * 8) + (byte - b'0') as u32;
+
+                if (b'0'..=b'3').contains(&init_byte) {
+                    // Grammar: ZeroToThree OctalDigit OctalDigit
+                    if let Some(byte) = cursor.peek()? {
+                        if (b'0'..=b'7').contains(&byte) {
+                            let _ = cursor.next_byte()?;
+                            code_point = (code_point * 8) + (byte - b'0') as u32;
+                        }
+                    }
+                }
+            }
+        }
+
+        if let Some(code_units_buf) = code_units_buf {
+            code_units_buf.push(code_point as u16);
+        }
+
+        Ok(code_point)
+    }
 }
--- a/boa/src/syntax/lexer/template.rs
+++ b/boa/src/syntax/lexer/template.rs
@ -3,7 +3,7 @@
 use super::{Cursor, Error, Tokenizer};
 use crate::{
    profiler::BoaProfiler,
-    syntax::lexer::string::{unescape_string, StringTerminator},
+    syntax::lexer::string::{StringLiteral, StringTerminator},
    syntax::{
        ast::{Position, Span},
        lexer::{Token, TokenKind},
@ -44,7 +44,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
            match next_chr {
                '`' => {
                    let raw = String::from_utf16_lossy(buf.as_slice());
-                    let (cooked, _) = unescape_string(
+                    let (cooked, _) = StringLiteral::take_string_characters(
                        &mut Cursor::with_position(raw.as_bytes(), start_pos),
                        start_pos,
                        StringTerminator::End,
@ -58,7 +58,7 @@ impl<R> Tokenizer<R> for TemplateLiteral {
                '$' if cursor.peek()? == Some(b'{') => {
                    let _ = cursor.next_byte()?;
                    let raw = String::from_utf16_lossy(buf.as_slice());
-                    let (cooked, _) = unescape_string(
+                    let (cooked, _) = StringLiteral::take_string_characters(
                        &mut Cursor::with_position(raw.as_bytes(), start_pos),
                        start_pos,
                        StringTerminator::End,
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@ -6,7 +6,7 @@ use super::token::Numeric;
 use super::*;
 use super::{Error, Position};
 use crate::syntax::ast::Keyword;
-use crate::syntax::lexer::string::{unescape_string, StringTerminator};
+use crate::syntax::lexer::string::{StringLiteral, StringTerminator};
 use std::str;

 fn span(start: (u32, u32), end: (u32, u32)) -> Span {
@ -795,7 +795,7 @@ fn illegal_following_numeric_literal() {
 }

 #[test]
-fn codepoint_with_no_braces() {
+fn string_codepoint_with_no_braces() {
    let mut lexer = Lexer::new(&br#""test\uD38Dtest""#[..]);
    assert!(lexer.next().is_ok());
 }
@ -814,7 +814,7 @@ fn illegal_code_point_following_numeric_literal() {
 }

 #[test]
-fn non_english_str() {
+fn string_unicode() {
    let str = r#"'中文';"#;

    let mut lexer = Lexer::new(str.as_bytes());
@ -828,7 +828,7 @@ fn non_english_str() {
 }

 #[test]
-fn unicode_escape_with_braces() {
+fn string_unicode_escape_with_braces() {
    let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);

    let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];
@ -859,12 +859,12 @@ fn unicode_escape_with_braces() {
 }

 #[test]
-fn unicode_escape_with_braces_() {
+fn take_string_characters_unicode_escape_with_braces_2() {
    let s = r#"\u{20ac}\u{a0}\u{a0}"#.to_string();

    let mut cursor = Cursor::new(s.as_bytes());

-    if let Ok((s, _)) = unescape_string(
+    if let Ok((s, _)) = StringLiteral::take_string_characters(
        &mut cursor,
        Position::new(1, 1),
        StringTerminator::End,
@ -877,10 +877,10 @@ fn unicode_escape_with_braces_() {
 }

 #[test]
-fn unescape_string_with_single_escape() {
+fn take_string_characters_with_single_escape() {
    let s = r#"\Б"#.to_string();
    let mut cursor = Cursor::new(s.as_bytes());
-    let (s, _) = unescape_string(
+    let (s, _) = StringLiteral::take_string_characters(
        &mut cursor,
        Position::new(1, 1),
        StringTerminator::End,
@ -890,6 +890,117 @@ fn unescape_string_with_single_escape() {
    assert_eq!(s, "Б");
 }

+#[test]
+fn take_string_characters_legacy_octal_escape() {
+    let test_cases = [
+        (r#"\3"#, "\u{3}"),
+        (r#"\03"#, "\u{3}"),
+        (r#"\003"#, "\u{3}"),
+        (r#"\0003"#, "\u{0}3"),
+        (r#"\43"#, "#"),
+        (r#"\043"#, "#"),
+        (r#"\101"#, "A"),
+    ];
+
+    for (s, expected) in test_cases.iter() {
+        let mut cursor = Cursor::new(s.as_bytes());
+        let (s, _) = StringLiteral::take_string_characters(
+            &mut cursor,
+            Position::new(1, 1),
+            StringTerminator::End,
+            false,
+        )
+        .unwrap();
+
+        assert_eq!(s, *expected);
+    }
+
+    for (s, _) in test_cases.iter() {
+        let mut cursor = Cursor::new(s.as_bytes());
+
+        if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
+            &mut cursor,
+            Position::new(1, 1),
+            StringTerminator::End,
+            true,
+        )
+        .expect_err("Octal-escape in strict mode not rejected as expected")
+        {
+            assert_eq!(pos, Position::new(1, 1));
+        } else {
+            panic!("invalid error type");
+        }
+    }
+}
+
+#[test]
+fn take_string_characters_zero_escape() {
+    let test_cases = [(r#"\0"#, "\u{0}"), (r#"\0A"#, "\u{0}A")];
+
+    for (s, expected) in test_cases.iter() {
+        let mut cursor = Cursor::new(s.as_bytes());
+        let (s, _) = StringLiteral::take_string_characters(
+            &mut cursor,
+            Position::new(1, 1),
+            StringTerminator::End,
+            false,
+        )
+        .unwrap();
+
+        assert_eq!(s, *expected);
+    }
+}
+
+#[test]
+fn take_string_characters_non_octal_decimal_escape() {
+    let test_cases = [(r#"\8"#, "8"), (r#"\9"#, "9")];
+
+    for (s, expected) in test_cases.iter() {
+        let mut cursor = Cursor::new(s.as_bytes());
+        let (s, _) = StringLiteral::take_string_characters(
+            &mut cursor,
+            Position::new(1, 1),
+            StringTerminator::End,
+            false,
+        )
+        .unwrap();
+
+        assert_eq!(s, *expected);
+    }
+
+    for (s, _) in test_cases.iter() {
+        let mut cursor = Cursor::new(s.as_bytes());
+
+        if let Error::Syntax(_, pos) = StringLiteral::take_string_characters(
+            &mut cursor,
+            Position::new(1, 1),
+            StringTerminator::End,
+            true,
+        )
+        .expect_err("Non-octal-decimal-escape in strict mode not rejected as expected")
+        {
+            assert_eq!(pos, Position::new(1, 1));
+        } else {
+            panic!("invalid error type");
+        }
+    }
+}
+
+#[test]
+fn take_string_characters_line_continuation() {
+    let s = "hello \\\nworld";
+    let mut cursor = Cursor::new(s.as_bytes());
+    let (s, _) = StringLiteral::take_string_characters(
+        &mut cursor,
+        Position::new(1, 1),
+        StringTerminator::End,
+        false,
+    )
+    .unwrap();
+
+    assert_eq!(s, "hello world");
+}
+
 mod carriage_return {
    use super::*;