From de7202dee83eaf7eb32b0da6781a9cf00f76be6f Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Wed, 14 Oct 2020 09:23:02 -0700 Subject: [PATCH] Handle invalid Unicode code point in the string literals (#853) --- boa/src/syntax/lexer/cursor.rs | 3 +- boa/src/syntax/lexer/string.rs | 145 ++++++++++++++------------------- 2 files changed, 63 insertions(+), 85 deletions(-) diff --git a/boa/src/syntax/lexer/cursor.rs b/boa/src/syntax/lexer/cursor.rs index ed162d3bd2..539d7330ca 100644 --- a/boa/src/syntax/lexer/cursor.rs +++ b/boa/src/syntax/lexer/cursor.rs @@ -228,8 +228,7 @@ where /// Retrieves the next UTF-8 checked character. fn next_char(&mut self) -> io::Result> { - if let Some(v) = self.peeked_char { - let _ = self.peeked_char.take(); + if let Some(v) = self.peeked_char.take() { return Ok(v); } diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 87b999664c..4cbcbbae6e 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -9,8 +9,6 @@ use crate::{ }, }; use std::{ - char::{decode_utf16, from_u32}, - convert::TryFrom, io::{self, ErrorKind, Read}, str, }; @@ -57,7 +55,7 @@ impl Tokenizer for StringLiteral { { let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); - let mut buf = String::new(); + let mut buf: Vec = Vec::new(); loop { let next_chr_start = cursor.pos(); let next_chr = cursor.next_char()?.ok_or_else(|| { @@ -84,108 +82,81 @@ impl Tokenizer for StringLiteral { "unterminated escape sequence in string literal", )) })?; + if escape != '\n' { - let escaped_ch = match escape { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'b' => '\x08', - 'f' => '\x0c', - '0' => '\0', + match escape { + 'n' => buf.push('\n' as u16), + 'r' => buf.push('\r' as u16), + 't' => buf.push('\t' as u16), + 'b' => buf.push('\x08' as u16), + 'f' => buf.push('\x0c' as u16), + '0' => buf.push('\0' as u16), 'x' => { - let mut nums = [0u8; 2]; - cursor.fill_bytes(&mut nums)?; - let nums = str::from_utf8(&nums).expect("non-UTF-8 bytes found"); - - let as_num = match u64::from_str_radix(&nums, 16) { - Ok(v) => v, - Err(_) => 0, - }; - match from_u32(as_num as u32) { - Some(v) => v, - None => { - return Err(Error::syntax( - format!( - "{}: {} is not a valid Unicode scalar value", - cursor.pos(), - as_num - ), + let mut code_point_utf8_bytes = [0u8; 2]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + let code_point_str = str::from_utf8(&code_point_utf8_bytes) + .expect("malformed Hexadecimal character escape sequence"); + let code_point = + u16::from_str_radix(&code_point_str, 16).map_err(|_| { + Error::syntax( + "invalid Hexadecimal escape sequence", cursor.pos(), - )) - } - } + ) + })?; + + buf.push(code_point); } 'u' => { - // There are 2 types of codepoints. Surragate codepoints and - // unicode codepoints. UTF-16 could be surrogate codepoints, - // "\uXXXX\uXXXX" which make up a single unicode codepoint. We will - // need to loop to make sure we catch all UTF-16 codepoints - // Support \u{X..X} (Unicode Codepoint) if cursor.next_is('{')? { cursor.next_char()?.expect("{ character vanished"); // Consume the '{'. - // The biggest code point is 0x10FFFF // TODO: use bytes for a bit better performance (using stack) - let mut code_point = String::with_capacity(6); - cursor.take_until('}', &mut code_point)?; + let mut code_point_str = String::with_capacity(6); + cursor.take_until('}', &mut code_point_str)?; cursor.next_char()?.expect("} character vanished"); // Consume the '}'. // We know this is a single unicode codepoint, convert to u32 - let as_num = - u32::from_str_radix(&code_point, 16).map_err(|_| { + let code_point = u32::from_str_radix(&code_point_str, 16) + .map_err(|_| { Error::syntax( "malformed Unicode character escape sequence", cursor.pos(), ) })?; - if as_num > 0x10_FFFF { + + // UTF16Encoding of a numeric code point value + if code_point > 0x10_FFFF { return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos())); + } else if code_point <= 65535 { + buf.push(code_point as u16); + } else { + let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16; + let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16; + buf.push(cu1); + buf.push(cu2); } - char::try_from(as_num).map_err(|_| { - Error::syntax( - "invalid Unicode escape sequence", - cursor.pos(), - ) - })? } else { - let mut codepoints: Vec = vec![]; - loop { - // Collect each character after \u e.g \uD83D will give "D83D" - let mut code_point = [0u8; 4]; - cursor.fill_bytes(&mut code_point)?; - - // Convert to u16 - let as_num = match u16::from_str_radix( - str::from_utf8(&code_point) - .expect("the cursor returned invalid UTF-8"), - 16, - ) { - Ok(v) => v, - Err(_) => 0, - }; - - codepoints.push(as_num); - - // Check for another UTF-16 codepoint - if cursor.next_is('\\')? && cursor.next_is('u')? { - continue; - } - break; - } + // Collect each character after \u e.g \uD83D will give "D83D" + let mut code_point_utf8_bytes = [0u8; 4]; + cursor.fill_bytes(&mut code_point_utf8_bytes)?; + + // Convert to u16 + let code_point_str = str::from_utf8(&code_point_utf8_bytes) + .expect("malformed Unicode character escape sequence"); + let code_point = u16::from_str_radix(code_point_str, 16) + .map_err(|_| { + Error::syntax( + "invalid Unicode escape sequence", + cursor.pos(), + ) + })?; - // codepoints length should either be 1 (unicode codepoint) or - // 2 (surrogate codepoint). Rust's decode_utf16 will deal with - // it regardless - // TODO: do not panic with invalid code points. - decode_utf16(codepoints.iter().copied()) - .next() - .expect("Could not get next codepoint") - .expect("Could not get next codepoint") + buf.push(code_point); } } - '\'' | '"' | '\\' => escape, + '\'' | '"' | '\\' => buf.push(escape as u16), ch => { let details = format!( "invalid escape sequence `{}` at line {}, column {}", @@ -196,15 +167,23 @@ impl Tokenizer for StringLiteral { return Err(Error::syntax(details, cursor.pos())); } }; - buf.push(escaped_ch); } } - next_ch => buf.push(next_ch), + next_ch => { + if next_ch.len_utf16() == 1 { + buf.push(next_ch as u16); + } else { + let mut code_point_bytes_buf = [0u16; 2]; + let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf); + + buf.extend(code_point_bytes.iter()); + } + } } } Ok(Token::new( - TokenKind::string_literal(buf), + TokenKind::string_literal(String::from_utf16_lossy(buf.as_slice())), Span::new(start_pos, cursor.pos()), )) }