Browse Source

Handle invalid Unicode code point in the string literals (#853)

pull/873/head
Jevan Chan 4 years ago committed by GitHub
parent
commit
de7202dee8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      boa/src/syntax/lexer/cursor.rs
  2. 145
      boa/src/syntax/lexer/string.rs

3
boa/src/syntax/lexer/cursor.rs

@ -228,8 +228,7 @@ where
/// Retrieves the next UTF-8 checked character. /// Retrieves the next UTF-8 checked character.
fn next_char(&mut self) -> io::Result<Option<char>> { fn next_char(&mut self) -> io::Result<Option<char>> {
if let Some(v) = self.peeked_char { if let Some(v) = self.peeked_char.take() {
let _ = self.peeked_char.take();
return Ok(v); return Ok(v);
} }

145
boa/src/syntax/lexer/string.rs

@ -9,8 +9,6 @@ use crate::{
}, },
}; };
use std::{ use std::{
char::{decode_utf16, from_u32},
convert::TryFrom,
io::{self, ErrorKind, Read}, io::{self, ErrorKind, Read},
str, str,
}; };
@ -57,7 +55,7 @@ impl<R> Tokenizer<R> for StringLiteral {
{ {
let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing"); let _timer = BoaProfiler::global().start_event("StringLiteral", "Lexing");
let mut buf = String::new(); let mut buf: Vec<u16> = Vec::new();
loop { loop {
let next_chr_start = cursor.pos(); let next_chr_start = cursor.pos();
let next_chr = cursor.next_char()?.ok_or_else(|| { let next_chr = cursor.next_char()?.ok_or_else(|| {
@ -84,108 +82,81 @@ impl<R> Tokenizer<R> for StringLiteral {
"unterminated escape sequence in string literal", "unterminated escape sequence in string literal",
)) ))
})?; })?;
if escape != '\n' { if escape != '\n' {
let escaped_ch = match escape { match escape {
'n' => '\n', 'n' => buf.push('\n' as u16),
'r' => '\r', 'r' => buf.push('\r' as u16),
't' => '\t', 't' => buf.push('\t' as u16),
'b' => '\x08', 'b' => buf.push('\x08' as u16),
'f' => '\x0c', 'f' => buf.push('\x0c' as u16),
'0' => '\0', '0' => buf.push('\0' as u16),
'x' => { 'x' => {
let mut nums = [0u8; 2]; let mut code_point_utf8_bytes = [0u8; 2];
cursor.fill_bytes(&mut nums)?; cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let nums = str::from_utf8(&nums).expect("non-UTF-8 bytes found"); let code_point_str = str::from_utf8(&code_point_utf8_bytes)
.expect("malformed Hexadecimal character escape sequence");
let as_num = match u64::from_str_radix(&nums, 16) { let code_point =
Ok(v) => v, u16::from_str_radix(&code_point_str, 16).map_err(|_| {
Err(_) => 0, Error::syntax(
}; "invalid Hexadecimal escape sequence",
match from_u32(as_num as u32) {
Some(v) => v,
None => {
return Err(Error::syntax(
format!(
"{}: {} is not a valid Unicode scalar value",
cursor.pos(),
as_num
),
cursor.pos(), cursor.pos(),
)) )
} })?;
}
buf.push(code_point);
} }
'u' => { 'u' => {
// There are 2 types of codepoints. Surragate codepoints and
// unicode codepoints. UTF-16 could be surrogate codepoints,
// "\uXXXX\uXXXX" which make up a single unicode codepoint. We will
// need to loop to make sure we catch all UTF-16 codepoints
// Support \u{X..X} (Unicode Codepoint) // Support \u{X..X} (Unicode Codepoint)
if cursor.next_is('{')? { if cursor.next_is('{')? {
cursor.next_char()?.expect("{ character vanished"); // Consume the '{'. cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.
// The biggest code point is 0x10FFFF
// TODO: use bytes for a bit better performance (using stack) // TODO: use bytes for a bit better performance (using stack)
let mut code_point = String::with_capacity(6); let mut code_point_str = String::with_capacity(6);
cursor.take_until('}', &mut code_point)?; cursor.take_until('}', &mut code_point_str)?;
cursor.next_char()?.expect("} character vanished"); // Consume the '}'. cursor.next_char()?.expect("} character vanished"); // Consume the '}'.
// We know this is a single unicode codepoint, convert to u32 // We know this is a single unicode codepoint, convert to u32
let as_num = let code_point = u32::from_str_radix(&code_point_str, 16)
u32::from_str_radix(&code_point, 16).map_err(|_| { .map_err(|_| {
Error::syntax( Error::syntax(
"malformed Unicode character escape sequence", "malformed Unicode character escape sequence",
cursor.pos(), cursor.pos(),
) )
})?; })?;
if as_num > 0x10_FFFF {
// UTF16Encoding of a numeric code point value
if code_point > 0x10_FFFF {
return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos())); return Err(Error::syntax("Unicode codepoint must not be greater than 0x10FFFF in escape sequence", cursor.pos()));
} else if code_point <= 65535 {
buf.push(code_point as u16);
} else {
let cu1 = ((code_point - 65536) / 1024 + 0xD800) as u16;
let cu2 = ((code_point - 65536) % 1024 + 0xDC00) as u16;
buf.push(cu1);
buf.push(cu2);
} }
char::try_from(as_num).map_err(|_| {
Error::syntax(
"invalid Unicode escape sequence",
cursor.pos(),
)
})?
} else { } else {
let mut codepoints: Vec<u16> = vec![]; // Collect each character after \u e.g \uD83D will give "D83D"
loop { let mut code_point_utf8_bytes = [0u8; 4];
// Collect each character after \u e.g \uD83D will give "D83D" cursor.fill_bytes(&mut code_point_utf8_bytes)?;
let mut code_point = [0u8; 4];
cursor.fill_bytes(&mut code_point)?; // Convert to u16
let code_point_str = str::from_utf8(&code_point_utf8_bytes)
// Convert to u16 .expect("malformed Unicode character escape sequence");
let as_num = match u16::from_str_radix( let code_point = u16::from_str_radix(code_point_str, 16)
str::from_utf8(&code_point) .map_err(|_| {
.expect("the cursor returned invalid UTF-8"), Error::syntax(
16, "invalid Unicode escape sequence",
) { cursor.pos(),
Ok(v) => v, )
Err(_) => 0, })?;
};
codepoints.push(as_num);
// Check for another UTF-16 codepoint
if cursor.next_is('\\')? && cursor.next_is('u')? {
continue;
}
break;
}
// codepoints length should either be 1 (unicode codepoint) or buf.push(code_point);
// 2 (surrogate codepoint). Rust's decode_utf16 will deal with
// it regardless
// TODO: do not panic with invalid code points.
decode_utf16(codepoints.iter().copied())
.next()
.expect("Could not get next codepoint")
.expect("Could not get next codepoint")
} }
} }
'\'' | '"' | '\\' => escape, '\'' | '"' | '\\' => buf.push(escape as u16),
ch => { ch => {
let details = format!( let details = format!(
"invalid escape sequence `{}` at line {}, column {}", "invalid escape sequence `{}` at line {}, column {}",
@ -196,15 +167,23 @@ impl<R> Tokenizer<R> for StringLiteral {
return Err(Error::syntax(details, cursor.pos())); return Err(Error::syntax(details, cursor.pos()));
} }
}; };
buf.push(escaped_ch);
} }
} }
next_ch => buf.push(next_ch), next_ch => {
if next_ch.len_utf16() == 1 {
buf.push(next_ch as u16);
} else {
let mut code_point_bytes_buf = [0u16; 2];
let code_point_bytes = next_ch.encode_utf16(&mut code_point_bytes_buf);
buf.extend(code_point_bytes.iter());
}
}
} }
} }
Ok(Token::new( Ok(Token::new(
TokenKind::string_literal(buf), TokenKind::string_literal(String::from_utf16_lossy(buf.as_slice())),
Span::new(start_pos, cursor.pos()), Span::new(start_pos, cursor.pos()),
)) ))
} }

Loading…
Cancel
Save