From 6f3641d593ac2e5a5f682bd81ce80ad18f8bad0b Mon Sep 17 00:00:00 2001 From: tofpie <75836434+tofpie@users.noreply.github.com> Date: Mon, 28 Dec 2020 13:02:05 +0100 Subject: [PATCH] Add numeric separator lexing (#995) * Add numeric separator handling * Implement suggestions from PR review Co-authored-by: tofpie --- boa/src/syntax/lexer/mod.rs | 8 ++- boa/src/syntax/lexer/number.rs | 93 ++++++++++++++++++++++++++++------ boa/src/syntax/lexer/tests.rs | 41 ++++++++++++++- 3 files changed, 123 insertions(+), 19 deletions(-) diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs index a779453f65..8728837675 100644 --- a/boa/src/syntax/lexer/mod.rs +++ b/boa/src/syntax/lexer/mod.rs @@ -213,7 +213,13 @@ impl Lexer { Punctuator::Colon.into(), Span::new(start, self.cursor.pos()), )), - '.' => SpreadLiteral::new().lex(&mut self.cursor, start), + '.' => { + if self.cursor.peek()?.map(|c| c >= b'0' && c <= b'9') == Some(true) { + NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) + } else { + SpreadLiteral::new().lex(&mut self.cursor, start) + } + } '(' => Ok(Token::new( Punctuator::OpenParen.into(), Span::new(start, self.cursor.pos()), diff --git a/boa/src/syntax/lexer/number.rs b/boa/src/syntax/lexer/number.rs index 4e5c97d341..46ba2b65c0 100644 --- a/boa/src/syntax/lexer/number.rs +++ b/boa/src/syntax/lexer/number.rs @@ -110,11 +110,52 @@ where } // Consume the decimal digits. - cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?; + take_integer(buf, cursor, kind, true)?; Ok(()) } +fn take_integer( + buf: &mut Vec, + cursor: &mut Cursor, + kind: &NumericKind, + separator_allowed: bool, +) -> Result<(), Error> +where + R: Read, +{ + let mut prev_is_underscore = false; + let mut pos = cursor.pos(); + while cursor.next_is_ascii_pred(&|c| c.is_digit(kind.base()) || c == '_')? { + pos = cursor.pos(); + match cursor.next_byte()? { + Some(c) if char::from(c).is_digit(kind.base()) => { + prev_is_underscore = false; + buf.push(c); + } + Some(b'_') if separator_allowed => { + if prev_is_underscore { + return Err(Error::syntax( + "only one underscore is allowed as numeric separator", + cursor.pos(), + )); + } + prev_is_underscore = true; + } + Some(b'_') if !separator_allowed => { + return Err(Error::syntax("separator is not allowed", pos)); + } + _ => (), + } + } + if prev_is_underscore { + return Err(Error::syntax( + "underscores are not allowed at the end of numeric literals", + pos, + )); + } + Ok(()) +} /// Utility function for checking the NumericLiteral is not followed by an `IdentifierStart` or `DecimalDigit` character. /// /// More information: @@ -149,6 +190,7 @@ impl Tokenizer for NumberLiteral { let mut kind = NumericKind::Integer(10); let c = cursor.peek(); + let mut legacy_octal = false; if self.init == b'0' { if let Some(ch) = c? { @@ -180,7 +222,7 @@ impl Tokenizer for NumberLiteral { // Checks if the next char after '0o' is a digit of that base. if not return an error. if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? { return Err(Error::syntax( - "expected hexadecimal digit after number base prefix", + "expected octal digit after number base prefix", cursor.pos(), )); } @@ -196,7 +238,7 @@ impl Tokenizer for NumberLiteral { // Checks if the next char after '0b' is a digit of that base. if not return an error. if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? { return Err(Error::syntax( - "expected hexadecimal digit after number base prefix", + "expected binary digit after number base prefix", cursor.pos(), )); } @@ -211,6 +253,7 @@ impl Tokenizer for NumberLiteral { )); } byte => { + legacy_octal = true; let ch = char::from(byte); if ch.is_digit(8) { // LegacyOctalIntegerLiteral @@ -237,8 +280,6 @@ impl Tokenizer for NumberLiteral { "leading 0's are not allowed in strict mode", start_pos, )); - } else { - buf.push(cursor.next_byte()?.expect("Number digit vanished")); } } // Else indicates that the symbol is a non-number. } @@ -253,34 +294,54 @@ impl Tokenizer for NumberLiteral { } } - // Consume digits until a non-digit character is encountered or all the characters are consumed. - cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; + let next = if self.init == b'.' { + Some(b'.') + } else { + // Consume digits and separators until a non-digit non-separator + // character is encountered or all the characters are consumed. + take_integer(&mut buf, cursor, &kind, !legacy_octal)?; + cursor.peek()? + }; // The non-digit character could be: // 'n' To indicate a BigIntLiteralSuffix. - // '.' To indicate a decimal seperator. + // '.' To indicate a decimal separator. // 'e' | 'E' To indicate an ExponentPart. - match cursor.peek()? { + match next { Some(b'n') => { // DecimalBigIntegerLiteral // Lexing finished. - // Consume the n + if legacy_octal { + return Err(Error::syntax( + "'n' suffix not allowed in octal representation", + cursor.pos(), + )); + } cursor.next_byte()?.expect("n character vanished"); kind = kind.to_bigint(); } Some(b'.') => { if kind.base() == 10 { - // Only base 10 numbers can have a decimal seperator. + // Only base 10 numbers can have a decimal separator. // Number literal lexing finished if a . is found for a number in a different base. - - cursor.next_byte()?.expect(". token vanished"); - buf.push(b'.'); // Consume the . + if self.init != b'.' { + cursor.next_byte()?.expect("'.' token vanished"); + buf.push(b'.'); // Consume the . + } kind = NumericKind::Rational; - // Consume digits until a non-digit character is encountered or all the characters are consumed. - cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; + if cursor.peek()? == Some(b'_') { + return Err(Error::syntax( + "numeric separator not allowed after '.'", + cursor.pos(), + )); + } + + // Consume digits and separators until a non-digit non-separator + // character is encountered or all the characters are consumed. + take_integer(&mut buf, cursor, &kind, true)?; // The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part. // Another '.' or 'n' is not allowed. diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index b72147dd23..d8454afa7e 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -389,6 +389,44 @@ fn numbers() { expect_tokens(&mut lexer, &expected); } +#[test] +fn numbers_with_separators() { + let mut lexer = Lexer::new( + "1_0 2_0 0x3_4 056 7.8_9 4_2. 5_0e2 5_0e+2 5_0e-4 0b1_0 1_0.0_0e2 1.0E-0_1 -3_2".as_bytes(), + ); + + let expected = [ + TokenKind::numeric_literal(10), + TokenKind::numeric_literal(20), + TokenKind::numeric_literal(52), + TokenKind::numeric_literal(46), + TokenKind::numeric_literal(7.89), + TokenKind::numeric_literal(42), + TokenKind::numeric_literal(5000), + TokenKind::numeric_literal(5000), + TokenKind::numeric_literal(0.005), + TokenKind::numeric_literal(2), + TokenKind::numeric_literal(1000), + TokenKind::numeric_literal(0.1), + TokenKind::Punctuator(Punctuator::Sub), + TokenKind::numeric_literal(32), + ]; + + expect_tokens(&mut lexer, &expected); +} + +#[test] +fn numbers_with_bad_separators() { + let numbers = [ + "0b_10", "0x_10", "10_", "1._10", "1e+_10", "1E_10", "10__00", + ]; + + for n in numbers.iter() { + let mut lexer = Lexer::new(n.as_bytes()); + assert!(lexer.next().is_err()); + } +} + #[test] fn big_exp_numbers() { let mut lexer = Lexer::new(&b"1.0e25 1.0e36 9.0e50"[..]); @@ -418,8 +456,7 @@ fn implicit_octal_edge_case() { let expected = [ TokenKind::numeric_literal(36), - TokenKind::Punctuator(Punctuator::Dot), - TokenKind::numeric_literal(5), + TokenKind::numeric_literal(0.5), TokenKind::numeric_literal(94.5), ];