From 6f3641d593ac2e5a5f682bd81ce80ad18f8bad0b Mon Sep 17 00:00:00 2001
From: tofpie <75836434+tofpie@users.noreply.github.com>
Date: Mon, 28 Dec 2020 13:02:05 +0100
Subject: [PATCH] Add numeric separator lexing (#995)

* Add numeric separator handling

* Implement suggestions from PR review

Co-authored-by: tofpie <tofpie@users.noreply.github.com>
---
 boa/src/syntax/lexer/mod.rs    |  8 ++-
 boa/src/syntax/lexer/number.rs | 93 ++++++++++++++++++++++++++++------
 boa/src/syntax/lexer/tests.rs  | 41 ++++++++++++++-
 3 files changed, 123 insertions(+), 19 deletions(-)
diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs
index a779453f65..8728837675 100644
--- a/boa/src/syntax/lexer/mod.rs
+++ b/boa/src/syntax/lexer/mod.rs
@@ -213,7 +213,13 @@ impl<R> Lexer<R> {
                     Punctuator::Colon.into(),
                     Span::new(start, self.cursor.pos()),
                 )),
-                '.' => SpreadLiteral::new().lex(&mut self.cursor, start),
+                '.' => {
+                    if self.cursor.peek()?.map(|c| c >= b'0' && c <= b'9') == Some(true) {
+                        NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                    } else {
+                        SpreadLiteral::new().lex(&mut self.cursor, start)
+                    }
+                }
                 '(' => Ok(Token::new(
                     Punctuator::OpenParen.into(),
                     Span::new(start, self.cursor.pos()),
diff --git a/boa/src/syntax/lexer/number.rs b/boa/src/syntax/lexer/number.rs
index 4e5c97d341..46ba2b65c0 100644
--- a/boa/src/syntax/lexer/number.rs
+++ b/boa/src/syntax/lexer/number.rs
@@ -110,11 +110,52 @@ where
     }
 
     // Consume the decimal digits.
-    cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?;
+    take_integer(buf, cursor, kind, true)?;
 
     Ok(())
 }
 
+fn take_integer<R>(
+    buf: &mut Vec<u8>,
+    cursor: &mut Cursor<R>,
+    kind: &NumericKind,
+    separator_allowed: bool,
+) -> Result<(), Error>
+where
+    R: Read,
+{
+    let mut prev_is_underscore = false;
+    let mut pos = cursor.pos();
+    while cursor.next_is_ascii_pred(&|c| c.is_digit(kind.base()) || c == '_')? {
+        pos = cursor.pos();
+        match cursor.next_byte()? {
+            Some(c) if char::from(c).is_digit(kind.base()) => {
+                prev_is_underscore = false;
+                buf.push(c);
+            }
+            Some(b'_') if separator_allowed => {
+                if prev_is_underscore {
+                    return Err(Error::syntax(
+                        "only one underscore is allowed as numeric separator",
+                        cursor.pos(),
+                    ));
+                }
+                prev_is_underscore = true;
+            }
+            Some(b'_') if !separator_allowed => {
+                return Err(Error::syntax("separator is not allowed", pos));
+            }
+            _ => (),
+        }
+    }
+    if prev_is_underscore {
+        return Err(Error::syntax(
+            "underscores are not allowed at the end of numeric literals",
+            pos,
+        ));
+    }
+    Ok(())
+}
 /// Utility function for checking the NumericLiteral is not followed by an `IdentifierStart` or `DecimalDigit` character.
 ///
 /// More information:
@@ -149,6 +190,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
         let mut kind = NumericKind::Integer(10);
 
         let c = cursor.peek();
+        let mut legacy_octal = false;
 
         if self.init == b'0' {
             if let Some(ch) = c? {
@@ -180,7 +222,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         // Checks if the next char after '0o' is a digit of that base. if not return an error.
                         if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? {
                             return Err(Error::syntax(
-                                "expected hexadecimal digit after number base prefix",
+                                "expected octal digit after number base prefix",
                                 cursor.pos(),
                             ));
                         }
@@ -196,7 +238,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         // Checks if the next char after '0b' is a digit of that base. if not return an error.
                         if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? {
                             return Err(Error::syntax(
-                                "expected hexadecimal digit after number base prefix",
+                                "expected binary digit after number base prefix",
                                 cursor.pos(),
                             ));
                         }
@@ -211,6 +253,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         ));
                     }
                     byte => {
+                        legacy_octal = true;
                         let ch = char::from(byte);
                         if ch.is_digit(8) {
                             // LegacyOctalIntegerLiteral
@@ -237,8 +280,6 @@ impl<R> Tokenizer<R> for NumberLiteral {
                                     "leading 0's are not allowed in strict mode",
                                     start_pos,
                                 ));
-                            } else {
-                                buf.push(cursor.next_byte()?.expect("Number digit vanished"));
                             }
                         } // Else indicates that the symbol is a non-number.
                     }
@@ -253,34 +294,54 @@ impl<R> Tokenizer<R> for NumberLiteral {
             }
         }
 
-        // Consume digits until a non-digit character is encountered or all the characters are consumed.
-        cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+        let next = if self.init == b'.' {
+            Some(b'.')
+        } else {
+            // Consume digits and separators until a non-digit non-separator
+            // character is encountered or all the characters are consumed.
+            take_integer(&mut buf, cursor, &kind, !legacy_octal)?;
+            cursor.peek()?
+        };
 
         // The non-digit character could be:
         // 'n' To indicate a BigIntLiteralSuffix.
-        // '.' To indicate a decimal seperator.
+        // '.' To indicate a decimal separator.
         // 'e' | 'E' To indicate an ExponentPart.
-        match cursor.peek()? {
+        match next {
             Some(b'n') => {
                 // DecimalBigIntegerLiteral
                 // Lexing finished.
-
                 // Consume the n
+                if legacy_octal {
+                    return Err(Error::syntax(
+                        "'n' suffix not allowed in octal representation",
+                        cursor.pos(),
+                    ));
+                }
                 cursor.next_byte()?.expect("n character vanished");
 
                 kind = kind.to_bigint();
             }
             Some(b'.') => {
                 if kind.base() == 10 {
-                    // Only base 10 numbers can have a decimal seperator.
+                    // Only base 10 numbers can have a decimal separator.
                     // Number literal lexing finished if a . is found for a number in a different base.
-
-                    cursor.next_byte()?.expect(". token vanished");
-                    buf.push(b'.'); // Consume the .
+                    if self.init != b'.' {
+                        cursor.next_byte()?.expect("'.' token vanished");
+                        buf.push(b'.'); // Consume the .
+                    }
                     kind = NumericKind::Rational;
 
-                    // Consume digits until a non-digit character is encountered or all the characters are consumed.
-                    cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+                    if cursor.peek()? == Some(b'_') {
+                        return Err(Error::syntax(
+                            "numeric separator not allowed after '.'",
+                            cursor.pos(),
+                        ));
+                    }
+
+                    // Consume digits and separators until a non-digit non-separator
+                    // character is encountered or all the characters are consumed.
+                    take_integer(&mut buf, cursor, &kind, true)?;
 
                     // The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part.
                     // Another '.' or 'n' is not allowed.
diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs
index b72147dd23..d8454afa7e 100644
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@@ -389,6 +389,44 @@ fn numbers() {
     expect_tokens(&mut lexer, &expected);
 }
 
+#[test]
+fn numbers_with_separators() {
+    let mut lexer = Lexer::new(
+        "1_0 2_0 0x3_4 056 7.8_9 4_2. 5_0e2 5_0e+2 5_0e-4 0b1_0 1_0.0_0e2 1.0E-0_1 -3_2".as_bytes(),
+    );
+
+    let expected = [
+        TokenKind::numeric_literal(10),
+        TokenKind::numeric_literal(20),
+        TokenKind::numeric_literal(52),
+        TokenKind::numeric_literal(46),
+        TokenKind::numeric_literal(7.89),
+        TokenKind::numeric_literal(42),
+        TokenKind::numeric_literal(5000),
+        TokenKind::numeric_literal(5000),
+        TokenKind::numeric_literal(0.005),
+        TokenKind::numeric_literal(2),
+        TokenKind::numeric_literal(1000),
+        TokenKind::numeric_literal(0.1),
+        TokenKind::Punctuator(Punctuator::Sub),
+        TokenKind::numeric_literal(32),
+    ];
+
+    expect_tokens(&mut lexer, &expected);
+}
+
+#[test]
+fn numbers_with_bad_separators() {
+    let numbers = [
+        "0b_10", "0x_10", "10_", "1._10", "1e+_10", "1E_10", "10__00",
+    ];
+
+    for n in numbers.iter() {
+        let mut lexer = Lexer::new(n.as_bytes());
+        assert!(lexer.next().is_err());
+    }
+}
+
 #[test]
 fn big_exp_numbers() {
     let mut lexer = Lexer::new(&b"1.0e25 1.0e36 9.0e50"[..]);
@@ -418,8 +456,7 @@ fn implicit_octal_edge_case() {
 
     let expected = [
         TokenKind::numeric_literal(36),
-        TokenKind::Punctuator(Punctuator::Dot),
-        TokenKind::numeric_literal(5),
+        TokenKind::numeric_literal(0.5),
         TokenKind::numeric_literal(94.5),
     ];