boa/boa/src/syntax/lexer/mod.rs

//! A lexical analyzer for JavaScript source code.
//!
//! The Lexer splits its input source code into a sequence of input elements called tokens, represented by the [Token](../ast/token/struct.Token.html) structure.
//! It also removes whitespace and comments and attaches them to the next token.

#[cfg(test)]
mod tests;

use crate::syntax::ast::bigint::BigInt;
use crate::syntax::ast::{
    punc::Punctuator,
    token::{NumericLiteral, Token, TokenKind},
};
use std::{
    char::{decode_utf16, from_u32},
    error, fmt,
    iter::Peekable,
    str::{Chars, FromStr},
};

/// `vop` tests the next token to see if we're on an assign operation of just a plain binary operation.
///
/// If the next value is not an assignment operation it will pattern match  the provided values and return the corresponding token.
macro_rules! vop {
    ($this:ident, $assign_op:expr, $op:expr) => ({
        let preview = $this.preview_next().ok_or_else(|| LexerError::new("Could not preview next value"))?;
        match preview {
            '=' => {
                $this.next();
                $this.column_number += 1;
                $assign_op
            }
            _ => $op,
        }
    });
    ($this:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr), +}) => ({
        let preview = $this.preview_next().ok_or_else(|| LexerError::new("Could not preview next value"))?;
        match preview {
            '=' => {
                $this.next();
                $this.column_number += 1;
                $assign_op
            },
            $($case => {
                $this.next();
                $this.column_number += 1;
                $block
            })+,
            _ => $op
        }
    });
    ($this:ident, $op:expr, {$($case:pat => $block:expr),+}) => {
        let preview = $this.preview_next().ok_or_else(|| LexerError::new("Could not preview next value"))?;
        match preview {
            $($case => {
                $this.next()?;
                $this.column_number += 1;
                $block
            })+,
            _ => $op
        }
    }
}

/// The `op` macro handles binary operations or assignment operations and converts them into tokens.
macro_rules! op {
    ($this:ident, $assign_op:expr, $op:expr) => ({
        let punc = vop!($this, $assign_op, $op);
        $this.push_punc(punc);
    });
    ($this:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr),+}) => ({
        let punc = vop!($this, $assign_op, $op, {$($case => $block),+});
        $this.push_punc(punc);
    });
    ($this:ident, $op:expr, {$($case:pat => $block:expr),+}) => ({
        let punc = vop!($this, $op, {$($case => $block),+});
        $this.push_punc();
    });
}

/// An error that occurred during lexing or compiling of the source input.
///
/// [LexerError] implements [fmt::Display] so you just display this value as an error
#[derive(Debug, Clone)]
pub struct LexerError {
    /// details will be displayed when a LexerError occurs.
    details: String,
}

impl LexerError {
    /// Create a new LexerError struct
    ///
    /// * `msg` - The message to show when LexerError is displayed
    fn new(msg: &str) -> Self {
        Self {
            details: msg.to_string(),
        }
    }
}

impl fmt::Display for LexerError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.details)
    }
}

impl error::Error for LexerError {
    fn description(&self) -> &str {
        &self.details
    }

    fn cause(&self) -> Option<&dyn error::Error> {
        // Generic error, underlying cause isn't tracked.
        None
    }
}

/// A lexical analyzer for JavaScript source code.
#[derive(Debug)]
pub struct Lexer<'a> {
    /// The list of tokens generated so far.
    ///
    /// This field is public so you can use them once lexing has finished.
    pub tokens: Vec<Token>,
    /// The current line number in the script
    line_number: u64,
    /// the current column number in the script
    column_number: u64,
    /// The full Peekable buffer, an array of [Char]s
    buffer: Peekable<Chars<'a>>,
}

impl<'a> Lexer<'a> {
    /// Returns a Lexer with a buffer inside
    ///
    /// The buffer needs to have a lifetime as long as the Lexer instance itself
    pub fn new(buffer: &'a str) -> Lexer<'a> {
        Lexer {
            tokens: Vec::new(),
            line_number: 1,
            column_number: 0,
            buffer: buffer.chars().peekable(),
        }
    }

    /// Push a token onto the token queue.
    fn push_token(&mut self, tk: TokenKind) {
        self.tokens
            .push(Token::new(tk, self.line_number, self.column_number))
    }

    /// Push a punctuation token
    fn push_punc(&mut self, punc: Punctuator) {
        self.push_token(TokenKind::Punctuator(punc));
    }

    /// next fetches the next token and return it, or a LexerError if there are no more.
    fn next(&mut self) -> char {
        self.buffer.next().expect(
            "No more more characters to consume from input stream, \
             use preview_next() first to check before calling next()",
        )
    }

    /// Preview the next character but don't actually increment
    fn preview_next(&mut self) -> Option<char> {
        self.buffer.peek().copied()
    }

    /// Preview a char x indexes further in buf, without incrementing
    fn preview_multiple_next(&mut self, nb_next: usize) -> Option<char> {
        let mut next_peek = None;

        for (i, x) in self.buffer.clone().enumerate() {
            if i >= nb_next {
                break;
            }

            next_peek = Some(x);
        }

        next_peek
    }

    /// Utility Function, while ``f(char)`` is true, read chars and move curser.
    /// All chars are returned as a string
    fn take_char_while<F>(&mut self, mut f: F) -> Result<String, LexerError>
    where
        F: FnMut(char) -> bool,
    {
        let mut s = String::new();
        while self.buffer.peek().is_some()
            && f(self.preview_next().expect("Could not preview next value"))
        {
            s.push(self.next());
        }

        Ok(s)
    }

    /// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented
    fn next_is(&mut self, peek: char) -> bool {
        let result = self.preview_next() == Some(peek);
        if result {
            self.buffer.next();
        }
        result
    }

    /// Utility function for checkint the NumericLiteral is not followed by an `IdentifierStart` or `DecimalDigit` character.
    ///
    /// More information:
    ///  - [ECMAScript Specification][spec]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals
    fn check_after_numeric_literal(&mut self) -> Result<(), LexerError> {
        match self.preview_next() {
            Some(ch)
                if ch.is_ascii_alphabetic() || ch == '$' || ch == '_' || ch.is_ascii_digit() =>
            {
                Err(LexerError::new("NumericLiteral token must not be followed by IdentifierStart nor DecimalDigit characters"))
            }
            Some(_) => Ok(()),
            None => Ok(())
        }
    }

    /// Lexes a numerical literal.
    ///
    /// More information:
    ///  - [ECMAScript Specification][spec]
    ///
    /// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals
    fn reed_numerical_literal(&mut self, ch: char) -> Result<(), LexerError> {
        /// This is a helper structure
        ///
        /// This structure helps with identifying what numerical type it is and what base is it.
        enum NumericKind {
            Rational,
            Integer(u32),
            BigInt(u32),
        }

        impl NumericKind {
            /// Get the base of the number kind.
            fn base(&self) -> u32 {
                match self {
                    Self::Rational => 10,
                    Self::Integer(ref base) => *base,
                    Self::BigInt(ref base) => *base,
                }
            }

            /// Converts `self` to BigInt kind.
            fn convert_to_bigint(&mut self) {
                *self = match *self {
                    Self::Rational => unreachable!("can not convert rational number to BigInt"),
                    Self::Integer(base) => Self::BigInt(base),
                    Self::BigInt(base) => Self::BigInt(base),
                };
            }
        }

        // TODO: Setup strict mode.
        let strict_mode = false;

        let mut buf = ch.to_string();
        let mut position_offset = 0;
        let mut kind = NumericKind::Integer(10);
        if ch == '0' {
            match self.preview_next() {
                None => {
                    self.push_token(TokenKind::NumericLiteral(NumericLiteral::Integer(0)));
                    self.column_number += 1;
                    return Ok(());
                }
                Some('x') | Some('X') => {
                    self.next();
                    position_offset += 1;
                    kind = NumericKind::Integer(16);
                }
                Some('o') | Some('O') => {
                    self.next();
                    position_offset += 1;
                    kind = NumericKind::Integer(8);
                }
                Some('b') | Some('B') => {
                    self.next();
                    position_offset += 1;
                    kind = NumericKind::Integer(2);
                }
                Some(ch) if ch.is_ascii_digit() => {
                    let mut is_implicit_octal = true;
                    while let Some(ch) = self.preview_next() {
                        if !ch.is_ascii_digit() {
                            break;
                        } else if !ch.is_digit(8) {
                            is_implicit_octal = false;
                        }
                        buf.push(self.next());
                    }
                    if !strict_mode {
                        if is_implicit_octal {
                            kind = NumericKind::Integer(8);
                        }
                    } else {
                        return Err(if is_implicit_octal {
                            LexerError::new(
                                "Implicit octal literals are not allowed in strict mode.",
                            )
                        } else {
                            LexerError::new(
                                "Decimals with leading zeros are not allowed in strict mode.",
                            )
                        });
                    }
                }
                Some(_) => {}
            }
        }

        while let Some(ch) = self.preview_next() {
            if !ch.is_digit(kind.base()) {
                break;
            }
            buf.push(self.next());
        }

        if self.next_is('n') {
            kind.convert_to_bigint()
        }

        if let NumericKind::Integer(10) = kind {
            'digitloop: while let Some(ch) = self.preview_next() {
                match ch {
                    '.' => loop {
                        kind = NumericKind::Rational;
                        buf.push(self.next());

                        let c = match self.preview_next() {
                            Some(ch) => ch,
                            None => break,
                        };

                        match c {
                            'e' | 'E' => {
                                match self
                                    .preview_multiple_next(2)
                                    .unwrap_or_default()
                                    .to_digit(10)
                                {
                                    Some(0..=9) | None => {
                                        buf.push(self.next());
                                    }
                                    _ => {
                                        break 'digitloop;
                                    }
                                }
                            }
                            _ => {
                                if !c.is_digit(10) {
                                    break 'digitloop;
                                }
                            }
                        }
                    },
                    'e' | 'E' => {
                        kind = NumericKind::Rational;
                        match self
                            .preview_multiple_next(2)
                            .unwrap_or_default()
                            .to_digit(10)
                        {
                            Some(0..=9) | None => {
                                buf.push(self.next());
                            }
                            _ => {
                                break;
                            }
                        }
                        buf.push(self.next());
                    }
                    '+' | '-' => {
                        break;
                    }
                    _ if ch.is_digit(10) => {
                        buf.push(self.next());
                    }
                    _ => break,
                }
            }
        }

        if let Err(e) = self.check_after_numeric_literal() {
            return Err(e);
        };

        let num = match kind {
                NumericKind::BigInt(base) => {
                    NumericLiteral::BigInt(
                        BigInt::from_str_radix(&buf, base).expect("Could not conver to BigInt")
                        )
                }
                NumericKind::Rational /* base: 10 */ => {
                    NumericLiteral::Rational(
                        f64::from_str(&buf)
                            .map_err(|_| LexerError::new("Could not convert value to f64"))?,
                    )
                }
                NumericKind::Integer(base) => {
                    if let Ok(num) = i32::from_str_radix(&buf, base) {
                        NumericLiteral::Integer(
                            num
                        )
                    } else {
                        let b = f64::from(base);
                        let mut result = 0.0_f64;
                        for c in buf.chars() {
                            let digit = f64::from(c.to_digit(base).unwrap());
                            result = result * b + digit;
                        }

                        NumericLiteral::Rational(result)
                    }

                }
            };

        self.push_token(TokenKind::NumericLiteral(num));
        self.column_number += (buf.len() as u64) + position_offset - 1;

        Ok(())
    }

    /// Runs the lexer until completion, returning a [LexerError] if there's a syntax issue, or an empty unit result
    ///
    /// # Example
    ///
    /// ```
    /// # use boa::syntax::lexer::{LexerError, Lexer};
    /// fn main() -> Result<(), LexerError> {
    ///     let buffer = String::from("Hello World");
    ///     let mut lexer = Lexer::new(&buffer);
    ///     lexer.lex()
    /// }
    /// ```
    pub fn lex(&mut self) -> Result<(), LexerError> {
        loop {
            // Check if we've reached the end
            if self.preview_next().is_none() {
                return Ok(());
            }
            self.column_number += 1;
            let ch = self.next();
            match ch {
                '"' | '\'' => {
                    let mut buf = String::new();
                    loop {
                        if self.preview_next().is_none() {
                            return Err(LexerError::new("Unterminated String"));
                        }
                        match self.next() {
                            '\'' if ch == '\'' => {
                                break;
                            }
                            '"' if ch == '"' => {
                                break;
                            }
                            '\\' => {
                                if self.preview_next().is_none() {
                                    return Err(LexerError::new("Unterminated String"));
                                }
                                let escape = self.next();
                                if escape != '\n' {
                                    let escaped_ch = match escape {
                                        'n' => '\n',
                                        'r' => '\r',
                                        't' => '\t',
                                        'b' => '\x08',
                                        'f' => '\x0c',
                                        '0' => '\0',
                                        'x' => {
                                            let mut nums = String::with_capacity(2);
                                            for _ in 0_u8..2 {
                                                if self.preview_next().is_none() {
                                                    return Err(LexerError::new("Unterminated String"));
                                                }
                                                nums.push(self.next());
                                            }
                                            self.column_number += 2;
                                            let as_num = match u64::from_str_radix(&nums, 16) {
                                                Ok(v) => v,
                                                Err(_) => 0,
                                            };
                                            match from_u32(as_num as u32) {
                                                Some(v) => v,
                                                None => panic!(
                                                    "{}:{}: {} is not a valid unicode scalar value",
                                                    self.line_number, self.column_number, as_num
                                                ),
                                            }
                                        }
                                        'u' => {
                                            // There are 2 types of codepoints. Surragate codepoints and unicode codepoints.
                                            // UTF-16 could be surrogate codepoints, "\uXXXX\uXXXX" which make up a single unicode codepoint.
                                            // We will need to loop to make sure we catch all UTF-16 codepoints
                                            // Example Test: https://github.com/tc39/test262/blob/ee3715ee56744ccc8aeb22a921f442e98090b3c1/implementation-contributed/v8/mjsunit/es6/unicode-escapes.js#L39-L44

                                            // Support \u{X..X} (Unicode Codepoint)
                                            if self.next_is('{') {
                                                let s = self
                                                    .take_char_while(char::is_alphanumeric)
                                                    .expect("Could not read chars");

                                                // We know this is a single unicode codepoint, convert to u32
                                                let as_num = match u32::from_str_radix(&s, 16) {
                                                    Ok(v) => v,
                                                    Err(_) => 0,
                                                };
                                                let c = from_u32(as_num).ok_or_else(|| LexerError::new("Invalid Unicode escape sequence"))?;

                                                if self.preview_next().is_none() {
                                                    return Err(LexerError::new("Unterminated String"));
                                                }
                                                self.next(); // '}'
                                                self.column_number +=
                                                    (s.len() as u64).wrapping_add(3);
                                                c
                                            } else {
                                                let mut codepoints: Vec<u16> = vec![];
                                                loop {
                                                    // Collect each character after \u e.g \uD83D will give "D83D"
                                                    let s = self
                                                        .take_char_while(char::is_alphanumeric)
                                                        .expect("Could not read chars");

                                                    // Convert to u16
                                                    let as_num = match u16::from_str_radix(&s, 16) {
                                                        Ok(v) => v,
                                                        Err(_) => 0,
                                                    };

                                                    codepoints.push(as_num);
                                                    self.column_number +=
                                                        (s.len() as u64).wrapping_add(2);

                                                    // Check for another UTF-16 codepoint
                                                    if self.next_is('\\') && self.next_is('u') {
                                                        continue;
                                                    }
                                                    break;
                                                }

                                                // codepoints length should either be 1 (unicode codepoint) or 2 (surrogate codepoint).
                                                // Rust's decode_utf16 will deal with it regardless
                                                decode_utf16(codepoints.iter().cloned())
                                                    .next()
                                                    .expect("Could not get next codepoint")
                                                    .expect("Could not get next codepoint")
                                            }
                                        }
                                        '\'' | '"' | '\\' => escape,
                                        ch => {
                                            let details = format!("{}:{}: Invalid escape `{}`", self.line_number, self.column_number, ch);
                                            return Err(LexerError { details });
                                        }
                                    };
                                    buf.push(escaped_ch);
                                }
                            }
                            next_ch => buf.push(next_ch),
                        }
                    }
                    let str_length = buf.len() as u64;
                    self.push_token(TokenKind::StringLiteral(buf));
                    // Why +1? Quotation marks are not included,
                    // So technically it would be +2, (for both " ") but we want to be 1 less
                    // to compensate for the incrementing at the top
                    self.column_number += str_length.wrapping_add(1);
                }
                _ if ch.is_digit(10) => self.reed_numerical_literal(ch)?,
                _ if ch.is_alphabetic() || ch == '$' || ch == '_' => {
                    let mut buf = ch.to_string();
                    while let Some(ch) = self.preview_next() {
                        if ch.is_alphabetic() || ch.is_digit(10) || ch == '_' {
                            buf.push(self.next());
                        } else {
                            break;
                        }
                    }

                    self.push_token(match buf.as_str() {
                        "true" => TokenKind::BooleanLiteral(true),
                        "false" => TokenKind::BooleanLiteral(false),
                        "null" => TokenKind::NullLiteral,
                        "NaN" => TokenKind::NumericLiteral(NumericLiteral::Rational(f64::NAN)),
                        slice => {
                            if let Ok(keyword) = FromStr::from_str(slice) {
                                TokenKind::Keyword(keyword)
                            } else {
                                TokenKind::identifier(slice)
                            }
                        }
                    });
                    // Move position forward the length of keyword
                    self.column_number += (buf.len().wrapping_sub(1)) as u64;
                }
                ';' => self.push_punc(Punctuator::Semicolon),
                ':' => self.push_punc(Punctuator::Colon),
                '.' => {
                    // . or ...
                    if self.next_is('.') {
                        if self.next_is('.') {
                            self.push_punc(Punctuator::Spread);
                            self.column_number += 2;
                        } else {
                            return Err(LexerError::new("Expecting Token ."));
                        }
                    } else {
                        self.push_punc(Punctuator::Dot);
                    };
                }
                '(' => self.push_punc(Punctuator::OpenParen),
                ')' => self.push_punc(Punctuator::CloseParen),
                ',' => self.push_punc(Punctuator::Comma),
                '{' => self.push_punc(Punctuator::OpenBlock),
                '}' => self.push_punc(Punctuator::CloseBlock),
                '[' => self.push_punc(Punctuator::OpenBracket),
                ']' => self.push_punc(Punctuator::CloseBracket),
                '?' => self.push_punc(Punctuator::Question),
                // Comments
                '/' => {
                    if let Some(ch) = self.preview_next() {
                        match ch {
                            // line comment
                            '/' => {
                                while self.preview_next().is_some() {
                                    if self.next() == '\n' {
                                        break;
                                    }
                                }
                                self.line_number += 1;
                                self.column_number = 0;
                            }
                            // block comment
                            '*' => {
                                let mut lines = 0;
                                loop {
                                    if self.preview_next().is_none() {
                                        return Err(LexerError::new("Unterminated Multiline Comment"));
                                    }
                                    match self.next() {
                                        '*' => {
                                            if self.next_is('/') {
                                                break;
                                            }
                                        }
                                        next_ch => {
                                            if next_ch == '\n' {
                                                lines += 1;
                                            }
                                        },
                                    }
                                }
                                self.line_number += lines;
                                self.column_number = 0;
                            }
                            // division, assigndiv or regex literal
                            _ => {
                                // if we fail to parse a regex literal, store a copy of the current
                                // buffer to restore later on
                                let original_buffer = self.buffer.clone();
                                // first, try to parse a regex literal
                                let mut body = String::new();
                                let mut regex = false;
                                loop {
                                    self.column_number +=1;
                                    match self.buffer.next() {
                                        // end of body
                                        Some('/') => {
                                            regex = true;
                                            break;
                                        }
                                        // newline/eof not allowed in regex literal
                                        n @ Some('\n') | n @ Some('\r') | n @ Some('\u{2028}')
                                        | n @ Some('\u{2029}') => {
                                            self.column_number = 0;
                                            if n != Some('\r') {
                                                self.line_number += 1;
                                            }
                                            break
                                        },
                                        None => {
                                            self.column_number -= 1;
                                            break
                                        }
                                        // escape sequence
                                        Some('\\') => {
                                            body.push('\\');
                                            if self.preview_next().is_none() {
                                                break;
                                            }
                                            match self.next() {
                                                // newline not allowed in regex literal
                                                '\n' | '\r' | '\u{2028}' | '\u{2029}' => break,
                                                ch => body.push(ch),
                                            }
                                        }
                                        Some(ch) => body.push(ch),
                                    }
                                }
                                if regex {
                                    // body was parsed, now look for flags
                                    let flags = self.take_char_while(char::is_alphabetic)?;
                                    self.push_token(TokenKind::RegularExpressionLiteral(
                                        body, flags,
                                    ));
                                } else {
                                    // failed to parse regex, restore original buffer position and
                                    // parse either div or assigndiv
                                    self.buffer = original_buffer;
                                    if self.next_is('=') {
                                        self.push_token(TokenKind::Punctuator(
                                            Punctuator::AssignDiv,
                                        ));
                                    } else {
                                        self.push_token(TokenKind::Punctuator(Punctuator::Div));
                                    }
                                }
                            }
                        }
                    } else {
                        return Err(LexerError::new("Expecting Token /,*,= or regex"));
                    }
                }
                '*' => op!(self, Punctuator::AssignMul, Punctuator::Mul, {
                    '*' => vop!(self, Punctuator::AssignPow, Punctuator::Exp)
                }),
                '+' => op!(self, Punctuator::AssignAdd, Punctuator::Add, {
                    '+' => Punctuator::Inc
                }),
                '-' => op!(self, Punctuator::AssignSub, Punctuator::Sub, {
                    '-' => {
                        Punctuator::Dec
                    }
                }),
                '%' => op!(self, Punctuator::AssignMod, Punctuator::Mod),
                '|' => op!(self, Punctuator::AssignOr, Punctuator::Or, {
                    '|' => Punctuator::BoolOr
                }),
                '&' => op!(self, Punctuator::AssignAnd, Punctuator::And, {
                    '&' => Punctuator::BoolAnd
                }),
                '^' => op!(self, Punctuator::AssignXor, Punctuator::Xor),
                '=' => op!(self, if self.next_is('=') {
                    Punctuator::StrictEq
                } else {
                    Punctuator::Eq
                }, Punctuator::Assign, {
                    '>' => {
                        Punctuator::Arrow
                    }
                }),
                '<' => op!(self, Punctuator::LessThanOrEq, Punctuator::LessThan, {
                    '<' => vop!(self, Punctuator::AssignLeftSh, Punctuator::LeftSh)
                }),
                '>' => op!(self, Punctuator::GreaterThanOrEq, Punctuator::GreaterThan, {
                    '>' => vop!(self, Punctuator::AssignRightSh, Punctuator::RightSh, {
                        '>' => vop!(self, Punctuator::AssignURightSh, Punctuator::URightSh)
                    })
                }),
                '!' => op!(
                    self,
                    vop!(self, Punctuator::StrictNotEq, Punctuator::NotEq),
                    Punctuator::Not
                ),
                '~' => self.push_punc(Punctuator::Neg),
                '\n' | '\u{2028}' | '\u{2029}' => {
                    self.push_token(TokenKind::LineTerminator);
                    self.line_number += 1;
                    self.column_number = 0;
                }
                '\r' => {
                    self.column_number = 0;
                }
                // The rust char::is_whitespace function and the ecma standard use different sets
                // of characters as whitespaces:
                //  * Rust uses \p{White_Space},
                //  * ecma standard uses \{Space_Separator} + \u{0009}, \u{000B}, \u{000C}, \u{FEFF}
                //
                // Explicit whitespace: see https://tc39.es/ecma262/#table-32
                '\u{0020}' | '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{00A0}' | '\u{FEFF}' |
                // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
                '\u{1680}' | '\u{2000}'..='\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}' => (),
                _ => {
                    let details = format!("{}:{}: Unexpected '{}'", self.line_number, self.column_number, ch);
                    return Err(LexerError { details });
                },
            }
        }
    }
}