Improve lexer by make cursor iterate over bytes (#915)

4 years ago · cc473855f1
11 changed files with 702 additions and 362 deletions
--- a/boa/src/syntax/lexer/comment.rs
+++ b/boa/src/syntax/lexer/comment.rs
@ -31,11 +31,11 @@ impl<R> Tokenizer<R> for SingleLineComment {

        // Skip either to the end of the line or to the end of the input
        while let Some(ch) = cursor.peek()? {
-            if ch == '\n' {
+            if ch == b'\n' {
                break;
            } else {
                // Consume char.
-                cursor.next_char()?.expect("Comment character vansihed");
+                cursor.next_byte()?.expect("Comment character vansihed");
            }
        }
        Ok(Token::new(
@ -66,10 +66,10 @@ impl<R> Tokenizer<R> for MultiLineComment {

        let mut new_line = false;
        loop {
-            if let Some(ch) = cursor.next_char()? {
-                if ch == '*' && cursor.next_is('/')? {
+            if let Some(ch) = cursor.next_byte()? {
+                if ch == b'*' && cursor.next_is(b'/')? {
                    break;
-                } else if ch == '\n' {
+                } else if ch == b'\n' {
                    new_line = true;
                }
            } else {
--- a/boa/src/syntax/lexer/cursor.rs
+++ b/boa/src/syntax/lexer/cursor.rs
@ -1,5 +1,4 @@
 //! Module implementing the lexer cursor. This is used for managing the input byte stream.
-
 use crate::{profiler::BoaProfiler, syntax::ast::Position};
 use std::io::{self, Bytes, Error, ErrorKind, Read};

@ -57,22 +56,38 @@ where
        }
    }

-    /// Peeks the next character.
+    /// Peeks the next byte.
    #[inline]
-    pub(super) fn peek(&mut self) -> Result<Option<char>, Error> {
+    pub(super) fn peek(&mut self) -> Result<Option<u8>, Error> {
        let _timer = BoaProfiler::global().start_event("cursor::peek()", "Lexing");

+        self.iter.peek_byte()
+    }
+
+    /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
+    #[inline]
+    pub(super) fn peek_n(&mut self, n: u8) -> Result<u32, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::peek_n()", "Lexing");
+
+        self.iter.peek_n_bytes(n)
+    }
+
+    /// Peeks the next UTF-8 character in u32 code point.
+    #[inline]
+    pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::peek_char()", "Lexing");
+
        self.iter.peek_char()
    }

-    /// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented
+    /// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented
    #[inline]
-    pub(super) fn next_is(&mut self, peek: char) -> io::Result<bool> {
+    pub(super) fn next_is(&mut self, byte: u8) -> io::Result<bool> {
        let _timer = BoaProfiler::global().start_event("cursor::next_is()", "Lexing");

        Ok(match self.peek()? {
-            Some(next) if next == peek => {
-                let _ = self.iter.next_char();
+            Some(next) if next == byte => {
+                let _ = self.next_byte()?;
                true
            }
            _ => false,
@ -80,34 +95,57 @@ where
    }

    /// Applies the predicate to the next character and returns the result.
-    /// Returns false if there is no next character.
+    /// Returns false if the next character is not a valid ascii or there is no next character.
+    /// Otherwise returns the result from the predicate on the ascii in char
    ///
    /// The buffer is not incremented.
    #[inline]
-    pub(super) fn next_is_pred<F>(&mut self, pred: &F) -> io::Result<bool>
+    pub(super) fn next_is_ascii_pred<F>(&mut self, pred: &F) -> io::Result<bool>
    where
        F: Fn(char) -> bool,
    {
-        let _timer = BoaProfiler::global().start_event("cursor::next_is_pred()", "Lexing");
+        let _timer = BoaProfiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing");
+
+        Ok(match self.peek()? {
+            Some(byte) => match byte {
+                0..=0x7F => pred(char::from(byte)),
+                _ => false,
+            },
+            None => false,
+        })
+    }
+
+    /// Applies the predicate to the next UTF-8 character and returns the result.
+    /// Returns false if there is no next character, otherwise returns the result from the
+    /// predicate on the ascii char
+    ///
+    /// The buffer is not incremented.
+    #[inline]
+    pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
+    where
+        F: Fn(u32) -> bool,
+    {
+        let _timer = BoaProfiler::global().start_event("cursor::next_is_char_pred()", "Lexing");

-        Ok(if let Some(peek) = self.peek()? {
+        Ok(if let Some(peek) = self.peek_char()? {
            pred(peek)
        } else {
            false
        })
    }

-    /// Fills the buffer with all characters until the stop character is found.
+    /// Fills the buffer with all bytes until the stop byte is found.
+    /// Returns error when reaching the end of the buffer.
    ///
-    /// Note: It will not add the stop character to the buffer.
-    pub(super) fn take_until(&mut self, stop: char, buf: &mut String) -> io::Result<()> {
+    /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before.
+    pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec<u8>) -> io::Result<()> {
        let _timer = BoaProfiler::global().start_event("cursor::take_until()", "Lexing");

        loop {
            if self.next_is(stop)? {
                return Ok(());
-            } else if let Some(ch) = self.next_char()? {
-                buf.push(ch);
+            } else if let Some(byte) = self.next_byte()? {
+                buf.push(byte);
            } else {
                return Err(io::Error::new(
                    ErrorKind::UnexpectedEof,
@ -117,21 +155,45 @@ where
        }
    }

-    /// Fills the buffer with characters until the first character (x) for which the predicate (pred) is false
-    /// (or the next character is none).
+    /// Fills the buffer with characters until the first ascii character for which the predicate (pred) is false.
+    /// It also stops when the next character is not an ascii or there is no next character.
    ///
-    /// Note that all characters up until x are added to the buffer including the character right before.
-    pub(super) fn take_while_pred<F>(&mut self, buf: &mut String, pred: &F) -> io::Result<()>
+    /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
    where
        F: Fn(char) -> bool,
    {
-        let _timer = BoaProfiler::global().start_event("cursor::take_while_pred()", "Lexing");
+        let _timer = BoaProfiler::global().start_event("cursor::take_while_ascii_pred()", "Lexing");
+
+        loop {
+            if !self.next_is_ascii_pred(pred)? {
+                return Ok(());
+            } else if let Some(byte) = self.next_byte()? {
+                buf.push(byte);
+            } else {
+                // next_is_pred will return false if the next value is None so the None case should already be handled.
+                unreachable!();
+            }
+        }
+    }
+
+    /// Fills the buffer with characters until the first character for which the predicate (pred) is false.
+    /// It also stops when there is no next character.
+    ///
+    /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
+    where
+        F: Fn(u32) -> bool,
+    {
+        let _timer = BoaProfiler::global().start_event("cursor::take_while_char_pred()", "Lexing");

        loop {
-            if !self.next_is_pred(pred)? {
+            if !self.next_is_char_pred(pred)? {
                return Ok(());
-            } else if let Some(ch) = self.next_char()? {
-                buf.push(ch);
+            } else if let Some(ch) = self.peek_char()? {
+                for _ in 0..utf8_len(ch) {
+                    buf.push(self.next_byte()?.unwrap());
+                }
            } else {
                // next_is_pred will return false if the next value is None so the None case should already be handled.
                unreachable!();
@ -139,7 +201,7 @@ where
        }
    }

-    /// It will fill the buffer with checked ASCII bytes.
+    /// It will fill the buffer with bytes.
    ///
    /// This expects for the buffer to be fully filled. If it's not, it will fail with an
    /// `UnexpectedEof` I/O error.
@ -150,28 +212,63 @@ where
        self.iter.fill_bytes(buf)
    }

+    /// Retrieves the next byte.
+    #[inline]
+    pub(crate) fn next_byte(&mut self) -> Result<Option<u8>, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::next_byte()", "Lexing");
+
+        let byte = self.iter.next_byte()?;
+
+        match byte {
+            Some(b'\r') => {
+                // Try to take a newline if it's next, for windows "\r\n" newlines
+                // Otherwise, treat as a Mac OS9 bare '\r' newline
+                if self.peek()? == Some(b'\n') {
+                    let _ = self.iter.next_byte();
+                }
+                self.next_line();
+            }
+            Some(b'\n') => self.next_line(),
+            Some(0xE2) => {
+                // Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9)
+                let next_bytes = self.peek_n(2)?;
+                if next_bytes == 0xA8_80 || next_bytes == 0xA9_80 {
+                    self.next_line();
+                } else {
+                    // 0xE2 is a utf8 first byte
+                    self.next_column();
+                }
+            }
+            Some(b) if utf8_is_first_byte(b) => self.next_column(),
+            _ => {}
+        }
+
+        Ok(byte)
+    }
+
    /// Retrieves the next UTF-8 character.
    #[inline]
-    pub(crate) fn next_char(&mut self) -> Result<Option<char>, Error> {
+    pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> {
        let _timer = BoaProfiler::global().start_event("cursor::next_char()", "Lexing");

-        let chr = self.iter.next_char()?;
+        let ch = self.iter.next_char()?;

-        match chr {
-            Some('\r') => {
+        match ch {
+            Some(0xD) => {
                // Try to take a newline if it's next, for windows "\r\n" newlines
                // Otherwise, treat as a Mac OS9 bare '\r' newline
-                if self.peek()? == Some('\n') {
-                    let _ = self.iter.next_char();
+                if self.peek()? == Some(0xA) {
+                    let _ = self.iter.next_byte();
                }
                self.next_line();
            }
-            Some('\n') | Some('\u{2028}') | Some('\u{2029}') => self.next_line(),
+            // '\n' | '\u{2028}' | '\u{2029}'
+            Some(0xA) | Some(0x2028) | Some(0x2029) => self.next_line(),
            Some(_) => self.next_column(),
-            None => {}
+            _ => {}
        }

-        Ok(chr)
+        Ok(ch)
    }
 }

@ -179,7 +276,9 @@ where
 #[derive(Debug)]
 struct InnerIter<R> {
    iter: Bytes<R>,
-    peeked_char: Option<Option<char>>,
+    num_peeked_bytes: u8,
+    peeked_bytes: u32,
+    peeked_char: Option<Option<u32>>,
 }

 impl<R> InnerIter<R> {
@ -188,6 +287,8 @@ impl<R> InnerIter<R> {
    fn new(iter: Bytes<R>) -> Self {
        Self {
            iter,
+            num_peeked_bytes: 0,
+            peeked_bytes: 0,
            peeked_char: None,
        }
    }
@ -197,14 +298,14 @@ impl<R> InnerIter<R>
 where
    R: Read,
 {
-    /// It will fill the buffer with checked ASCII bytes.
+    /// It will fill the buffer with checked ascii bytes.
    ///
    /// This expects for the buffer to be fully filled. If it's not, it will fail with an
    /// `UnexpectedEof` I/O error.
    #[inline]
    fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> {
        for byte in buf.iter_mut() {
-            *byte = self.next_ascii()?.ok_or_else(|| {
+            *byte = self.next_byte()?.ok_or_else(|| {
                io::Error::new(
                    io::ErrorKind::UnexpectedEof,
                    "unexpected EOF when filling buffer",
@ -214,90 +315,197 @@ where
        Ok(())
    }

-    /// Peeks the next UTF-8 checked character.
+    /// Increments the iter by n bytes.
    #[inline]
-    pub(super) fn peek_char(&mut self) -> Result<Option<char>, Error> {
-        if let Some(v) = self.peeked_char {
-            Ok(v)
+    fn increment(&mut self, n: u32) -> Result<(), Error> {
+        for _ in 0..n {
+            if None == self.next_byte()? {
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Peeks the next byte.
+    #[inline]
+    pub(super) fn peek_byte(&mut self) -> Result<Option<u8>, Error> {
+        if self.num_peeked_bytes > 0 {
+            let byte = self.peeked_bytes as u8;
+            Ok(Some(byte))
        } else {
-            let chr = self.next_char()?;
-            self.peeked_char = Some(chr);
-            Ok(chr)
+            match self.iter.next().transpose()? {
+                Some(byte) => {
+                    self.num_peeked_bytes = 1;
+                    self.peeked_bytes = byte as u32;
+                    Ok(Some(byte))
+                }
+                None => Ok(None),
+            }
        }
    }

-    /// Retrieves the next UTF-8 checked character.
-    fn next_char(&mut self) -> io::Result<Option<char>> {
-        if let Some(v) = self.peeked_char.take() {
-            return Ok(v);
+    /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
+    #[inline]
+    pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<u32, Error> {
+        while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 {
+            match self.iter.next().transpose()? {
+                Some(byte) => {
+                    self.peeked_bytes |= (byte as u32) << (self.num_peeked_bytes * 8);
+                    self.num_peeked_bytes += 1;
+                }
+                None => break,
+            };
        }

-        let first_byte = match self.iter.next().transpose()? {
-            Some(b) => b,
-            None => return Ok(None),
-        };
+        match n {
+            0 => Ok(0),
+            1 => Ok(self.peeked_bytes & 0xFF),
+            2 => Ok(self.peeked_bytes & 0xFFFF),
+            3 => Ok(self.peeked_bytes & 0xFFFFFF),
+            _ => Ok(self.peeked_bytes),
+        }
+    }

-        let chr: char = if first_byte < 0x80 {
-            // 0b0xxx_xxxx
-            first_byte.into()
+    /// Peeks the next unchecked character in u32 code point.
+    #[inline]
+    pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
+        if let Some(ch) = self.peeked_char {
+            Ok(ch)
        } else {
-            let mut buf = [first_byte, 0u8, 0u8, 0u8];
-            let num_bytes = if first_byte < 0xE0 {
-                // 0b110x_xxxx
-                2
-            } else if first_byte < 0xF0 {
-                // 0b1110_xxxx
-                3
-            } else {
-                // 0b1111_0xxx
-                4
+            // Decode UTF-8
+            let x = match self.peek_byte()? {
+                Some(b) if b < 128 => {
+                    self.peeked_char = Some(Some(b as u32));
+                    return Ok(Some(b as u32));
+                }
+                Some(b) => b,
+                None => {
+                    self.peeked_char = None;
+                    return Ok(None);
+                }
            };

-            for b in buf.iter_mut().take(num_bytes).skip(1) {
-                let next = match self.iter.next() {
-                    Some(Ok(b)) => b,
-                    Some(Err(e)) => return Err(e),
-                    None => {
-                        return Err(io::Error::new(
-                            io::ErrorKind::InvalidData,
-                            "stream did not contain valid UTF-8",
-                        ))
-                    }
-                };
-
-                *b = next;
+            // Multibyte case follows
+            // Decode from a byte combination out of: [[[x y] z] w]
+            // NOTE: Performance is sensitive to the exact formulation here
+            let init = utf8_first_byte(x, 2);
+            let y = (self.peek_n_bytes(2)? >> 8) as u8;
+            let mut ch = utf8_acc_cont_byte(init, y);
+            if x >= 0xE0 {
+                // [[x y z] w] case
+                // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+                let z = (self.peek_n_bytes(3)? >> 16) as u8;
+                let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+                ch = init << 12 | y_z;
+                if x >= 0xF0 {
+                    // [x y z w] case
+                    // use only the lower 3 bits of `init`
+                    let w = (self.peek_n_bytes(4)? >> 24) as u8;
+                    ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
+                }
+            };
+
+            self.peeked_char = Some(Some(ch));
+            Ok(Some(ch))
+        }
+    }
+
+    /// Retrieves the next byte
+    #[inline]
+    fn next_byte(&mut self) -> io::Result<Option<u8>> {
+        self.peeked_char = None;
+        if self.num_peeked_bytes > 0 {
+            let byte = (self.peeked_bytes & 0xFF) as u8;
+            self.num_peeked_bytes -= 1;
+            self.peeked_bytes >>= 8;
+            Ok(Some(byte))
+        } else {
+            self.iter.next().transpose()
+        }
+    }
+
+    /// Retrieves the next unchecked char in u32 code point.
+    #[inline]
+    fn next_char(&mut self) -> io::Result<Option<u32>> {
+        if let Some(ch) = self.peeked_char.take() {
+            if let Some(c) = ch {
+                self.increment(utf8_len(c))?;
            }
+            return Ok(ch);
+        }

-            if let Ok(s) = std::str::from_utf8(&buf) {
-                if let Some(chr) = s.chars().next() {
-                    chr
-                } else {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        "stream did not contain valid UTF-8",
-                    ));
-                }
-            } else {
-                return Err(io::Error::new(
-                    io::ErrorKind::InvalidData,
-                    "stream did not contain valid UTF-8",
-                ));
+        // Decode UTF-8
+        let x = match self.next_byte()? {
+            Some(b) if b < 128 => return Ok(Some(b as u32)),
+            Some(b) => b,
+            None => return Ok(None),
+        };
+
+        // Multibyte case follows
+        // Decode from a byte combination out of: [[[x y] z] w]
+        // NOTE: Performance is sensitive to the exact formulation here
+        let init = utf8_first_byte(x, 2);
+        let y = unwrap_or_0(self.next_byte()?);
+        let mut ch = utf8_acc_cont_byte(init, y);
+        if x >= 0xE0 {
+            // [[x y z] w] case
+            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+            let z = unwrap_or_0(self.next_byte()?);
+            let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+            ch = init << 12 | y_z;
+            if x >= 0xF0 {
+                // [x y z w] case
+                // use only the lower 3 bits of `init`
+                let w = unwrap_or_0(self.next_byte()?);
+                ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
            }
        };

-        Ok(Some(chr))
+        Ok(Some(ch))
    }
+}

-    /// Retrieves the next ASCII checked character.
-    #[inline]
-    fn next_ascii(&mut self) -> io::Result<Option<u8>> {
-        match self.next_char() {
-            Ok(Some(chr)) if chr.is_ascii() => Ok(Some(chr as u8)),
-            Ok(None) => Ok(None),
-            _ => Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "non-ASCII byte found",
-            )),
-        }
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111;
+
+/// Returns the initial codepoint accumulator for the first byte.
+/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+/// for width 3, and 3 bits for width 4.
+#[inline]
+fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+    (byte & (0x7F >> width)) as u32
+}
+
+/// Returns the value of `ch` updated with continuation byte `byte`.
+#[inline]
+fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+    (ch << 6) | (byte & CONT_MASK) as u32
+}
+
+/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the
+/// bits `11`).
+#[inline]
+fn utf8_is_first_byte(byte: u8) -> bool {
+    byte <= 0x7F || (byte >> 6) == 0x11
+}
+
+#[inline]
+fn unwrap_or_0(opt: Option<u8>) -> u8 {
+    match opt {
+        Some(byte) => byte,
+        None => 0,
+    }
+}
+
+#[inline]
+fn utf8_len(ch: u32) -> u32 {
+    if ch <= 0x7F {
+        1
+    } else if ch <= 0x7FF {
+        2
+    } else if ch <= 0xFFFF {
+        3
+    } else {
+        4
    }
 }
--- a/boa/src/syntax/lexer/identifier.rs
+++ b/boa/src/syntax/lexer/identifier.rs
@ -8,7 +8,9 @@ use crate::{
        lexer::{Token, TokenKind},
    },
 };
+use core::convert::TryFrom;
 use std::io::Read;
+use std::str;

 const STRICT_FORBIDDEN_IDENTIFIERS: [&str; 11] = [
    "eval",
@ -51,13 +53,21 @@ impl<R> Tokenizer<R> for Identifier {
    {
        let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");

-        let mut buf = self.init.to_string();
+        let mut init_buf = [0u8; 4];
+        let mut buf = Vec::new();
+        self.init.encode_utf8(&mut init_buf);
+        buf.extend(init_buf.iter().take(self.init.len_utf8()));

-        cursor.take_while_pred(&mut buf, &|c: char| {
-            c.is_alphabetic() || c.is_digit(10) || c == '_'
+        cursor.take_while_char_pred(&mut buf, &|c: u32| {
+            if let Ok(c) = char::try_from(c) {
+                c.is_alphabetic() || c.is_digit(10) || c == '_'
+            } else {
+                false
+            }
        })?;

-        let tk = match buf.as_str() {
+        let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
+        let tk = match token_str {
            "true" => TokenKind::BooleanLiteral(true),
            "false" => TokenKind::BooleanLiteral(false),
            "null" => TokenKind::NullLiteral,
--- a/boa/src/syntax/lexer/mod.rs
+++ b/boa/src/syntax/lexer/mod.rs
@ -42,6 +42,7 @@ use self::{
 };
 use crate::syntax::ast::{Punctuator, Span};
 pub use crate::{profiler::BoaProfiler, syntax::ast::Position};
+use core::convert::TryFrom;
 pub use error::Error;
 use std::io::Read;
 pub use token::{Token, TokenKind};
@ -69,12 +70,12 @@ impl<R> Lexer<R> {
    ///  * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
    ///
    /// [More information](https://tc39.es/ecma262/#table-32)
-    fn is_whitespace(ch: char) -> bool {
+    fn is_whitespace(ch: u32) -> bool {
        matches!(
            ch,
-            '\u{0020}' | '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{00A0}' | '\u{FEFF}' |
+            0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
            // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
-            '\u{1680}' | '\u{2000}'..='\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}'
+            0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
        )
    }

@ -127,12 +128,12 @@ impl<R> Lexer<R> {

        if let Some(c) = self.cursor.peek()? {
            match c {
-                '/' => {
-                    self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/'
+                b'/' => {
+                    self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/'
                    SingleLineComment.lex(&mut self.cursor, start)
                }
-                '*' => {
-                    self.cursor.next_char()?.expect("* token vanished"); // Consume the '*'
+                b'*' => {
+                    self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*'
                    MultiLineComment.lex(&mut self.cursor, start)
                }
                ch => {
@ -140,9 +141,9 @@ impl<R> Lexer<R> {
                        InputElement::Div | InputElement::TemplateTail => {
                            // Only div punctuator allowed, regex not.

-                            if ch == '=' {
+                            if ch == b'=' {
                                // Indicates this is an AssignDiv.
-                                self.cursor.next_char()?.expect("= token vanished"); // Consume the '='
+                                self.cursor.next_byte()?.expect("= token vanished"); // Consume the '='
                                Ok(Token::new(
                                    Punctuator::AssignDiv.into(),
                                    Span::new(start, self.cursor.pos()),
@ -178,90 +179,104 @@ impl<R> Lexer<R> {
    {
        let _timer = BoaProfiler::global().start_event("next()", "Lexing");

-        let (start, next_chr) = loop {
+        let (start, next_ch) = loop {
            let start = self.cursor.pos();
-            if let Some(next_chr) = self.cursor.next_char()? {
+            if let Some(next_ch) = self.cursor.next_char()? {
                // Ignore whitespace
-                if !Self::is_whitespace(next_chr) {
-                    break (start, next_chr);
+                if !Self::is_whitespace(next_ch) {
+                    break (start, next_ch);
                }
            } else {
                return Ok(None);
            }
        };

-        let token = match next_chr {
-            '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
-                TokenKind::LineTerminator,
-                Span::new(start, self.cursor.pos()),
-            )),
-            '"' | '\'' => StringLiteral::new(next_chr).lex(&mut self.cursor, start),
-            '`' => TemplateLiteral.lex(&mut self.cursor, start),
-            _ if next_chr.is_digit(10) => NumberLiteral::new(next_chr).lex(&mut self.cursor, start),
-            _ if next_chr.is_alphabetic() || next_chr == '$' || next_chr == '_' => {
-                Identifier::new(next_chr).lex(&mut self.cursor, start)
-            }
-            ';' => Ok(Token::new(
-                Punctuator::Semicolon.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ':' => Ok(Token::new(
-                Punctuator::Colon.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '.' => SpreadLiteral::new().lex(&mut self.cursor, start),
-            '(' => Ok(Token::new(
-                Punctuator::OpenParen.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ')' => Ok(Token::new(
-                Punctuator::CloseParen.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ',' => Ok(Token::new(
-                Punctuator::Comma.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '{' => Ok(Token::new(
-                Punctuator::OpenBlock.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '}' => Ok(Token::new(
-                Punctuator::CloseBlock.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '[' => Ok(Token::new(
-                Punctuator::OpenBracket.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ']' => Ok(Token::new(
-                Punctuator::CloseBracket.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '?' => Ok(Token::new(
-                Punctuator::Question.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '/' => self.lex_slash_token(start),
-            '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
-                Operator::new(next_chr).lex(&mut self.cursor, start)
+        if let Ok(c) = char::try_from(next_ch) {
+            let token = match c {
+                '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
+                    TokenKind::LineTerminator,
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
+                '`' => TemplateLiteral.lex(&mut self.cursor, start),
+                _ if c.is_digit(10) => {
+                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
+                _ if c.is_alphabetic() || c == '$' || c == '_' => {
+                    Identifier::new(c).lex(&mut self.cursor, start)
+                }
+                ';' => Ok(Token::new(
+                    Punctuator::Semicolon.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ':' => Ok(Token::new(
+                    Punctuator::Colon.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '.' => SpreadLiteral::new().lex(&mut self.cursor, start),
+                '(' => Ok(Token::new(
+                    Punctuator::OpenParen.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ')' => Ok(Token::new(
+                    Punctuator::CloseParen.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ',' => Ok(Token::new(
+                    Punctuator::Comma.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '{' => Ok(Token::new(
+                    Punctuator::OpenBlock.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '}' => Ok(Token::new(
+                    Punctuator::CloseBlock.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '[' => Ok(Token::new(
+                    Punctuator::OpenBracket.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ']' => Ok(Token::new(
+                    Punctuator::CloseBracket.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '?' => Ok(Token::new(
+                    Punctuator::Question.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '/' => self.lex_slash_token(start),
+                '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
+                    Operator::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
+                _ => {
+                    let details = format!(
+                        "unexpected '{}' at line {}, column {}",
+                        c,
+                        start.line_number(),
+                        start.column_number()
+                    );
+                    Err(Error::syntax(details, start))
+                }
+            }?;
+
+            if token.kind() == &TokenKind::Comment {
+                // Skip comment
+                self.next()
+            } else {
+                Ok(Some(token))
            }
-            _ => {
-                let details = format!(
-                    "unexpected '{}' at line {}, column {}",
-                    next_chr,
+        } else {
+            Err(Error::syntax(
+                format!(
+                    "unexpected utf-8 char '\\u{}' at line {}, column {}",
+                    next_ch,
                    start.line_number(),
                    start.column_number()
-                );
-                Err(Error::syntax(details, start))
-            }
-        }?;
-
-        if token.kind() == &TokenKind::Comment {
-            // Skip comment
-            self.next()
-        } else {
-            Ok(Some(token))
+                ),
+                start,
+            ))
        }
    }
 }
--- a/boa/src/syntax/lexer/number.rs
+++ b/boa/src/syntax/lexer/number.rs
@ -9,6 +9,7 @@ use crate::{
        lexer::{token::Numeric, Token},
    },
 };
+use std::str;
 use std::{io::Read, str::FromStr};

 /// Number literal lexing.
@ -23,12 +24,12 @@ use std::{io::Read, str::FromStr};
 /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type
 #[derive(Debug, Clone, Copy)]
 pub(super) struct NumberLiteral {
-    init: char,
+    init: u8,
 }

 impl NumberLiteral {
    /// Creates a new string literal lexer.
-    pub(super) fn new(init: char) -> Self {
+    pub(super) fn new(init: u8) -> Self {
        Self { init }
    }
 }
@ -63,8 +64,9 @@ impl NumericKind {
    }
 }

+#[inline]
 fn take_signed_integer<R>(
-    buf: &mut String,
+    buf: &mut Vec<u8>,
    cursor: &mut Cursor<R>,
    kind: &NumericKind,
 ) -> Result<(), Error>
@ -73,30 +75,31 @@ where
 {
    // The next part must be SignedInteger.
    // This is optionally a '+' or '-' followed by 1 or more DecimalDigits.
-    match cursor.next_char()? {
-        Some('+') => {
-            buf.push('+');
-            if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
+    match cursor.next_byte()? {
+        Some(b'+') => {
+            buf.push(b'+');
+            if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
                // A digit must follow the + or - symbol.
                return Err(Error::syntax("No digit found after + symbol", cursor.pos()));
            }
        }
-        Some('-') => {
-            buf.push('-');
-            if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
+        Some(b'-') => {
+            buf.push(b'-');
+            if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
                // A digit must follow the + or - symbol.
                return Err(Error::syntax("No digit found after - symbol", cursor.pos()));
            }
        }
-        Some(c) if c.is_digit(kind.base()) => buf.push(c),
-        Some(c) => {
-            return Err(Error::syntax(
-                format!(
-                    "When lexing exponential value found unexpected char: '{}'",
-                    c
-                ),
-                cursor.pos(),
-            ));
+        Some(byte) => {
+            let ch = char::from(byte);
+            if ch.is_ascii() && ch.is_digit(kind.base()) {
+                buf.push(byte);
+            } else {
+                return Err(Error::syntax(
+                    "When lexing exponential value found unexpected char",
+                    cursor.pos(),
+                ));
+            }
        }
        None => {
            return Err(Error::syntax(
@ -107,7 +110,7 @@ where
    }

    // Consume the decimal digits.
-    cursor.take_while_pred(buf, &|c: char| c.is_digit(kind.base()))?;
+    cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?;

    Ok(())
 }
@ -118,12 +121,12 @@ where
 ///  - [ECMAScript Specification][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals
+#[inline]
 fn check_after_numeric_literal<R>(cursor: &mut Cursor<R>) -> Result<(), Error>
 where
    R: Read,
 {
-    let pred = |ch: char| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_';
-    if cursor.next_is_pred(&pred)? {
+    if cursor.next_is_ascii_pred(&|ch| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_')? {
        Err(Error::syntax(
            "a numeric literal must not be followed by an alphanumeric, $ or _ characters",
            cursor.pos(),
@ -140,17 +143,17 @@ impl<R> Tokenizer<R> for NumberLiteral {
    {
        let _timer = BoaProfiler::global().start_event("NumberLiteral", "Lexing");

-        let mut buf = self.init.to_string();
+        let mut buf = vec![self.init];

        // Default assume the number is a base 10 integer.
        let mut kind = NumericKind::Integer(10);

        let c = cursor.peek();

-        if self.init == '0' {
+        if self.init == b'0' {
            if let Some(ch) = c? {
                match ch {
-                    'x' | 'X' => {
+                    b'x' | b'X' => {
                        // Remove the initial '0' from buffer.
                        cursor.next_char()?.expect("x or X character vanished");
                        buf.pop();
@ -159,16 +162,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                        kind = NumericKind::Integer(16);

                        // Checks if the next char after '0x' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(16) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(16))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                        }
                    }
-                    'o' | 'O' => {
+                    b'o' | b'O' => {
                        // Remove the initial '0' from buffer.
                        cursor.next_char()?.expect("o or O character vanished");
                        buf.pop();
@ -177,16 +178,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                        kind = NumericKind::Integer(8);

                        // Checks if the next char after '0o' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(8) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                        }
                    }
-                    'b' | 'B' => {
+                    b'b' | b'B' => {
                        // Remove the initial '0' from buffer.
                        cursor.next_char()?.expect("b or B character vanished");
                        buf.pop();
@ -195,16 +194,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                        kind = NumericKind::Integer(2);

                        // Checks if the next char after '0b' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(2) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                        }
                    }
-                    'n' => {
+                    b'n' => {
                        cursor.next_char()?.expect("n character vanished");

                        // DecimalBigIntegerLiteral '0n'
@ -213,7 +210,8 @@ impl<R> Tokenizer<R> for NumberLiteral {
                            Span::new(start_pos, cursor.pos()),
                        ));
                    }
-                    ch => {
+                    byte => {
+                        let ch = char::from(byte);
                        if ch.is_digit(8) {
                            // LegacyOctalIntegerLiteral
                            if cursor.strict_mode() {
@ -226,7 +224,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                                // Remove the initial '0' from buffer.
                                buf.pop();

-                                buf.push(cursor.next_char()?.expect("'0' character vanished"));
+                                buf.push(cursor.next_byte()?.expect("'0' character vanished"));

                                kind = NumericKind::Integer(8);
                            }
@ -240,7 +238,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                                    start_pos,
                                ));
                            } else {
-                                buf.push(cursor.next_char()?.expect("Number digit vanished"));
+                                buf.push(cursor.next_byte()?.expect("Number digit vanished"));
                            }
                        } // Else indicates that the symbol is a non-number.
                    }
@ -256,42 +254,42 @@ impl<R> Tokenizer<R> for NumberLiteral {
        }

        // Consume digits until a non-digit character is encountered or all the characters are consumed.
-        cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+        cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;

        // The non-digit character could be:
        // 'n' To indicate a BigIntLiteralSuffix.
        // '.' To indicate a decimal seperator.
        // 'e' | 'E' To indicate an ExponentPart.
        match cursor.peek()? {
-            Some('n') => {
+            Some(b'n') => {
                // DecimalBigIntegerLiteral
                // Lexing finished.

                // Consume the n
-                cursor.next_char()?.expect("n character vanished");
+                cursor.next_byte()?.expect("n character vanished");

                kind = kind.to_bigint();
            }
-            Some('.') => {
+            Some(b'.') => {
                if kind.base() == 10 {
                    // Only base 10 numbers can have a decimal seperator.
                    // Number literal lexing finished if a . is found for a number in a different base.

-                    cursor.next_char()?.expect(". token vanished");
-                    buf.push('.'); // Consume the .
+                    cursor.next_byte()?.expect(". token vanished");
+                    buf.push(b'.'); // Consume the .
                    kind = NumericKind::Rational;

                    // Consume digits until a non-digit character is encountered or all the characters are consumed.
-                    cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+                    cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;

                    // The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part.
                    // Another '.' or 'n' is not allowed.
                    match cursor.peek()? {
-                        Some('e') | Some('E') => {
+                        Some(b'e') | Some(b'E') => {
                            // Consume the ExponentIndicator.
-                            cursor.next_char()?.expect("e or E token vanished");
+                            cursor.next_byte()?.expect("e or E token vanished");

-                            buf.push('E');
+                            buf.push(b'E');

                            take_signed_integer(&mut buf, cursor, &kind)?;
                        }
@ -301,10 +299,10 @@ impl<R> Tokenizer<R> for NumberLiteral {
                    }
                }
            }
-            Some('e') | Some('E') => {
+            Some(b'e') | Some(b'E') => {
                kind = NumericKind::Rational;
-                cursor.next_char()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
-                buf.push('E');
+                cursor.next_byte()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
+                buf.push(b'E');
                take_signed_integer(&mut buf, cursor, &kind)?;
            }
            Some(_) | None => {
@ -314,14 +312,15 @@ impl<R> Tokenizer<R> for NumberLiteral {

        check_after_numeric_literal(cursor)?;

+        let num_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
        let num = match kind {
            NumericKind::BigInt(base) => {
                Numeric::BigInt(
-                    BigInt::from_string_radix(&buf, base).expect("Could not convert to BigInt")
+                    BigInt::from_string_radix(num_str, base).expect("Could not convert to BigInt")
                    )
            }
            NumericKind::Rational /* base: 10 */ => {
-                let val = f64::from_str(&buf).expect("Failed to parse float after checks");
+                let val = f64::from_str(num_str).expect("Failed to parse float after checks");
                let int_val = val as i32;

                // The truncated float should be identically to the non-truncated float for the conversion to be loss-less,
@ -335,12 +334,12 @@ impl<R> Tokenizer<R> for NumberLiteral {
                }
            },
            NumericKind::Integer(base) => {
-                if let Ok(num) = i32::from_str_radix(&buf, base) {
+                if let Ok(num) = i32::from_str_radix(num_str, base) {
                    Numeric::Integer(num)
                } else {
                    let b = f64::from(base);
                    let mut result = 0.0_f64;
-                    for c in buf.chars() {
+                    for c in num_str.chars() {
                        let digit = f64::from(c.to_digit(base).expect("could not parse digit after already checking validity"));
                        result = result * b + digit;
                    }
--- a/boa/src/syntax/lexer/operator.rs
+++ b/boa/src/syntax/lexer/operator.rs
@ -17,8 +17,8 @@ macro_rules! vop {
    ($cursor:ident, $assign_op:expr, $op:expr) => ({
        match $cursor.peek()? {
            None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
-            Some('=') => {
-                $cursor.next_char()?.expect("= token vanished");
+            Some(b'=') => {
+                $cursor.next_byte()?.expect("= token vanished");
                $cursor.next_column();
                $assign_op
            }
@ -28,13 +28,13 @@ macro_rules! vop {
    ($cursor:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr), +}) => ({
        match $cursor.peek()? {
            None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
-            Some('=') => {
-                $cursor.next_char()?.expect("= token vanished");
+            Some(b'=') => {
+                $cursor.next_byte()?.expect("= token vanished");
                $cursor.next_column();
                $assign_op
            },
            $($case => {
-                $cursor.next_char()?.expect("Token vanished");
+                $cursor.next_byte()?.expect("Token vanished");
                $cursor.next_column();
                $block
            })+,
@ -44,7 +44,7 @@ macro_rules! vop {
    ($cursor:ident, $op:expr, {$($case:pat => $block:expr),+}) => {
        match $cursor.peek().ok_or_else(|| Error::syntax("could not preview next value", $cursor.pos()))? {
            $($case => {
-                $cursor.next_char()?;
+                $cursor.next_byte()?;
                $cursor.next_column();
                $block
            })+,
@ -72,7 +72,7 @@ macro_rules! op {

 #[derive(Debug, Clone, Copy)]
 pub(super) struct Operator {
-    init: char,
+    init: u8,
 }

 /// Operator lexing.
@ -87,7 +87,7 @@ pub(super) struct Operator {
 /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators
 impl Operator {
    /// Creates a new operator lexer.
-    pub(super) fn new(init: char) -> Self {
+    pub(super) fn new(init: u8) -> Self {
        Self { init }
    }
 }
@ -100,61 +100,63 @@ impl<R> Tokenizer<R> for Operator {
        let _timer = BoaProfiler::global().start_event("Operator", "Lexing");

        match self.init {
-            '*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
-                Some('*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
+            b'*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
+                Some(b'*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
            }),
-            '+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
-                Some('+') => Ok(Punctuator::Inc)
+            b'+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
+                Some(b'+') => Ok(Punctuator::Inc)
            }),
-            '-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
-                Some('-') => {
+            b'-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
+                Some(b'-') => {
                    Ok(Punctuator::Dec)
                }
            }),
-            '%' => op!(
+            b'%' => op!(
                cursor,
                start_pos,
                Ok(Punctuator::AssignMod),
                Ok(Punctuator::Mod)
            ),
-            '|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
-                Some('|') => Ok(Punctuator::BoolOr)
+            b'|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
+                Some(b'|') => Ok(Punctuator::BoolOr)
            }),
-            '&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
-                Some('&') => Ok(Punctuator::BoolAnd)
+            b'&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
+                Some(b'&') => Ok(Punctuator::BoolAnd)
            }),
-            '^' => op!(
+            b'^' => op!(
                cursor,
                start_pos,
                Ok(Punctuator::AssignXor),
                Ok(Punctuator::Xor)
            ),
-            '=' => op!(cursor, start_pos, if cursor.next_is('=')? {
+            b'=' => op!(cursor, start_pos, if cursor.next_is(b'=')? {
                Ok(Punctuator::StrictEq)
            } else {
                Ok(Punctuator::Eq)
            }, Ok(Punctuator::Assign), {
-                Some('>') => {
+                Some(b'>') => {
                    Ok(Punctuator::Arrow)
                }
            }),
-            '<' => op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
-                Some('<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
-            }),
-            '>' => {
+            b'<' => {
+                op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
+                    Some(b'<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
+                })
+            }
+            b'>' => {
                op!(cursor, start_pos, Ok(Punctuator::GreaterThanOrEq), Ok(Punctuator::GreaterThan), {
-                    Some('>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
-                        Some('>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
+                    Some(b'>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
+                        Some(b'>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
                    })
                })
            }
-            '!' => op!(
+            b'!' => op!(
                cursor,
                start_pos,
                vop!(cursor, Ok(Punctuator::StrictNotEq), Ok(Punctuator::NotEq)),
                Ok(Punctuator::Not)
            ),
-            '~' => Ok(Token::new(
+            b'~' => Ok(Token::new(
                Punctuator::Neg.into(),
                Span::new(start_pos, cursor.pos()),
            )),
--- a/boa/src/syntax/lexer/regex.rs
+++ b/boa/src/syntax/lexer/regex.rs
@ -9,6 +9,8 @@ use crate::{
    },
 };
 use bitflags::bitflags;
+use std::io::{self, ErrorKind};
+use std::str;
 use std::{
    fmt::{self, Display, Formatter},
    io::Read,
@ -39,11 +41,11 @@ impl<R> Tokenizer<R> for RegexLiteral {
    {
        let _timer = BoaProfiler::global().start_event("RegexLiteral", "Lexing");

-        let mut body = String::new();
+        let mut body = Vec::new();

        // Lex RegularExpressionBody.
        loop {
-            match cursor.next_char()? {
+            match cursor.next_byte()? {
                None => {
                    // Abrupt end.
                    return Err(Error::syntax(
@ -51,29 +53,45 @@ impl<R> Tokenizer<R> for RegexLiteral {
                        cursor.pos(),
                    ));
                }
-                Some(c) => {
-                    match c {
-                        '/' => break, // RegularExpressionBody finished.
-                        '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
+                Some(b) => {
+                    match b {
+                        b'/' => break, // RegularExpressionBody finished.
+                        b'\n' | b'\r' => {
                            // Not allowed in Regex literal.
                            return Err(Error::syntax(
                                "new lines are not allowed in regular expressions",
                                cursor.pos(),
                            ));
                        }
-                        '\\' => {
+                        0xE2 if (cursor.peek_n(2)? == 0xA8_80 || cursor.peek_n(2)? == 0xA9_80) => {
+                            // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
+                            return Err(Error::syntax(
+                                "new lines are not allowed in regular expressions",
+                                cursor.pos(),
+                            ));
+                        }
+                        b'\\' => {
                            // Escape sequence
-                            body.push('\\');
-                            if let Some(sc) = cursor.next_char()? {
+                            body.push(b'\\');
+                            if let Some(sc) = cursor.next_byte()? {
                                match sc {
-                                    '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
+                                    b'\n' | b'\r' => {
                                        // Not allowed in Regex literal.
                                        return Err(Error::syntax(
                                            "new lines are not allowed in regular expressions",
                                            cursor.pos(),
                                        ));
                                    }
-                                    ch => body.push(ch),
+                                    0xE2 if (cursor.peek_n(2)? == 0xA8_80
+                                        || cursor.peek_n(2)? == 0xA9_80) =>
+                                    {
+                                        // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
+                                        return Err(Error::syntax(
+                                            "new lines are not allowed in regular expressions",
+                                            cursor.pos(),
+                                        ));
+                                    }
+                                    b => body.push(b),
                                }
                            } else {
                                // Abrupt end of regex.
@ -83,20 +101,31 @@ impl<R> Tokenizer<R> for RegexLiteral {
                                ));
                            }
                        }
-                        _ => body.push(c),
+                        _ => body.push(b),
                    }
                }
            }
        }

-        let mut flags = String::new();
+        let mut flags = Vec::new();
        let flags_start = cursor.pos();
-        cursor.take_while_pred(&mut flags, &char::is_alphabetic)?;
-
-        Ok(Token::new(
-            TokenKind::regular_expression_literal(body, parse_regex_flags(&flags, flags_start)?),
-            Span::new(start_pos, cursor.pos()),
-        ))
+        cursor.take_while_ascii_pred(&mut flags, &|c: char| c.is_alphabetic())?;
+
+        let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
+        if let Ok(body_str) = str::from_utf8(body.as_slice()) {
+            Ok(Token::new(
+                TokenKind::regular_expression_literal(
+                    body_str,
+                    parse_regex_flags(flags_str, flags_start)?,
+                ),
+                Span::new(start_pos, cursor.pos()),
+            ))
+        } else {
+            Err(Error::from(io::Error::new(
+                ErrorKind::InvalidData,
+                "Invalid UTF-8 character in regular expressions",
+            )))
+        }
    }
 }

--- a/boa/src/syntax/lexer/spread.rs
+++ b/boa/src/syntax/lexer/spread.rs
@ -38,8 +38,8 @@ impl<R> Tokenizer<R> for SpreadLiteral {
        let _timer = BoaProfiler::global().start_event("SpreadLiteral", "Lexing");

        // . or ...
-        if cursor.next_is('.')? {
-            if cursor.next_is('.')? {
+        if cursor.next_is(b'.')? {
+            if cursor.next_is(b'.')? {
                Ok(Token::new(
                    Punctuator::Spread.into(),
                    Span::new(start_pos, cursor.pos()),
--- a/boa/src/syntax/lexer/string.rs
+++ b/boa/src/syntax/lexer/string.rs
@ -8,6 +8,7 @@ use crate::{
        lexer::{Token, TokenKind},
    },
 };
+use core::convert::TryFrom;
 use std::{
    io::{self, ErrorKind, Read},
    str,
@ -58,12 +59,13 @@ impl<R> Tokenizer<R> for StringLiteral {
        let mut buf: Vec<u16> = Vec::new();
        loop {
            let next_chr_start = cursor.pos();
-            let next_chr = cursor.next_char()?.ok_or_else(|| {
+            let next_chr = char::try_from(cursor.next_char()?.ok_or_else(|| {
                Error::from(io::Error::new(
                    ErrorKind::UnexpectedEof,
                    "unterminated string literal",
                ))
-            })?;
+            })?)
+            .unwrap();

            match next_chr {
                '\'' if self.terminator == StringTerminator::SingleQuote => {
@ -76,22 +78,22 @@ impl<R> Tokenizer<R> for StringLiteral {
                    let _timer = BoaProfiler::global()
                        .start_event("StringLiteral - escape sequence", "Lexing");

-                    let escape = cursor.next_char()?.ok_or_else(|| {
+                    let escape = cursor.next_byte()?.ok_or_else(|| {
                        Error::from(io::Error::new(
                            ErrorKind::UnexpectedEof,
                            "unterminated escape sequence in string literal",
                        ))
                    })?;

-                    if escape != '\n' {
+                    if escape != b'\n' {
                        match escape {
-                            'n' => buf.push('\n' as u16),
-                            'r' => buf.push('\r' as u16),
-                            't' => buf.push('\t' as u16),
-                            'b' => buf.push('\x08' as u16),
-                            'f' => buf.push('\x0c' as u16),
-                            '0' => buf.push('\0' as u16),
-                            'x' => {
+                            b'n' => buf.push('\n' as u16),
+                            b'r' => buf.push('\r' as u16),
+                            b't' => buf.push('\t' as u16),
+                            b'b' => buf.push('\x08' as u16),
+                            b'f' => buf.push('\x0c' as u16),
+                            b'0' => buf.push('\0' as u16),
+                            b'x' => {
                                let mut code_point_utf8_bytes = [0u8; 2];
                                cursor.fill_bytes(&mut code_point_utf8_bytes)?;
                                let code_point_str = str::from_utf8(&code_point_utf8_bytes)
@ -106,17 +108,20 @@ impl<R> Tokenizer<R> for StringLiteral {

                                buf.push(code_point);
                            }
-                            'u' => {
+                            b'u' => {
                                // Support \u{X..X} (Unicode Codepoint)
-                                if cursor.next_is('{')? {
-                                    cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.
+                                if cursor.next_is(b'{')? {
+                                    cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'.

                                    // TODO: use bytes for a bit better performance (using stack)
-                                    let mut code_point_str = String::with_capacity(6);
-                                    cursor.take_until('}', &mut code_point_str)?;
+                                    let mut code_point_buf = Vec::with_capacity(6);
+                                    cursor.take_until(b'}', &mut code_point_buf)?;

-                                    cursor.next_char()?.expect("} character vanished"); // Consume the '}'.
+                                    cursor.next_byte()?.expect("} character vanished"); // Consume the '}'.

+                                    let code_point_str = unsafe {
+                                        str::from_utf8_unchecked(code_point_buf.as_slice())
+                                    };
                                    // We know this is a single unicode codepoint, convert to u32
                                    let code_point = u32::from_str_radix(&code_point_str, 16)
                                        .map_err(|_| {
@ -156,13 +161,12 @@ impl<R> Tokenizer<R> for StringLiteral {
                                    buf.push(code_point);
                                }
                            }
-                            '\'' | '"' | '\\' => buf.push(escape as u16),
-                            ch => {
+                            b'\'' | b'"' | b'\\' => buf.push(escape as u16),
+                            _ => {
                                let details = format!(
-                                    "invalid escape sequence `{}` at line {}, column {}",
+                                    "invalid escape sequence at line {}, column {}",
                                    next_chr_start.line_number(),
                                    next_chr_start.column_number(),
-                                    ch
                                );
                                return Err(Error::syntax(details, cursor.pos()));
                            }
--- a/boa/src/syntax/lexer/template.rs
+++ b/boa/src/syntax/lexer/template.rs
@ -9,6 +9,7 @@ use crate::{
    },
 };
 use std::io::{self, ErrorKind, Read};
+use std::str;

 /// Template literal lexing.
 ///
@ -30,23 +31,30 @@ impl<R> Tokenizer<R> for TemplateLiteral {
    {
        let _timer = BoaProfiler::global().start_event("TemplateLiteral", "Lexing");

-        let mut buf = String::new();
+        let mut buf = Vec::new();
        loop {
-            match cursor.next_char()? {
+            match cursor.next_byte()? {
                None => {
                    return Err(Error::from(io::Error::new(
                        ErrorKind::UnexpectedEof,
                        "Unterminated template literal",
                    )));
                }
-                Some('`') => break,                 // Template literal finished.
-                Some(next_ch) => buf.push(next_ch), // TODO when there is an expression inside the literal
+                Some(b'`') => break, // Template literal finished.
+                Some(next_byte) => buf.push(next_byte), // TODO when there is an expression inside the literal
            }
        }

-        Ok(Token::new(
-            TokenKind::template_literal(buf),
-            Span::new(start_pos, cursor.pos()),
-        ))
+        if let Ok(s) = str::from_utf8(buf.as_slice()) {
+            Ok(Token::new(
+                TokenKind::template_literal(s),
+                Span::new(start_pos, cursor.pos()),
+            ))
+        } else {
+            Err(Error::from(io::Error::new(
+                ErrorKind::InvalidData,
+                "Invalid UTF-8 character in template literal",
+            )))
+        }
    }
 }
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@ -6,6 +6,7 @@ use super::token::Numeric;
 use super::*;
 use super::{Error, Position};
 use crate::syntax::ast::Keyword;
+use std::str;

 fn span(start: (u32, u32), end: (u32, u32)) -> Span {
    Span::new(Position::new(start.0, start.1), Position::new(end.0, end.1))
@ -280,19 +281,19 @@ fn check_positions_codepoint() {
    // String token starts on column 13
    assert_eq!(
        lexer.next().unwrap().unwrap().span(),
-        span((1, 13), (1, 34))
+        span((1, 13), (1, 36))
    );

-    // Close parenthesis token starts on column 34
+    // Close parenthesis token starts on column 36
    assert_eq!(
        lexer.next().unwrap().unwrap().span(),
-        span((1, 34), (1, 35))
+        span((1, 36), (1, 37))
    );

-    // Semi Colon token starts on column 35
+    // Semi Colon token starts on column 37
    assert_eq!(
        lexer.next().unwrap().unwrap().span(),
-        span((1, 35), (1, 36))
+        span((1, 37), (1, 38))
    );
 }

@ -554,38 +555,102 @@ fn addition_no_spaces_e_number() {
 }

 #[test]
-fn take_while_pred_simple() {
+fn take_while_ascii_pred_simple() {
    let mut cur = Cursor::new(&b"abcdefghijk"[..]);

-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();

-    cur.take_while_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
+    cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
        .unwrap();

-    assert_eq!(buf, "abc");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
 }

 #[test]
-fn take_while_pred_immediate_stop() {
+fn take_while_ascii_pred_immediate_stop() {
    let mut cur = Cursor::new(&b"abcdefghijk"[..]);

-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();

-    cur.take_while_pred(&mut buf, &|c| c == 'd').unwrap();
+    cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap();

-    assert_eq!(buf, "");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
 }

 #[test]
-fn take_while_pred_entire_str() {
+fn take_while_ascii_pred_entire_str() {
    let mut cur = Cursor::new(&b"abcdefghijk"[..]);

-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();

-    cur.take_while_pred(&mut buf, &|c| c.is_alphabetic())
-        .unwrap();
+    cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
+}
+
+#[test]
+fn take_while_ascii_pred_non_ascii_stop() {
+    let mut cur = Cursor::new("abcde😀fghijk".as_bytes());
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde");
+}
+
+#[test]
+fn take_while_char_pred_simple() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|c| {
+        c == 'a' as u32 || c == 'b' as u32 || c == 'c' as u32
+    })
+    .unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
+}
+
+#[test]
+fn take_while_char_pred_immediate_stop() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|_| false).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
+}
+
+#[test]
+fn take_while_char_pred_entire_str() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
+}
+
+#[test]
+fn take_while_char_pred_utf8_char() {
+    let mut cur = Cursor::new("abc😀defghijk".as_bytes());
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|c| {
+        if let Ok(c) = char::try_from(c) {
+            c == 'a' || c == 'b' || c == 'c' || c == '😀'
+        } else {
+            false
+        }
+    })
+    .unwrap();

-    assert_eq!(buf, "abcdefghijk");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc😀");
 }

 #[test]