From 5534ec2773fbc93864285f11a71f94963a73a964 Mon Sep 17 00:00:00 2001 From: Hans Larsen Date: Mon, 9 Sep 2024 07:25:07 -0700 Subject: [PATCH] Add a display type for JsString to allow formatting without allocations (#3951) Co-authored-by: Haled Odat <8566042+HalidOdat@users.noreply.github.com> --- core/string/src/display.rs | 44 ++++++++++++++++++++++++++++++++++++++ core/string/src/lib.rs | 40 ++++++++++++++++++++++------------ core/string/src/str.rs | 12 ++++++++++- core/string/src/tests.rs | 22 ++++++++++++++++++- 4 files changed, 102 insertions(+), 16 deletions(-) create mode 100644 core/string/src/display.rs diff --git a/core/string/src/display.rs b/core/string/src/display.rs new file mode 100644 index 0000000000..9124af24e1 --- /dev/null +++ b/core/string/src/display.rs @@ -0,0 +1,44 @@ +//! Display implementations for [`crate::JsString`]. +use crate::{CodePoint, JsStr, JsStrVariant}; +use std::fmt; +use std::fmt::Write; + +/// Display implementation for [`crate::JsString`] that escapes unicode characters. +#[derive(Debug)] +pub struct JsStrDisplayEscaped<'a> { + inner: JsStr<'a>, +} + +impl fmt::Display for JsStrDisplayEscaped<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.inner.variant() { + // SAFETY: `JsStrVariant::Latin1` does not contain any unpaired surrogates, so need to check. + JsStrVariant::Latin1(v) => v + .iter() + .copied() + .map(char::from) + .try_for_each(|c| f.write_char(c)), + JsStrVariant::Utf16(_) => self.inner.code_points().try_for_each(|r| match r { + CodePoint::Unicode(c) => f.write_char(c), + CodePoint::UnpairedSurrogate(c) => { + write!(f, "\\u{c:04X}") + } + }), + } + } +} + +impl<'a> From> for JsStrDisplayEscaped<'a> { + fn from(inner: JsStr<'a>) -> Self { + Self { inner } + } +} + +#[test] +fn latin1() { + // 0xE9 is `é` in ISO-8859-1 (see https://www.ascii-code.com/ISO-8859-1). + let s = JsStr::latin1(b"Hello \xE9 world!"); + + let rust_str = format!("{}", JsStrDisplayEscaped { inner: s }); + assert_eq!(rust_str, "Hello é world!"); +} diff --git a/core/string/src/lib.rs b/core/string/src/lib.rs index c45963add1..ecacaa1d56 100644 --- a/core/string/src/lib.rs +++ b/core/string/src/lib.rs @@ -17,6 +17,7 @@ #![allow(clippy::module_name_repetitions)] mod common; +mod display; mod iter; mod str; mod tagged; @@ -25,6 +26,7 @@ mod tagged; mod tests; use self::{iter::Windows, str::JsSliceIndex}; +use crate::display::JsStrDisplayEscaped; use crate::tagged::{Tagged, UnwrappedTagged}; #[doc(inline)] pub use crate::{ @@ -32,6 +34,7 @@ pub use crate::{ iter::Iter, str::{JsStr, JsStrVariant}, }; +use std::fmt::Write; use std::{ alloc::{alloc, dealloc, Layout}, cell::Cell, @@ -150,6 +153,17 @@ impl CodePoint { } } +impl std::fmt::Display for CodePoint { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CodePoint::Unicode(c) => f.write_char(*c), + CodePoint::UnpairedSurrogate(c) => { + write!(f, "\\u{c:04X}") + } + } + } +} + /// A `usize` contains a flag and the length of Latin1/UTF-16 . /// ```text /// ┌────────────────────────────────────┐ @@ -528,10 +542,7 @@ impl JsString { /// Gets an iterator of all the Unicode codepoints of a [`JsString`]. #[inline] pub fn code_points(&self) -> impl Iterator + Clone + '_ { - char::decode_utf16(self.iter()).map(|res| match res { - Ok(c) => CodePoint::Unicode(c), - Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), - }) + self.as_str().code_points() } /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )` @@ -935,6 +946,15 @@ impl JsString { UnwrappedTagged::Tag(_inner) => None, } } + + /// Gets a displayable escaped string. This may be faster and has less + /// allocations than `format!("{}", str.to_string_escaped())` when + /// displaying. + #[inline] + #[must_use] + pub fn display_escaped(&self) -> JsStrDisplayEscaped<'_> { + JsStrDisplayEscaped::from(self.as_str()) + } } impl Clone for JsString { @@ -1036,10 +1056,7 @@ impl Drop for JsString { impl ToStringEscaped for JsString { #[inline] fn to_string_escaped(&self) -> String { - match self.as_str().variant() { - JsStrVariant::Latin1(v) => v.iter().copied().map(char::from).collect(), - JsStrVariant::Utf16(v) => v.to_string_escaped(), - } + format!("{}", self.display_escaped()) } } @@ -1232,11 +1249,6 @@ pub(crate) trait ToStringEscaped { impl ToStringEscaped for [u16] { #[inline] fn to_string_escaped(&self) -> String { - char::decode_utf16(self.iter().copied()) - .map(|r| match r { - Ok(c) => String::from(c), - Err(e) => format!("\\u{:04X}", e.unpaired_surrogate()), - }) - .collect() + JsString::from(self).to_string_escaped() } } diff --git a/core/string/src/str.rs b/core/string/src/str.rs index c42821558f..ffbee00034 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -1,4 +1,4 @@ -use crate::{is_trimmable_whitespace, is_trimmable_whitespace_latin1, Iter}; +use crate::{is_trimmable_whitespace, is_trimmable_whitespace_latin1, CodePoint, Iter}; use std::{ hash::{Hash, Hasher}, slice::SliceIndex, @@ -234,6 +234,16 @@ impl<'a> JsStr<'a> { let (m, n) = (self.len(), needle.len()); m >= n && needle == self.get(m - n..).expect("already checked size") } + + /// Gets an iterator of all the Unicode codepoints of a [`JsStr`]. + /// This is not optimized for Latin1 strings. + #[inline] + pub(crate) fn code_points(self) -> impl Iterator + Clone + 'a { + char::decode_utf16(self.iter()).map(|res| match res { + Ok(c) => CodePoint::Unicode(c), + Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), + }) + } } impl Hash for JsStr<'_> { diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index c9cf68eca7..abf10c0c80 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -2,7 +2,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash}; -use crate::{JsStr, JsString, StaticJsString, StaticJsStrings}; +use crate::{JsStr, JsString, StaticJsString, StaticJsStrings, ToStringEscaped}; use rustc_hash::FxHasher; @@ -174,6 +174,26 @@ fn conversion_to_known_static_js_string() { assert!(string.unwrap().as_str().is_latin1()); } +#[test] +fn to_string_escaped() { + assert_eq!( + JsString::from("Hello, \u{1D49E} world!").to_string_escaped(), + "Hello, \u{1D49E} world!" + ); + + assert_eq!( + JsString::from("Hello, world!").to_string_escaped(), + "Hello, world!" + ); + + // 15 should not be escaped. + let unpaired_surrogates: [u16; 3] = [0xDC58, 0xD83C, 0x0015]; + assert_eq!( + JsString::from(&unpaired_surrogates).to_string_escaped(), + "\\uDC58\\uD83C\u{15}" + ); +} + #[test] fn from_static_js_string() { static STATIC_HELLO_WORLD: StaticJsString =