mirror of https://github.com/boa-dev/boa.git
Browse Source
This Pull Request closes #894.
It changes the following:
- Adds the `encodeURI()`, `decodeURI()`, `encodeURIComponent()` and `decodeURIComponent()` functions
- Passes all the tests except for those depending on #1987 or on the comment below.
Things to discuss:
- I'm unable to find in the spec information regarding the only failing tests, which relate to [this](f1870753fa/test/built-ins/encodeURI/S15.1.3.3_A1.1_T2.js
):
> If string.charAt(k) in [0xDC00 - 0xDFFF], throw URIError
Let me know your thoughts :)
Co-authored-by: raskad <32105367+raskad@users.noreply.github.com>
pull/2287/head
Iban Eguia
2 years ago
7 changed files with 779 additions and 24 deletions
@ -0,0 +1,110 @@ |
|||||||
|
//! URI handling function constants
|
||||||
|
//!
|
||||||
|
//! This module contains a few constants used to handle decoding and encoding for URI handling
|
||||||
|
//! functions. They make it easier and more performant to compare different ranges and code points.
|
||||||
|
|
||||||
|
use std::ops::RangeInclusive; |
||||||
|
|
||||||
|
/// A range containing all the lowercase `uriAlpha` code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
|
||||||
|
const URI_ALPHA_LOWER: RangeInclusive<u16> = b'a' as u16..=b'z' as u16; |
||||||
|
|
||||||
|
/// A range containing all the uppercase `uriAlpha` code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
|
||||||
|
const URI_ALPHA_UPPER: RangeInclusive<u16> = b'A' as u16..=b'Z' as u16; |
||||||
|
|
||||||
|
/// A range containing all the `DecimalDigit` code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-DecimalDigit
|
||||||
|
const DECIMAL_DIGIT: RangeInclusive<u16> = b'0' as u16..=b'9' as u16; |
||||||
|
|
||||||
|
/// An array containing all the `uriMark` code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriMark
|
||||||
|
const URI_MARK: [u16; 9] = [ |
||||||
|
b'-' as u16, |
||||||
|
b'_' as u16, |
||||||
|
b'.' as u16, |
||||||
|
b'!' as u16, |
||||||
|
b'~' as u16, |
||||||
|
b'*' as u16, |
||||||
|
b'\'' as u16, |
||||||
|
b'(' as u16, |
||||||
|
b')' as u16, |
||||||
|
]; |
||||||
|
|
||||||
|
/// An array containing all the `uriReserved` code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriReserved
|
||||||
|
const URI_RESERVED: [u16; 10] = [ |
||||||
|
b';' as u16, |
||||||
|
b'/' as u16, |
||||||
|
b'?' as u16, |
||||||
|
b':' as u16, |
||||||
|
b'@' as u16, |
||||||
|
b'&' as u16, |
||||||
|
b'=' as u16, |
||||||
|
b'+' as u16, |
||||||
|
b'$' as u16, |
||||||
|
b',' as u16, |
||||||
|
]; |
||||||
|
|
||||||
|
/// The number sign (`#`) symbol as a UTF-16 code potint.
|
||||||
|
const NUMBER_SIGN: u16 = b'#' as u16; |
||||||
|
|
||||||
|
/// Constant with all the unescaped URI characters.
|
||||||
|
///
|
||||||
|
/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped
|
||||||
|
#[inline] |
||||||
|
pub(super) fn is_uri_unescaped(code_point: u16) -> bool { |
||||||
|
URI_ALPHA_LOWER.contains(&code_point) |
||||||
|
|| URI_ALPHA_UPPER.contains(&code_point) |
||||||
|
|| DECIMAL_DIGIT.contains(&code_point) |
||||||
|
|| URI_MARK.contains(&code_point) |
||||||
|
} |
||||||
|
|
||||||
|
/// Constant with all the reserved URI characters, plus the number sign symbol (`#`).
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#prod-uriReserved
|
||||||
|
#[inline] |
||||||
|
pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool { |
||||||
|
code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point) |
||||||
|
} |
||||||
|
|
||||||
|
/// Constant with all the reserved and unescaped URI characters, plus the number sign symbol (`#`).
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [`uriReserved` in ECMAScript spec][uri_reserved]
|
||||||
|
/// - [`uriUnescaped` in ECMAScript spec][uri_unescaped]
|
||||||
|
///
|
||||||
|
/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved
|
||||||
|
/// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped
|
||||||
|
#[inline] |
||||||
|
pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool { |
||||||
|
code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point) |
||||||
|
} |
@ -0,0 +1,550 @@ |
|||||||
|
//! URI Handling Functions
|
||||||
|
//!
|
||||||
|
//! Uniform Resource Identifiers, or URIs, are Strings that identify resources (e.g. web pages or
|
||||||
|
//! files) and transport protocols by which to access them (e.g. HTTP or FTP) on the Internet. The
|
||||||
|
//! ECMAScript language itself does not provide any support for using URIs except for functions
|
||||||
|
//! that encode and decode URIs as described in 19.2.6.2, 19.2.6.3, 19.2.6.4 and 19.2.6.5
|
||||||
|
//!
|
||||||
|
//! More information:
|
||||||
|
//! - [ECMAScript reference][spec]
|
||||||
|
//!
|
||||||
|
//! [spec]: https://tc39.es/ecma262/#sec-uri-handling-functions
|
||||||
|
|
||||||
|
mod consts; |
||||||
|
|
||||||
|
use self::consts::{ |
||||||
|
is_uri_reserved_or_number_sign, is_uri_reserved_or_uri_unescaped_or_number_sign, |
||||||
|
is_uri_unescaped, |
||||||
|
}; |
||||||
|
|
||||||
|
use super::{string::code_point_at, BuiltIn}; |
||||||
|
use crate::{ |
||||||
|
builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, |
||||||
|
JsValue, |
||||||
|
}; |
||||||
|
|
||||||
|
/// URI Handling Functions
|
||||||
|
#[derive(Debug, Clone, Copy)] |
||||||
|
pub(crate) struct Uri; |
||||||
|
|
||||||
|
impl BuiltIn for Uri { |
||||||
|
const NAME: &'static str = "Uri"; |
||||||
|
|
||||||
|
fn init(context: &mut Context) -> Option<JsValue> { |
||||||
|
let decode_uri = FunctionBuilder::native(context, Self::decode_uri) |
||||||
|
.name("decodeURI") |
||||||
|
.length(1) |
||||||
|
.constructor(false) |
||||||
|
.build(); |
||||||
|
|
||||||
|
context.register_global_property( |
||||||
|
"decodeURI", |
||||||
|
decode_uri, |
||||||
|
Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, |
||||||
|
); |
||||||
|
|
||||||
|
let decode_uri_component = FunctionBuilder::native(context, Self::decode_uri_component) |
||||||
|
.name("decodeURIComponent") |
||||||
|
.length(1) |
||||||
|
.constructor(false) |
||||||
|
.build(); |
||||||
|
|
||||||
|
context.register_global_property( |
||||||
|
"decodeURIComponent", |
||||||
|
decode_uri_component, |
||||||
|
Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, |
||||||
|
); |
||||||
|
|
||||||
|
let encode_uri = FunctionBuilder::native(context, Self::encode_uri) |
||||||
|
.name("encodeURI") |
||||||
|
.length(1) |
||||||
|
.constructor(false) |
||||||
|
.build(); |
||||||
|
|
||||||
|
context.register_global_property( |
||||||
|
"encodeURI", |
||||||
|
encode_uri, |
||||||
|
Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, |
||||||
|
); |
||||||
|
|
||||||
|
let encode_uri_component = FunctionBuilder::native(context, Self::encode_uri_component) |
||||||
|
.name("encodeURIComponent") |
||||||
|
.length(1) |
||||||
|
.constructor(false) |
||||||
|
.build(); |
||||||
|
|
||||||
|
context.register_global_property( |
||||||
|
"encodeURIComponent", |
||||||
|
encode_uri_component, |
||||||
|
Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, |
||||||
|
); |
||||||
|
|
||||||
|
None |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
impl Uri { |
||||||
|
/// Builtin JavaScript `decodeURI ( encodedURI )` function.
|
||||||
|
///
|
||||||
|
/// This function computes a new version of a URI in which each escape sequence and UTF-8
|
||||||
|
/// encoding of the sort that might be introduced by the `encodeURI` function is replaced with
|
||||||
|
/// the UTF-16 encoding of the code points that it represents. Escape sequences that could not
|
||||||
|
/// have been introduced by `encodeURI` are not replaced.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
/// - [MDN documentation][mdn]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-decodeuri-encodeduri
|
||||||
|
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI
|
||||||
|
pub(crate) fn decode_uri( |
||||||
|
_: &JsValue, |
||||||
|
args: &[JsValue], |
||||||
|
context: &mut Context, |
||||||
|
) -> JsResult<JsValue> { |
||||||
|
let encoded_uri = args.get_or_undefined(0); |
||||||
|
|
||||||
|
// 1. Let uriString be ? ToString(encodedURI).
|
||||||
|
let uri_string = encoded_uri.to_string(context)?; |
||||||
|
|
||||||
|
// 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#".
|
||||||
|
let reserved_uri_set = is_uri_reserved_or_number_sign; |
||||||
|
|
||||||
|
// 3. Return ? Decode(uriString, reservedURISet).
|
||||||
|
Ok(JsValue::from(decode( |
||||||
|
context, |
||||||
|
&uri_string, |
||||||
|
reserved_uri_set, |
||||||
|
)?)) |
||||||
|
} |
||||||
|
|
||||||
|
/// Builtin JavaScript `decodeURIComponent ( encodedURIComponent )` function.
|
||||||
|
///
|
||||||
|
/// This function computes a new version of a URI in which each escape sequence and UTF-8
|
||||||
|
/// encoding of the sort that might be introduced by the `encodeURIComponent` function is
|
||||||
|
/// replaced with the UTF-16 encoding of the code points that it represents.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
/// - [MDN documentation][mdn]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-decodeuricomponent-encodeduricomponent
|
||||||
|
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent
|
||||||
|
pub(crate) fn decode_uri_component( |
||||||
|
_: &JsValue, |
||||||
|
args: &[JsValue], |
||||||
|
context: &mut Context, |
||||||
|
) -> JsResult<JsValue> { |
||||||
|
let encoded_uri_component = args.get_or_undefined(0); |
||||||
|
|
||||||
|
// 1. Let componentString be ? ToString(encodedURIComponent).
|
||||||
|
let component_string = encoded_uri_component.to_string(context)?; |
||||||
|
|
||||||
|
// 2. Let reservedURIComponentSet be the empty String.
|
||||||
|
let reserved_uri_component_set = |_: u16| false; |
||||||
|
|
||||||
|
// 3. Return ? Decode(componentString, reservedURIComponentSet).
|
||||||
|
Ok(JsValue::from(decode( |
||||||
|
context, |
||||||
|
&component_string, |
||||||
|
reserved_uri_component_set, |
||||||
|
)?)) |
||||||
|
} |
||||||
|
|
||||||
|
/// Builtin JavaScript `encodeURI ( uri )` function.
|
||||||
|
///
|
||||||
|
/// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance
|
||||||
|
/// of certain code points is replaced by one, two, three, or four escape sequences
|
||||||
|
/// representing the UTF-8 encoding of the code points.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
/// - [MDN documentation][mdn]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-encodeuri-uri
|
||||||
|
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
|
||||||
|
pub(crate) fn encode_uri( |
||||||
|
_: &JsValue, |
||||||
|
args: &[JsValue], |
||||||
|
context: &mut Context, |
||||||
|
) -> JsResult<JsValue> { |
||||||
|
let uri = args.get_or_undefined(0); |
||||||
|
|
||||||
|
// 1. Let uriString be ? ToString(uri).
|
||||||
|
let uri_string = uri.to_string(context)?; |
||||||
|
|
||||||
|
// 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#".
|
||||||
|
let unescaped_uri_set = is_uri_reserved_or_uri_unescaped_or_number_sign; |
||||||
|
|
||||||
|
// 3. Return ? Encode(uriString, unescapedURISet).
|
||||||
|
Ok(JsValue::from(encode( |
||||||
|
context, |
||||||
|
&uri_string, |
||||||
|
unescaped_uri_set, |
||||||
|
)?)) |
||||||
|
} |
||||||
|
|
||||||
|
/// Builtin JavaScript `encodeURIComponent ( uriComponent )` function.
|
||||||
|
///
|
||||||
|
/// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance
|
||||||
|
/// of certain code points is replaced by one, two, three, or four escape sequences
|
||||||
|
/// representing the UTF-8 encoding of the code point.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
/// - [MDN documentation][mdn]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent
|
||||||
|
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
|
||||||
|
pub(crate) fn encode_uri_component( |
||||||
|
_: &JsValue, |
||||||
|
args: &[JsValue], |
||||||
|
context: &mut Context, |
||||||
|
) -> JsResult<JsValue> { |
||||||
|
let uri_component = args.get_or_undefined(0); |
||||||
|
|
||||||
|
// 1. Let componentString be ? ToString(uriComponent).
|
||||||
|
let component_string = uri_component.to_string(context)?; |
||||||
|
|
||||||
|
// 2. Let unescapedURIComponentSet be a String containing one instance of each code unit valid in uriUnescaped.
|
||||||
|
let unescaped_uri_component_set = is_uri_unescaped; |
||||||
|
|
||||||
|
// 3. Return ? Encode(componentString, unescapedURIComponentSet).
|
||||||
|
Ok(JsValue::from(encode( |
||||||
|
context, |
||||||
|
&component_string, |
||||||
|
unescaped_uri_component_set, |
||||||
|
)?)) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/// The `Encode ( string, unescapedSet )` abstract operation
|
||||||
|
///
|
||||||
|
/// The abstract operation Encode takes arguments `string` (a String) and `unescapedSet` (a String)
|
||||||
|
/// and returns either a normal completion containing a String or a throw completion. It performs
|
||||||
|
/// URI encoding and escaping.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-encode
|
||||||
|
fn encode<F>(context: &mut Context, string: &JsString, unescaped_set: F) -> JsResult<String> |
||||||
|
where |
||||||
|
F: Fn(u16) -> bool, |
||||||
|
{ |
||||||
|
let code_units = string.encode_utf16().collect::<Vec<_>>(); |
||||||
|
|
||||||
|
// 1. Let strLen be the length of string.
|
||||||
|
let str_len = code_units.len(); |
||||||
|
|
||||||
|
// 2. Let R be the empty String.
|
||||||
|
let mut r = String::new(); |
||||||
|
|
||||||
|
// 3. Let k be 0.
|
||||||
|
let mut k = 0; |
||||||
|
// 4. Repeat,
|
||||||
|
loop { |
||||||
|
// a. If k = strLen, return R.
|
||||||
|
if k == str_len { |
||||||
|
return Ok(r); |
||||||
|
} |
||||||
|
|
||||||
|
// b. Let C be the code unit at index k within string.
|
||||||
|
let c = code_units[k]; |
||||||
|
|
||||||
|
// c. If C is in unescapedSet, then
|
||||||
|
if unescaped_set(c) { |
||||||
|
// i. Set k to k + 1.
|
||||||
|
k += 1; |
||||||
|
|
||||||
|
// ii. Set R to the string-concatenation of R and C.
|
||||||
|
r.push(char::from_u32(u32::from(c)).expect("char from code point cannot fail here")); |
||||||
|
} else { |
||||||
|
// d. Else,
|
||||||
|
// i. Let cp be CodePointAt(string, k).
|
||||||
|
let cp = code_point_at(string, k as u64); |
||||||
|
|
||||||
|
// ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception.
|
||||||
|
if cp.is_unpaired_surrogate { |
||||||
|
context.throw_uri_error("trying to encode an invalid string")?; |
||||||
|
} |
||||||
|
|
||||||
|
// iii. Set k to k + cp.[[CodeUnitCount]].
|
||||||
|
k += cp.code_unit_count as usize; |
||||||
|
|
||||||
|
// iv. Let Octets be the List of octets resulting by applying the UTF-8 transformation
|
||||||
|
// to cp.[[CodePoint]].
|
||||||
|
let mut buff = [0_u8; 4]; // Will never be more than 4 bytes
|
||||||
|
|
||||||
|
let octets = char::from_u32(cp.code_point) |
||||||
|
.expect("valid unicode code point to char conversion failed") |
||||||
|
.encode_utf8(&mut buff); |
||||||
|
|
||||||
|
// v. For each element octet of Octets, do
|
||||||
|
for octet in octets.bytes() { |
||||||
|
// 1. Set R to the string-concatenation of:
|
||||||
|
// R
|
||||||
|
// "%"
|
||||||
|
// the String representation of octet, formatted as a two-digit uppercase
|
||||||
|
// hexadecimal number, padded to the left with a zero if necessary
|
||||||
|
r = format!("{r}%{octet:0>2X}"); |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/// The `Decode ( string, reservedSet )` abstract operation.
|
||||||
|
///
|
||||||
|
/// The abstract operation Decode takes arguments `string` (a String) and `reservedSet` (a String)
|
||||||
|
/// and returns either a normal completion containing a String or a throw completion. It performs
|
||||||
|
/// URI unescaping and decoding.
|
||||||
|
///
|
||||||
|
/// More information:
|
||||||
|
/// - [ECMAScript reference][spec]
|
||||||
|
///
|
||||||
|
/// [spec]: https://tc39.es/ecma262/#sec-decode
|
||||||
|
#[allow(clippy::many_single_char_names)] |
||||||
|
fn decode<F>(context: &mut Context, string: &JsString, reserved_set: F) -> JsResult<String> |
||||||
|
where |
||||||
|
F: Fn(u16) -> bool, |
||||||
|
{ |
||||||
|
let code_units = string.encode_utf16().collect::<Vec<_>>(); |
||||||
|
|
||||||
|
// 1. Let strLen be the length of string.
|
||||||
|
let str_len = code_units.len(); |
||||||
|
// 2. Let R be the empty String.
|
||||||
|
let mut r = Vec::new(); |
||||||
|
|
||||||
|
// 3. Let k be 0.
|
||||||
|
let mut k = 0; |
||||||
|
// 4. Repeat,
|
||||||
|
loop { |
||||||
|
// a. If k = strLen, return R.
|
||||||
|
if k == str_len { |
||||||
|
return Ok(String::from_utf16(&r).expect("invalid UTF-16 characters found")); |
||||||
|
} |
||||||
|
|
||||||
|
// b. Let C be the code unit at index k within string.
|
||||||
|
let c = code_units[k]; |
||||||
|
|
||||||
|
// c. If C is not the code unit 0x0025 (PERCENT SIGN), then
|
||||||
|
#[allow(clippy::if_not_else)] |
||||||
|
let s = if c != 0x0025_u16 { |
||||||
|
// i. Let S be the String value containing only the code unit C.
|
||||||
|
Vec::from([c]) |
||||||
|
} else { |
||||||
|
// d. Else,
|
||||||
|
// i. Let start be k.
|
||||||
|
let start = k; |
||||||
|
|
||||||
|
// ii. If k + 2 ≥ strLen, throw a URIError exception.
|
||||||
|
if k + 2 >= str_len { |
||||||
|
context.throw_uri_error("invalid escape character found")?; |
||||||
|
} |
||||||
|
|
||||||
|
// iii. If the code units at index (k + 1) and (k + 2) within string do not represent
|
||||||
|
// hexadecimal digits, throw a URIError exception.
|
||||||
|
// iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2).
|
||||||
|
let b = decode_hex_byte(code_units[k + 1], code_units[k + 2]) |
||||||
|
.ok_or_else(|| context.construct_uri_error("invalid hexadecimal digit found"))?; |
||||||
|
|
||||||
|
// v. Set k to k + 2.
|
||||||
|
k += 2; |
||||||
|
|
||||||
|
// vi. Let n be the number of leading 1 bits in B.
|
||||||
|
let n = leading_one_bits(b); |
||||||
|
|
||||||
|
// vii. If n = 0, then
|
||||||
|
if n == 0 { |
||||||
|
// 1. Let C be the code unit whose value is B.
|
||||||
|
let c = u16::from(b); |
||||||
|
|
||||||
|
// 2. If C is not in reservedSet, then
|
||||||
|
if !reserved_set(c) { |
||||||
|
// a. Let S be the String value containing only the code unit C.
|
||||||
|
Vec::from([c]) |
||||||
|
} else { |
||||||
|
// 3. Else,
|
||||||
|
// a. Let S be the substring of string from start to k + 1.
|
||||||
|
Vec::from(&code_units[start..=k]) |
||||||
|
} |
||||||
|
} else { |
||||||
|
// viii. Else,
|
||||||
|
// 1. If n = 1 or n > 4, throw a URIError exception.
|
||||||
|
if n == 1 || n > 4 { |
||||||
|
context.throw_uri_error("invalid escaped character found")?; |
||||||
|
} |
||||||
|
|
||||||
|
// 2. If k + (3 × (n - 1)) ≥ strLen, throw a URIError exception.
|
||||||
|
if k + (3 * (n - 1)) > str_len { |
||||||
|
context.throw_uri_error("non-terminated escape character found")?; |
||||||
|
} |
||||||
|
|
||||||
|
// 3. Let Octets be « B ».
|
||||||
|
let mut octets = Vec::from([b]); |
||||||
|
|
||||||
|
// 4. Let j be 1.
|
||||||
|
// 5. Repeat, while j < n,
|
||||||
|
for _j in 1..n { |
||||||
|
// a. Set k to k + 1.
|
||||||
|
k += 1; |
||||||
|
|
||||||
|
// b. If the code unit at index k within string is not the code unit 0x0025 (PERCENT SIGN), throw a URIError exception.
|
||||||
|
if code_units[k] != 0x0025 { |
||||||
|
context |
||||||
|
.throw_uri_error("escape characters must be preceded with a % sign")?; |
||||||
|
} |
||||||
|
|
||||||
|
// c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception.
|
||||||
|
// d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2).
|
||||||
|
let b = |
||||||
|
decode_hex_byte(code_units[k + 1], code_units[k + 2]).ok_or_else(|| { |
||||||
|
context.construct_uri_error("invalid hexadecimal digit found") |
||||||
|
})?; |
||||||
|
|
||||||
|
// e. Set k to k + 2.
|
||||||
|
k += 2; |
||||||
|
|
||||||
|
// f. Append B to Octets.
|
||||||
|
octets.push(b); |
||||||
|
|
||||||
|
// g. Set j to j + 1.
|
||||||
|
} |
||||||
|
|
||||||
|
// 6. Assert: The length of Octets is n.
|
||||||
|
assert_eq!(octets.len(), n); |
||||||
|
|
||||||
|
// 7. If Octets does not contain a valid UTF-8 encoding of a Unicode code point, throw a URIError exception.
|
||||||
|
match String::from_utf8(octets) { |
||||||
|
Err(_) => { |
||||||
|
return Err(context.construct_uri_error("invalid UTF-8 encoding found")) |
||||||
|
} |
||||||
|
Ok(v) => { |
||||||
|
// 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value.
|
||||||
|
|
||||||
|
// 9. Let S be UTF16EncodeCodePoint(V).
|
||||||
|
// utf16_encode_codepoint(v)
|
||||||
|
v.encode_utf16().collect::<Vec<_>>() |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
// e. Set R to the string-concatenation of R and S.
|
||||||
|
r.extend_from_slice(&s); |
||||||
|
|
||||||
|
// f. Set k to k + 1.
|
||||||
|
k += 1; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/// Decodes a byte from two unicode code units.
|
||||||
|
fn decode_hex_byte(high: u16, low: u16) -> Option<u8> { |
||||||
|
match ( |
||||||
|
char::from_u32(u32::from(high)), |
||||||
|
char::from_u32(u32::from(low)), |
||||||
|
) { |
||||||
|
(Some(high), Some(low)) => match (high.to_digit(16), low.to_digit(16)) { |
||||||
|
(Some(high), Some(low)) => Some(((high as u8) << 4) + low as u8), |
||||||
|
_ => None, |
||||||
|
}, |
||||||
|
_ => None, |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/// Counts the number of leading 1 bits in a given byte.
|
||||||
|
#[inline] |
||||||
|
fn leading_one_bits(byte: u8) -> usize { |
||||||
|
// This uses a value table for speed
|
||||||
|
if byte == u8::MAX { |
||||||
|
8 |
||||||
|
} else if byte == 0b1111_1110 { |
||||||
|
7 |
||||||
|
} else if byte & 0b1111_1100 == 0b1111_1100 { |
||||||
|
6 |
||||||
|
} else if byte & 0b1111_1000 == 0b1111_1000 { |
||||||
|
5 |
||||||
|
} else if byte & 0b1111_0000 == 0b1111_0000 { |
||||||
|
4 |
||||||
|
} else if byte & 0b1110_0000 == 0b1110_0000 { |
||||||
|
3 |
||||||
|
} else if byte & 0b1100_0000 == 0b1100_0000 { |
||||||
|
2 |
||||||
|
} else if byte & 0b1000_0000 == 0b1000_0000 { |
||||||
|
1 |
||||||
|
} else { |
||||||
|
0 |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
#[cfg(test)] |
||||||
|
mod tests { |
||||||
|
use super::*; |
||||||
|
|
||||||
|
/// Checks if the `leading_one_bits()` function works as expected.
|
||||||
|
#[test] |
||||||
|
fn ut_leading_one_bits() { |
||||||
|
assert_eq!(leading_one_bits(0b1111_1111), 8); |
||||||
|
assert_eq!(leading_one_bits(0b1111_1110), 7); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1111_1100), 6); |
||||||
|
assert_eq!(leading_one_bits(0b1111_1101), 6); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1111_1011), 5); |
||||||
|
assert_eq!(leading_one_bits(0b1111_1000), 5); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1111_0000), 4); |
||||||
|
assert_eq!(leading_one_bits(0b1111_0111), 4); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1110_0000), 3); |
||||||
|
assert_eq!(leading_one_bits(0b1110_1111), 3); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1100_0000), 2); |
||||||
|
assert_eq!(leading_one_bits(0b1101_1111), 2); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b1000_0000), 1); |
||||||
|
assert_eq!(leading_one_bits(0b1011_1111), 1); |
||||||
|
|
||||||
|
assert_eq!(leading_one_bits(0b0000_0000), 0); |
||||||
|
assert_eq!(leading_one_bits(0b0111_1111), 0); |
||||||
|
} |
||||||
|
|
||||||
|
/// Checks that the `decode_byte()` function works as expected.
|
||||||
|
#[test] |
||||||
|
fn ut_decode_byte() { |
||||||
|
// Sunny day tests
|
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'2'), u16::from(b'0')).unwrap(), |
||||||
|
0x20 |
||||||
|
); |
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'2'), u16::from(b'A')).unwrap(), |
||||||
|
0x2A |
||||||
|
); |
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'3'), u16::from(b'C')).unwrap(), |
||||||
|
0x3C |
||||||
|
); |
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'4'), u16::from(b'0')).unwrap(), |
||||||
|
0x40 |
||||||
|
); |
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'7'), u16::from(b'E')).unwrap(), |
||||||
|
0x7E |
||||||
|
); |
||||||
|
assert_eq!( |
||||||
|
decode_hex_byte(u16::from(b'0'), u16::from(b'0')).unwrap(), |
||||||
|
0x00 |
||||||
|
); |
||||||
|
|
||||||
|
// Rainy day tests
|
||||||
|
assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0')).is_none()); |
||||||
|
assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~')).is_none()); |
||||||
|
assert!(decode_hex_byte(u16::from(b'A'), 0_u16).is_none()); |
||||||
|
assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&')).is_none()); |
||||||
|
|
||||||
|
assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-')).is_none()); |
||||||
|
assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16).is_none()); |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue