Files
rust/library/core/src/str/mod.rs

2948 lines
94 KiB
Rust
Raw Normal View History

// ignore-tidy-filelength
2019-02-08 14:53:55 +01:00
//! String manipulation.
//!
2020-03-05 08:52:46 +01:00
//! For more details, see the [`std::str`] module.
//!
//! [`std::str`]: ../../std/str/index.html
#![stable(feature = "rust1", since = "1.0.0")]
2020-09-04 07:34:23 +00:00
mod error;
mod iter;
mod traits;
2020-09-04 07:34:23 +00:00
use self::pattern::Pattern;
use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher};
2019-04-15 11:23:21 +09:00
use crate::char;
use crate::mem;
use crate::slice::{self, SliceIndex};
pub mod pattern;
#[unstable(feature = "str_internals", issue = "none")]
2018-04-05 15:55:28 +02:00
#[allow(missing_docs)]
pub mod lossy;
2020-09-04 07:34:23 +00:00
#[stable(feature = "rust1", since = "1.0.0")]
pub use error::{ParseBoolError, Utf8Error};
#[stable(feature = "rust1", since = "1.0.0")]
pub use traits::FromStr;
2014-11-15 15:52:00 +11:00
#[stable(feature = "rust1", since = "1.0.0")]
pub use iter::{Bytes, CharIndices, Chars, Lines, SplitWhitespace};
#[stable(feature = "rust1", since = "1.0.0")]
#[allow(deprecated)]
pub use iter::LinesAny;
#[stable(feature = "rust1", since = "1.0.0")]
pub use iter::{RSplit, RSplitTerminator, Split, SplitTerminator};
#[stable(feature = "rust1", since = "1.0.0")]
pub use iter::{RSplitN, SplitN};
#[stable(feature = "str_matches", since = "1.2.0")]
pub use iter::{Matches, RMatches};
#[stable(feature = "str_match_indices", since = "1.5.0")]
pub use iter::{MatchIndices, RMatchIndices};
#[stable(feature = "encode_utf16", since = "1.8.0")]
pub use iter::EncodeUtf16;
#[stable(feature = "str_escape", since = "1.34.0")]
pub use iter::{EscapeDebug, EscapeDefault, EscapeUnicode};
#[stable(feature = "split_ascii_whitespace", since = "1.34.0")]
pub use iter::SplitAsciiWhitespace;
#[unstable(feature = "split_inclusive", issue = "72360")]
use iter::SplitInclusive;
use iter::MatchIndicesInternal;
use iter::SplitInternal;
use iter::{MatchesInternal, SplitNInternal};
/*
Section: Creating a string
*/
/// Converts a slice of bytes to a string slice.
///
/// A string slice ([`&str`]) is made of bytes ([`u8`]), and a byte slice
/// ([`&[u8]`][byteslice]) is made of bytes, so this function converts between
/// the two. Not all byte slices are valid string slices, however: [`&str`] requires
/// that it is valid UTF-8. `from_utf8()` checks to ensure that the bytes are valid
/// UTF-8, and then does the conversion.
///
2020-07-17 12:44:44 -07:00
/// [`&str`]: str
/// [byteslice]: ../../std/primitive.slice.html
///
/// If you are sure that the byte slice is valid UTF-8, and you don't want to
/// incur the overhead of the validity check, there is an unsafe version of
/// this function, [`from_utf8_unchecked`], which has the same
/// behavior but skips the check.
///
/// If you need a `String` instead of a `&str`, consider
/// [`String::from_utf8`][string].
///
/// [string]: ../../std/string/struct.String.html#method.from_utf8
///
/// Because you can stack-allocate a `[u8; N]`, and you can take a
/// [`&[u8]`][byteslice] of it, this function is one way to have a
/// stack-allocated string. There is an example of this in the
/// examples section below.
///
/// [byteslice]: ../../std/primitive.slice.html
///
/// # Errors
std: Stabilize the std::str module This commit starts out by consolidating all `str` extension traits into one `StrExt` trait to be included in the prelude. This means that `UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into one `StrExt` exported by the standard library. Some functionality is currently duplicated with the `StrExt` present in libcore. This commit also currently avoids any methods which require any form of pattern to operate. These functions will be stabilized via a separate RFC. Next, stability of methods and structures are as follows: Stable * from_utf8_unchecked * CowString - after moving to std::string * StrExt::as_bytes * StrExt::as_ptr * StrExt::bytes/Bytes - also made a struct instead of a typedef * StrExt::char_indices/CharIndices - CharOffsets was renamed * StrExt::chars/Chars * StrExt::is_empty * StrExt::len * StrExt::lines/Lines * StrExt::lines_any/LinesAny * StrExt::slice_unchecked * StrExt::trim * StrExt::trim_left * StrExt::trim_right * StrExt::words/Words - also made a struct instead of a typedef Unstable * from_utf8 - the error type was changed to a `Result`, but the error type has yet to prove itself * from_c_str - this function will be handled by the c_str RFC * FromStr - this trait will have an associated error type eventually * StrExt::escape_default - needs iterators at least, unsure if it should make the cut * StrExt::escape_unicode - needs iterators at least, unsure if it should make the cut * StrExt::slice_chars - this function has yet to prove itself * StrExt::slice_shift_char - awaiting conventions about slicing and shifting * StrExt::graphemes/Graphemes - this functionality may only be in libunicode * StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in libunicode * StrExt::width - this functionality may only be in libunicode * StrExt::utf16_units - this functionality may only be in libunicode * StrExt::nfd_chars - this functionality may only be in libunicode * StrExt::nfkd_chars - this functionality may only be in libunicode * StrExt::nfc_chars - this functionality may only be in libunicode * StrExt::nfkc_chars - this functionality may only be in libunicode * StrExt::is_char_boundary - naming is uncertain with container conventions * StrExt::char_range_at - naming is uncertain with container conventions * StrExt::char_range_at_reverse - naming is uncertain with container conventions * StrExt::char_at - naming is uncertain with container conventions * StrExt::char_at_reverse - naming is uncertain with container conventions * StrVector::concat - this functionality may be replaced with iterators, but it's not certain at this time * StrVector::connect - as with concat, may be deprecated in favor of iterators Deprecated * StrAllocating and UnicodeStrPrelude have been merged into StrExit * eq_slice - compiler implementation detail * from_str - use the inherent parse() method * is_utf8 - call from_utf8 instead * replace - call the method instead * truncate_utf16_at_nul - this is an implementation detail of windows and does not need to be exposed. * utf8_char_width - moved to libunicode * utf16_items - moved to libunicode * is_utf16 - moved to libunicode * Utf16Items - moved to libunicode * Utf16Item - moved to libunicode * Utf16Encoder - moved to libunicode * AnyLines - renamed to LinesAny and made a struct * SendStr - use CowString<'static> instead * str::raw - all functionality is deprecated * StrExt::into_string - call to_string() instead * StrExt::repeat - use iterators instead * StrExt::char_len - use .chars().count() instead * StrExt::is_alphanumeric - use .chars().all(..) * StrExt::is_whitespace - use .chars().all(..) Pending deprecation -- while slicing syntax is being worked out, these methods are all #[unstable] * Str - while currently used for generic programming, this trait will be replaced with one of [], deref coercions, or a generic conversion trait. * StrExt::slice - use slicing syntax instead * StrExt::slice_to - use slicing syntax instead * StrExt::slice_from - use slicing syntax instead * StrExt::lev_distance - deprecated with no replacement Awaiting stabilization due to patterns and/or matching * StrExt::contains * StrExt::contains_char * StrExt::split * StrExt::splitn * StrExt::split_terminator * StrExt::rsplitn * StrExt::match_indices * StrExt::split_str * StrExt::starts_with * StrExt::ends_with * StrExt::trim_chars * StrExt::trim_left_chars * StrExt::trim_right_chars * StrExt::find * StrExt::rfind * StrExt::find_str * StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
///
/// Returns `Err` if the slice is not UTF-8 with a description as to why the
/// provided slice is not UTF-8.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some bytes, in a vector
/// let sparkle_heart = vec![240, 159, 146, 150];
///
/// // We know these bytes are valid, so just use `unwrap()`.
/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap();
///
/// assert_eq!("💖", sparkle_heart);
/// ```
///
/// Incorrect bytes:
///
/// ```
/// use std::str;
///
/// // some invalid bytes, in a vector
/// let sparkle_heart = vec![0, 159, 146, 150];
///
/// assert!(str::from_utf8(&sparkle_heart).is_err());
/// ```
///
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
///
/// A "stack allocated string":
///
/// ```
/// use std::str;
///
/// // some bytes, in a stack-allocated array
/// let sparkle_heart = [240, 159, 146, 150];
///
/// // We know these bytes are valid, so just use `unwrap()`.
/// let sparkle_heart = str::from_utf8(&sparkle_heart).unwrap();
///
/// assert_eq!("💖", sparkle_heart);
/// ```
2015-01-23 21:48:20 -08:00
#[stable(feature = "rust1", since = "1.0.0")]
std: Stabilize the std::str module This commit starts out by consolidating all `str` extension traits into one `StrExt` trait to be included in the prelude. This means that `UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into one `StrExt` exported by the standard library. Some functionality is currently duplicated with the `StrExt` present in libcore. This commit also currently avoids any methods which require any form of pattern to operate. These functions will be stabilized via a separate RFC. Next, stability of methods and structures are as follows: Stable * from_utf8_unchecked * CowString - after moving to std::string * StrExt::as_bytes * StrExt::as_ptr * StrExt::bytes/Bytes - also made a struct instead of a typedef * StrExt::char_indices/CharIndices - CharOffsets was renamed * StrExt::chars/Chars * StrExt::is_empty * StrExt::len * StrExt::lines/Lines * StrExt::lines_any/LinesAny * StrExt::slice_unchecked * StrExt::trim * StrExt::trim_left * StrExt::trim_right * StrExt::words/Words - also made a struct instead of a typedef Unstable * from_utf8 - the error type was changed to a `Result`, but the error type has yet to prove itself * from_c_str - this function will be handled by the c_str RFC * FromStr - this trait will have an associated error type eventually * StrExt::escape_default - needs iterators at least, unsure if it should make the cut * StrExt::escape_unicode - needs iterators at least, unsure if it should make the cut * StrExt::slice_chars - this function has yet to prove itself * StrExt::slice_shift_char - awaiting conventions about slicing and shifting * StrExt::graphemes/Graphemes - this functionality may only be in libunicode * StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in libunicode * StrExt::width - this functionality may only be in libunicode * StrExt::utf16_units - this functionality may only be in libunicode * StrExt::nfd_chars - this functionality may only be in libunicode * StrExt::nfkd_chars - this functionality may only be in libunicode * StrExt::nfc_chars - this functionality may only be in libunicode * StrExt::nfkc_chars - this functionality may only be in libunicode * StrExt::is_char_boundary - naming is uncertain with container conventions * StrExt::char_range_at - naming is uncertain with container conventions * StrExt::char_range_at_reverse - naming is uncertain with container conventions * StrExt::char_at - naming is uncertain with container conventions * StrExt::char_at_reverse - naming is uncertain with container conventions * StrVector::concat - this functionality may be replaced with iterators, but it's not certain at this time * StrVector::connect - as with concat, may be deprecated in favor of iterators Deprecated * StrAllocating and UnicodeStrPrelude have been merged into StrExit * eq_slice - compiler implementation detail * from_str - use the inherent parse() method * is_utf8 - call from_utf8 instead * replace - call the method instead * truncate_utf16_at_nul - this is an implementation detail of windows and does not need to be exposed. * utf8_char_width - moved to libunicode * utf16_items - moved to libunicode * is_utf16 - moved to libunicode * Utf16Items - moved to libunicode * Utf16Item - moved to libunicode * Utf16Encoder - moved to libunicode * AnyLines - renamed to LinesAny and made a struct * SendStr - use CowString<'static> instead * str::raw - all functionality is deprecated * StrExt::into_string - call to_string() instead * StrExt::repeat - use iterators instead * StrExt::char_len - use .chars().count() instead * StrExt::is_alphanumeric - use .chars().all(..) * StrExt::is_whitespace - use .chars().all(..) Pending deprecation -- while slicing syntax is being worked out, these methods are all #[unstable] * Str - while currently used for generic programming, this trait will be replaced with one of [], deref coercions, or a generic conversion trait. * StrExt::slice - use slicing syntax instead * StrExt::slice_to - use slicing syntax instead * StrExt::slice_from - use slicing syntax instead * StrExt::lev_distance - deprecated with no replacement Awaiting stabilization due to patterns and/or matching * StrExt::contains * StrExt::contains_char * StrExt::split * StrExt::splitn * StrExt::split_terminator * StrExt::rsplitn * StrExt::match_indices * StrExt::split_str * StrExt::starts_with * StrExt::ends_with * StrExt::trim_chars * StrExt::trim_left_chars * StrExt::trim_right_chars * StrExt::find * StrExt::rfind * StrExt::find_str * StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
run_utf8_validation(v)?;
2019-12-26 12:56:34 -08:00
// SAFETY: Just ran validation.
std: Stabilize the std::str module This commit starts out by consolidating all `str` extension traits into one `StrExt` trait to be included in the prelude. This means that `UnicodeStrPrelude`, `StrPrelude`, and `StrAllocating` have all been merged into one `StrExt` exported by the standard library. Some functionality is currently duplicated with the `StrExt` present in libcore. This commit also currently avoids any methods which require any form of pattern to operate. These functions will be stabilized via a separate RFC. Next, stability of methods and structures are as follows: Stable * from_utf8_unchecked * CowString - after moving to std::string * StrExt::as_bytes * StrExt::as_ptr * StrExt::bytes/Bytes - also made a struct instead of a typedef * StrExt::char_indices/CharIndices - CharOffsets was renamed * StrExt::chars/Chars * StrExt::is_empty * StrExt::len * StrExt::lines/Lines * StrExt::lines_any/LinesAny * StrExt::slice_unchecked * StrExt::trim * StrExt::trim_left * StrExt::trim_right * StrExt::words/Words - also made a struct instead of a typedef Unstable * from_utf8 - the error type was changed to a `Result`, but the error type has yet to prove itself * from_c_str - this function will be handled by the c_str RFC * FromStr - this trait will have an associated error type eventually * StrExt::escape_default - needs iterators at least, unsure if it should make the cut * StrExt::escape_unicode - needs iterators at least, unsure if it should make the cut * StrExt::slice_chars - this function has yet to prove itself * StrExt::slice_shift_char - awaiting conventions about slicing and shifting * StrExt::graphemes/Graphemes - this functionality may only be in libunicode * StrExt::grapheme_indices/GraphemeIndices - this functionality may only be in libunicode * StrExt::width - this functionality may only be in libunicode * StrExt::utf16_units - this functionality may only be in libunicode * StrExt::nfd_chars - this functionality may only be in libunicode * StrExt::nfkd_chars - this functionality may only be in libunicode * StrExt::nfc_chars - this functionality may only be in libunicode * StrExt::nfkc_chars - this functionality may only be in libunicode * StrExt::is_char_boundary - naming is uncertain with container conventions * StrExt::char_range_at - naming is uncertain with container conventions * StrExt::char_range_at_reverse - naming is uncertain with container conventions * StrExt::char_at - naming is uncertain with container conventions * StrExt::char_at_reverse - naming is uncertain with container conventions * StrVector::concat - this functionality may be replaced with iterators, but it's not certain at this time * StrVector::connect - as with concat, may be deprecated in favor of iterators Deprecated * StrAllocating and UnicodeStrPrelude have been merged into StrExit * eq_slice - compiler implementation detail * from_str - use the inherent parse() method * is_utf8 - call from_utf8 instead * replace - call the method instead * truncate_utf16_at_nul - this is an implementation detail of windows and does not need to be exposed. * utf8_char_width - moved to libunicode * utf16_items - moved to libunicode * is_utf16 - moved to libunicode * Utf16Items - moved to libunicode * Utf16Item - moved to libunicode * Utf16Encoder - moved to libunicode * AnyLines - renamed to LinesAny and made a struct * SendStr - use CowString<'static> instead * str::raw - all functionality is deprecated * StrExt::into_string - call to_string() instead * StrExt::repeat - use iterators instead * StrExt::char_len - use .chars().count() instead * StrExt::is_alphanumeric - use .chars().all(..) * StrExt::is_whitespace - use .chars().all(..) Pending deprecation -- while slicing syntax is being worked out, these methods are all #[unstable] * Str - while currently used for generic programming, this trait will be replaced with one of [], deref coercions, or a generic conversion trait. * StrExt::slice - use slicing syntax instead * StrExt::slice_to - use slicing syntax instead * StrExt::slice_from - use slicing syntax instead * StrExt::lev_distance - deprecated with no replacement Awaiting stabilization due to patterns and/or matching * StrExt::contains * StrExt::contains_char * StrExt::split * StrExt::splitn * StrExt::split_terminator * StrExt::rsplitn * StrExt::match_indices * StrExt::split_str * StrExt::starts_with * StrExt::ends_with * StrExt::trim_chars * StrExt::trim_left_chars * StrExt::trim_right_chars * StrExt::find * StrExt::rfind * StrExt::find_str * StrExt::subslice_offset
2014-12-10 09:02:31 -08:00
Ok(unsafe { from_utf8_unchecked(v) })
}
/// Converts a mutable slice of bytes to a mutable string slice.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // "Hello, Rust!" as a mutable vector
/// let mut hellorust = vec![72, 101, 108, 108, 111, 44, 32, 82, 117, 115, 116, 33];
///
/// // As we know these bytes are valid, we can use `unwrap()`
/// let outstr = str::from_utf8_mut(&mut hellorust).unwrap();
///
/// assert_eq!("Hello, Rust!", outstr);
/// ```
2017-09-10 14:12:23 +01:00
///
2017-09-10 04:33:24 +01:00
/// Incorrect bytes:
///
/// ```
/// use std::str;
2017-09-10 14:25:23 +01:00
///
/// // Some invalid bytes in a mutable vector
/// let mut invalid = vec![128, 223];
///
/// assert!(str::from_utf8_mut(&mut invalid).is_err());
/// ```
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
run_utf8_validation(v)?;
2019-12-26 12:56:34 -08:00
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked_mut(v) })
}
/// Converts a slice of bytes to a string slice without checking
/// that the string contains valid UTF-8.
///
/// See the safe version, [`from_utf8`], for more information.
///
/// # Safety
///
/// This function is unsafe because it does not check that the bytes passed to
/// it are valid UTF-8. If this constraint is violated, undefined behavior
2017-03-29 13:21:31 -04:00
/// results, as the rest of Rust assumes that [`&str`]s are valid UTF-8.
///
2020-07-17 12:44:44 -07:00
/// [`&str`]: str
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// // some bytes, in a vector
/// let sparkle_heart = vec![240, 159, 146, 150];
///
/// let sparkle_heart = unsafe {
/// str::from_utf8_unchecked(&sparkle_heart)
/// };
///
/// assert_eq!("💖", sparkle_heart);
/// ```
#[inline]
2015-01-23 21:48:20 -08:00
#[stable(feature = "rust1", since = "1.0.0")]
2020-08-05 14:05:57 -03:00
#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked", issue = "75196")]
2020-08-04 14:38:42 -03:00
#[allow(unused_attributes)]
2020-08-13 15:56:23 -03:00
#[allow_internal_unstable(const_fn_transmute)]
2020-08-04 14:38:42 -03:00
pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
// SAFETY: the caller must guarantee that the bytes `v` are valid UTF-8.
// Also relies on `&str` and `&[u8]` having the same layout.
2020-08-13 16:08:22 -03:00
unsafe { mem::transmute(v) }
}
/// Converts a slice of bytes to a string slice without checking
/// that the string contains valid UTF-8; mutable version.
///
2020-07-17 12:44:44 -07:00
/// See the immutable version, [`from_utf8_unchecked()`] for more information.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// use std::str;
///
/// let mut heart = vec![240, 159, 146, 150];
/// let heart = unsafe { str::from_utf8_unchecked_mut(&mut heart) };
///
/// assert_eq!("💖", heart);
/// ```
#[inline]
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
2020-06-25 18:46:59 +02:00
// SAFETY: the caller must guarantee that the bytes `v`
// are valid UTF-8, thus the cast to `*mut str` is safe.
// Also, the pointer dereference is safe because that pointer
// comes from a reference which is guaranteed to be valid for writes.
unsafe { &mut *(v as *mut [u8] as *mut str) }
2015-03-14 19:34:21 -04:00
}
/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}
/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
fn utf8_is_cont_byte(byte: u8) -> bool {
(byte & !CONT_MASK) == TAG_CONT_U8
}
#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}
/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
// Decode UTF-8
let x = *bytes.next()?;
if x < 128 {
return Some(x as u32);
}
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some(ch)
}
/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
#[inline]
fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
I: DoubleEndedIterator<Item = &'a u8>,
{
// Decode UTF-8
let w = match *bytes.next_back()? {
next_byte if next_byte < 128 => return Some(next_byte as u32),
back_byte => back_byte,
};
When possible without changing semantics, implement Iterator::last in terms of DoubleEndedIterator::next_back for types in liballoc and libcore. Provided that the iterator has finite length and does not trigger user-provided code, this is safe. What follows is a full list of the DoubleEndedIterators in liballoc/libcore and whether this optimization is safe, and if not, why not. src/liballoc/boxed.rs Box: Pass through to avoid defeating optimization of the underlying DoubleIterator implementation. This has no correctness impact. src/liballoc/collections/binary_heap.rs Iter: Pass through to avoid defeating optimizations on slice::Iter IntoIter: Not safe, changes Drop order Drain: Not safe, changes Drop order src/liballoc/collections/btree/map.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order Keys: Safe to call next_back, invokes no user defined code. Values: ditto ValuesMut: ditto Range: ditto RangeMut: ditto src/liballoc/collections/btree/set.rs Iter: Safe to call next_back, invokes no user defined code. IntoIter: Not safe, changes Drop order Range: Safe to call next_back, invokes no user defined code. src/liballoc/collections/linked_list.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order src/liballoc/collections/vec_deque.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order Drain: ditto src/liballoc/string.rs Drain: Safe because return type is a primitive (char) src/liballoc/vec.rs IntoIter: Not safe, changes Drop order Drain: ditto Splice: ditto src/libcore/ascii.rs EscapeDefault: Safe because return type is a primitive (u8) src/libcore/iter/adapters/chain.rs Chain: Not safe, invokes user defined code (Iterator impl) src/libcore/iter/adapters/flatten.rs FlatMap: Not safe, invokes user defined code (Iterator impl) Flatten: ditto FlattenCompat: ditto src/libcore/iter/adapters/mod.rs Rev: Not safe, invokes user defined code (Iterator impl) Copied: ditto Cloned: Not safe, invokes user defined code (Iterator impl and T::clone) Map: Not safe, invokes user defined code (Iterator impl + closure) Filter: ditto FilterMap: ditto Enumerate: Not safe, invokes user defined code (Iterator impl) Skip: ditto Fuse: ditto Inspect: ditto src/libcore/iter/adapters/zip.rs Zip: Not safe, invokes user defined code (Iterator impl) src/libcore/iter/range.rs ops::Range: Not safe, changes Drop order, but ALREADY HAS SPECIALIZATION ops::RangeInclusive: ditto src/libcore/iter/sources.rs Repeat: Not safe, calling last should iloop. Empty: No point, iterator is at most one item long. Once: ditto OnceWith: ditto src/libcore/option.rs Item: No point, iterator is at most one item long. Iter: ditto IterMut: ditto IntoIter: ditto src/libcore/result.rs Iter: No point, iterator is at most one item long IterMut: ditto IntoIter: ditto src/libcore/slice/mod.rs Split: Not safe, invokes user defined closure SplitMut: ditto RSplit: ditto RSplitMut: ditto Windows: Safe, already has specialization Chunks: ditto ChunksMut: ditto ChunksExact: ditto ChunksExactMut: ditto RChunks: ditto RChunksMut: ditto RChunksExact: ditto RChunksExactMut: ditto src/libcore/str/mod.rs Chars: Safe, already has specialization CharIndices: ditto Bytes: ditto Lines: Safe to call next_back, invokes no user defined code. LinesAny: Deprecated Everything that is generic over P: Pattern: Not safe because Pattern invokes user defined code. SplitWhitespace: Safe to call next_back, invokes no user defined code. SplitAsciiWhitespace: ditto
2019-07-02 13:45:29 -07:00
// Multibyte case follows
// Decode from a byte combination out of: [x [y [z w]]]
let mut ch;
let z = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(z, 2);
if utf8_is_cont_byte(z) {
let y = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(y, 3);
if utf8_is_cont_byte(y) {
let x = unwrap_or_0(bytes.next_back());
ch = utf8_first_byte(x, 4);
ch = utf8_acc_cont_byte(ch, y);
}
ch = utf8_acc_cont_byte(ch, z);
When possible without changing semantics, implement Iterator::last in terms of DoubleEndedIterator::next_back for types in liballoc and libcore. Provided that the iterator has finite length and does not trigger user-provided code, this is safe. What follows is a full list of the DoubleEndedIterators in liballoc/libcore and whether this optimization is safe, and if not, why not. src/liballoc/boxed.rs Box: Pass through to avoid defeating optimization of the underlying DoubleIterator implementation. This has no correctness impact. src/liballoc/collections/binary_heap.rs Iter: Pass through to avoid defeating optimizations on slice::Iter IntoIter: Not safe, changes Drop order Drain: Not safe, changes Drop order src/liballoc/collections/btree/map.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order Keys: Safe to call next_back, invokes no user defined code. Values: ditto ValuesMut: ditto Range: ditto RangeMut: ditto src/liballoc/collections/btree/set.rs Iter: Safe to call next_back, invokes no user defined code. IntoIter: Not safe, changes Drop order Range: Safe to call next_back, invokes no user defined code. src/liballoc/collections/linked_list.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order src/liballoc/collections/vec_deque.rs Iter: Safe to call next_back, invokes no user defined code. IterMut: ditto IntoIter: Not safe, changes Drop order Drain: ditto src/liballoc/string.rs Drain: Safe because return type is a primitive (char) src/liballoc/vec.rs IntoIter: Not safe, changes Drop order Drain: ditto Splice: ditto src/libcore/ascii.rs EscapeDefault: Safe because return type is a primitive (u8) src/libcore/iter/adapters/chain.rs Chain: Not safe, invokes user defined code (Iterator impl) src/libcore/iter/adapters/flatten.rs FlatMap: Not safe, invokes user defined code (Iterator impl) Flatten: ditto FlattenCompat: ditto src/libcore/iter/adapters/mod.rs Rev: Not safe, invokes user defined code (Iterator impl) Copied: ditto Cloned: Not safe, invokes user defined code (Iterator impl and T::clone) Map: Not safe, invokes user defined code (Iterator impl + closure) Filter: ditto FilterMap: ditto Enumerate: Not safe, invokes user defined code (Iterator impl) Skip: ditto Fuse: ditto Inspect: ditto src/libcore/iter/adapters/zip.rs Zip: Not safe, invokes user defined code (Iterator impl) src/libcore/iter/range.rs ops::Range: Not safe, changes Drop order, but ALREADY HAS SPECIALIZATION ops::RangeInclusive: ditto src/libcore/iter/sources.rs Repeat: Not safe, calling last should iloop. Empty: No point, iterator is at most one item long. Once: ditto OnceWith: ditto src/libcore/option.rs Item: No point, iterator is at most one item long. Iter: ditto IterMut: ditto IntoIter: ditto src/libcore/result.rs Iter: No point, iterator is at most one item long IterMut: ditto IntoIter: ditto src/libcore/slice/mod.rs Split: Not safe, invokes user defined closure SplitMut: ditto RSplit: ditto RSplitMut: ditto Windows: Safe, already has specialization Chunks: ditto ChunksMut: ditto ChunksExact: ditto ChunksExactMut: ditto RChunks: ditto RChunksMut: ditto RChunksExact: ditto RChunksExactMut: ditto src/libcore/str/mod.rs Chars: Safe, already has specialization CharIndices: ditto Bytes: ditto Lines: Safe to call next_back, invokes no user defined code. LinesAny: Deprecated Everything that is generic over P: Pattern: Not safe because Pattern invokes user defined code. SplitWhitespace: Safe to call next_back, invokes no user defined code. SplitAsciiWhitespace: ditto
2019-07-02 13:45:29 -07:00
}
ch = utf8_acc_cont_byte(ch, w);
Some(ch)
2015-03-14 19:34:21 -04:00
}
impl_fn_for_zst! {
/// A nameable, cloneable fn type
#[derive(Clone)]
struct LinesAnyMap impl<'a> Fn = |line: &'a str| -> &'a str {
let l = line.len();
if l > 0 && line.as_bytes()[l - 1] == b'\r' { &line[0 .. l - 1] }
else { line }
};
2015-03-14 19:34:21 -04:00
}
/*
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
Section: UTF-8 validation
*/
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
// use truncation to fit u64 into usize
const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;
/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
#[inline]
2018-10-23 23:09:44 +02:00
fn contains_nonascii(x: usize) -> bool {
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
(x & NONASCII_MASK) != 0
}
/// Walks through `v` checking that it's a valid UTF-8 sequence,
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
#[inline(always)]
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
let len = v.len();
let usize_bytes = mem::size_of::<usize>();
let ascii_block_size = 2 * usize_bytes;
let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
Optimize pointer alignment in utf8 validation This uses (and reuses) the u8 arrays's inherent block alignment when checking whether the current index is block aligned. I initially thought that this would just move the expensive `align_offset` call out of the while loop and replace it with a subtraction and bitwise AND. But it appears this optimizes much better, too... before: https://rust.godbolt.org/z/WIPvWl after: https://rust.godbolt.org/z/-jBPoW https://github.com/jridgewell/faster-from_utf8/tree/pointer-alignment ``` test from_utf8_2_bytes_fast ... bench: 310 ns/iter (+/- 42) = 1290 MB/s test from_utf8_2_bytes_regular ... bench: 309 ns/iter (+/- 24) = 1294 MB/s test from_utf8_3_bytes_fast ... bench: 1,027 ns/iter (+/- 62) = 1168 MB/s test from_utf8_3_bytes_regular ... bench: 1,513 ns/iter (+/- 611) = 793 MB/s test from_utf8_4_bytes_fast ... bench: 1,788 ns/iter (+/- 26) = 1342 MB/s test from_utf8_4_bytes_regular ... bench: 1,907 ns/iter (+/- 181) = 1258 MB/s test from_utf8_all_bytes_fast ... bench: 3,463 ns/iter (+/- 97) = 1155 MB/s test from_utf8_all_bytes_regular ... bench: 4,083 ns/iter (+/- 89) = 979 MB/s test from_utf8_ascii_fast ... bench: 88 ns/iter (+/- 4) = 28988 MB/s test from_utf8_ascii_regular ... bench: 88 ns/iter (+/- 8) = 28988 MB/s test from_utf8_cyr_fast ... bench: 7,707 ns/iter (+/- 531) = 665 MB/s test from_utf8_cyr_regular ... bench: 8,202 ns/iter (+/- 135) = 625 MB/s test from_utf8_enwik8_fast ... bench: 1,135,756 ns/iter (+/- 84,450) = 8804 MB/s test from_utf8_enwik8_regular ... bench: 1,145,468 ns/iter (+/- 79,601) = 8730 MB/s test from_utf8_jawik10_fast ... bench: 12,723,844 ns/iter (+/- 473,247) = 785 MB/s test from_utf8_jawik10_regular ... bench: 13,384,596 ns/iter (+/- 666,997) = 747 MB/s test from_utf8_mixed_fast ... bench: 2,321 ns/iter (+/- 123) = 2081 MB/s test from_utf8_mixed_regular ... bench: 2,702 ns/iter (+/- 408) = 1788 MB/s test from_utf8_mostlyasc_fast ... bench: 249 ns/iter (+/- 10) = 14666 MB/s test from_utf8_mostlyasc_regular ... bench: 276 ns/iter (+/- 5) = 13231 MB/s ```
2019-05-29 23:33:35 -04:00
let align = v.as_ptr().align_offset(usize_bytes);
while index < len {
let old_offset = index;
macro_rules! err {
($error_len: expr) => {
2019-12-22 17:42:04 -05:00
return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len });
};
}
2019-12-22 17:42:04 -05:00
macro_rules! next {
() => {{
index += 1;
// we needed data, but there was none: error!
if index >= len {
err!(None)
}
v[index]
}};
}
let first = v[index];
if first >= 128 {
2015-03-10 12:06:44 +01:00
let w = UTF8_CHAR_WIDTH[first as usize];
2014-12-09 14:08:10 -08:00
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
2014-12-09 14:08:10 -08:00
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
// first E0 A0 80 last EF BF BF
2014-12-09 14:08:10 -08:00
// excluding surrogates codepoints \u{d800} to \u{dfff}
// ED A0 80 to ED BF BF
2014-12-09 14:08:10 -08:00
// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2019-12-22 17:42:04 -05:00
2 => {
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
}
}
3 => {
match (first, next!()) {
2019-12-22 17:42:04 -05:00
(0xE0, 0xA0..=0xBF)
| (0xE1..=0xEC, 0x80..=0xBF)
| (0xED, 0x80..=0x9F)
| (0xEE..=0xEF, 0x80..=0xBF) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, next!()) {
2019-12-22 17:42:04 -05:00
(0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
_ => err!(Some(1)),
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
2019-12-22 17:42:04 -05:00
_ => err!(Some(1)),
}
index += 1;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
} else {
// Ascii case, try to skip forward quickly.
// When the pointer is aligned, read 2 words of data per iteration
// until we find a word containing a non-ascii byte.
2020-06-02 07:59:11 +00:00
if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
Optimize pointer alignment in utf8 validation This uses (and reuses) the u8 arrays's inherent block alignment when checking whether the current index is block aligned. I initially thought that this would just move the expensive `align_offset` call out of the while loop and replace it with a subtraction and bitwise AND. But it appears this optimizes much better, too... before: https://rust.godbolt.org/z/WIPvWl after: https://rust.godbolt.org/z/-jBPoW https://github.com/jridgewell/faster-from_utf8/tree/pointer-alignment ``` test from_utf8_2_bytes_fast ... bench: 310 ns/iter (+/- 42) = 1290 MB/s test from_utf8_2_bytes_regular ... bench: 309 ns/iter (+/- 24) = 1294 MB/s test from_utf8_3_bytes_fast ... bench: 1,027 ns/iter (+/- 62) = 1168 MB/s test from_utf8_3_bytes_regular ... bench: 1,513 ns/iter (+/- 611) = 793 MB/s test from_utf8_4_bytes_fast ... bench: 1,788 ns/iter (+/- 26) = 1342 MB/s test from_utf8_4_bytes_regular ... bench: 1,907 ns/iter (+/- 181) = 1258 MB/s test from_utf8_all_bytes_fast ... bench: 3,463 ns/iter (+/- 97) = 1155 MB/s test from_utf8_all_bytes_regular ... bench: 4,083 ns/iter (+/- 89) = 979 MB/s test from_utf8_ascii_fast ... bench: 88 ns/iter (+/- 4) = 28988 MB/s test from_utf8_ascii_regular ... bench: 88 ns/iter (+/- 8) = 28988 MB/s test from_utf8_cyr_fast ... bench: 7,707 ns/iter (+/- 531) = 665 MB/s test from_utf8_cyr_regular ... bench: 8,202 ns/iter (+/- 135) = 625 MB/s test from_utf8_enwik8_fast ... bench: 1,135,756 ns/iter (+/- 84,450) = 8804 MB/s test from_utf8_enwik8_regular ... bench: 1,145,468 ns/iter (+/- 79,601) = 8730 MB/s test from_utf8_jawik10_fast ... bench: 12,723,844 ns/iter (+/- 473,247) = 785 MB/s test from_utf8_jawik10_regular ... bench: 13,384,596 ns/iter (+/- 666,997) = 747 MB/s test from_utf8_mixed_fast ... bench: 2,321 ns/iter (+/- 123) = 2081 MB/s test from_utf8_mixed_regular ... bench: 2,702 ns/iter (+/- 408) = 1788 MB/s test from_utf8_mostlyasc_fast ... bench: 249 ns/iter (+/- 10) = 14666 MB/s test from_utf8_mostlyasc_regular ... bench: 276 ns/iter (+/- 5) = 13231 MB/s ```
2019-05-29 23:33:35 -04:00
let ptr = v.as_ptr();
while index < blocks_end {
2019-12-26 12:56:34 -08:00
// SAFETY: since `align - index` and `ascii_block_size` are
// multiples of `usize_bytes`, `block = ptr.add(index)` is
// always aligned with a `usize` so it's safe to dereference
// both `block` and `block.offset(1)`.
unsafe {
let block = ptr.add(index) as *const usize;
// break if there is a nonascii byte
let zu = contains_nonascii(*block);
let zv = contains_nonascii(*block.offset(1));
if zu | zv {
break;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
}
}
index += ascii_block_size;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
}
// step from the point where the wordwise loop stopped
while index < len && v[index] < 128 {
index += 1;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
}
} else {
index += 1;
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
}
}
}
Add fast path for ASCII in UTF-8 validation This speeds up the ascii case (and long stretches of ascii in otherwise mixed UTF-8 data) when checking UTF-8 validity. Benchmark results suggest that on purely ASCII input, we can improve throughput (megabytes verified / second) by a factor of 13 to 14! On xml and mostly english language input (en.wikipedia xml dump), throughput increases by a factor 7. On mostly non-ASCII input, performance increases slightly or is the same. The UTF-8 validation is rewritten to use indexed access; since all access is preceded by a (mandatory for validation) length check, they are statically elided by llvm and this formulation is in fact the best for performance. A previous version had losses due to slice to iterator conversions. A large credit to Björn Steinbrink who improved this patch immensely, writing this second version. Benchmark results on x86-64 (Sandy Bridge) compiled with -C opt-level=3. Old code is `regular`, this PR is called `fast`. Datasets: - `ascii` is just ascii (2.5 kB) - `cyr` is cyrillic script with ascii spaces (5 kB) - `dewik10` is 10MB of a de.wikipedia xml dump - `enwik10` is 100MB of an en.wikipedia xml dump - `jawik10` is 10MB of a ja.wikipedia xml dump ``` test from_utf8_ascii_fast ... bench: 140 ns/iter (+/- 4) = 18221 MB/s test from_utf8_ascii_regular ... bench: 1,932 ns/iter (+/- 19) = 1320 MB/s test from_utf8_cyr_fast ... bench: 10,025 ns/iter (+/- 245) = 511 MB/s test from_utf8_cyr_regular ... bench: 12,250 ns/iter (+/- 437) = 418 MB/s test from_utf8_dewik10_fast ... bench: 6,017,909 ns/iter (+/- 105,755) = 1740 MB/s test from_utf8_dewik10_regular ... bench: 11,669,493 ns/iter (+/- 264,045) = 891 MB/s test from_utf8_enwik8_fast ... bench: 14,085,692 ns/iter (+/- 1,643,316) = 7000 MB/s test from_utf8_enwik8_regular ... bench: 93,657,410 ns/iter (+/- 5,353,353) = 1000 MB/s test from_utf8_jawik10_fast ... bench: 29,154,073 ns/iter (+/- 4,659,534) = 340 MB/s test from_utf8_jawik10_regular ... bench: 29,112,917 ns/iter (+/- 2,475,123) = 340 MB/s ``` Co-authored-by: Björn Steinbrink <bsteinbr@gmail.com>
2016-01-06 15:43:33 +01:00
Ok(())
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
2019-12-22 17:42:04 -05:00
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x7F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0xBF
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, // 0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
];
/// Given a first byte, determines how many bytes are in this UTF-8 character.
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}
/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
const TAG_CONT_U8: u8 = 0b1000_0000;
// truncate `&str` to length at most equal to `max`
// return `true` if it were truncated, and the new str.
fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) {
if max >= s.len() {
(false, s)
} else {
while !s.is_char_boundary(max) {
max -= 1;
}
(true, &s[..max])
}
}
#[inline(never)]
2015-11-23 09:34:01 +05:30
#[cold]
#[track_caller]
fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
const MAX_DISPLAY_LENGTH: usize = 256;
let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH);
let ellipsis = if truncated { "[...]" } else { "" };
// 1. out of bounds
if begin > s.len() || end > s.len() {
let oob_index = if begin > s.len() { begin } else { end };
panic!("byte index {} is out of bounds of `{}`{}", oob_index, s_trunc, ellipsis);
}
// 2. begin <= end
2019-12-22 17:42:04 -05:00
assert!(
begin <= end,
"begin <= end ({} <= {}) when slicing `{}`{}",
begin,
end,
s_trunc,
ellipsis
);
// 3. character boundary
let index = if !s.is_char_boundary(begin) { begin } else { end };
// find the character
let mut char_start = index;
while !s.is_char_boundary(char_start) {
char_start -= 1;
}
// `char_start` must be less than len and a char boundary
let ch = s[char_start..].chars().next().unwrap();
2019-12-22 17:42:04 -05:00
let char_range = char_start..char_start + ch.len_utf8();
panic!(
"byte index {} is not a char boundary; it is inside {:?} (bytes {:?}) of `{}`{}",
index, ch, char_range, s_trunc, ellipsis
);
}
2018-05-10 12:02:19 -06:00
#[lang = "str"]
#[cfg(not(test))]
impl str {
/// Returns the length of `self`.
///
/// This length is in bytes, not [`char`]s or graphemes. In other words,
/// it may not be what a human considers the length of the string.
///
/// [`char`]: prim@char
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let len = "foo".len();
/// assert_eq!(3, len);
///
/// assert_eq!("ƒoo".len(), 4); // fancy f!
/// assert_eq!("ƒoo".chars().count(), 3);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
2019-12-18 12:00:59 -05:00
#[rustc_const_stable(feature = "const_str_len", since = "1.32.0")]
2015-05-06 15:53:34 -07:00
#[inline]
pub const fn len(&self) -> usize {
2018-05-10 12:02:19 -06:00
self.as_bytes().len()
2015-05-06 15:53:34 -07:00
}
/// Returns `true` if `self` has a length of zero bytes.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = "";
/// assert!(s.is_empty());
///
/// let s = "not empty";
/// assert!(!s.is_empty());
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
2019-12-18 12:00:59 -05:00
#[rustc_const_stable(feature = "const_str_is_empty", since = "1.32.0")]
pub const fn is_empty(&self) -> bool {
2018-05-10 12:02:19 -06:00
self.len() == 0
}
2020-05-27 16:21:30 +03:00
/// Checks that `index`-th byte is the first byte in a UTF-8 code point
/// sequence or the end of the string.
///
/// The start and end of the string (when `index == self.len()`) are
2020-05-27 16:21:30 +03:00
/// considered to be boundaries.
///
/// Returns `false` if `index` is greater than `self.len()`.
///
/// # Examples
///
/// ```
/// let s = "Löwe 老虎 Léopard";
/// assert!(s.is_char_boundary(0));
/// // start of `老`
/// assert!(s.is_char_boundary(6));
/// assert!(s.is_char_boundary(s.len()));
///
/// // second byte of `ö`
/// assert!(!s.is_char_boundary(2));
///
/// // third byte of `老`
/// assert!(!s.is_char_boundary(8));
/// ```
#[stable(feature = "is_char_boundary", since = "1.9.0")]
#[inline]
pub fn is_char_boundary(&self, index: usize) -> bool {
2018-05-10 12:02:19 -06:00
// 0 and len are always ok.
// Test for 0 explicitly so that it can optimize out the check
// easily and skip reading string data for that case.
2019-12-22 17:42:04 -05:00
if index == 0 || index == self.len() {
return true;
}
2018-05-10 12:02:19 -06:00
match self.as_bytes().get(index) {
None => false,
// This is bit magic equivalent to: b < 128 || b >= 192
Some(&b) => (b as i8) >= -0x40,
}
}
/// Converts a string slice to a byte slice. To convert the byte slice back
2020-07-17 12:44:44 -07:00
/// into a string slice, use the [`from_utf8`] function.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let bytes = "bors".as_bytes();
/// assert_eq!(b"bors", bytes);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
2019-12-18 12:00:59 -05:00
#[rustc_const_stable(feature = "str_as_bytes", since = "1.32.0")]
#[inline(always)]
#[allow(unused_attributes)]
2020-08-13 15:56:23 -03:00
#[allow_internal_unstable(const_fn_transmute)]
pub const fn as_bytes(&self) -> &[u8] {
// SAFETY: const sound because we transmute two types with the same layout
2020-08-13 15:56:23 -03:00
unsafe { mem::transmute(self) }
}
/// Converts a mutable string slice to a mutable byte slice.
///
/// # Safety
///
/// The caller must ensure that the content of the slice is valid UTF-8
/// before the borrow ends and the underlying `str` is used.
///
/// Use of a `str` whose contents are not valid UTF-8 is undefined behavior.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let mut s = String::from("Hello");
/// let bytes = unsafe { s.as_bytes_mut() };
///
/// assert_eq!(b"Hello", bytes);
/// ```
///
/// Mutability:
///
/// ```
/// let mut s = String::from("🗻∈🌏");
///
/// unsafe {
/// let bytes = s.as_bytes_mut();
///
/// bytes[0] = 0xF0;
/// bytes[1] = 0x9F;
/// bytes[2] = 0x8D;
/// bytes[3] = 0x94;
/// }
///
/// assert_eq!("🍔∈🌏", s);
/// ```
#[stable(feature = "str_mut_extras", since = "1.20.0")]
#[inline(always)]
pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] {
2020-06-25 18:46:59 +02:00
// SAFETY: the cast from `&str` to `&[u8]` is safe since `str`
// has the same layout as `&[u8]` (only libstd can make this guarantee).
// The pointer dereference is safe since it comes from a mutable reference which
// is guaranteed to be valid for writes.
unsafe { &mut *(self as *mut str as *mut [u8]) }
}
/// Converts a string slice to a raw pointer.
///
/// As string slices are a slice of bytes, the raw pointer points to a
/// [`u8`]. This pointer will be pointing to the first byte of the string
/// slice.
///
2019-05-02 13:36:30 +02:00
/// The caller must ensure that the returned pointer is never written to.
/// If you need to mutate the contents of the string slice, use [`as_mut_ptr`].
2019-05-01 17:59:48 +02:00
///
2020-07-17 12:44:44 -07:00
/// [`as_mut_ptr`]: str::as_mut_ptr
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = "Hello";
/// let ptr = s.as_ptr();
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
2019-12-18 12:00:59 -05:00
#[rustc_const_stable(feature = "rustc_str_as_ptr", since = "1.32.0")]
#[inline]
pub const fn as_ptr(&self) -> *const u8 {
2018-05-10 12:02:19 -06:00
self as *const str as *const u8
}
/// Converts a mutable string slice to a raw pointer.
///
/// As string slices are a slice of bytes, the raw pointer points to a
/// [`u8`]. This pointer will be pointing to the first byte of the string
/// slice.
///
/// It is your responsibility to make sure that the string slice only gets
/// modified in a way that it remains valid UTF-8.
2019-04-29 02:33:50 +09:00
#[stable(feature = "str_as_mut_ptr", since = "1.36.0")]
#[inline]
pub fn as_mut_ptr(&mut self) -> *mut u8 {
self as *mut str as *mut u8
}
/// Returns a subslice of `str`.
///
/// This is the non-panicking alternative to indexing the `str`. Returns
/// [`None`] whenever equivalent indexing operation would panic.
///
/// # Examples
///
/// ```
/// let v = String::from("🗻∈🌏");
///
/// assert_eq!(Some("🗻"), v.get(0..4));
///
/// // indices not on UTF-8 sequence boundaries
/// assert!(v.get(1..).is_none());
/// assert!(v.get(..8).is_none());
///
/// // out of bounds
/// assert!(v.get(..42).is_none());
/// ```
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
#[inline]
pub fn get<I: SliceIndex<str>>(&self, i: I) -> Option<&I::Output> {
2018-05-10 12:02:19 -06:00
i.get(self)
}
/// Returns a mutable subslice of `str`.
///
/// This is the non-panicking alternative to indexing the `str`. Returns
/// [`None`] whenever equivalent indexing operation would panic.
///
/// # Examples
///
/// ```
/// let mut v = String::from("hello");
/// // correct length
/// assert!(v.get_mut(0..5).is_some());
/// // out of bounds
/// assert!(v.get_mut(..42).is_none());
/// assert_eq!(Some("he"), v.get_mut(0..2).map(|v| &*v));
///
/// assert_eq!("hello", v);
/// {
/// let s = v.get_mut(0..2);
/// let s = s.map(|s| {
/// s.make_ascii_uppercase();
/// &*s
/// });
/// assert_eq!(Some("HE"), s);
/// }
/// assert_eq!("HEllo", v);
/// ```
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
#[inline]
pub fn get_mut<I: SliceIndex<str>>(&mut self, i: I) -> Option<&mut I::Output> {
2018-05-10 12:02:19 -06:00
i.get_mut(self)
}
/// Returns an unchecked subslice of `str`.
///
/// This is the unchecked alternative to indexing the `str`.
///
/// # Safety
///
/// Callers of this function are responsible that these preconditions are
/// satisfied:
///
/// * The starting index must not exceed the ending index;
/// * Indexes must be within bounds of the original slice;
/// * Indexes must lie on UTF-8 sequence boundaries.
///
/// Failing that, the returned string slice may reference invalid memory or
/// violate the invariants communicated by the `str` type.
///
/// # Examples
///
/// ```
/// let v = "🗻∈🌏";
/// unsafe {
/// assert_eq!("🗻", v.get_unchecked(0..4));
/// assert_eq!("∈", v.get_unchecked(4..7));
/// assert_eq!("🌏", v.get_unchecked(7..11));
/// }
/// ```
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
#[inline]
pub unsafe fn get_unchecked<I: SliceIndex<str>>(&self, i: I) -> &I::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked`;
// the slice is dereferencable because `self` is a safe reference.
// The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is.
unsafe { &*i.get_unchecked(self) }
}
/// Returns a mutable, unchecked subslice of `str`.
///
/// This is the unchecked alternative to indexing the `str`.
///
/// # Safety
///
/// Callers of this function are responsible that these preconditions are
/// satisfied:
///
/// * The starting index must not exceed the ending index;
/// * Indexes must be within bounds of the original slice;
/// * Indexes must lie on UTF-8 sequence boundaries.
///
/// Failing that, the returned string slice may reference invalid memory or
/// violate the invariants communicated by the `str` type.
///
/// # Examples
///
/// ```
/// let mut v = String::from("🗻∈🌏");
/// unsafe {
/// assert_eq!("🗻", v.get_unchecked_mut(0..4));
/// assert_eq!("∈", v.get_unchecked_mut(4..7));
/// assert_eq!("🌏", v.get_unchecked_mut(7..11));
/// }
/// ```
#[stable(feature = "str_checked_slicing", since = "1.20.0")]
#[inline]
pub unsafe fn get_unchecked_mut<I: SliceIndex<str>>(&mut self, i: I) -> &mut I::Output {
// SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`;
// the slice is dereferencable because `self` is a safe reference.
// The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is.
unsafe { &mut *i.get_unchecked_mut(self) }
}
/// Creates a string slice from another string slice, bypassing safety
/// checks.
///
/// This is generally not recommended, use with caution! For a safe
/// alternative see [`str`] and [`Index`].
///
2020-07-17 12:44:44 -07:00
/// [`Index`]: crate::ops::Index
///
/// This new slice goes from `begin` to `end`, including `begin` but
/// excluding `end`.
///
/// To get a mutable string slice instead, see the
/// [`slice_mut_unchecked`] method.
///
2020-07-17 12:44:44 -07:00
/// [`slice_mut_unchecked`]: str::slice_mut_unchecked
///
/// # Safety
///
/// Callers of this function are responsible that three preconditions are
/// satisfied:
///
/// * `begin` must not exceed `end`.
/// * `begin` and `end` must be byte positions within the string slice.
/// * `begin` and `end` must lie on UTF-8 sequence boundaries.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = "Löwe 老虎 Léopard";
///
/// unsafe {
/// assert_eq!("Löwe 老虎 Léopard", s.slice_unchecked(0, 21));
/// }
///
/// let s = "Hello, world!";
///
/// unsafe {
/// assert_eq!("world", s.slice_unchecked(7, 12));
/// }
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
2018-07-12 19:49:55 +03:00
#[rustc_deprecated(since = "1.29.0", reason = "use `get_unchecked(begin..end)` instead")]
#[inline]
pub unsafe fn slice_unchecked(&self, begin: usize, end: usize) -> &str {
// SAFETY: the caller must uphold the safety contract for `get_unchecked`;
// the slice is dereferencable because `self` is a safe reference.
// The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is.
unsafe { &*(begin..end).get_unchecked(self) }
}
/// Creates a string slice from another string slice, bypassing safety
/// checks.
/// This is generally not recommended, use with caution! For a safe
/// alternative see [`str`] and [`IndexMut`].
///
2020-07-17 12:44:44 -07:00
/// [`IndexMut`]: crate::ops::IndexMut
///
/// This new slice goes from `begin` to `end`, including `begin` but
/// excluding `end`.
///
/// To get an immutable string slice instead, see the
/// [`slice_unchecked`] method.
///
2020-07-17 12:44:44 -07:00
/// [`slice_unchecked`]: str::slice_unchecked
///
/// # Safety
///
/// Callers of this function are responsible that three preconditions are
/// satisfied:
///
/// * `begin` must not exceed `end`.
/// * `begin` and `end` must be byte positions within the string slice.
/// * `begin` and `end` must lie on UTF-8 sequence boundaries.
#[stable(feature = "str_slice_mut", since = "1.5.0")]
2018-07-12 19:49:55 +03:00
#[rustc_deprecated(since = "1.29.0", reason = "use `get_unchecked_mut(begin..end)` instead")]
#[inline]
pub unsafe fn slice_mut_unchecked(&mut self, begin: usize, end: usize) -> &mut str {
// SAFETY: the caller must uphold the safety contract for `get_unchecked_mut`;
// the slice is dereferencable because `self` is a safe reference.
// The returned pointer is safe because impls of `SliceIndex` have to guarantee that it is.
unsafe { &mut *(begin..end).get_unchecked_mut(self) }
}
/// Divide one string slice into two at an index.
///
/// The argument, `mid`, should be a byte offset from the start of the
/// string. It must also be on the boundary of a UTF-8 code point.
///
/// The two slices returned go from the start of the string slice to `mid`,
/// and from `mid` to the end of the string slice.
///
/// To get mutable string slices instead, see the [`split_at_mut`]
/// method.
///
2020-07-17 12:44:44 -07:00
/// [`split_at_mut`]: str::split_at_mut
///
/// # Panics
///
/// Panics if `mid` is not on a UTF-8 code point boundary, or if it is
/// past the end of the last code point of the string slice.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = "Per Martin-Löf";
///
/// let (first, last) = s.split_at(3);
///
/// assert_eq!("Per", first);
/// assert_eq!(" Martin-Löf", last);
/// ```
#[inline]
#[stable(feature = "str_split_at", since = "1.4.0")]
pub fn split_at(&self, mid: usize) -> (&str, &str) {
2018-05-10 12:02:19 -06:00
// is_char_boundary checks that the index is in [0, .len()]
if self.is_char_boundary(mid) {
2019-12-26 12:56:34 -08:00
// SAFETY: just checked that `mid` is on a char boundary.
2019-12-22 17:42:04 -05:00
unsafe { (self.get_unchecked(0..mid), self.get_unchecked(mid..self.len())) }
2018-05-10 12:02:19 -06:00
} else {
slice_error_fail(self, 0, mid)
}
}
/// Divide one mutable string slice into two at an index.
///
/// The argument, `mid`, should be a byte offset from the start of the
/// string. It must also be on the boundary of a UTF-8 code point.
///
/// The two slices returned go from the start of the string slice to `mid`,
/// and from `mid` to the end of the string slice.
///
/// To get immutable string slices instead, see the [`split_at`] method.
///
2020-07-17 12:44:44 -07:00
/// [`split_at`]: str::split_at
///
/// # Panics
///
/// Panics if `mid` is not on a UTF-8 code point boundary, or if it is
/// past the end of the last code point of the string slice.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let mut s = "Per Martin-Löf".to_string();
/// {
/// let (first, last) = s.split_at_mut(3);
/// first.make_ascii_uppercase();
/// assert_eq!("PER", first);
/// assert_eq!(" Martin-Löf", last);
/// }
/// assert_eq!("PER Martin-Löf", s);
/// ```
#[inline]
#[stable(feature = "str_split_at", since = "1.4.0")]
pub fn split_at_mut(&mut self, mid: usize) -> (&mut str, &mut str) {
2018-05-10 12:02:19 -06:00
// is_char_boundary checks that the index is in [0, .len()]
if self.is_char_boundary(mid) {
let len = self.len();
let ptr = self.as_mut_ptr();
2019-12-26 12:56:34 -08:00
// SAFETY: just checked that `mid` is on a char boundary.
2018-05-10 12:02:19 -06:00
unsafe {
2019-12-22 17:42:04 -05:00
(
from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, mid)),
from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr.add(mid), len - mid)),
)
2018-05-10 12:02:19 -06:00
}
} else {
slice_error_fail(self, 0, mid)
}
}
/// Returns an iterator over the [`char`]s of a string slice.
///
/// As a string slice consists of valid UTF-8, we can iterate through a
/// string slice by [`char`]. This method returns such an iterator.
///
/// It's important to remember that [`char`] represents a Unicode Scalar
/// Value, and may not match your idea of what a 'character' is. Iteration
/// over grapheme clusters may be what you actually want. This functionality
/// is not provided by Rust's standard library, check crates.io instead.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let word = "goodbye";
///
/// let count = word.chars().count();
/// assert_eq!(7, count);
///
/// let mut chars = word.chars();
///
/// assert_eq!(Some('g'), chars.next());
/// assert_eq!(Some('o'), chars.next());
/// assert_eq!(Some('o'), chars.next());
/// assert_eq!(Some('d'), chars.next());
/// assert_eq!(Some('b'), chars.next());
/// assert_eq!(Some('y'), chars.next());
/// assert_eq!(Some('e'), chars.next());
///
/// assert_eq!(None, chars.next());
/// ```
///
/// Remember, [`char`]s may not match your intuition about characters:
///
/// [`char`]: prim@char
///
/// ```
/// let y = "y̆";
///
/// let mut chars = y.chars();
///
/// assert_eq!(Some('y'), chars.next()); // not 'y̆'
/// assert_eq!(Some('\u{0306}'), chars.next());
///
/// assert_eq!(None, chars.next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2019-04-19 01:37:12 +02:00
pub fn chars(&self) -> Chars<'_> {
2019-12-22 17:42:04 -05:00
Chars { iter: self.as_bytes().iter() }
}
2018-05-10 12:02:19 -06:00
/// Returns an iterator over the [`char`]s of a string slice, and their
/// positions.
///
/// As a string slice consists of valid UTF-8, we can iterate through a
/// string slice by [`char`]. This method returns an iterator of both
/// these [`char`]s, as well as their byte positions.
///
/// The iterator yields tuples. The position is first, the [`char`] is
/// second.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let word = "goodbye";
///
/// let count = word.char_indices().count();
/// assert_eq!(7, count);
///
/// let mut char_indices = word.char_indices();
///
/// assert_eq!(Some((0, 'g')), char_indices.next());
/// assert_eq!(Some((1, 'o')), char_indices.next());
/// assert_eq!(Some((2, 'o')), char_indices.next());
/// assert_eq!(Some((3, 'd')), char_indices.next());
/// assert_eq!(Some((4, 'b')), char_indices.next());
/// assert_eq!(Some((5, 'y')), char_indices.next());
/// assert_eq!(Some((6, 'e')), char_indices.next());
///
/// assert_eq!(None, char_indices.next());
/// ```
///
/// Remember, [`char`]s may not match your intuition about characters:
///
/// [`char`]: prim@char
///
/// ```
/// let yes = "y̆es";
///
/// let mut char_indices = yes.char_indices();
///
/// assert_eq!(Some((0, 'y')), char_indices.next()); // not (0, 'y̆')
/// assert_eq!(Some((1, '\u{0306}')), char_indices.next());
///
/// // note the 3 here - the last character took up two bytes
/// assert_eq!(Some((3, 'e')), char_indices.next());
/// assert_eq!(Some((4, 's')), char_indices.next());
///
/// assert_eq!(None, char_indices.next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2019-04-19 01:37:12 +02:00
pub fn char_indices(&self) -> CharIndices<'_> {
2018-05-10 12:02:19 -06:00
CharIndices { front_offset: 0, iter: self.chars() }
}
/// An iterator over the bytes of a string slice.
///
/// As a string slice consists of a sequence of bytes, we can iterate
/// through a string slice by byte. This method returns such an iterator.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let mut bytes = "bors".bytes();
///
/// assert_eq!(Some(b'b'), bytes.next());
/// assert_eq!(Some(b'o'), bytes.next());
/// assert_eq!(Some(b'r'), bytes.next());
/// assert_eq!(Some(b's'), bytes.next());
///
/// assert_eq!(None, bytes.next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2019-04-19 01:37:12 +02:00
pub fn bytes(&self) -> Bytes<'_> {
2020-01-30 20:09:23 +00:00
Bytes(self.as_bytes().iter().copied())
}
2019-02-09 21:23:30 +00:00
/// Splits a string slice by whitespace.
///
/// The iterator returned will return string slices that are sub-slices of
/// the original string slice, separated by any amount of whitespace.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
2018-05-05 00:33:20 -04:00
/// Core Property `White_Space`. If you only want to split on ASCII whitespace
/// instead, use [`split_ascii_whitespace`].
///
2020-07-17 12:44:44 -07:00
/// [`split_ascii_whitespace`]: str::split_ascii_whitespace
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let mut iter = "A few words".split_whitespace();
///
/// assert_eq!(Some("A"), iter.next());
/// assert_eq!(Some("few"), iter.next());
/// assert_eq!(Some("words"), iter.next());
///
/// assert_eq!(None, iter.next());
/// ```
///
/// All kinds of whitespace are considered:
///
/// ```
/// let mut iter = " Mary had\ta\u{2009}little \n\t lamb".split_whitespace();
/// assert_eq!(Some("Mary"), iter.next());
/// assert_eq!(Some("had"), iter.next());
/// assert_eq!(Some("a"), iter.next());
/// assert_eq!(Some("little"), iter.next());
/// assert_eq!(Some("lamb"), iter.next());
///
/// assert_eq!(None, iter.next());
/// ```
#[stable(feature = "split_whitespace", since = "1.1.0")]
#[inline]
2019-04-19 01:37:12 +02:00
pub fn split_whitespace(&self) -> SplitWhitespace<'_> {
2018-05-10 12:02:19 -06:00
SplitWhitespace { inner: self.split(IsWhitespace).filter(IsNotEmpty) }
}
2019-02-09 21:23:30 +00:00
/// Splits a string slice by ASCII whitespace.
2018-05-05 00:33:20 -04:00
///
/// The iterator returned will return string slices that are sub-slices of
/// the original string slice, separated by any amount of ASCII whitespace.
///
/// To split by Unicode `Whitespace` instead, use [`split_whitespace`].
///
2020-07-17 12:44:44 -07:00
/// [`split_whitespace`]: str::split_whitespace
2018-05-05 00:33:20 -04:00
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let mut iter = "A few words".split_ascii_whitespace();
///
/// assert_eq!(Some("A"), iter.next());
/// assert_eq!(Some("few"), iter.next());
/// assert_eq!(Some("words"), iter.next());
///
/// assert_eq!(None, iter.next());
/// ```
///
/// All kinds of ASCII whitespace are considered:
///
/// ```
/// let mut iter = " Mary had\ta little \n\t lamb".split_ascii_whitespace();
2018-05-05 00:33:20 -04:00
/// assert_eq!(Some("Mary"), iter.next());
/// assert_eq!(Some("had"), iter.next());
/// assert_eq!(Some("a"), iter.next());
/// assert_eq!(Some("little"), iter.next());
/// assert_eq!(Some("lamb"), iter.next());
///
/// assert_eq!(None, iter.next());
/// ```
#[stable(feature = "split_ascii_whitespace", since = "1.34.0")]
2018-05-05 00:33:20 -04:00
#[inline]
2019-04-19 01:37:12 +02:00
pub fn split_ascii_whitespace(&self) -> SplitAsciiWhitespace<'_> {
2019-12-22 17:42:04 -05:00
let inner =
self.as_bytes().split(IsAsciiWhitespace).filter(BytesIsNotEmpty).map(UnsafeBytesToStr);
2018-05-05 00:33:20 -04:00
SplitAsciiWhitespace { inner }
}
/// An iterator over the lines of a string, as string slices.
///
/// Lines are ended with either a newline (`\n`) or a carriage return with
/// a line feed (`\r\n`).
///
/// The final line ending is optional.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let text = "foo\r\nbar\n\nbaz\n";
/// let mut lines = text.lines();
///
/// assert_eq!(Some("foo"), lines.next());
/// assert_eq!(Some("bar"), lines.next());
/// assert_eq!(Some(""), lines.next());
/// assert_eq!(Some("baz"), lines.next());
///
/// assert_eq!(None, lines.next());
/// ```
///
/// The final line ending isn't required:
///
/// ```
/// let text = "foo\nbar\n\r\nbaz";
/// let mut lines = text.lines();
///
/// assert_eq!(Some("foo"), lines.next());
/// assert_eq!(Some("bar"), lines.next());
/// assert_eq!(Some(""), lines.next());
/// assert_eq!(Some("baz"), lines.next());
///
/// assert_eq!(None, lines.next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
2019-04-19 01:37:12 +02:00
pub fn lines(&self) -> Lines<'_> {
2018-05-10 12:02:19 -06:00
Lines(self.split_terminator('\n').map(LinesAnyMap))
}
/// An iterator over the lines of a string.
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_deprecated(since = "1.4.0", reason = "use lines() instead now")]
#[inline]
#[allow(deprecated)]
2019-04-19 01:37:12 +02:00
pub fn lines_any(&self) -> LinesAny<'_> {
2018-05-10 12:02:19 -06:00
LinesAny(self.lines())
}
/// Returns an iterator of `u16` over the string encoded as UTF-16.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let text = "Zażółć gęślą jaźń";
///
/// let utf8_len = text.len();
/// let utf16_len = text.encode_utf16().count();
///
/// assert!(utf16_len <= utf8_len);
/// ```
#[stable(feature = "encode_utf16", since = "1.8.0")]
2019-04-19 01:37:12 +02:00
pub fn encode_utf16(&self) -> EncodeUtf16<'_> {
2018-05-10 12:02:19 -06:00
EncodeUtf16 { chars: self.chars(), extra: 0 }
}
/// Returns `true` if the given pattern matches a sub-slice of
/// this string slice.
///
/// Returns `false` if it does not.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let bananas = "bananas";
///
/// assert!(bananas.contains("nana"));
/// assert!(!bananas.contains("apples"));
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool {
2018-05-10 12:02:19 -06:00
pat.is_contained_in(self)
}
/// Returns `true` if the given pattern matches a prefix of this
/// string slice.
///
/// Returns `false` if it does not.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let bananas = "bananas";
///
/// assert!(bananas.starts_with("bana"));
/// assert!(!bananas.starts_with("nana"));
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool {
2018-05-10 12:02:19 -06:00
pat.is_prefix_of(self)
}
/// Returns `true` if the given pattern matches a suffix of this
/// string slice.
///
/// Returns `false` if it does not.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let bananas = "bananas";
///
/// assert!(bananas.ends_with("anas"));
/// assert!(!bananas.ends_with("nana"));
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn ends_with<'a, P>(&'a self, pat: P) -> bool
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
pat.is_suffix_of(self)
}
/// Returns the byte index of the first character of this string slice that
/// matches the pattern.
///
/// Returns [`None`] if the pattern doesn't match.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let s = "Löwe 老虎 Léopard Gepardi";
///
/// assert_eq!(s.find('L'), Some(0));
/// assert_eq!(s.find('é'), Some(14));
/// assert_eq!(s.find("pard"), Some(17));
/// ```
///
/// More complex patterns using point-free style and closures:
///
/// ```
/// let s = "Löwe 老虎 Léopard";
///
/// assert_eq!(s.find(char::is_whitespace), Some(5));
/// assert_eq!(s.find(char::is_lowercase), Some(1));
/// assert_eq!(s.find(|c: char| c.is_whitespace() || c.is_lowercase()), Some(1));
/// assert_eq!(s.find(|c: char| (c < 'o') && (c > 'a')), Some(4));
/// ```
///
/// Not finding the pattern:
///
/// ```
/// let s = "Löwe 老虎 Léopard";
/// let x: &[_] = &['1', '2'];
///
/// assert_eq!(s.find(x), None);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize> {
2018-05-10 12:02:19 -06:00
pat.into_searcher(self).next_match().map(|(i, _)| i)
}
/// Returns the byte index for the first character of the rightmost match of the pattern in
/// this string slice.
///
/// Returns [`None`] if the pattern doesn't match.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let s = "Löwe 老虎 Léopard Gepardi";
///
/// assert_eq!(s.rfind('L'), Some(13));
/// assert_eq!(s.rfind('é'), Some(14));
/// assert_eq!(s.rfind("pard"), Some(24));
/// ```
///
/// More complex patterns with closures:
///
/// ```
/// let s = "Löwe 老虎 Léopard";
///
/// assert_eq!(s.rfind(char::is_whitespace), Some(12));
/// assert_eq!(s.rfind(char::is_lowercase), Some(20));
/// ```
///
/// Not finding the pattern:
///
/// ```
/// let s = "Löwe 老虎 Léopard";
/// let x: &[_] = &['1', '2'];
///
/// assert_eq!(s.rfind(x), None);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn rfind<'a, P>(&'a self, pat: P) -> Option<usize>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
pat.into_searcher(self).next_match_back().map(|(i, _)| i)
}
/// An iterator over substrings of this string slice, separated by
/// characters matched by a pattern.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator will be a [`DoubleEndedIterator`] if the pattern
/// allows a reverse search and forward/reverse search yields the same
/// elements. This is true for, e.g., [`char`], but not for `&str`.
///
/// If the pattern allows a reverse search but its results might differ
/// from a forward search, the [`rsplit`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`rsplit`]: str::rsplit
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let v: Vec<&str> = "Mary had a little lamb".split(' ').collect();
/// assert_eq!(v, ["Mary", "had", "a", "little", "lamb"]);
///
/// let v: Vec<&str> = "".split('X').collect();
/// assert_eq!(v, [""]);
///
/// let v: Vec<&str> = "lionXXtigerXleopard".split('X').collect();
/// assert_eq!(v, ["lion", "", "tiger", "leopard"]);
///
/// let v: Vec<&str> = "lion::tiger::leopard".split("::").collect();
/// assert_eq!(v, ["lion", "tiger", "leopard"]);
///
/// let v: Vec<&str> = "abc1def2ghi".split(char::is_numeric).collect();
/// assert_eq!(v, ["abc", "def", "ghi"]);
///
/// let v: Vec<&str> = "lionXtigerXleopard".split(char::is_uppercase).collect();
/// assert_eq!(v, ["lion", "tiger", "leopard"]);
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// let v: Vec<&str> = "abc1defXghi".split(|c| c == '1' || c == 'X').collect();
/// assert_eq!(v, ["abc", "def", "ghi"]);
/// ```
///
/// If a string contains multiple contiguous separators, you will end up
/// with empty strings in the output:
///
/// ```
/// let x = "||||a||b|c".to_string();
/// let d: Vec<_> = x.split('|').collect();
///
/// assert_eq!(d, &["", "", "", "", "a", "", "b", "c"]);
/// ```
///
/// Contiguous separators are separated by the empty string.
///
/// ```
/// let x = "(///)".to_string();
/// let d: Vec<_> = x.split('/').collect();
///
/// assert_eq!(d, &["(", "", "", ")"]);
/// ```
///
/// Separators at the start or end of a string are neighbored
/// by empty strings.
///
/// ```
/// let d: Vec<_> = "010".split("0").collect();
/// assert_eq!(d, &["", "1", ""]);
/// ```
///
/// When the empty string is used as a separator, it separates
/// every character in the string, along with the beginning
/// and end of the string.
///
/// ```
/// let f: Vec<_> = "rust".split("").collect();
/// assert_eq!(f, &["", "r", "u", "s", "t", ""]);
/// ```
///
/// Contiguous separators can lead to possibly surprising behavior
/// when whitespace is used as the separator. This code is correct:
///
/// ```
/// let x = " a b c".to_string();
/// let d: Vec<_> = x.split(' ').collect();
///
/// assert_eq!(d, &["", "", "", "", "a", "", "b", "c"]);
/// ```
///
/// It does _not_ give you:
///
/// ```,ignore
/// assert_eq!(d, &["a", "b", "c"]);
/// ```
///
/// Use [`split_whitespace`] for this behavior.
///
2020-07-17 12:44:44 -07:00
/// [`split_whitespace`]: str::split_whitespace
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> {
2018-05-10 12:02:19 -06:00
Split(SplitInternal {
start: 0,
end: self.len(),
matcher: pat.into_searcher(self),
allow_trailing_empty: true,
finished: false,
})
}
/// An iterator over substrings of this string slice, separated by
/// characters matched by a pattern. Differs from the iterator produced by
/// `split` in that `split_inclusive` leaves the matched part as the
/// terminator of the substring.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// ```
/// #![feature(split_inclusive)]
/// let v: Vec<&str> = "Mary had a little lamb\nlittle lamb\nlittle lamb."
/// .split_inclusive('\n').collect();
/// assert_eq!(v, ["Mary had a little lamb\n", "little lamb\n", "little lamb."]);
/// ```
///
/// If the last element of the string is matched,
/// that element will be considered the terminator of the preceding substring.
/// That substring will be the last item returned by the iterator.
///
/// ```
/// #![feature(split_inclusive)]
/// let v: Vec<&str> = "Mary had a little lamb\nlittle lamb\nlittle lamb.\n"
/// .split_inclusive('\n').collect();
/// assert_eq!(v, ["Mary had a little lamb\n", "little lamb\n", "little lamb.\n"]);
/// ```
#[unstable(feature = "split_inclusive", issue = "72360")]
#[inline]
pub fn split_inclusive<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitInclusive<'a, P> {
SplitInclusive(SplitInternal {
start: 0,
end: self.len(),
matcher: pat.into_searcher(self),
allow_trailing_empty: false,
finished: false,
})
}
/// An iterator over substrings of the given string slice, separated by
/// characters matched by a pattern and yielded in reverse order.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator requires that the pattern supports a reverse
/// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse
/// search yields the same elements.
///
/// For iterating from the front, the [`split`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`split`]: str::split
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let v: Vec<&str> = "Mary had a little lamb".rsplit(' ').collect();
/// assert_eq!(v, ["lamb", "little", "a", "had", "Mary"]);
///
/// let v: Vec<&str> = "".rsplit('X').collect();
/// assert_eq!(v, [""]);
///
/// let v: Vec<&str> = "lionXXtigerXleopard".rsplit('X').collect();
/// assert_eq!(v, ["leopard", "tiger", "", "lion"]);
///
/// let v: Vec<&str> = "lion::tiger::leopard".rsplit("::").collect();
/// assert_eq!(v, ["leopard", "tiger", "lion"]);
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// let v: Vec<&str> = "abc1defXghi".rsplit(|c| c == '1' || c == 'X').collect();
/// assert_eq!(v, ["ghi", "def", "abc"]);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn rsplit<'a, P>(&'a self, pat: P) -> RSplit<'a, P>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
RSplit(self.split(pat).0)
}
/// An iterator over substrings of the given string slice, separated by
/// characters matched by a pattern.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// Equivalent to [`split`], except that the trailing substring
/// is skipped if empty.
///
2020-07-17 12:44:44 -07:00
/// [`split`]: str::split
///
/// This method can be used for string data that is _terminated_,
/// rather than _separated_ by a pattern.
///
/// # Iterator behavior
///
/// The returned iterator will be a [`DoubleEndedIterator`] if the pattern
/// allows a reverse search and forward/reverse search yields the same
/// elements. This is true for, e.g., [`char`], but not for `&str`.
///
/// If the pattern allows a reverse search but its results might differ
/// from a forward search, the [`rsplit_terminator`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`rsplit_terminator`]: str::rsplit_terminator
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let v: Vec<&str> = "A.B.".split_terminator('.').collect();
/// assert_eq!(v, ["A", "B"]);
///
/// let v: Vec<&str> = "A..B..".split_terminator(".").collect();
/// assert_eq!(v, ["A", "", "B", ""]);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> {
2019-12-22 17:42:04 -05:00
SplitTerminator(SplitInternal { allow_trailing_empty: false, ..self.split(pat).0 })
}
/// An iterator over substrings of `self`, separated by characters
/// matched by a pattern and yielded in reverse order.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// Equivalent to [`split`], except that the trailing substring is
/// skipped if empty.
///
2020-07-17 12:44:44 -07:00
/// [`split`]: str::split
///
/// This method can be used for string data that is _terminated_,
/// rather than _separated_ by a pattern.
///
/// # Iterator behavior
///
/// The returned iterator requires that the pattern supports a
/// reverse search, and it will be double ended if a forward/reverse
/// search yields the same elements.
///
/// For iterating from the front, the [`split_terminator`] method can be
/// used.
///
2020-07-17 12:44:44 -07:00
/// [`split_terminator`]: str::split_terminator
///
/// # Examples
///
/// ```
/// let v: Vec<&str> = "A.B.".rsplit_terminator('.').collect();
/// assert_eq!(v, ["B", "A"]);
///
/// let v: Vec<&str> = "A..B..".rsplit_terminator(".").collect();
/// assert_eq!(v, ["", "B", "", "A"]);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn rsplit_terminator<'a, P>(&'a self, pat: P) -> RSplitTerminator<'a, P>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
RSplitTerminator(self.split_terminator(pat).0)
}
/// An iterator over substrings of the given string slice, separated by a
/// pattern, restricted to returning at most `n` items.
///
/// If `n` substrings are returned, the last substring (the `n`th substring)
/// will contain the remainder of the string.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator will not be double ended, because it is
/// not efficient to support.
///
/// If the pattern allows a reverse search, the [`rsplitn`] method can be
/// used.
///
2020-07-17 12:44:44 -07:00
/// [`rsplitn`]: str::rsplitn
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let v: Vec<&str> = "Mary had a little lambda".splitn(3, ' ').collect();
/// assert_eq!(v, ["Mary", "had", "a little lambda"]);
///
/// let v: Vec<&str> = "lionXXtigerXleopard".splitn(3, "X").collect();
/// assert_eq!(v, ["lion", "", "tigerXleopard"]);
///
/// let v: Vec<&str> = "abcXdef".splitn(1, 'X').collect();
/// assert_eq!(v, ["abcXdef"]);
///
/// let v: Vec<&str> = "".splitn(1, 'X').collect();
/// assert_eq!(v, [""]);
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// let v: Vec<&str> = "abc1defXghi".splitn(2, |c| c == '1' || c == 'X').collect();
/// assert_eq!(v, ["abc", "defXghi"]);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn splitn<'a, P: Pattern<'a>>(&'a self, n: usize, pat: P) -> SplitN<'a, P> {
2019-12-22 17:42:04 -05:00
SplitN(SplitNInternal { iter: self.split(pat).0, count: n })
}
/// An iterator over substrings of this string slice, separated by a
/// pattern, starting from the end of the string, restricted to returning
/// at most `n` items.
///
/// If `n` substrings are returned, the last substring (the `n`th substring)
/// will contain the remainder of the string.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator will not be double ended, because it is not
/// efficient to support.
///
/// For splitting from the front, the [`splitn`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`splitn`]: str::splitn
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// let v: Vec<&str> = "Mary had a little lamb".rsplitn(3, ' ').collect();
/// assert_eq!(v, ["lamb", "little", "Mary had a"]);
///
/// let v: Vec<&str> = "lionXXtigerXleopard".rsplitn(3, 'X').collect();
/// assert_eq!(v, ["leopard", "tiger", "lionX"]);
///
/// let v: Vec<&str> = "lion::tiger::leopard".rsplitn(2, "::").collect();
/// assert_eq!(v, ["leopard", "lion::tiger"]);
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// let v: Vec<&str> = "abc1defXghi".rsplitn(2, |c| c == '1' || c == 'X').collect();
/// assert_eq!(v, ["ghi", "abc1def"]);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn rsplitn<'a, P>(&'a self, n: usize, pat: P) -> RSplitN<'a, P>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
RSplitN(self.splitn(n, pat).0)
}
/// Splits the string on the first occurrence of the specified delimiter and
/// returns prefix before delimiter and suffix after delimiter.
///
/// # Examples
///
/// ```
/// #![feature(str_split_once)]
///
/// assert_eq!("cfg".split_once('='), None);
/// assert_eq!("cfg=foo".split_once('='), Some(("cfg", "foo")));
/// assert_eq!("cfg=foo=bar".split_once('='), Some(("cfg", "foo=bar")));
/// ```
#[unstable(feature = "str_split_once", reason = "newly added", issue = "74773")]
#[inline]
pub fn split_once<'a, P: Pattern<'a>>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)> {
let (start, end) = delimiter.into_searcher(self).next_match()?;
Some((&self[..start], &self[end..]))
}
/// Splits the string on the last occurrence of the specified delimiter and
/// returns prefix before delimiter and suffix after delimiter.
///
/// # Examples
///
/// ```
/// #![feature(str_split_once)]
///
/// assert_eq!("cfg".rsplit_once('='), None);
/// assert_eq!("cfg=foo".rsplit_once('='), Some(("cfg", "foo")));
/// assert_eq!("cfg=foo=bar".rsplit_once('='), Some(("cfg=foo", "bar")));
/// ```
#[unstable(feature = "str_split_once", reason = "newly added", issue = "74773")]
#[inline]
pub fn rsplit_once<'a, P>(&'a self, delimiter: P) -> Option<(&'a str, &'a str)>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
let (start, end) = delimiter.into_searcher(self).next_match_back()?;
Some((&self[..start], &self[end..]))
}
/// An iterator over the disjoint matches of a pattern within the given string
/// slice.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator will be a [`DoubleEndedIterator`] if the pattern
/// allows a reverse search and forward/reverse search yields the same
/// elements. This is true for, e.g., [`char`], but not for `&str`.
///
/// If the pattern allows a reverse search but its results might differ
/// from a forward search, the [`rmatches`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`rmatches`]: str::matches
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let v: Vec<&str> = "abcXXXabcYYYabc".matches("abc").collect();
/// assert_eq!(v, ["abc", "abc", "abc"]);
///
/// let v: Vec<&str> = "1abc2abc3".matches(char::is_numeric).collect();
/// assert_eq!(v, ["1", "2", "3"]);
/// ```
#[stable(feature = "str_matches", since = "1.2.0")]
#[inline]
pub fn matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> Matches<'a, P> {
2018-05-10 12:02:19 -06:00
Matches(MatchesInternal(pat.into_searcher(self)))
}
/// An iterator over the disjoint matches of a pattern within this string slice,
/// yielded in reverse order.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator requires that the pattern supports a reverse
/// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse
/// search yields the same elements.
///
/// For iterating from the front, the [`matches`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`matches`]: str::matches
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let v: Vec<&str> = "abcXXXabcYYYabc".rmatches("abc").collect();
/// assert_eq!(v, ["abc", "abc", "abc"]);
///
/// let v: Vec<&str> = "1abc2abc3".rmatches(char::is_numeric).collect();
/// assert_eq!(v, ["3", "2", "1"]);
/// ```
#[stable(feature = "str_matches", since = "1.2.0")]
#[inline]
pub fn rmatches<'a, P>(&'a self, pat: P) -> RMatches<'a, P>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
RMatches(self.matches(pat).0)
}
/// An iterator over the disjoint matches of a pattern within this string
/// slice as well as the index that the match starts at.
///
/// For matches of `pat` within `self` that overlap, only the indices
/// corresponding to the first match are returned.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator will be a [`DoubleEndedIterator`] if the pattern
/// allows a reverse search and forward/reverse search yields the same
/// elements. This is true for, e.g., [`char`], but not for `&str`.
///
/// If the pattern allows a reverse search but its results might differ
/// from a forward search, the [`rmatch_indices`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`rmatch_indices`]: str::match_indices
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let v: Vec<_> = "abcXXXabcYYYabc".match_indices("abc").collect();
/// assert_eq!(v, [(0, "abc"), (6, "abc"), (12, "abc")]);
///
/// let v: Vec<_> = "1abcabc2".match_indices("abc").collect();
/// assert_eq!(v, [(1, "abc"), (4, "abc")]);
///
/// let v: Vec<_> = "ababa".match_indices("aba").collect();
/// assert_eq!(v, [(0, "aba")]); // only the first `aba`
/// ```
#[stable(feature = "str_match_indices", since = "1.5.0")]
#[inline]
pub fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> {
2018-05-10 12:02:19 -06:00
MatchIndices(MatchIndicesInternal(pat.into_searcher(self)))
}
/// An iterator over the disjoint matches of a pattern within `self`,
/// yielded in reverse order along with the index of the match.
///
/// For matches of `pat` within `self` that overlap, only the indices
/// corresponding to the last match are returned.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Iterator behavior
///
/// The returned iterator requires that the pattern supports a reverse
/// search, and it will be a [`DoubleEndedIterator`] if a forward/reverse
/// search yields the same elements.
///
/// For iterating from the front, the [`match_indices`] method can be used.
///
2020-07-17 12:44:44 -07:00
/// [`match_indices`]: str::match_indices
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let v: Vec<_> = "abcXXXabcYYYabc".rmatch_indices("abc").collect();
/// assert_eq!(v, [(12, "abc"), (6, "abc"), (0, "abc")]);
///
/// let v: Vec<_> = "1abcabc2".rmatch_indices("abc").collect();
/// assert_eq!(v, [(4, "abc"), (1, "abc")]);
///
/// let v: Vec<_> = "ababa".rmatch_indices("aba").collect();
/// assert_eq!(v, [(2, "aba")]); // only the last `aba`
/// ```
#[stable(feature = "str_match_indices", since = "1.5.0")]
#[inline]
pub fn rmatch_indices<'a, P>(&'a self, pat: P) -> RMatchIndices<'a, P>
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
RMatchIndices(self.match_indices(pat).0)
}
/// Returns a string slice with leading and trailing whitespace removed.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
/// Core Property `White_Space`.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = " Hello\tworld\t";
///
/// assert_eq!("Hello\tworld", s.trim());
/// ```
#[must_use = "this returns the trimmed string as a slice, \
without modifying the original"]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn trim(&self) -> &str {
2018-05-10 12:02:19 -06:00
self.trim_matches(|c: char| c.is_whitespace())
}
/// Returns a string slice with leading whitespace removed.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
/// Core Property `White_Space`.
///
/// # Text directionality
///
/// A string is a sequence of bytes. `start` in this context means the first
/// position of that byte string; for a left-to-right language like English or
2019-02-09 22:16:58 +00:00
/// Russian, this will be left side, and for right-to-left languages like
/// Arabic or Hebrew, this will be the right side.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = " Hello\tworld\t";
/// assert_eq!("Hello\tworld\t", s.trim_start());
/// ```
///
/// Directionality:
///
/// ```
/// let s = " English ";
/// assert!(Some('E') == s.trim_start().chars().next());
///
/// let s = " עברית ";
/// assert!(Some('ע') == s.trim_start().chars().next());
/// ```
2018-12-26 22:03:04 +01:00
#[must_use = "this returns the trimmed string as a new slice, \
without modifying the original"]
#[stable(feature = "trim_direction", since = "1.30.0")]
pub fn trim_start(&self) -> &str {
self.trim_start_matches(|c: char| c.is_whitespace())
}
/// Returns a string slice with trailing whitespace removed.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
/// Core Property `White_Space`.
///
/// # Text directionality
///
/// A string is a sequence of bytes. `end` in this context means the last
/// position of that byte string; for a left-to-right language like English or
2019-02-09 22:16:58 +00:00
/// Russian, this will be right side, and for right-to-left languages like
/// Arabic or Hebrew, this will be the left side.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = " Hello\tworld\t";
/// assert_eq!(" Hello\tworld", s.trim_end());
/// ```
///
/// Directionality:
///
/// ```
/// let s = " English ";
/// assert!(Some('h') == s.trim_end().chars().rev().next());
///
/// let s = " עברית ";
/// assert!(Some('ת') == s.trim_end().chars().rev().next());
/// ```
2018-12-26 22:03:04 +01:00
#[must_use = "this returns the trimmed string as a new slice, \
without modifying the original"]
#[stable(feature = "trim_direction", since = "1.30.0")]
pub fn trim_end(&self) -> &str {
self.trim_end_matches(|c: char| c.is_whitespace())
}
/// Returns a string slice with leading whitespace removed.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
/// Core Property `White_Space`.
///
/// # Text directionality
///
/// A string is a sequence of bytes. 'Left' in this context means the first
/// position of that byte string; for a language like Arabic or Hebrew
/// which are 'right to left' rather than 'left to right', this will be
/// the _right_ side, not the left.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = " Hello\tworld\t";
///
/// assert_eq!("Hello\tworld\t", s.trim_left());
/// ```
///
/// Directionality:
///
/// ```
/// let s = " English";
/// assert!(Some('E') == s.trim_left().chars().next());
///
/// let s = " עברית";
/// assert!(Some('ע') == s.trim_left().chars().next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_deprecated(
since = "1.33.0",
reason = "superseded by `trim_start`",
2019-12-22 17:42:04 -05:00
suggestion = "trim_start"
)]
pub fn trim_left(&self) -> &str {
self.trim_start()
}
/// Returns a string slice with trailing whitespace removed.
///
/// 'Whitespace' is defined according to the terms of the Unicode Derived
/// Core Property `White_Space`.
///
/// # Text directionality
///
/// A string is a sequence of bytes. 'Right' in this context means the last
/// position of that byte string; for a language like Arabic or Hebrew
/// which are 'right to left' rather than 'left to right', this will be
/// the _left_ side, not the right.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// let s = " Hello\tworld\t";
///
/// assert_eq!(" Hello\tworld", s.trim_right());
/// ```
///
/// Directionality:
///
/// ```
/// let s = "English ";
/// assert!(Some('h') == s.trim_right().chars().rev().next());
///
/// let s = "עברית ";
/// assert!(Some('ת') == s.trim_right().chars().rev().next());
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_deprecated(
since = "1.33.0",
reason = "superseded by `trim_end`",
2019-12-22 17:42:04 -05:00
suggestion = "trim_end"
)]
pub fn trim_right(&self) -> &str {
self.trim_end()
}
/// Returns a string slice with all prefixes and suffixes that match a
/// pattern repeatedly removed.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a [`char`], a slice of [`char`]s, or a function
/// or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// assert_eq!("11foo1bar11".trim_matches('1'), "foo1bar");
/// assert_eq!("123foo1bar123".trim_matches(char::is_numeric), "foo1bar");
///
/// let x: &[_] = &['1', '2'];
/// assert_eq!("12foo1bar12".trim_matches(x), "foo1bar");
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// assert_eq!("1foo1barXX".trim_matches(|c| c == '1' || c == 'X'), "foo1bar");
/// ```
2018-12-26 22:03:04 +01:00
#[must_use = "this returns the trimmed string as a new slice, \
without modifying the original"]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn trim_matches<'a, P>(&'a self, pat: P) -> &'a str
where
P: Pattern<'a, Searcher: DoubleEndedSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
let mut i = 0;
let mut j = 0;
let mut matcher = pat.into_searcher(self);
if let Some((a, b)) = matcher.next_reject() {
i = a;
j = b; // Remember earliest known match, correct it below if
2019-12-22 17:42:04 -05:00
// last match is different
2018-05-10 12:02:19 -06:00
}
if let Some((_, b)) = matcher.next_reject_back() {
j = b;
}
2019-12-26 12:56:34 -08:00
// SAFETY: `Searcher` is known to return valid indices.
2020-01-16 18:50:53 -08:00
unsafe { self.get_unchecked(i..j) }
}
/// Returns a string slice with all prefixes that match a pattern
/// repeatedly removed.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Text directionality
///
/// A string is a sequence of bytes. `start` in this context means the first
/// position of that byte string; for a left-to-right language like English or
/// Russian, this will be left side, and for right-to-left languages like
/// Arabic or Hebrew, this will be the right side.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// assert_eq!("11foo1bar11".trim_start_matches('1'), "foo1bar11");
/// assert_eq!("123foo1bar123".trim_start_matches(char::is_numeric), "foo1bar123");
///
/// let x: &[_] = &['1', '2'];
/// assert_eq!("12foo1bar12".trim_start_matches(x), "foo1bar12");
/// ```
2018-12-26 22:03:04 +01:00
#[must_use = "this returns the trimmed string as a new slice, \
without modifying the original"]
#[stable(feature = "trim_direction", since = "1.30.0")]
pub fn trim_start_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str {
2018-05-10 12:02:19 -06:00
let mut i = self.len();
let mut matcher = pat.into_searcher(self);
if let Some((a, _)) = matcher.next_reject() {
i = a;
}
2019-12-26 12:56:34 -08:00
// SAFETY: `Searcher` is known to return valid indices.
2020-01-16 18:50:53 -08:00
unsafe { self.get_unchecked(i..self.len()) }
}
/// Returns a string slice with the prefix removed.
///
/// If the string starts with the pattern `prefix`, `Some` is returned with the substring where
/// the prefix is removed. Unlike `trim_start_matches`, this method removes the prefix exactly
/// once.
///
/// If the string does not start with `prefix`, `None` is returned.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// ```
/// assert_eq!("foo:bar".strip_prefix("foo:"), Some("bar"));
/// assert_eq!("foo:bar".strip_prefix("bar"), None);
/// assert_eq!("foofoo".strip_prefix("foo"), Some("foo"));
/// ```
#[must_use = "this returns the remaining substring as a new slice, \
without modifying the original"]
2020-05-22 15:29:47 +00:00
#[stable(feature = "str_strip", since = "1.45.0")]
pub fn strip_prefix<'a, P: Pattern<'a>>(&'a self, prefix: P) -> Option<&'a str> {
prefix.strip_prefix_of(self)
}
/// Returns a string slice with the suffix removed.
///
/// If the string ends with the pattern `suffix`, `Some` is returned with the substring where
/// the suffix is removed. Unlike `trim_end_matches`, this method removes the suffix exactly
/// once.
///
/// If the string does not end with `suffix`, `None` is returned.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
2020-04-19 14:17:32 -07:00
///
/// # Examples
///
/// ```
/// assert_eq!("bar:foo".strip_suffix(":foo"), Some("bar"));
/// assert_eq!("bar:foo".strip_suffix("bar"), None);
/// assert_eq!("foofoo".strip_suffix("foo"), Some("foo"));
/// ```
#[must_use = "this returns the remaining substring as a new slice, \
without modifying the original"]
2020-05-22 15:29:47 +00:00
#[stable(feature = "str_strip", since = "1.45.0")]
pub fn strip_suffix<'a, P>(&'a self, suffix: P) -> Option<&'a str>
where
P: Pattern<'a>,
<P as Pattern<'a>>::Searcher: ReverseSearcher<'a>,
{
suffix.strip_suffix_of(self)
}
/// Returns a string slice with all suffixes that match a pattern
/// repeatedly removed.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Text directionality
///
/// A string is a sequence of bytes. `end` in this context means the last
/// position of that byte string; for a left-to-right language like English or
/// Russian, this will be right side, and for right-to-left languages like
/// Arabic or Hebrew, this will be the left side.
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// assert_eq!("11foo1bar11".trim_end_matches('1'), "11foo1bar");
/// assert_eq!("123foo1bar123".trim_end_matches(char::is_numeric), "123foo1bar");
///
/// let x: &[_] = &['1', '2'];
/// assert_eq!("12foo1bar12".trim_end_matches(x), "12foo1bar");
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// assert_eq!("1fooX".trim_end_matches(|c| c == '1' || c == 'X'), "1foo");
/// ```
2018-12-26 22:03:04 +01:00
#[must_use = "this returns the trimmed string as a new slice, \
without modifying the original"]
#[stable(feature = "trim_direction", since = "1.30.0")]
pub fn trim_end_matches<'a, P>(&'a self, pat: P) -> &'a str
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
2018-05-10 12:02:19 -06:00
let mut j = 0;
let mut matcher = pat.into_searcher(self);
if let Some((_, b)) = matcher.next_reject_back() {
j = b;
}
2019-12-26 12:56:34 -08:00
// SAFETY: `Searcher` is known to return valid indices.
2020-01-16 18:50:53 -08:00
unsafe { self.get_unchecked(0..j) }
}
/// Returns a string slice with all prefixes that match a pattern
/// repeatedly removed.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Text directionality
///
/// A string is a sequence of bytes. 'Left' in this context means the first
/// position of that byte string; for a language like Arabic or Hebrew
/// which are 'right to left' rather than 'left to right', this will be
/// the _right_ side, not the left.
///
/// # Examples
///
/// Basic usage:
///
/// ```
/// assert_eq!("11foo1bar11".trim_left_matches('1'), "foo1bar11");
/// assert_eq!("123foo1bar123".trim_left_matches(char::is_numeric), "foo1bar123");
///
/// let x: &[_] = &['1', '2'];
/// assert_eq!("12foo1bar12".trim_left_matches(x), "foo1bar12");
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_deprecated(
since = "1.33.0",
reason = "superseded by `trim_start_matches`",
2019-12-22 17:42:04 -05:00
suggestion = "trim_start_matches"
)]
pub fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str {
self.trim_start_matches(pat)
}
/// Returns a string slice with all suffixes that match a pattern
/// repeatedly removed.
///
2020-04-19 14:17:32 -07:00
/// The [pattern] can be a `&str`, [`char`], a slice of [`char`]s, or a
/// function or closure that determines if a character matches.
///
/// [`char`]: prim@char
2020-07-17 12:44:44 -07:00
/// [pattern]: self::pattern
///
/// # Text directionality
///
/// A string is a sequence of bytes. 'Right' in this context means the last
/// position of that byte string; for a language like Arabic or Hebrew
/// which are 'right to left' rather than 'left to right', this will be
/// the _left_ side, not the right.
///
/// # Examples
///
/// Simple patterns:
///
/// ```
/// assert_eq!("11foo1bar11".trim_right_matches('1'), "11foo1bar");
/// assert_eq!("123foo1bar123".trim_right_matches(char::is_numeric), "123foo1bar");
///
/// let x: &[_] = &['1', '2'];
/// assert_eq!("12foo1bar12".trim_right_matches(x), "12foo1bar");
/// ```
///
/// A more complex pattern, using a closure:
///
/// ```
/// assert_eq!("1fooX".trim_right_matches(|c| c == '1' || c == 'X'), "1foo");
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[rustc_deprecated(
since = "1.33.0",
reason = "superseded by `trim_end_matches`",
2019-12-22 17:42:04 -05:00
suggestion = "trim_end_matches"
)]
pub fn trim_right_matches<'a, P>(&'a self, pat: P) -> &'a str
where
P: Pattern<'a, Searcher: ReverseSearcher<'a>>,
{
self.trim_end_matches(pat)
}
/// Parses this string slice into another type.
///
/// Because `parse` is so general, it can cause problems with type
/// inference. As such, `parse` is one of the few times you'll see
/// the syntax affectionately known as the 'turbofish': `::<>`. This
/// helps the inference algorithm understand specifically which type
/// you're trying to parse into.
///
/// `parse` can parse any type that implements the [`FromStr`] trait.
2020-07-17 12:44:44 -07:00
///
/// # Errors
///
/// Will return [`Err`] if it's not possible to parse this string slice into
/// the desired type.
///
2020-07-17 12:44:44 -07:00
/// [`Err`]: FromStr::Err
///
/// # Examples
///
/// Basic usage
///
/// ```
/// let four: u32 = "4".parse().unwrap();
///
/// assert_eq!(4, four);
/// ```
///
/// Using the 'turbofish' instead of annotating `four`:
///
/// ```
/// let four = "4".parse::<u32>();
///
/// assert_eq!(Ok(4), four);
/// ```
///
/// Failing to parse:
///
/// ```
/// let nope = "j".parse::<u32>();
///
/// assert!(nope.is_err());
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn parse<F: FromStr>(&self) -> Result<F, F::Err> {
2018-05-10 12:02:19 -06:00
FromStr::from_str(self)
}
/// Checks if all characters in this string are within the ASCII range.
///
/// # Examples
///
/// ```
/// let ascii = "hello!\n";
/// let non_ascii = "Grüße, Jürgen ❤";
///
/// assert!(ascii.is_ascii());
/// assert!(!non_ascii.is_ascii());
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[inline]
pub fn is_ascii(&self) -> bool {
// We can treat each byte as character here: all multibyte characters
// start with a byte that is not in the ascii range, so we will stop
// there already.
2020-07-05 10:09:29 -07:00
self.as_bytes().is_ascii()
}
/// Checks that two strings are an ASCII case-insensitive match.
///
/// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
/// but without allocating and copying temporaries.
///
/// # Examples
///
/// ```
/// assert!("Ferris".eq_ignore_ascii_case("FERRIS"));
/// assert!("Ferrös".eq_ignore_ascii_case("FERRöS"));
/// assert!(!"Ferrös".eq_ignore_ascii_case("FERRÖS"));
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
#[inline]
pub fn eq_ignore_ascii_case(&self, other: &str) -> bool {
self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
}
/// Converts this string to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
///
/// To return a new uppercased value without modifying the existing one, use
/// [`to_ascii_uppercase`].
///
/// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
///
/// # Examples
///
/// ```
/// let mut s = String::from("Grüße, Jürgen ❤");
///
/// s.make_ascii_uppercase();
///
/// assert_eq!("GRüßE, JüRGEN ❤", s);
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
pub fn make_ascii_uppercase(&mut self) {
2019-12-26 12:56:34 -08:00
// SAFETY: safe because we transmute two types with the same layout.
let me = unsafe { self.as_bytes_mut() };
me.make_ascii_uppercase()
}
/// Converts this string to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
///
/// To return a new lowercased value without modifying the existing one, use
/// [`to_ascii_lowercase`].
///
/// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
///
/// # Examples
///
/// ```
/// let mut s = String::from("GRÜßE, JÜRGEN ❤");
///
/// s.make_ascii_lowercase();
///
/// assert_eq!("grÜße, jÜrgen ❤", s);
/// ```
#[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
pub fn make_ascii_lowercase(&mut self) {
2019-12-26 12:56:34 -08:00
// SAFETY: safe because we transmute two types with the same layout.
let me = unsafe { self.as_bytes_mut() };
me.make_ascii_lowercase()
}
2019-02-02 10:34:36 +01:00
/// Return an iterator that escapes each char in `self` with [`char::escape_debug`].
2019-02-02 10:34:36 +01:00
///
/// Note: only extended grapheme codepoints that begin the string will be
/// escaped.
///
2019-02-02 11:25:46 +01:00
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in "❤\n!".escape_debug() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", "❤\n!".escape_debug());
/// ```
///
///
/// Both are equivalent to:
///
/// ```
/// println!("❤\\n!");
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!("❤\n!".escape_debug().to_string(), "❤\\n!");
/// ```
2019-02-02 10:34:36 +01:00
#[stable(feature = "str_escape", since = "1.34.0")]
2019-04-19 01:37:12 +02:00
pub fn escape_debug(&self) -> EscapeDebug<'_> {
2019-02-02 10:34:36 +01:00
let mut chars = self.chars();
EscapeDebug {
2019-12-22 17:42:04 -05:00
inner: chars
.next()
2019-02-02 10:34:36 +01:00
.map(|first| first.escape_debug_ext(true))
.into_iter()
.flatten()
2019-12-22 17:42:04 -05:00
.chain(chars.flat_map(CharEscapeDebugContinue)),
2019-02-02 10:34:36 +01:00
}
}
/// Return an iterator that escapes each char in `self` with [`char::escape_default`].
2019-02-02 10:34:36 +01:00
///
2019-02-02 11:25:46 +01:00
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in "❤\n!".escape_default() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", "❤\n!".escape_default());
/// ```
///
///
/// Both are equivalent to:
///
/// ```
/// println!("\\u{{2764}}\\n!");
2019-02-02 11:25:46 +01:00
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!("❤\n!".escape_default().to_string(), "\\u{2764}\\n!");
/// ```
2019-02-02 10:34:36 +01:00
#[stable(feature = "str_escape", since = "1.34.0")]
2019-04-19 01:37:12 +02:00
pub fn escape_default(&self) -> EscapeDefault<'_> {
2019-02-02 10:34:36 +01:00
EscapeDefault { inner: self.chars().flat_map(CharEscapeDefault) }
}
/// Return an iterator that escapes each char in `self` with [`char::escape_unicode`].
2019-02-02 10:34:36 +01:00
///
2019-02-02 11:25:46 +01:00
/// # Examples
///
/// As an iterator:
///
/// ```
/// for c in "❤\n!".escape_unicode() {
/// print!("{}", c);
/// }
/// println!();
/// ```
///
/// Using `println!` directly:
///
/// ```
/// println!("{}", "❤\n!".escape_unicode());
/// ```
///
///
/// Both are equivalent to:
///
/// ```
/// println!("\\u{{2764}}\\u{{a}}\\u{{21}}");
/// ```
///
/// Using `to_string`:
///
/// ```
/// assert_eq!("❤\n!".escape_unicode().to_string(), "\\u{2764}\\u{a}\\u{21}");
/// ```
2019-02-02 10:34:36 +01:00
#[stable(feature = "str_escape", since = "1.34.0")]
2019-04-19 01:37:12 +02:00
pub fn escape_unicode(&self) -> EscapeUnicode<'_> {
2019-02-02 10:34:36 +01:00
EscapeUnicode { inner: self.chars().flat_map(CharEscapeUnicode) }
}
}
impl_fn_for_zst! {
#[derive(Clone)]
struct CharEscapeDebugContinue impl Fn = |c: char| -> char::EscapeDebug {
c.escape_debug_ext(false)
};
#[derive(Clone)]
struct CharEscapeUnicode impl Fn = |c: char| -> char::EscapeUnicode {
c.escape_unicode()
};
#[derive(Clone)]
struct CharEscapeDefault impl Fn = |c: char| -> char::EscapeDefault {
c.escape_default()
};
}
2015-05-06 15:53:34 -07:00
#[stable(feature = "rust1", since = "1.0.0")]
impl AsRef<[u8]> for str {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
2015-01-23 21:48:20 -08:00
#[stable(feature = "rust1", since = "1.0.0")]
impl Default for &str {
/// Creates an empty str
2019-12-22 17:42:04 -05:00
fn default() -> Self {
""
}
}
2018-06-03 00:29:50 +08:00
#[stable(feature = "default_mut_str", since = "1.28.0")]
impl Default for &mut str {
2018-06-03 00:29:50 +08:00
/// Creates an empty mutable str
2019-12-22 17:42:04 -05:00
fn default() -> Self {
2020-01-16 18:38:04 -08:00
// SAFETY: The empty string is valid UTF-8.
2019-12-22 17:42:04 -05:00
unsafe { from_utf8_unchecked_mut(&mut []) }
}
2018-06-03 00:29:50 +08:00
}
impl_fn_for_zst! {
#[derive(Clone)]
struct IsWhitespace impl Fn = |c: char| -> bool {
c.is_whitespace()
};
#[derive(Clone)]
struct IsAsciiWhitespace impl Fn = |byte: &u8| -> bool {
byte.is_ascii_whitespace()
};
2018-05-05 00:33:20 -04:00
#[derive(Clone)]
struct IsNotEmpty impl<'a, 'b> Fn = |s: &'a &'b str| -> bool {
!s.is_empty()
};
2018-05-05 00:33:20 -04:00
#[derive(Clone)]
struct BytesIsNotEmpty impl<'a, 'b> Fn = |s: &'a &'b [u8]| -> bool {
!s.is_empty()
};
2018-05-05 00:33:20 -04:00
#[derive(Clone)]
struct UnsafeBytesToStr impl<'a> Fn = |bytes: &'a [u8]| -> &'a str {
// SAFETY: not safe
unsafe { from_utf8_unchecked(bytes) }
};
2018-05-05 00:33:20 -04:00
}