Auto merge of #49698 - SimonSapin:unicode-for-everyone, r=alexcrichton

Merge the std_unicode crate into the core crate [The standard library facade](https://github.com/rust-lang/rust/issues/27783) has historically contained a number of crates with different roles, but that number has decreased over time. `rand` and `libc` have moved to crates.io, and [`collections` was merged into `alloc`](https://github.com/rust-lang/rust/pull/42648). Today we have `core` that applies everywhere, `std` that expects a full operating system, and `alloc` in-between that only requires a memory allocator (which can be provided by users)… and `std_unicode`, which doesn’t really have a reason to be separate anymore. It contains functionality based on Unicode data tables that can be large, but as long as relevant functions are not called the tables should be removed from binaries by linkers. This deprecates the unstable `std_unicode` crate and moves all of its contents into `core`, replacing them with `pub use` reexports. The crate can be removed later. This also removes the `CharExt` trait (replaced with inherent methods in libcore) and `UnicodeStr` trait (merged into `StrExt`). There traits were both unstable and not intended to be used or named directly. A number of new items are newly-available in libcore and instantly stable there, but only if they were already stable in libstd. Fixes #49319.
2018-04-12 00:35:33 +00:00
parent e28ef22ae5 ef41788cf3
commit d26f9e42df
42 changed files with 1398 additions and 1537 deletions
--- a/src/liballoc/Cargo.toml
+++ b/src/liballoc/Cargo.toml
@@ -9,7 +9,6 @@ path = "lib.rs"

 [dependencies]
 core = { path = "../libcore" }
-std_unicode = { path = "../libstd_unicode" }
 compiler_builtins = { path = "../rustc/compiler_builtins_shim" }

 [dev-dependencies]
--- a/src/liballoc/lib.rs
+++ b/src/liballoc/lib.rs
@@ -113,7 +113,7 @@
 #![feature(trusted_len)]
 #![feature(try_reserve)]
 #![feature(unboxed_closures)]
-#![feature(unicode)]
+#![feature(unicode_internals)]
 #![feature(unsize)]
 #![feature(allocator_internals)]
 #![feature(on_unimplemented)]
@@ -135,8 +135,6 @@ extern crate test;
 #[cfg(test)]
 extern crate rand;

-extern crate std_unicode;
-
 // Module with internal macros used by other modules (needs to be included before other modules).
 #[macro_use]
 mod macros;
--- a/src/liballoc/str.rs
+++ b/src/liballoc/str.rs
@@ -45,12 +45,10 @@ use core::str::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher};
 use core::mem;
 use core::ptr;
 use core::iter::FusedIterator;
-use std_unicode::str::{UnicodeStr, Utf16Encoder};

 use vec_deque::VecDeque;
 use borrow::{Borrow, ToOwned};
 use string::String;
-use std_unicode;
 use vec::Vec;
 use slice::{SliceConcatExt, SliceIndex};
 use boxed::Box;
@@ -75,7 +73,7 @@ pub use core::str::{from_utf8, from_utf8_mut, Chars, CharIndices, Bytes};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut, ParseBoolError};
 #[stable(feature = "rust1", since = "1.0.0")]
-pub use std_unicode::str::SplitWhitespace;
+pub use core::str::SplitWhitespace;
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::str::pattern;

@@ -147,7 +145,8 @@ impl<S: Borrow<str>> SliceConcatExt<str> for [S] {
 #[derive(Clone)]
 #[stable(feature = "encode_utf16", since = "1.8.0")]
 pub struct EncodeUtf16<'a> {
-    encoder: Utf16Encoder<Chars<'a>>,
+    chars: Chars<'a>,
+    extra: u16,
 }

 #[stable(feature = "collection_debug", since = "1.17.0")]
@@ -163,12 +162,29 @@ impl<'a> Iterator for EncodeUtf16<'a> {

    #[inline]
    fn next(&mut self) -> Option<u16> {
-        self.encoder.next()
+        if self.extra != 0 {
+            let tmp = self.extra;
+            self.extra = 0;
+            return Some(tmp);
+        }
+
+        let mut buf = [0; 2];
+        self.chars.next().map(|ch| {
+            let n = ch.encode_utf16(&mut buf).len();
+            if n == 2 {
+                self.extra = buf[1];
+            }
+            buf[0]
+        })
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.encoder.size_hint()
+        let (low, high) = self.chars.size_hint();
+        // every char gets either one u16 or two u16,
+        // so this iterator is between 1 or 2 times as
+        // long as the underlying iterator.
+        (low, high.and_then(|n| n.checked_mul(2)))
    }
 }

@@ -801,7 +817,7 @@ impl str {
    #[stable(feature = "split_whitespace", since = "1.1.0")]
    #[inline]
    pub fn split_whitespace(&self) -> SplitWhitespace {
-        UnicodeStr::split_whitespace(self)
+        StrExt::split_whitespace(self)
    }

    /// An iterator over the lines of a string, as string slices.
@@ -871,7 +887,7 @@ impl str {
    /// ```
    #[stable(feature = "encode_utf16", since = "1.8.0")]
    pub fn encode_utf16(&self) -> EncodeUtf16 {
-        EncodeUtf16 { encoder: Utf16Encoder::new(self[..].chars()) }
+        EncodeUtf16 { chars: self[..].chars(), extra: 0 }
    }

    /// Returns `true` if the given pattern matches a sub-slice of
@@ -1571,7 +1587,7 @@ impl str {
    /// ```
    #[stable(feature = "rust1", since = "1.0.0")]
    pub fn trim(&self) -> &str {
-        UnicodeStr::trim(self)
+        StrExt::trim(self)
    }

    /// Returns a string slice with leading whitespace removed.
@@ -1607,7 +1623,7 @@ impl str {
    /// ```
    #[stable(feature = "rust1", since = "1.0.0")]
    pub fn trim_left(&self) -> &str {
-        UnicodeStr::trim_left(self)
+        StrExt::trim_left(self)
    }

    /// Returns a string slice with trailing whitespace removed.
@@ -1643,7 +1659,7 @@ impl str {
    /// ```
    #[stable(feature = "rust1", since = "1.0.0")]
    pub fn trim_right(&self) -> &str {
-        UnicodeStr::trim_right(self)
+        StrExt::trim_right(self)
    }

    /// Returns a string slice with all prefixes and suffixes that match a
@@ -1960,7 +1976,7 @@ impl str {
        }

        fn case_ignoreable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
-            use std_unicode::derived_property::{Cased, Case_Ignorable};
+            use core::unicode::derived_property::{Cased, Case_Ignorable};
            match iter.skip_while(|&c| Case_Ignorable(c)).next() {
                Some(c) => Cased(c),
                None => false,
@@ -2142,7 +2158,7 @@ impl str {
    #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")]
    #[inline]
    pub fn is_whitespace(&self) -> bool {
-        UnicodeStr::is_whitespace(self)
+        StrExt::is_whitespace(self)
    }

    /// Returns true if this `str` is entirely alphanumeric, and false otherwise.
@@ -2161,7 +2177,7 @@ impl str {
    #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")]
    #[inline]
    pub fn is_alphanumeric(&self) -> bool {
-        UnicodeStr::is_alphanumeric(self)
+        StrExt::is_alphanumeric(self)
    }

    /// Checks if all characters in this string are within the ASCII range.
--- a/src/liballoc/string.rs
+++ b/src/liballoc/string.rs
@@ -56,6 +56,7 @@

 #![stable(feature = "rust1", since = "1.0.0")]

+use core::char::{decode_utf16, REPLACEMENT_CHARACTER};
 use core::fmt;
 use core::hash;
 use core::iter::{FromIterator, FusedIterator};
@@ -63,8 +64,7 @@ use core::ops::Bound::{Excluded, Included, Unbounded};
 use core::ops::{self, Add, AddAssign, Index, IndexMut, RangeBounds};
 use core::ptr;
 use core::str::pattern::Pattern;
-use std_unicode::lossy;
-use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
+use core::str::lossy;

 use borrow::{Cow, ToOwned};
 use str::{self, from_boxed_utf8_unchecked, FromStr, Utf8Error, Chars};
--- a/src/liballoc/tests/lib.rs
+++ b/src/liballoc/tests/lib.rs
@@ -24,12 +24,11 @@
 #![feature(string_retain)]
 #![feature(try_reserve)]
 #![feature(unboxed_closures)]
-#![feature(unicode)]
 #![feature(exact_chunks)]
 #![feature(inclusive_range_fields)]

 extern crate alloc_system;
-extern crate std_unicode;
+extern crate core;
 extern crate rand;

 use std::hash::{Hash, Hasher};
--- a/src/liballoc/tests/str.rs
+++ b/src/liballoc/tests/str.rs
@@ -1204,8 +1204,7 @@ fn test_rev_split_char_iterator_no_trailing() {

 #[test]
 fn test_utf16_code_units() {
-    use std_unicode::str::Utf16Encoder;
-    assert_eq!(Utf16Encoder::new(vec!['é', '\u{1F4A9}'].into_iter()).collect::<Vec<u16>>(),
+    assert_eq!("é\u{1F4A9}".encode_utf16().collect::<Vec<u16>>(),
               [0xE9, 0xD83D, 0xDCA9])
 }

--- a/src/liballoc/tests/string.rs
+++ b/src/liballoc/tests/string.rs
@@ -132,7 +132,7 @@ fn test_from_utf16() {
        let s_as_utf16 = s.encode_utf16().collect::<Vec<u16>>();
        let u_as_string = String::from_utf16(&u).unwrap();

-        assert!(::std_unicode::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok()));
+        assert!(::core::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok()));
        assert_eq!(s_as_utf16, u);

        assert_eq!(u_as_string, s);