450 lines
16 KiB
Rust
450 lines
16 KiB
Rust
#![unstable(issue = "none", feature = "windows_stdio")]
|
|
|
|
use core::str::utf8_char_width;
|
|
|
|
use super::api::{self, WinError};
|
|
use crate::mem::MaybeUninit;
|
|
use crate::os::windows::io::{FromRawHandle, IntoRawHandle};
|
|
use crate::sys::handle::Handle;
|
|
use crate::sys::{c, cvt};
|
|
use crate::{cmp, io, ptr, str};
|
|
|
|
#[cfg(test)]
|
|
mod tests;
|
|
|
|
// Don't cache handles but get them fresh for every read/write. This allows us to track changes to
|
|
// the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490.
|
|
pub struct Stdin {
|
|
surrogate: u16,
|
|
incomplete_utf8: IncompleteUtf8,
|
|
}
|
|
|
|
pub struct Stdout {
|
|
incomplete_utf8: IncompleteUtf8,
|
|
}
|
|
|
|
pub struct Stderr {
|
|
incomplete_utf8: IncompleteUtf8,
|
|
}
|
|
|
|
struct IncompleteUtf8 {
|
|
bytes: [u8; 4],
|
|
len: u8,
|
|
}
|
|
|
|
impl IncompleteUtf8 {
|
|
// Implemented for use in Stdin::read.
|
|
fn read(&mut self, buf: &mut [u8]) -> usize {
|
|
// Write to buffer until the buffer is full or we run out of bytes.
|
|
let to_write = cmp::min(buf.len(), self.len as usize);
|
|
buf[..to_write].copy_from_slice(&self.bytes[..to_write]);
|
|
|
|
// Rotate the remaining bytes if not enough remaining space in buffer.
|
|
if usize::from(self.len) > buf.len() {
|
|
self.bytes.copy_within(to_write.., 0);
|
|
self.len -= to_write as u8;
|
|
} else {
|
|
self.len = 0;
|
|
}
|
|
|
|
to_write
|
|
}
|
|
}
|
|
|
|
// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see
|
|
// #13304 for details).
|
|
//
|
|
// From MSDN (2011): "The storage for this buffer is allocated from a shared heap for the
|
|
// process that is 64 KB in size. The maximum size of the buffer will depend on heap usage."
|
|
//
|
|
// We choose the cap at 8 KiB because libuv does the same, and it seems to be acceptable so far.
|
|
const MAX_BUFFER_SIZE: usize = 8192;
|
|
|
|
// The standard buffer size of BufReader for Stdin should be able to hold 3x more bytes than there
|
|
// are `u16`'s in MAX_BUFFER_SIZE. This ensures the read data can always be completely decoded from
|
|
// UTF-16 to UTF-8.
|
|
pub const STDIN_BUF_SIZE: usize = MAX_BUFFER_SIZE / 2 * 3;
|
|
|
|
pub fn get_handle(handle_id: u32) -> io::Result<c::HANDLE> {
|
|
let handle = unsafe { c::GetStdHandle(handle_id) };
|
|
if handle == c::INVALID_HANDLE_VALUE {
|
|
Err(io::Error::last_os_error())
|
|
} else if handle.is_null() {
|
|
Err(io::Error::from_raw_os_error(c::ERROR_INVALID_HANDLE as i32))
|
|
} else {
|
|
Ok(handle)
|
|
}
|
|
}
|
|
|
|
fn is_console(handle: c::HANDLE) -> bool {
|
|
// `GetConsoleMode` will return false (0) if this is a pipe (we don't care about the reported
|
|
// mode). This will only detect Windows Console, not other terminals connected to a pipe like
|
|
// MSYS. Which is exactly what we need, as only Windows Console needs a conversion to UTF-16.
|
|
let mut mode = 0;
|
|
unsafe { c::GetConsoleMode(handle, &mut mode) != 0 }
|
|
}
|
|
|
|
fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> io::Result<usize> {
|
|
if data.is_empty() {
|
|
return Ok(0);
|
|
}
|
|
|
|
let handle = get_handle(handle_id)?;
|
|
if !is_console(handle) {
|
|
unsafe {
|
|
let handle = Handle::from_raw_handle(handle);
|
|
let ret = handle.write(data);
|
|
let _ = handle.into_raw_handle(); // Don't close the handle
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
if incomplete_utf8.len > 0 {
|
|
assert!(
|
|
incomplete_utf8.len < 4,
|
|
"Unexpected number of bytes for incomplete UTF-8 codepoint."
|
|
);
|
|
if data[0] >> 6 != 0b10 {
|
|
// not a continuation byte - reject
|
|
incomplete_utf8.len = 0;
|
|
return Err(io::const_error!(
|
|
io::ErrorKind::InvalidData,
|
|
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
|
|
));
|
|
}
|
|
incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0];
|
|
incomplete_utf8.len += 1;
|
|
let char_width = utf8_char_width(incomplete_utf8.bytes[0]);
|
|
if (incomplete_utf8.len as usize) < char_width {
|
|
// more bytes needed
|
|
return Ok(1);
|
|
}
|
|
let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]);
|
|
incomplete_utf8.len = 0;
|
|
match s {
|
|
Ok(s) => {
|
|
assert_eq!(char_width, s.len());
|
|
let written = write_valid_utf8_to_console(handle, s)?;
|
|
assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes
|
|
return Ok(1);
|
|
}
|
|
Err(_) => {
|
|
return Err(io::const_error!(
|
|
io::ErrorKind::InvalidData,
|
|
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
|
|
));
|
|
}
|
|
}
|
|
}
|
|
|
|
// As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
|
|
// which needs to be encoded as UTF-16.
|
|
//
|
|
// If the data is not valid UTF-8 we write out as many bytes as are valid.
|
|
// If the first byte is invalid it is either first byte of a multi-byte sequence but the
|
|
// provided byte slice is too short or it is the first byte of an invalid multi-byte sequence.
|
|
let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2);
|
|
let utf8 = match str::from_utf8(&data[..len]) {
|
|
Ok(s) => s,
|
|
Err(ref e) if e.valid_up_to() == 0 => {
|
|
let first_byte_char_width = utf8_char_width(data[0]);
|
|
if first_byte_char_width > 1 && data.len() < first_byte_char_width {
|
|
incomplete_utf8.bytes[0] = data[0];
|
|
incomplete_utf8.len = 1;
|
|
return Ok(1);
|
|
} else {
|
|
return Err(io::const_error!(
|
|
io::ErrorKind::InvalidData,
|
|
"Windows stdio in console mode does not support writing non-UTF-8 byte sequences",
|
|
));
|
|
}
|
|
}
|
|
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
|
|
};
|
|
|
|
write_valid_utf8_to_console(handle, utf8)
|
|
}
|
|
|
|
fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> {
|
|
debug_assert!(!utf8.is_empty());
|
|
|
|
let mut utf16 = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
|
|
let utf8 = &utf8[..utf8.floor_char_boundary(utf16.len())];
|
|
|
|
let utf16: &[u16] = unsafe {
|
|
// Note that this theoretically checks validity twice in the (most common) case
|
|
// where the underlying byte sequence is valid utf-8 (given the check in `write()`).
|
|
let result = c::MultiByteToWideChar(
|
|
c::CP_UTF8, // CodePage
|
|
c::MB_ERR_INVALID_CHARS, // dwFlags
|
|
utf8.as_ptr(), // lpMultiByteStr
|
|
utf8.len() as i32, // cbMultiByte
|
|
utf16.as_mut_ptr() as *mut c::WCHAR, // lpWideCharStr
|
|
utf16.len() as i32, // cchWideChar
|
|
);
|
|
assert!(result != 0, "Unexpected error in MultiByteToWideChar");
|
|
|
|
// Safety: MultiByteToWideChar initializes `result` values.
|
|
MaybeUninit::slice_assume_init_ref(&utf16[..result as usize])
|
|
};
|
|
|
|
let mut written = write_u16s(handle, utf16)?;
|
|
|
|
// Figure out how many bytes of as UTF-8 were written away as UTF-16.
|
|
if written == utf16.len() {
|
|
Ok(utf8.len())
|
|
} else {
|
|
// Make sure we didn't end up writing only half of a surrogate pair (even though the chance
|
|
// is tiny). Because it is not possible for user code to re-slice `data` in such a way that
|
|
// a missing surrogate can be produced (and also because of the UTF-8 validation above),
|
|
// write the missing surrogate out now.
|
|
// Buffering it would mean we have to lie about the number of bytes written.
|
|
let first_code_unit_remaining = utf16[written];
|
|
if matches!(first_code_unit_remaining, 0xDCEE..=0xDFFF) {
|
|
// low surrogate
|
|
// We just hope this works, and give up otherwise
|
|
let _ = write_u16s(handle, &utf16[written..written + 1]);
|
|
written += 1;
|
|
}
|
|
// Calculate the number of bytes of `utf8` that were actually written.
|
|
let mut count = 0;
|
|
for ch in utf16[..written].iter() {
|
|
count += match ch {
|
|
0x0000..=0x007F => 1,
|
|
0x0080..=0x07FF => 2,
|
|
0xDCEE..=0xDFFF => 1, // Low surrogate. We already counted 3 bytes for the other.
|
|
_ => 3,
|
|
};
|
|
}
|
|
debug_assert!(String::from_utf16(&utf16[..written]).unwrap() == utf8[..count]);
|
|
Ok(count)
|
|
}
|
|
}
|
|
|
|
fn write_u16s(handle: c::HANDLE, data: &[u16]) -> io::Result<usize> {
|
|
debug_assert!(data.len() < u32::MAX as usize);
|
|
let mut written = 0;
|
|
cvt(unsafe {
|
|
c::WriteConsoleW(handle, data.as_ptr(), data.len() as u32, &mut written, ptr::null_mut())
|
|
})?;
|
|
Ok(written as usize)
|
|
}
|
|
|
|
impl Stdin {
|
|
pub const fn new() -> Stdin {
|
|
Stdin { surrogate: 0, incomplete_utf8: IncompleteUtf8::new() }
|
|
}
|
|
}
|
|
|
|
impl io::Read for Stdin {
|
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
let handle = get_handle(c::STD_INPUT_HANDLE)?;
|
|
if !is_console(handle) {
|
|
unsafe {
|
|
let handle = Handle::from_raw_handle(handle);
|
|
let ret = handle.read(buf);
|
|
let _ = handle.into_raw_handle(); // Don't close the handle
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
// If there are bytes in the incomplete utf-8, start with those.
|
|
// (No-op if there is nothing in the buffer.)
|
|
let mut bytes_copied = self.incomplete_utf8.read(buf);
|
|
|
|
if bytes_copied == buf.len() {
|
|
Ok(bytes_copied)
|
|
} else if buf.len() - bytes_copied < 4 {
|
|
// Not enough space to get a UTF-8 byte. We will use the incomplete UTF8.
|
|
let mut utf16_buf = [MaybeUninit::new(0); 1];
|
|
// Read one u16 character.
|
|
let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, 1, &mut self.surrogate)?;
|
|
// Read bytes, using the (now-empty) self.incomplete_utf8 as extra space.
|
|
let read_bytes = utf16_to_utf8(
|
|
unsafe { MaybeUninit::slice_assume_init_ref(&utf16_buf[..read]) },
|
|
&mut self.incomplete_utf8.bytes,
|
|
)?;
|
|
|
|
// Read in the bytes from incomplete_utf8 until the buffer is full.
|
|
self.incomplete_utf8.len = read_bytes as u8;
|
|
// No-op if no bytes.
|
|
bytes_copied += self.incomplete_utf8.read(&mut buf[bytes_copied..]);
|
|
Ok(bytes_copied)
|
|
} else {
|
|
let mut utf16_buf = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2];
|
|
|
|
// In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So
|
|
// we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets
|
|
// lost.
|
|
let amount = cmp::min(buf.len() / 3, utf16_buf.len());
|
|
let read =
|
|
read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?;
|
|
// Safety `read_u16s_fixup_surrogates` returns the number of items
|
|
// initialized.
|
|
let utf16s = unsafe { MaybeUninit::slice_assume_init_ref(&utf16_buf[..read]) };
|
|
match utf16_to_utf8(utf16s, buf) {
|
|
Ok(value) => return Ok(bytes_copied + value),
|
|
Err(e) => return Err(e),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// We assume that if the last `u16` is an unpaired surrogate they got sliced apart by our
|
|
// buffer size, and keep it around for the next read hoping to put them together.
|
|
// This is a best effort, and might not work if we are not the only reader on Stdin.
|
|
fn read_u16s_fixup_surrogates(
|
|
handle: c::HANDLE,
|
|
buf: &mut [MaybeUninit<u16>],
|
|
mut amount: usize,
|
|
surrogate: &mut u16,
|
|
) -> io::Result<usize> {
|
|
// Insert possibly remaining unpaired surrogate from last read.
|
|
let mut start = 0;
|
|
if *surrogate != 0 {
|
|
buf[0] = MaybeUninit::new(*surrogate);
|
|
*surrogate = 0;
|
|
start = 1;
|
|
if amount == 1 {
|
|
// Special case: `Stdin::read` guarantees we can always read at least one new `u16`
|
|
// and combine it with an unpaired surrogate, because the UTF-8 buffer is at least
|
|
// 4 bytes.
|
|
amount = 2;
|
|
}
|
|
}
|
|
let mut amount = read_u16s(handle, &mut buf[start..amount])? + start;
|
|
|
|
if amount > 0 {
|
|
// Safety: The returned `amount` is the number of values initialized,
|
|
// and it is not 0, so we know that `buf[amount - 1]` have been
|
|
// initialized.
|
|
let last_char = unsafe { buf[amount - 1].assume_init() };
|
|
if matches!(last_char, 0xD800..=0xDBFF) {
|
|
// high surrogate
|
|
*surrogate = last_char;
|
|
amount -= 1;
|
|
}
|
|
}
|
|
Ok(amount)
|
|
}
|
|
|
|
// Returns `Ok(n)` if it initialized `n` values in `buf`.
|
|
fn read_u16s(handle: c::HANDLE, buf: &mut [MaybeUninit<u16>]) -> io::Result<usize> {
|
|
// Configure the `pInputControl` parameter to not only return on `\r\n` but also Ctrl-Z, the
|
|
// traditional DOS method to indicate end of character stream / user input (SUB).
|
|
// See #38274 and https://stackoverflow.com/questions/43836040/win-api-readconsole.
|
|
const CTRL_Z: u16 = 0x1A;
|
|
const CTRL_Z_MASK: u32 = 1 << CTRL_Z;
|
|
let input_control = c::CONSOLE_READCONSOLE_CONTROL {
|
|
nLength: crate::mem::size_of::<c::CONSOLE_READCONSOLE_CONTROL>() as u32,
|
|
nInitialChars: 0,
|
|
dwCtrlWakeupMask: CTRL_Z_MASK,
|
|
dwControlKeyState: 0,
|
|
};
|
|
|
|
let mut amount = 0;
|
|
loop {
|
|
cvt(unsafe {
|
|
c::SetLastError(0);
|
|
c::ReadConsoleW(
|
|
handle,
|
|
buf.as_mut_ptr() as *mut core::ffi::c_void,
|
|
buf.len() as u32,
|
|
&mut amount,
|
|
&input_control,
|
|
)
|
|
})?;
|
|
|
|
// ReadConsoleW returns success with ERROR_OPERATION_ABORTED for Ctrl-C or Ctrl-Break.
|
|
// Explicitly check for that case here and try again.
|
|
if amount == 0 && api::get_last_error() == WinError::OPERATION_ABORTED {
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
// Safety: if `amount > 0`, then that many bytes were written, so
|
|
// `buf[amount as usize - 1]` has been initialized.
|
|
if amount > 0 && unsafe { buf[amount as usize - 1].assume_init() } == CTRL_Z {
|
|
amount -= 1;
|
|
}
|
|
Ok(amount as usize)
|
|
}
|
|
|
|
fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> {
|
|
debug_assert!(utf16.len() <= i32::MAX as usize);
|
|
debug_assert!(utf8.len() <= i32::MAX as usize);
|
|
|
|
if utf16.is_empty() {
|
|
return Ok(0);
|
|
}
|
|
|
|
let result = unsafe {
|
|
c::WideCharToMultiByte(
|
|
c::CP_UTF8, // CodePage
|
|
c::WC_ERR_INVALID_CHARS, // dwFlags
|
|
utf16.as_ptr(), // lpWideCharStr
|
|
utf16.len() as i32, // cchWideChar
|
|
utf8.as_mut_ptr(), // lpMultiByteStr
|
|
utf8.len() as i32, // cbMultiByte
|
|
ptr::null(), // lpDefaultChar
|
|
ptr::null_mut(), // lpUsedDefaultChar
|
|
)
|
|
};
|
|
if result == 0 {
|
|
// We can't really do any better than forget all data and return an error.
|
|
Err(io::const_error!(
|
|
io::ErrorKind::InvalidData,
|
|
"Windows stdin in console mode does not support non-UTF-16 input; \
|
|
encountered unpaired surrogate",
|
|
))
|
|
} else {
|
|
Ok(result as usize)
|
|
}
|
|
}
|
|
|
|
impl IncompleteUtf8 {
|
|
pub const fn new() -> IncompleteUtf8 {
|
|
IncompleteUtf8 { bytes: [0; 4], len: 0 }
|
|
}
|
|
}
|
|
|
|
impl Stdout {
|
|
pub const fn new() -> Stdout {
|
|
Stdout { incomplete_utf8: IncompleteUtf8::new() }
|
|
}
|
|
}
|
|
|
|
impl io::Write for Stdout {
|
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8)
|
|
}
|
|
|
|
fn flush(&mut self) -> io::Result<()> {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
impl Stderr {
|
|
pub const fn new() -> Stderr {
|
|
Stderr { incomplete_utf8: IncompleteUtf8::new() }
|
|
}
|
|
}
|
|
|
|
impl io::Write for Stderr {
|
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8)
|
|
}
|
|
|
|
fn flush(&mut self) -> io::Result<()> {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
pub fn is_ebadf(err: &io::Error) -> bool {
|
|
err.raw_os_error() == Some(c::ERROR_INVALID_HANDLE as i32)
|
|
}
|
|
|
|
pub fn panic_output() -> Option<impl io::Write> {
|
|
Some(Stderr::new())
|
|
}
|