Skip to content

Commit

Permalink
Rollup merge of rust-lang#115443 - epage:os_str, r=cuviper
Browse files Browse the repository at this point in the history
feat(std): Stabilize 'os_str_bytes' feature

Closes rust-lang#111544
  • Loading branch information
matthiaskrgr authored Sep 2, 2023
2 parents b44bf0d + 30292bb commit 43e1561
Show file tree
Hide file tree
Showing 17 changed files with 91 additions and 96 deletions.
4 changes: 2 additions & 2 deletions library/std/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,8 @@
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
//!
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
//! [`OsStr::from_os_str_bytes_unchecked`].
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_encoded_bytes`] and
//! [`OsStr::from_encoded_bytes_unchecked`].
//!
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
Expand Down
68 changes: 32 additions & 36 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,36 +154,34 @@ impl OsString {
/// # Safety
///
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
/// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same rust version
/// built for the same target platform. For example, reconstructing an `OsString` from bytes sent
/// over the network or stored in a file will likely violate these safety rules.
///
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let bytes = os_str.as_encoded_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// unsafe { OsStr::from_encoded_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
#[stable(feature = "os_str_bytes", since = "CURRENT_RUSTC_VERSION")]
pub unsafe fn from_encoded_bytes_unchecked(bytes: Vec<u8>) -> Self {
OsString { inner: Buf::from_encoded_bytes_unchecked(bytes) }
}

/// Converts to an [`OsStr`] slice.
Expand All @@ -205,7 +203,7 @@ impl OsString {
}

/// Converts the `OsString` into a byte slice. To convert the byte slice back into an
/// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
/// `OsString`, use the [`OsStr::from_encoded_bytes_unchecked`] function.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
Expand All @@ -219,9 +217,9 @@ impl OsString {
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner.into_os_str_bytes()
#[stable(feature = "os_str_bytes", since = "CURRENT_RUSTC_VERSION")]
pub fn into_encoded_bytes(self) -> Vec<u8> {
self.inner.into_encoded_bytes()
}

/// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
Expand Down Expand Up @@ -745,36 +743,34 @@ impl OsStr {
/// # Safety
///
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
/// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same rust version
/// built for the same target platform. For example, reconstructing an `OsStr` from bytes sent
/// over the network or stored in a file will likely violate these safety rules.
///
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let bytes = os_str.as_encoded_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// unsafe { OsStr::from_encoded_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub unsafe fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
#[stable(feature = "os_str_bytes", since = "CURRENT_RUSTC_VERSION")]
pub unsafe fn from_encoded_bytes_unchecked(bytes: &[u8]) -> &Self {
Self::from_inner(Slice::from_encoded_bytes_unchecked(bytes))
}

#[inline]
Expand Down Expand Up @@ -948,7 +944,7 @@ impl OsStr {
}

/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
/// string slice, use the [`OsStr::from_encoded_bytes_unchecked`] function.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
Expand All @@ -962,9 +958,9 @@ impl OsStr {
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_os_str_bytes()
#[stable(feature = "os_str_bytes", since = "CURRENT_RUSTC_VERSION")]
pub fn as_encoded_bytes(&self) -> &[u8] {
self.inner.as_encoded_bytes()
}

/// Converts this string to its ASCII lower case equivalent in-place.
Expand Down Expand Up @@ -1270,7 +1266,7 @@ impl Default for &OsStr {
impl PartialEq for OsStr {
#[inline]
fn eq(&self, other: &OsStr) -> bool {
self.as_os_str_bytes().eq(other.as_os_str_bytes())
self.as_encoded_bytes().eq(other.as_encoded_bytes())
}
}

Expand All @@ -1297,23 +1293,23 @@ impl Eq for OsStr {}
impl PartialOrd for OsStr {
#[inline]
fn partial_cmp(&self, other: &OsStr) -> Option<cmp::Ordering> {
self.as_os_str_bytes().partial_cmp(other.as_os_str_bytes())
self.as_encoded_bytes().partial_cmp(other.as_encoded_bytes())
}
#[inline]
fn lt(&self, other: &OsStr) -> bool {
self.as_os_str_bytes().lt(other.as_os_str_bytes())
self.as_encoded_bytes().lt(other.as_encoded_bytes())
}
#[inline]
fn le(&self, other: &OsStr) -> bool {
self.as_os_str_bytes().le(other.as_os_str_bytes())
self.as_encoded_bytes().le(other.as_encoded_bytes())
}
#[inline]
fn gt(&self, other: &OsStr) -> bool {
self.as_os_str_bytes().gt(other.as_os_str_bytes())
self.as_encoded_bytes().gt(other.as_encoded_bytes())
}
#[inline]
fn ge(&self, other: &OsStr) -> bool {
self.as_os_str_bytes().ge(other.as_os_str_bytes())
self.as_encoded_bytes().ge(other.as_encoded_bytes())
}
}

Expand All @@ -1332,7 +1328,7 @@ impl PartialOrd<str> for OsStr {
impl Ord for OsStr {
#[inline]
fn cmp(&self, other: &OsStr) -> cmp::Ordering {
self.as_os_str_bytes().cmp(other.as_os_str_bytes())
self.as_encoded_bytes().cmp(other.as_encoded_bytes())
}
}

Expand Down Expand Up @@ -1382,7 +1378,7 @@ impl_cmp!(Cow<'a, OsStr>, OsString);
impl Hash for OsStr {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_os_str_bytes().hash(state)
self.as_encoded_bytes().hash(state)
}
}

Expand Down
34 changes: 17 additions & 17 deletions library/std/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ impl<'a> Prefix<'a> {
fn len(&self) -> usize {
use self::Prefix::*;
fn os_str_len(s: &OsStr) -> usize {
s.as_os_str_bytes().len()
s.as_encoded_bytes().len()
}
match *self {
Verbatim(x) => 4 + os_str_len(x),
Expand Down Expand Up @@ -316,31 +316,31 @@ fn has_physical_root(s: &[u8], prefix: Option<Prefix<'_>>) -> bool {

// basic workhorse for splitting stem and extension
fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
if file.as_os_str_bytes() == b".." {
if file.as_encoded_bytes() == b".." {
return (Some(file), None);
}

// The unsafety here stems from converting between &OsStr and &[u8]
// and back. This is safe to do because (1) we only look at ASCII
// contents of the encoding and (2) new &OsStr values are produced
// only from ASCII-bounded slices of existing &OsStr values.
let mut iter = file.as_os_str_bytes().rsplitn(2, |b| *b == b'.');
let mut iter = file.as_encoded_bytes().rsplitn(2, |b| *b == b'.');
let after = iter.next();
let before = iter.next();
if before == Some(b"") {
(Some(file), None)
} else {
unsafe {
(
before.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
after.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
before.map(|s| OsStr::from_encoded_bytes_unchecked(s)),
after.map(|s| OsStr::from_encoded_bytes_unchecked(s)),
)
}
}
}

fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
let slice = file.as_os_str_bytes();
let slice = file.as_encoded_bytes();
if slice == b".." {
return (file, None);
}
Expand All @@ -357,8 +357,8 @@ fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
let after = &slice[i + 1..];
unsafe {
(
OsStr::from_os_str_bytes_unchecked(before),
Some(OsStr::from_os_str_bytes_unchecked(after)),
OsStr::from_encoded_bytes_unchecked(before),
Some(OsStr::from_encoded_bytes_unchecked(after)),
)
}
}
Expand Down Expand Up @@ -739,7 +739,7 @@ impl<'a> Components<'a> {
// separately via `include_cur_dir`
b".." => Some(Component::ParentDir),
b"" => None,
_ => Some(Component::Normal(unsafe { OsStr::from_os_str_bytes_unchecked(comp) })),
_ => Some(Component::Normal(unsafe { OsStr::from_encoded_bytes_unchecked(comp) })),
}
}

Expand Down Expand Up @@ -896,7 +896,7 @@ impl<'a> Iterator for Components<'a> {
let raw = &self.path[..self.prefix_len()];
self.path = &self.path[self.prefix_len()..];
return Some(Component::Prefix(PrefixComponent {
raw: unsafe { OsStr::from_os_str_bytes_unchecked(raw) },
raw: unsafe { OsStr::from_encoded_bytes_unchecked(raw) },
parsed: self.prefix.unwrap(),
}));
}
Expand Down Expand Up @@ -968,7 +968,7 @@ impl<'a> DoubleEndedIterator for Components<'a> {
State::Prefix if self.prefix_len() > 0 => {
self.back = State::Done;
return Some(Component::Prefix(PrefixComponent {
raw: unsafe { OsStr::from_os_str_bytes_unchecked(self.path) },
raw: unsafe { OsStr::from_encoded_bytes_unchecked(self.path) },
parsed: self.prefix.unwrap(),
}));
}
Expand Down Expand Up @@ -1477,17 +1477,17 @@ impl PathBuf {
fn _set_extension(&mut self, extension: &OsStr) -> bool {
let file_stem = match self.file_stem() {
None => return false,
Some(f) => f.as_os_str_bytes(),
Some(f) => f.as_encoded_bytes(),
};

// truncate until right after the file stem
let end_file_stem = file_stem[file_stem.len()..].as_ptr().addr();
let start = self.inner.as_os_str_bytes().as_ptr().addr();
let start = self.inner.as_encoded_bytes().as_ptr().addr();
let v = self.as_mut_vec();
v.truncate(end_file_stem.wrapping_sub(start));

// add the new extension, if any
let new = extension.as_os_str_bytes();
let new = extension.as_encoded_bytes();
if !new.is_empty() {
v.reserve_exact(new.len() + 1);
v.push(b'.');
Expand Down Expand Up @@ -2007,11 +2007,11 @@ impl Path {
// The following (private!) function allows construction of a path from a u8
// slice, which is only safe when it is known to follow the OsStr encoding.
unsafe fn from_u8_slice(s: &[u8]) -> &Path {
unsafe { Path::new(OsStr::from_os_str_bytes_unchecked(s)) }
unsafe { Path::new(OsStr::from_encoded_bytes_unchecked(s)) }
}
// The following (private!) function reveals the byte encoding used for OsStr.
fn as_u8_slice(&self) -> &[u8] {
self.inner.as_os_str_bytes()
self.inner.as_encoded_bytes()
}

/// Directly wraps a string slice as a `Path` slice.
Expand Down Expand Up @@ -2609,7 +2609,7 @@ impl Path {

fn _with_extension(&self, extension: &OsStr) -> PathBuf {
let self_len = self.as_os_str().len();
let self_bytes = self.as_os_str().as_os_str_bytes();
let self_bytes = self.as_os_str().as_encoded_bytes();

let (new_capacity, slice_to_copy) = match self.extension() {
None => {
Expand Down
2 changes: 1 addition & 1 deletion library/std/src/sys/common/small_c_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub fn run_path_with_cstr<T, F>(path: &Path, f: F) -> io::Result<T>
where
F: FnOnce(&CStr) -> io::Result<T>,
{
run_with_cstr(path.as_os_str().as_os_str_bytes(), f)
run_with_cstr(path.as_os_str().as_encoded_bytes(), f)
}

#[inline]
Expand Down
4 changes: 2 additions & 2 deletions library/std/src/sys/common/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use core::iter::repeat;
fn stack_allocation_works() {
let path = Path::new("abc");
let result = run_path_with_cstr(path, |p| {
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
assert_eq!(p, &*CString::new(path.as_os_str().as_encoded_bytes()).unwrap());
Ok(42)
});
assert_eq!(result.unwrap(), 42);
Expand All @@ -25,7 +25,7 @@ fn heap_allocation_works() {
let path = repeat("a").take(384).collect::<String>();
let path = Path::new(&path);
let result = run_path_with_cstr(path, |p| {
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
assert_eq!(p, &*CString::new(path.as_os_str().as_encoded_bytes()).unwrap());
Ok(42)
});
assert_eq!(result.unwrap(), 42);
Expand Down
10 changes: 5 additions & 5 deletions library/std/src/sys/unix/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ impl AsInner<[u8]> for Buf {

impl Buf {
#[inline]
pub fn into_os_str_bytes(self) -> Vec<u8> {
pub fn into_encoded_bytes(self) -> Vec<u8> {
self.inner
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
pub unsafe fn from_encoded_bytes_unchecked(s: Vec<u8>) -> Self {
Self { inner: s }
}

Expand Down Expand Up @@ -203,18 +203,18 @@ impl Buf {

impl Slice {
#[inline]
pub fn as_os_str_bytes(&self) -> &[u8] {
pub fn as_encoded_bytes(&self) -> &[u8] {
&self.inner
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
pub unsafe fn from_encoded_bytes_unchecked(s: &[u8]) -> &Slice {
unsafe { mem::transmute(s) }
}

#[inline]
pub fn from_str(s: &str) -> &Slice {
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
unsafe { Slice::from_encoded_bytes_unchecked(s.as_bytes()) }
}

pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> {
Expand Down
Loading

0 comments on commit 43e1561

Please sign in to comment.