-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[libc++] Allows any types of size 4 and 8 to use native platform ulock_wait #161086
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-libcxx Author: Hui (huixie90) ChangesPatch is 21.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161086.diff 6 Files Affected:
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index b424427e65c33..489336688b090 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -212,6 +212,8 @@ struct __atomic_base<_Tp, true> : public __atomic_base<_Tp, false> {
// __atomic_base<int, false>. So specializing __atomic_base<_Tp> does not work
template <class _Tp, bool _IsIntegral>
struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > {
+ using __inner_type _LIBCPP_NODEBUG = _Tp;
+
static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_base<_Tp, _IsIntegral>& __a, memory_order __order) {
return __a.load(__order);
}
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 5cc6fb0c55d09..f0fcffc9b0b07 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -82,6 +82,8 @@ struct atomic_flag {
template <>
struct __atomic_waitable_traits<atomic_flag> {
+ using __inner_type _LIBCPP_NODEBUG = _LIBCPP_ATOMIC_FLAG_TYPE;
+
static _LIBCPP_HIDE_FROM_ABI _LIBCPP_ATOMIC_FLAG_TYPE __atomic_load(const atomic_flag& __a, memory_order __order) {
return std::__cxx_atomic_load(&__a.__a_, __order);
}
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
index 9bdc6b1160d2c..4da7c208a9268 100644
--- a/libcxx/include/__atomic/atomic_ref.h
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -230,6 +230,8 @@ struct __atomic_ref_base {
template <class _Tp>
struct __atomic_waitable_traits<__atomic_ref_base<_Tp>> {
+ using __inner_type _LIBCPP_NODEBUG = _Tp;
+
static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_ref_base<_Tp>& __a, memory_order __order) {
return __a.load(__order);
}
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index 0dae448d649be..d3574c69d4a46 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -38,6 +38,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// The below implementations look ugly to support C++03
template <class _Tp, class = void>
struct __atomic_waitable_traits {
+ using __inner_type _LIBCPP_NODEBUG = void;
+
template <class _AtomicWaitable>
static void __atomic_load(_AtomicWaitable&&, memory_order) = delete;
@@ -58,21 +60,26 @@ struct __atomic_waitable< _Tp,
#if _LIBCPP_STD_VER >= 20
# if _LIBCPP_HAS_THREADS
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one(void const volatile*) _NOEXCEPT;
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile*) _NOEXCEPT;
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t
-__libcpp_atomic_monitor(void const volatile*) _NOEXCEPT;
+template <std::size_t _Size>
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
-__libcpp_atomic_wait(void const volatile*, __cxx_contention_t) _NOEXCEPT;
+__libcpp_atomic_wait_native(void const volatile* __address, void const volatile* __old_value) _NOEXCEPT;
+
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t
+__libcpp_atomic_monitor_global(void const volatile* __address) _NOEXCEPT;
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
-__cxx_atomic_notify_one(__cxx_atomic_contention_t const volatile*) _NOEXCEPT;
+__libcpp_atomic_wait_global_table(void const volatile* __address, __cxx_contention_t __monitor_value) _NOEXCEPT;
+
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one_global_table(void const volatile*) _NOEXCEPT;
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all_global_table(void const volatile*) _NOEXCEPT;
+
+template <std::size_t _Size>
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
-__cxx_atomic_notify_all(__cxx_atomic_contention_t const volatile*) _NOEXCEPT;
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t
-__libcpp_atomic_monitor(__cxx_atomic_contention_t const volatile*) _NOEXCEPT;
+__cxx_atomic_notify_one_native(const volatile void*) _NOEXCEPT;
+
+template <std::size_t _Size>
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
-__libcpp_atomic_wait(__cxx_atomic_contention_t const volatile*, __cxx_contention_t) _NOEXCEPT;
+__cxx_atomic_notify_all_native(const volatile void*) _NOEXCEPT;
template <class _AtomicWaitable, class _Poll>
struct __atomic_wait_backoff_impl {
@@ -81,38 +88,25 @@ struct __atomic_wait_backoff_impl {
memory_order __order_;
using __waitable_traits _LIBCPP_NODEBUG = __atomic_waitable_traits<__decay_t<_AtomicWaitable> >;
-
- _LIBCPP_AVAILABILITY_SYNC
- _LIBCPP_HIDE_FROM_ABI bool
- __update_monitor_val_and_poll(__cxx_atomic_contention_t const volatile*, __cxx_contention_t& __monitor_val) const {
- // In case the contention type happens to be __cxx_atomic_contention_t, i.e. __cxx_atomic_impl<int64_t>,
- // the platform wait is directly monitoring the atomic value itself.
- // `__poll_` takes the current value of the atomic as an in-out argument
- // to potentially modify it. After it returns, `__monitor` has a value
- // which can be safely waited on by `std::__libcpp_atomic_wait` without any
- // ABA style issues.
- __monitor_val = __waitable_traits::__atomic_load(__a_, __order_);
- return __poll_(__monitor_val);
- }
-
- _LIBCPP_AVAILABILITY_SYNC
- _LIBCPP_HIDE_FROM_ABI bool
- __update_monitor_val_and_poll(void const volatile* __contention_address, __cxx_contention_t& __monitor_val) const {
- // In case the contention type is anything else, platform wait is monitoring a __cxx_atomic_contention_t
- // from the global pool, the monitor comes from __libcpp_atomic_monitor
- __monitor_val = std::__libcpp_atomic_monitor(__contention_address);
- auto __current_val = __waitable_traits::__atomic_load(__a_, __order_);
- return __poll_(__current_val);
- }
+ using __inner_type _LIBCPP_NODEBUG = typename __waitable_traits::__inner_type;
_LIBCPP_AVAILABILITY_SYNC
_LIBCPP_HIDE_FROM_ABI bool operator()(chrono::nanoseconds __elapsed) const {
if (__elapsed > chrono::microseconds(4)) {
auto __contention_address = __waitable_traits::__atomic_contention_address(__a_);
- __cxx_contention_t __monitor_val;
- if (__update_monitor_val_and_poll(__contention_address, __monitor_val))
- return true;
- std::__libcpp_atomic_wait(__contention_address, __monitor_val);
+
+ if constexpr (__is_atomic_wait_native_type<__inner_type>::value) {
+ auto __atomic_value = __waitable_traits::__atomic_load(__a_, __order_);
+ if (__poll_(__atomic_value))
+ return true;
+ std::__libcpp_atomic_wait_native<sizeof(__inner_type)>(__contention_address, &__atomic_value);
+ } else {
+ __cxx_contention_t __monitor_val = std::__libcpp_atomic_monitor_global(__contention_address);
+ auto __atomic_value = __waitable_traits::__atomic_load(__a_, __order_);
+ if (__poll_(__atomic_value))
+ return true;
+ std::__libcpp_atomic_wait_global_table(__contention_address, __monitor_val);
+ }
} else {
} // poll
return false;
@@ -144,13 +138,23 @@ __atomic_wait_unless(const _AtomicWaitable& __a, memory_order __order, _Poll&& _
template <class _AtomicWaitable>
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void __atomic_notify_one(const _AtomicWaitable& __a) {
static_assert(__atomic_waitable<_AtomicWaitable>::value, "");
- std::__cxx_atomic_notify_one(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ using __inner_type _LIBCPP_NODEBUG = typename __atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__inner_type;
+ if constexpr (__is_atomic_wait_native_type<__inner_type>::value) {
+ std::__cxx_atomic_notify_one_native<sizeof(__inner_type)>(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ } else {
+ std::__cxx_atomic_notify_one_global_table(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ }
}
template <class _AtomicWaitable>
_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void __atomic_notify_all(const _AtomicWaitable& __a) {
static_assert(__atomic_waitable<_AtomicWaitable>::value, "");
- std::__cxx_atomic_notify_all(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ using __inner_type _LIBCPP_NODEBUG = typename __atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__inner_type;
+ if constexpr (__is_atomic_wait_native_type<__inner_type>::value) {
+ std::__cxx_atomic_notify_all_native<sizeof(__inner_type)>(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ } else {
+ std::__cxx_atomic_notify_all_global_table(__atomic_waitable_traits<__decay_t<_AtomicWaitable> >::__atomic_contention_address(__a));
+ }
}
# else // _LIBCPP_HAS_THREADS
diff --git a/libcxx/include/__atomic/contention_t.h b/libcxx/include/__atomic/contention_t.h
index 5b42a0125f875..bf14d076d6281 100644
--- a/libcxx/include/__atomic/contention_t.h
+++ b/libcxx/include/__atomic/contention_t.h
@@ -11,6 +11,10 @@
#include <__atomic/support.h>
#include <__config>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/integral_constant.h>
+#include <__type_traits/is_integral.h>
+#include <cstddef>
#include <cstdint>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -19,10 +23,23 @@
_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Tp, class = void>
+struct __is_atomic_wait_native_type : false_type {};
+
#if defined(__linux__) || (defined(_AIX) && !defined(__64BIT__))
using __cxx_contention_t _LIBCPP_NODEBUG = int32_t;
+
+template <class _Tp>
+struct __is_atomic_wait_native_type<_Tp, __enable_if_t<is_integral<_Tp>::value && sizeof(_Tp) == 4> > : true_type {};
+
#else
using __cxx_contention_t _LIBCPP_NODEBUG = int64_t;
+
+template <class _Tp>
+struct __is_atomic_wait_native_type<_Tp,
+ __enable_if_t<is_integral<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8)> >
+ : true_type {};
+
#endif // __linux__ || (_AIX && !__64BIT__)
using __cxx_atomic_contention_t _LIBCPP_NODEBUG = __cxx_atomic_impl<__cxx_contention_t>;
diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp
index b214ba1fd11c0..c5f03acdf1da1 100644
--- a/libcxx/src/atomic.cpp
+++ b/libcxx/src/atomic.cpp
@@ -9,6 +9,7 @@
#include <__thread/timed_backoff_policy.h>
#include <atomic>
#include <climits>
+#include <cstddef>
#include <functional>
#include <thread>
@@ -53,6 +54,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
#ifdef __linux__
+
+// TODO : update
static void
__libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) {
static constexpr timespec __timeout = {2, 0};
@@ -70,22 +73,32 @@ extern "C" int __ulock_wait(
extern "C" int __ulock_wake(uint32_t operation, void* addr, uint64_t wake_value);
// https://github.com/apple/darwin-xnu/blob/2ff845c2e033bd0ff64b5b6aa6063a1f8f65aa32/bsd/sys/ulock.h#L82
+# define UL_COMPARE_AND_WAIT 1
# define UL_COMPARE_AND_WAIT64 5
# define ULF_WAKE_ALL 0x00000100
-static void
-__libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) {
- static_assert(sizeof(__cxx_atomic_contention_t) == 8, "Waiting on 8 bytes value");
- __ulock_wait(UL_COMPARE_AND_WAIT64, const_cast<__cxx_atomic_contention_t*>(__ptr), __val, 0);
+template <std::size_t _Size>
+static void __libcpp_platform_wait_on_address(void const volatile* __ptr, void const volatile* __val) {
+ static_assert(_Size == 8 || _Size == 4, "Can only wait on 8 bytes or 4 bytes value");
+ if constexpr (_Size == 4)
+ __ulock_wait(UL_COMPARE_AND_WAIT, const_cast<void*>(__ptr), *reinterpret_cast<uint32_t const volatile*>(__val), 0);
+ else
+ __ulock_wait(
+ UL_COMPARE_AND_WAIT64, const_cast<void*>(__ptr), *reinterpret_cast<uint64_t const volatile*>(__val), 0);
}
-static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const volatile* __ptr, bool __notify_one) {
- static_assert(sizeof(__cxx_atomic_contention_t) == 8, "Waking up on 8 bytes value");
- __ulock_wake(
- UL_COMPARE_AND_WAIT64 | (__notify_one ? 0 : ULF_WAKE_ALL), const_cast<__cxx_atomic_contention_t*>(__ptr), 0);
+template <std::size_t _Size>
+static void __libcpp_platform_wake_by_address(void const volatile* __ptr, bool __notify_one) {
+ static_assert(_Size == 8 || _Size == 4, "Can only wake up on 8 bytes or 4 bytes value");
+
+ if constexpr (_Size == 4)
+ __ulock_wake(UL_COMPARE_AND_WAIT | (__notify_one ? 0 : ULF_WAKE_ALL), const_cast<void*>(__ptr), 0);
+ else
+ __ulock_wake(UL_COMPARE_AND_WAIT64 | (__notify_one ? 0 : ULF_WAKE_ALL), const_cast<void*>(__ptr), 0);
}
#elif defined(__FreeBSD__) && __SIZEOF_LONG__ == 8
+// TODO : update
/*
* Since __cxx_contention_t is int64_t even on 32bit FreeBSD
* platforms, we have to use umtx ops that work on the long type, and
@@ -104,6 +117,7 @@ static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const vo
#else // <- Add other operating systems here
// Baseline is just a timed backoff
+// TODO : update
static void
__libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) {
@@ -128,83 +142,115 @@ static __libcpp_contention_table_entry __libcpp_contention_table[__libcpp_conten
static hash<void const volatile*> __libcpp_contention_hasher;
-static __libcpp_contention_table_entry* __libcpp_contention_state(void const volatile* p) {
+static __libcpp_contention_table_entry* __get_global_contention_state(void const volatile* p) {
return &__libcpp_contention_table[__libcpp_contention_hasher(p) & (__libcpp_contention_table_size - 1)];
}
/* Given an atomic to track contention and an atomic to actually wait on, which may be
the same atomic, we try to detect contention to avoid spuriously calling the platform. */
-static void __libcpp_contention_notify(__cxx_atomic_contention_t volatile* __contention_state,
- __cxx_atomic_contention_t const volatile* __platform_state,
+template <std::size_t _Size>
+static void __libcpp_contention_notify(__cxx_atomic_contention_t volatile* __global_contention_state,
+ void const volatile* __address_to_notify,
bool __notify_one) {
- if (0 != __cxx_atomic_load(__contention_state, memory_order_seq_cst))
+ if (0 != __cxx_atomic_load(__global_contention_state, memory_order_seq_cst))
// We only call 'wake' if we consumed a contention bit here.
- __libcpp_platform_wake_by_address(__platform_state, __notify_one);
-}
-static __cxx_contention_t
-__libcpp_contention_monitor_for_wait(__cxx_atomic_contention_t volatile* /*__contention_state*/,
- __cxx_atomic_contention_t const volatile* __platform_state) {
- // We will monitor this value.
- return __cxx_atomic_load(__platform_state, memory_order_acquire);
+ __libcpp_platform_wake_by_address<_Size>(__address_to_notify, __notify_one);
}
+
+template <std::size_t _Size>
static void __libcpp_contention_wait(__cxx_atomic_contention_t volatile* __contention_state,
- __cxx_atomic_contention_t const volatile* __platform_state,
- __cxx_contention_t __old_value) {
+ void const volatile* __address_to_wait,
+ void const volatile* __old_value) {
__cxx_atomic_fetch_add(__contention_state, __cxx_contention_t(1), memory_order_relaxed);
// https://llvm.org/PR109290
// There are no platform guarantees of a memory barrier in the platform wait implementation
__cxx_atomic_thread_fence(memory_order_seq_cst);
// We sleep as long as the monitored value hasn't changed.
- __libcpp_platform_wait_on_address(__platform_state, __old_value);
+ __libcpp_platform_wait_on_address<_Size>(__address_to_wait, __old_value);
__cxx_atomic_fetch_sub(__contention_state, __cxx_contention_t(1), memory_order_release);
}
/* When the incoming atomic is the wrong size for the platform wait size, need to
launder the value sequence through an atomic from our table. */
-static void __libcpp_atomic_notify(void const volatile* __location) {
- auto const __entry = __libcpp_contention_state(__location);
+static void __atomic_notify_global_table(void const volatile* __location) {
+ auto const __entry = __get_global_contention_state(__location);
// The value sequence laundering happens on the next line below.
__cxx_atomic_fetch_add(&__entry->__platform_state, __cxx_contention_t(1), memory_order_seq_cst);
- __libcpp_contention_notify(
+ __libcpp_contention_notify<sizeof(__cxx_atomic_contention_t)>(
&__entry->__contention_state,
&__entry->__platform_state,
false /* when laundering, we can't handle notify_one */);
}
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one(void const volatile* __location) noexcept {
- __libcpp_atomic_notify(__location);
+
+_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor_global(void const volatile* __location) noexcept {
+ auto const __entry = __get_global_contention_state(__location);
+ return __cxx_atomic_load(&__entry->__platform_state, memory_order_acquire);
}
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(void const volatile* __location) noexcept {
- __libcpp_atomic_notify(__location);
+
+_LIBCPP_EXPORTED_FROM_ABI void
+__libcpp_atomic_wait_global_table(void const volatile* __location, __cxx_contention_t __old_value) noexcept {
+ auto const __entry = __get_global_contention_state(__location);
+ __libcpp_contention_wait<sizeof(__cxx_atomic_contention_t)>(
+ &__entry->__contention_state, &__entry->__platform_state, &__old_value);
}
-_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t __libcpp_atomic_monitor(void const volatile* __location) noexcept {
- auto const __entry = __libcpp_contention_state(__location);
- return __libcpp_contention_monitor_for_wait(&__entry->__contention_state, &__entry->__platform_state);
+
+template <std::size_t _Size>
+_LIBCPP_AVAILABILITY_SYNC _LIBCPP_EXPORTED_FROM_ABI void
+__libcpp_atomic_wait_native(void const volatile* __address, void const volatile* __old_value) noexcept {
+ __libcpp_contention_wait<_Size>(
+ &__get_global_contention_state(__address)->__contention_state, __address, __old_value);
}
-_LIBCPP_EXPORTED_FROM_ABI void
-__libcpp_atomic_wait(void const volatile* __location, __cxx_contention_t __old_value) noexcept {
- auto const __entry = __libcpp_contention_state(__location);
- __libcpp_contention_wait(&__entry->__contention_state, &__entry->__platform_state, __old_value);
+
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one_global_table(void const volatile* __location) noexcept {
+ __atomic_notify_global_table(__location);
+}
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all_global_table(void const volatile* __location) noexcept {
+ __atomic_notify_global_table(__location);
}
/* When the incoming atomic happens to be the platform wait size, we still need to use the
table for the contention detection, but we can use the atomic directly for the wait. */
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one(__cxx_atomic_contention_t const volatile* __location) noexcept {
- __libcpp_contention_notify(&__libcpp_contention_state(__location)->__contention_state, __location, true);
-}
-_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_all(__cxx_atomic_contention_t const volatile* __location) noexcept {
- __libcpp_contention_notify(&__libcpp_contention_state(__location)->__contention_state, __location, false);
+template <std::size_t _Size>
+_LIBCPP_EXPORTED_FROM_ABI void __cxx_atomic_notify_one_native(void const volatile* __location) noexcept {
+ __libcpp_contention_notify<_Size>(&__get_global_contention_state(__location)->__contention_state, __location, true);
}
-// This function is never used, but still exported for ABI compatibility.
-_LIBCPP_EXPORTED_FROM_ABI __cxx_contention_t
-__libcpp_atomic_monitor(__cxx_atomic_contention_t const volatile* __location) noexcept {
- return __libcpp_contention_monitor_for_wait(&__libcpp_contention...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
ldionne
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CC @ogiroux in case you are curious about this patch
187e12d to
40b2e5c
Compare
e650e22 to
63bb534
Compare
| #if defined(__APPLE__) && defined(__aarch64__) | ||
| constexpr size_t __cache_line_size = 128; | ||
| #else | ||
| constexpr size_t __cache_line_size = 64; | ||
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's use the interference_size feature. Also, let's un-disable https://github.com/llvm/llvm-project/blob/main/libcxx/test/std/language.support/support.dynamic/hardware_inference_size.compile.pass.cpp#L10. Can you do that as a separate patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uses the hardware_destructive_interference_size now if it is defined. but i am not quite happy about the number. here on my M4 cpu
hw.cachelinesize: 128
std::hardware_destructive_interference_size == 256
std::hardware_constructive_interference_size == 64
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
and clang documented the values are unstable
https://clang.llvm.org/docs/LanguageExtensions.html#gcc-destructive-size-and-gcc-constructive-size
I don't feel very comfortable to use them , even though we don't have ABI problems here (since it is purely in dylib).
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
Co-authored-by: Louis Dionne <[email protected]>
c1a6abd to
94a5d68
Compare
ea65024 to
859a7e4
Compare
859a7e4 to
3959e16
Compare
ldionne
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a really nice change, thanks for being diligent about handling this tricky ABI situation!
|
I am going to pause the merge of this. I found that on the trunk, the apple back deployment seems already been problematic if I add a stress test like this #169177 |
|
|
||
| // UNSUPPORTED: no-threads | ||
| // UNSUPPORTED: c++03, c++11, c++14, c++17 | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After investigating this with Hui, we were basically not able to reproduce this issue when back-deploying to macOS 26.1. We were also unable to reproduce this issue when back-deploying to macOS 15.6 on Hui's system, which is strange since the Github runners are using macOS 15.7 (and both have the same libc++.dylib). Hence, it seems like this only reproduces on Github runners. Consequently, I suggest we still add the test but mark it as UNSUPPORTED with a TODO when back-deploying on macOS 15. At the moment, I am not convinced of the value of investigating this failure all the way through: it could be extremely time consuming and might not even be something that actually happens on Apple hardware (I'm not familiar with Github-hosted runners but I'm not sure they use actual Apple hardware).
TLDR, this seems like a reasonable approach for now:
// TODO: This test (unreliably) fails when back-deploying to macOS 15. However,
// we've only managed to observe the failure on the Github-provided CI
// runners, which is suspicious.
// UNSUPPORTED: stdlib=system && target={{.+}}-apple-macosx15{{.*}}
Also note that when we update to macOS 26 in the backdeployment CI, this test is going to start failing (if there's indeed a bug) -- at that point we might conclude that it wasn't a fluke and perhaps investigate this further.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I finally got to the bottom (sort of) of the failure.
First of all, this timeout was not due to the lost wake up. It was purely because the test is extremely slow on these runners. The test had 1 notify thread, 8 waiting threads, and the main thread. It was likely overloading the runner machine. I also saw that the trunk (non back deployment) version also failed once. I have put some debug print on the notifying thread. In the CI runner, the notifying thread was slowly making progress, but eventually did not make to the final iterations before the timeout killer killed the whole process. See the log here
I then tried to figure out why the tests were super slow on the runner (>1500s) , whereas it only took 1s on our own machines. I noticed that I have put a std::this_thread::yield in the notifier's loop in the test. I remember that macOS would put the yield thread on the lowest priority on the scheduler. If the machine was overloaded with other running jobs, our notifier thread would have taken long time to resume again. So as an experiment, I removed the yield from the test, and ran it again
while (waiter_ready.load() < num_waiters) {
- std::this_thread::yield();
}
We can see that without yield, the test only took 14s to finish on these runners. See log here (btw, i made the test always fail in order to see the logs)
So my final fix is that we don't need to disable the back deployment for this test. We can simply remove the yield from the test and the CI is green now
On the separate note, I am fully convinced that we should remove yield everywhere as it made the test 100x slower. I noticed that in the log the slowest test was
733.55s: apple-libc++-system.cfg.in :: std/thread/thread.semaphore/lost_wakeup.pass.cpp
In this test, we have a std::barrier and std::barrier's wait function uses __libcpp_timed_backoff_policy, which has a __libcpp_thread_yield in it. We have not removed it yet though it was discussed in #123855
I think it is time to get rid of it now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow, thanks a lot for the analysis. And I strongly agree with your conclusion that we should remove that yield now.
6c476d6 to
6c14b2a
Compare
6c14b2a to
f3e5805
Compare
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/17770 Here is the relevant piece of the build log for the reference |
…k_wait (llvm#161086) This is to address llvm#146145 The issue before was that, for `std::atomic::wait/notify`, we only support `uint64_t` to go through the native `ulock_wait` directly. Any other types will go through the global contention table's `atomic`, increasing the chances of spurious wakeup. This PR tries to allow any types that are of size 4 or 8 to directly go to the `ulock_wait`. This PR is just proof of concept. If we like this idea, I can go further to update the Linux/FreeBSD branch and add ABI macros so the existing behaviours are reserved under the stable ABI Here are some benchmark results ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ---------------------------------------------------------------------------------------------------------------------------------------------------- BM_stop_token_single_thread_reg_unreg_callback/1024 -0.1113 -0.1165 51519 45785 51397 45408 BM_stop_token_single_thread_reg_unreg_callback/4096 -0.2727 -0.1447 249685 181608 211865 181203 BM_stop_token_single_thread_reg_unreg_callback/65536 -0.1241 -0.1237 3308930 2898396 3300986 2892608 BM_stop_token_single_thread_reg_unreg_callback/262144 +0.0335 -0.1920 13237682 13681632 13208849 10673254 OVERALL_GEOMEAN -0.1254 -0.1447 0 0 0 0 ``` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<0>>/65536 -0.3344 -0.2424 5960741 3967212 5232250 3964085 BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<0>>/131072 -0.1474 -0.1475 9144356 7796745 9137547 7790193 BM_1_atomic_1_waiter_1_notifier<KeepNotifying, NumHighPrioTasks<0>>/262144 -0.1336 -0.1340 18333441 15883805 18323711 15868500 OVERALL_GEOMEAN -0.2107 -0.1761 0 0 0 0 ``` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<2>, NumHighPrioTasks<0>>/16384 +0.2321 -0.0081 836618 1030772 833197 826476 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<2>, NumHighPrioTasks<0>>/32768 -0.3034 -0.1329 2182721 1520569 1747211 1515028 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<2>, NumHighPrioTasks<0>>/65536 -0.0924 -0.0924 3389098 3075897 3378486 3066448 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<8>, NumHighPrioTasks<0>>/4096 +0.0464 +0.0474 664233 695080 657736 688892 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<8>, NumHighPrioTasks<0>>/8192 -0.0279 -0.0267 1336041 1298794 1324270 1288953 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<8>, NumHighPrioTasks<0>>/16384 +0.0270 +0.0304 2543004 2611786 2517471 2593975 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<32>, NumHighPrioTasks<0>>/1024 +0.0423 +0.0941 473621 493657 325604 356245 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<32>, NumHighPrioTasks<0>>/2048 +0.0420 +0.0675 906266 944349 636253 679169 BM_1_atomic_multi_waiter_1_notifier<KeepNotifying, NumWaitingThreads<32>, NumHighPrioTasks<0>>/4096 +0.0359 +0.0378 1761584 1824783 1015092 1053439 OVERALL_GEOMEAN -0.0097 -0.0007 0 0 0 0 ``` ``` Benchmark Time CPU Time Old Time New CPU Old CPU New --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<2>, NumHighPrioTasks<0>>/4096 -0.0990 -0.1001 371100 334370 369984 332955 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<2>, NumHighPrioTasks<0>>/8192 -0.0305 -0.0314 698228 676908 696418 674585 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<2>, NumHighPrioTasks<0>>/16384 -0.0258 -0.0268 1383530 1347894 1380665 1343680 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<8>, NumHighPrioTasks<0>>/1024 +0.0465 +0.4702 937821 981388 472087 694082 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<8>, NumHighPrioTasks<0>>/2048 +0.1596 +0.9140 1704819 1976899 616419 1179852 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<8>, NumHighPrioTasks<0>>/4096 -0.1018 -0.2316 3793976 3407609 1912209 1469331 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<32>, NumHighPrioTasks<0>>/256 +0.0395 +0.5818 30102662 31292982 174650 276270 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<32>, NumHighPrioTasks<0>>/512 -0.0065 +1.2860 33079634 32863968 162150 370680 BM_N_atomics_N_waiter_N_notifier<KeepNotifying, NumberOfAtomics<32>, NumHighPrioTasks<0>>/1024 -0.0325 +0.4683 36581740 35392385 282320 414520 OVERALL_GEOMEAN -0.0084 +0.2878 0 0 0 0 ``` --------- Co-authored-by: Louis Dionne <[email protected]>
This is to address #146145
The issue before was that, for
std::atomic::wait/notify, we only supportuint64_tto go through the nativeulock_waitdirectly. Any other types will go through the global contention table'satomic, increasing the chances of spurious wakeup. This PR tries to allow any types that are of size 4 or 8 to directly go to theulock_wait.This PR is just proof of concept. If we like this idea, I can go further to update the Linux/FreeBSD branch and add ABI macros so the existing behaviours are reserved under the stable ABI
Here are some benchmark results