diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h index cba1f5fc2d26..e53ae751953d 100644 --- a/include/os/windows/spl/sys/sysmacros.h +++ b/include/os/windows/spl/sys/sysmacros.h @@ -81,9 +81,9 @@ extern unsigned int num_ecores; * swap priority is at 92. Most ZFS priorities should probably * stay below this, but kmem_reap needs to be higher. */ -#define minclsyspri 81 /* BASEPRI_KERNEL */ -#define defclsyspri 81 /* BASEPRI_KERNEL */ -#define maxclsyspri 89 +#define minclsyspri 8 /* BASEPRI_KERNEL */ +#define defclsyspri 8 /* BASEPRI_KERNEL */ +#define maxclsyspri 12 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) diff --git a/include/os/windows/zfs/sys/kstat_windows.h b/include/os/windows/zfs/sys/kstat_windows.h index 644b01354177..842cc4e006d7 100644 --- a/include/os/windows/zfs/sys/kstat_windows.h +++ b/include/os/windows/zfs/sys/kstat_windows.h @@ -150,6 +150,7 @@ typedef struct windows_kstat { kstat_named_t zfs_removal_suspend_progress; kstat_named_t cpu_avx_supported; kstat_named_t zvol_io_threads; + kstat_named_t zfs_prealloc_percent; } windows_kstat_t; @@ -261,6 +262,7 @@ extern int zfs_autoimport_disable; extern int zfs_removal_suspend_progress; extern int cpu_avx_supported; extern int zvol_threads; +extern int zfs_prealloc_percent; int kstat_windows_init(void *); void kstat_windows_fini(void); diff --git a/module/os/windows/spl/CMakeLists.txt b/module/os/windows/spl/CMakeLists.txt index b63c54e57921..b1840072271d 100644 --- a/module/os/windows/spl/CMakeLists.txt +++ b/module/os/windows/spl/CMakeLists.txt @@ -40,6 +40,40 @@ wdk_add_library(splkern ${TMH_FILE_LIST} ) +target_include_directories(splkern BEFORE PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/" PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/spl") # set(CMAKE_TOOLCHAIN_FILE $ENV{CMAKE_TOOLCHAIN_FILE}) -target_include_directories(splkern BEFORE PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/" PUBLIC "${CMAKE_SOURCE_DIR}/include/os/windows/spl") +find_program(MSVC_CL_EXECUTABLE cl.exe) +if(NOT MSVC_CL_EXECUTABLE) + message(FATAL_ERROR "MSVC_CL_EXECUTABLE is not set! Make sure to run CMake with a configured compiler.") +else() + message(STATUS "MSVC_CL_EXECUTABLE is set to: ${MSVC_CL_EXECUTABLE}") +endif() + +find_program(MSVC_LIB_EXECUTABLE lib.exe) +if(NOT MSVC_LIB_EXECUTABLE) + message(FATAL_ERROR "MSVC lib.exe not found in PATH. Ensure you're using Visual Studio Command Prompt or MSVC toolchain.") +else() +message(STATUS "MSVC_LIB_EXECUTABLE is set to: ${MSVC_LIB_EXECUTABLE}") +endif() + +set(CR_WRAPPERS_DIR "${CMAKE_BINARY_DIR}/module/os/windows/spl") +file(MAKE_DIRECTORY "${CR_WRAPPERS_DIR}") + +set(CR_WRAPPERS_OBJ "${CR_WRAPPERS_DIR}/spl_cr_wrappers.obj") +set(CR_WRAPPERS_LIB "${CR_WRAPPERS_DIR}/spl_cr_wrappers.lib") + +add_custom_command( + OUTPUT ${CR_WRAPPERS_LIB} + COMMAND ${MSVC_CL_EXECUTABLE} /nologo /c /Fo"${CR_WRAPPERS_OBJ}" ${CMAKE_SOURCE_DIR}/module/os/windows/spl/spl_cr_wrappers.c + COMMAND ${MSVC_LIB_EXECUTABLE} /nologo /OUT:${CR_WRAPPERS_LIB} /MACHINE:x64 ${CR_WRAPPERS_OBJ} + WORKING_DIRECTORY ${CR_WRAPPERS_DIR} + DEPENDS ${CMAKE_SOURCE_DIR}/module/os/windows/spl/spl_cr_wrappers.c + COMMENT "Building ${CR_WRAPPERS_LIB} in ${CR_WRAPPERS_DIR} using MSVC" +) + +add_custom_target(cr_wrappers ALL DEPENDS ${CR_WRAPPERS_LIB}) +add_library(spl_cr_wrappers STATIC IMPORTED GLOBAL) +set_target_properties(spl_cr_wrappers PROPERTIES + IMPORTED_LOCATION ${CR_WRAPPERS_LIB}) + diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c index 3a57b55ea90d..48c680ee2672 100644 --- a/module/os/windows/spl/spl-kmem.c +++ b/module/os/windows/spl/spl-kmem.c @@ -76,6 +76,7 @@ static volatile _Atomic int64_t spl_free = 0; int64_t spl_free_delta_ema; static boolean_t spl_event_thread_exit = FALSE; +static boolean_t spl_abd_prealloc_thread_exit = FALSE; PKEVENT low_mem_event = NULL; static volatile _Atomic int64_t spl_free_manual_pressure = 0; @@ -131,6 +132,10 @@ extern uint64_t zfs_active_rwlock; extern uint64_t total_memory; extern uint64_t real_total_memory; +extern kmem_cache_t *abd_chunk_cache; +extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; + #define MULT 1 static const char *KMEM_VA_PREFIX = "kmem_va"; @@ -4230,7 +4235,7 @@ spl_free_wrapper(void) int64_t spl_free_manual_pressure_wrapper(void) { - return (spl_free_manual_pressure); + return (0); } uint64_t @@ -4479,6 +4484,7 @@ spl_free_thread() spl_vm_pressure_level != MAGIC_PRESSURE_UNAVAILABLE) { /* there is pressure */ lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pressure_level: %lu\n",spl_vm_pressure_level)); new_spl_free = -(2LL * PAGE_SIZE * spl_vm_pages_wanted); if (spl_vm_pressure_level > 1) { emergency_lowmem = true; @@ -4533,6 +4539,7 @@ spl_free_thread() int64_t old_pressure = spl_free_manual_pressure; new_spl_free -= old_pressure * 2LL; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_free_manual_pressure: %llu\n",spl_free_manual_pressure)); if (spl_free_fast_pressure) { emergency_lowmem = true; new_spl_free -= old_pressure * 4LL; @@ -4630,6 +4637,7 @@ spl_free_thread() new_spl_free += bminus; lowmem = true; emergency_lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted %lu\n", spl_vm_pages_wanted)); // atomic swaps to set these variables used in arc.c int64_t previous_highest_pressure = 0; int64_t new_p = -bminus; @@ -4650,6 +4658,7 @@ spl_free_thread() new_spl_free -= bytes_wanted; if (reserve_low && !early_lots_free) { lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "spl_vm_pages_wanted: %lu reserve_low: %lu early_lots_free: %lu\n", spl_vm_pages_wanted, reserve_low, early_lots_free)); if (recent_lowmem == 0) { recent_lowmem = time_now; } @@ -4760,6 +4769,7 @@ spl_free_thread() real_total_memory) > 75) { new_spl_free -= total_mem_used / 32; lowmem = true; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "segkmem_total_mem_allocated: %llu real_total_memory: %llu\n", segkmem_total_mem_allocated, real_total_memory)); } } @@ -4914,6 +4924,53 @@ spl_event_thread(void *notused) thread_exit(); } +static void +spl_abd_prealloc_thread(void *notused) +{ + NTSTATUS Status; + + typedef struct abd_prealloc_node { + list_node_t node; + } abd_prealloc_node_t; + + abd_prealloc_node_t *node; + list_t abd_prealloc_list; + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc start segkmem_total_mem_allocated: %lld total_memory: %lld\n", + segkmem_total_mem_allocated, total_memory)); + + dprintf("SPL: beginning spl_abd_prealloc_thread() loop\n"); + + list_create(&abd_prealloc_list, sizeof (abd_prealloc_node_t), offsetof(abd_prealloc_node_t, node)); + + while (!spl_abd_prealloc_thread_exit) { + + if (!abd_chunk_cache || !zfs_arc_max) { + delay(hz); + continue; + } + + if (segkmem_total_mem_allocated >= + (zfs_arc_max * zfs_prealloc_percent) / 100) { + break; + } + + node = (abd_prealloc_node_t *)kmem_cache_alloc(abd_chunk_cache, KM_SLEEP); + list_insert_tail(&abd_prealloc_list, node); + } + + while ((node = list_remove_head(&abd_prealloc_list)) != NULL) { + kmem_cache_free(abd_chunk_cache, node); + } + + spl_abd_prealloc_thread_exit = FALSE; + dprintf("SPL: %s thread_exit\n", __func__); + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: abd prealloc done segkmem_total_mem_allocated: %lld total_memory: %lld zfs_arc_max: %llu zfs_prealloc_percent: %d%\n", + segkmem_total_mem_allocated, total_memory, zfs_arc_max, zfs_prealloc_percent)); + thread_exit(); +} + static int spl_kstat_update(kstat_t *ksp, int rw) @@ -5342,8 +5399,13 @@ spl_kmem_thread_init(void) (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); spl_free_thread_running = TRUE; - spl_event_thread_exit = FALSE; - (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + if (zfs_prealloc_percent) { + spl_abd_prealloc_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_abd_prealloc_thread, 0, 0, 0, 0, 92); + } else { + spl_event_thread_exit = FALSE; + (void) thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); + } } void @@ -5351,6 +5413,7 @@ spl_kmem_thread_fini(void) { shutting_down = 1; + spl_abd_prealloc_thread_exit = TRUE; if (low_mem_event != NULL) { dprintf("SPL: stopping spl_event_thread\n"); spl_event_thread_exit = TRUE; diff --git a/module/os/windows/spl/spl-thread.c b/module/os/windows/spl/spl-thread.c index 6ef65414c6d4..93cb51557fd8 100644 --- a/module/os/windows/spl/spl-thread.c +++ b/module/os/windows/spl/spl-thread.c @@ -39,7 +39,78 @@ uint64_t zfs_threads = 0; -kthread_t * + +kthread_t* +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void*), + void* arg, + size_t len, + int state, +#ifdef SPL_DEBUG_THREAD + char* filename, + int line, +#endif + pri_t pri) +{ + NTSTATUS status; + HANDLE hThread = NULL; + PETHREAD eThread = NULL; + +#ifdef SPL_DEBUG_THREAD + dprintf("Start thread pri %d\n", pri); +#endif + + status = PsCreateSystemThread( + &hThread, + THREAD_ALL_ACCESS, + NULL, + NULL, + NULL, + proc, + arg); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Convert HANDLE ETHREAD */ + status = ObReferenceObjectByHandle( + hThread, + THREAD_ALL_ACCESS, + *PsThreadType, + KernelMode, + (PVOID*)&eThread, + NULL); + + /* We no longer need the handle */ + ZwClose(hThread); + + if (!NT_SUCCESS(status)) + return NULL; + + /* Clamp priority to safe Windows range */ + KPRIORITY newPri = (KPRIORITY)pri; + + if (newPri > maxclsyspri) + newPri = maxclsyspri; + + if (newPri < minclsyspri) + newPri = minclsyspri; + + /* Set absolute priority */ + KeSetPriorityThread((PKTHREAD)eThread, newPri); + +#ifdef SPL_DEBUG_THREAD + dprintf("Thread created with priority %d\n", newPri); +#endif + + atomic_inc_64(&zfs_threads); + + return (kthread_t*)eThread; +} + +/*kthread_t* spl_thread_create( caddr_t stk, size_t stksize, @@ -72,12 +143,7 @@ spl_thread_create( if (result != STATUS_SUCCESS) return (NULL); - /* - * Improve the priority when asked to do so - * Thread priorities range from 0 to 31, where 0 is the lowest - * priority and 31 is the highest - */ - + if (pri > minclsyspri) { // thread_precedence_policy_data_t policy; // policy.importance = pri - minclsyspri; @@ -102,7 +168,7 @@ spl_thread_create( ObDereferenceObject(eThread); ZwClose(thread); return ((kthread_t *)eThread); -} +}*/ kthread_t * spl_current_thread(void) diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c index 2540dca395aa..4c3d0a645aea 100644 --- a/module/os/windows/spl/spl-vmem.c +++ b/module/os/windows/spl/spl-vmem.c @@ -448,6 +448,8 @@ uint64_t spl_frag_walk_cnt = 0; extern void spl_free_set_emergency_pressure(int64_t p); extern uint64_t segkmem_total_mem_allocated; extern uint64_t total_memory; +extern uint64_t zfs_arc_max; +extern int zfs_prealloc_percent; /* * Get a vmem_seg_t from the global segfree list. @@ -1732,12 +1734,23 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) vsp = vprev; } + // calling vm_source_free will free the memory to windows, we + // don't want to do this unless we are crossing the arc limit when + // zfs_prealloc_percent is enabled. + boolean_t allow_vm_source_free = true; + if (zfs_prealloc_percent) { + if (segkmem_total_mem_allocated < + (zfs_arc_max * 102) / 100) { + allow_vm_source_free = false; + } + } + /* * If the entire span is free, return it to the source. */ if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && vsp->vs_aprev->vs_type == VMEM_SPAN && - vsp->vs_anext->vs_type == VMEM_SPAN) { + vsp->vs_anext->vs_type == VMEM_SPAN && allow_vm_source_free) { vaddr = (void *)vsp->vs_start; size = VS_SIZE(vsp); ASSERT(size == VS_SIZE(vsp->vs_aprev)); diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c index ae3135208fef..423a6e02c124 100644 --- a/module/os/windows/spl/spl-windows.c +++ b/module/os/windows/spl/spl-windows.c @@ -53,12 +53,14 @@ volatile unsigned int vm_page_speculative_count = 5500; uint64_t spl_GetPhysMem(void); uint64_t spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath); +uint64_t spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath); #include #include // Size in bytes of the memory allocated in seg_kmem extern uint64_t segkmem_total_mem_allocated; +extern int zfs_prealloc_percent; #define MAXHOSTNAMELEN 64 extern char hostname[MAXHOSTNAMELEN]; @@ -73,7 +75,7 @@ uint32_t spl_hostid = 0; uint64_t __readcr8(void) { - return (0ULL); + return (read_cr8_msvc()); } unsigned long @@ -528,6 +530,9 @@ spl_start(PUNICODE_STRING RegistryPath) spl_mutex_subsystem_init(); spl_kmem_init(total_memory); + // lets get the registry value now, because the zfs loads the registry little later + zfs_prealloc_percent = spl_getZfsPreallocSize(RegistryPath); + spl_vnode_init(); spl_kmem_thread_init(); spl_kmem_mp_init(); @@ -755,3 +760,91 @@ spl_GetZfsTotalMemory(PUNICODE_STRING RegistryPath) ZwClose(h); return (newvalue); } + +uint64_t +spl_getZfsPreallocSize(PUNICODE_STRING RegistryPath) +{ + OBJECT_ATTRIBUTES ObjectAttributes; + HANDLE h; + NTSTATUS status; + uint64_t newvalue = 0; + + InitializeObjectAttributes(&ObjectAttributes, + RegistryPath, + OBJ_KERNEL_HANDLE | OBJ_CASE_INSENSITIVE, + NULL, + NULL); + + status = ZwOpenKey(&h, // KeyHandle + KEY_ALL_ACCESS, // DesiredAccess + &ObjectAttributes); // ObjectAttributes + + if (!NT_SUCCESS(status)) { + dprintf("%s: Unable to open Registry %wZ: 0x%x. " + "Going with defaults.\n", __func__, RegistryPath, status); + return (0); + } + + ULONG index = 0; + ULONG length = 0; + PKEY_VALUE_FULL_INFORMATION regBuffer = NULL; + + for (index = 0; status != STATUS_NO_MORE_ENTRIES; index++) { + // Get the buffer size necessary + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + NULL, 0, &length); + + if ((status != STATUS_BUFFER_TOO_SMALL) && + (status != STATUS_BUFFER_OVERFLOW)) + break; // Something is wrong - or we finished + + // Allocate space to hold + regBuffer = (PKEY_VALUE_FULL_INFORMATION)ExAllocatePoolWithTag( + NonPagedPoolNx, length, 'zfsr'); + + if (regBuffer == NULL) + break; + + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, + regBuffer, length, &length); + if (!NT_SUCCESS(status)) { + break; + } + // Convert name to straight ascii so we compare with kstat + ULONG outlen = 0; + char keyname[KSTAT_STRLEN + 1] = { 0 }; + status = RtlUnicodeToUTF8N(keyname, KSTAT_STRLEN, &outlen, + regBuffer->Name, regBuffer->NameLength); + + // Conversion failed? move along.. + if (status != STATUS_SUCCESS && status + != STATUS_SOME_NOT_MAPPED) + break; + + // Output string is only null terminated if input is, + // so do so now. + keyname[outlen] = 0; + if (strcasecmp("zfs_prealloc_percent", keyname) == 0) { + if (regBuffer->Type != REG_DWORD || + regBuffer->DataLength != sizeof (uint32_t)) { + dprintf("%s: registry '%s' did not match. " + "Type needs to be REG_QWORD. (8 bytes)\n", + __func__, keyname); + } else { + newvalue = *(uint32_t *)((uint8_t *)regBuffer + + regBuffer->DataOffset); + dprintf("%s: zfs_prealloc_percent is set to:" + " %llu\n", __func__, newvalue); + } + break; + } + ExFreePool(regBuffer); + regBuffer = NULL; + } + + if (regBuffer) + ExFreePool(regBuffer); + + ZwClose(h); + return (newvalue); +} diff --git a/module/os/windows/spl/spl_cr_wrappers.c b/module/os/windows/spl/spl_cr_wrappers.c new file mode 100644 index 000000000000..529fe45df92c --- /dev/null +++ b/module/os/windows/spl/spl_cr_wrappers.c @@ -0,0 +1,21 @@ + +#ifdef _MSC_VER +#include +#include + +// we can use the MSVC-specific intrinsic __readcr8() to read the value of the CR8 +// register directly. This intrinsic is part of MSVC's built-in functions, +// which allows us to access hardware-level registers without writing assembly code. + +// Clang does not support the __readcr8 intrinsic, as it is specific to MSVC. +// Clang does not have a direct equivalent +// for accessing the CR8 register via a built-in function. Therefore, if we are using +// Clang, we must either use inline assembly or a different method to access the register. +// https://learn.microsoft.com/en-us/cpp/intrinsics/readcr8?view=msvc-170 + +__declspec(dllexport) uint64_t read_cr8_msvc(void) { + return __readcr8(); +} +#else +#error "_MSC_VER not defined" +#endif diff --git a/module/os/windows/zfs/CMakeLists.txt b/module/os/windows/zfs/CMakeLists.txt index 4a778d957f8f..ce832e0bb43c 100644 --- a/module/os/windows/zfs/CMakeLists.txt +++ b/module/os/windows/zfs/CMakeLists.txt @@ -47,4 +47,4 @@ zvol_os.c ${TMH_FILE_LIST} ) -target_link_libraries(zfskern_os PRIVATE splkern icpkern) +target_link_libraries(zfskern_os PRIVATE splkern icpkern spl_cr_wrappers) diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c index 41eef66413f3..0d23f80677e2 100644 --- a/module/os/windows/zfs/arc_os.c +++ b/module/os/windows/zfs/arc_os.c @@ -59,6 +59,7 @@ #include extern arc_stats_t arc_stats; +extern uint64_t zfs_arc_max; static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; @@ -127,7 +128,7 @@ arc_free_memory(void) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int @@ -137,8 +138,7 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) /* possibly wake up arc reclaim thread */ if (arc_reclaim_in_loop == B_FALSE) { - if (spl_free_manual_pressure_wrapper() != 0 || - !spl_minimal_physmem_p() || + if (!spl_minimal_physmem_p() || arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); kpreempt(KPREEMPT_SYNC); @@ -696,6 +696,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) zfs_arc_average_blocksize = ks->arc_zfs_arc_average_blocksize.value.ui64; zvol_threads = ks->zvol_io_threads.value.ui32; + zfs_prealloc_percent = ks->zfs_prealloc_percent.value.ui32; #ifdef _KERNEL if (ks->zfs_total_memory_limit.value.ui64 > total_memory && @@ -731,6 +732,7 @@ arc_kstat_update_windows(kstat_t *ksp, int rw) ks->arc_zfs_arc_average_blocksize.value.ui64 = zfs_arc_average_blocksize; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_prealloc_percent.value.ui32 = zfs_prealloc_percent; #ifdef _KERNEL ks->zfs_total_memory_limit.value.ui64 = total_memory; @@ -796,7 +798,7 @@ arc_prune_async(int64_t adjust) int64_t arc_available_memory(void) { - return (arc_free_memory() - arc_sys_free); + return (zfs_arc_max - aggsum_value(&arc_sums.arcstat_size)); } int diff --git a/module/os/windows/zfs/zfs_kstat_windows.c b/module/os/windows/zfs/zfs_kstat_windows.c index 36799e2fdd40..8b0c4d9823a3 100644 --- a/module/os/windows/zfs/zfs_kstat_windows.c +++ b/module/os/windows/zfs/zfs_kstat_windows.c @@ -173,7 +173,8 @@ windows_kstat_t windows_kstat = { { "zfs_total_memory_limit", KSTAT_DATA_UINT64 }, { "zfs_removal_suspend_progress", KSTAT_DATA_INT32 }, { "cpu_avx_supported", KSTAT_DATA_UINT32 }, - { "zvol_io_threads", KSTAT_DATA_UINT32 } + { "zvol_io_threads", KSTAT_DATA_UINT32 }, + { "zfs_prealloc_percent", KSTAT_DATA_UINT32 }, }; @@ -383,6 +384,8 @@ windows_kstat_update(kstat_t *ksp, int rw) ks->zfs_removal_suspend_progress.value.i32; cpu_avx_supported = ks->cpu_avx_supported.value.ui32; + zfs_prealloc_percent = + ks->zfs_prealloc_percent.value.ui32; } else { /* kstat READ */ @@ -573,6 +576,8 @@ windows_kstat_update(kstat_t *ksp, int rw) cpu_avx_supported; ks->zvol_io_threads.value.ui32 = zvol_threads; + ks->zfs_prealloc_percent.value.ui32 = + zfs_prealloc_percent; } arc_kstat_update_windows(ksp, rw); return (0); diff --git a/module/os/windows/zfs/zvol_os.c b/module/os/windows/zfs/zvol_os.c index 0500d2fdffc7..edcf319f0c07 100644 --- a/module/os/windows/zfs/zvol_os.c +++ b/module/os/windows/zfs/zvol_os.c @@ -46,6 +46,7 @@ unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; int zvol_threads = 0; +int zfs_prealloc_percent = 0; taskq_t *zvol_taskq; diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 46c3bb1bcd0b..a940770c45bf 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -113,6 +113,200 @@ static void txg_quiesce_thread(void *arg); int zfs_txg_timeout = 1; /* max seconds worth of delta per txg */ +#define DIRTY_FLOOR_BYTES (153ULL << 20) +#define DIRTY_CEIL_BYTES (4ULL << 30) +uint_t zfs_adc_target_sync_pct = 75; +/* PID gains, all scaled ×1000 to avoid floating point */ +int zfs_adc_kp = 200; /* Proportional: main corrective force */ +int zfs_adc_ki = 15; /* Integral: eliminates steady-state bias */ +int zfs_adc_kd = 100; /* Derivative: damping against oscillation */ + +/* EMA smoothing window in TXG count */ +uint_t zfs_adc_ema_alpha_pct = 25; /* 25% weight on new sample */ +/* Minimum TXGs between dirty_max updates (anti-flapping) */ +uint_t zfs_adc_holdoff_txgs = 2; +/* Master enable — 0 reverts to stock ZFS behavior instantly */ +int zfs_adc_enable = 1; + +typedef struct { + /* PID state */ + int64_t adc_integral; /* accumulated error (I term) */ + int64_t adc_prev_error; /* last error sample (D term) */ + clock_t adc_ema_delta; /* smoothed spa_sync duration */ + uint64_t adc_last_txg; /* TXG of last adjustment */ + + /* Bounds in bytes, computed once from physmem */ + uint64_t adc_min_dirty; + uint64_t adc_max_dirty; + + /* Diagnostics / kstat shadow */ + uint64_t adc_n_syncs; /* total TXGs observed */ + uint64_t adc_n_raised; /* times dirty_max was raised */ + uint64_t adc_n_lowered; /* times dirty_max was lowered */ + uint64_t adc_n_clamped; /* times a bound was hit */ + int64_t adc_last_p; /* last P term for debug */ + int64_t adc_last_i; /* last I term for debug */ + int64_t adc_last_d; /* last D term for debug */ +} txg_adc_t; + +static inline clock_t +adc_ema(clock_t prev, clock_t sample, uint_t alpha_pct) +{ + return (prev + (clock_t)(((int64_t)sample - prev) + * alpha_pct / 100)); +} + +static void +adc_init(txg_adc_t* adc, clock_t target_ticks) +{ + bzero(adc, sizeof(*adc)); + + adc->adc_min_dirty = DIRTY_FLOOR_BYTES; + adc->adc_max_dirty = DIRTY_CEIL_BYTES; + + /* Defensive: ensure min < max regardless of tunable misconfiguration */ + if (adc->adc_min_dirty >= adc->adc_max_dirty) + adc->adc_min_dirty = adc->adc_max_dirty / 8; + + /* Seed EMA at target — controller starts in steady state */ + adc->adc_ema_delta = target_ticks; + adc->adc_prev_error = 0; + adc->adc_integral = 0; +} + +static void +adc_update(txg_adc_t* adc, uint64_t txg, + clock_t raw_delta, clock_t target_ticks) +{ + int64_t error; /* normalized error × 1000 */ + int64_t p_term; /* proportional correction */ + int64_t i_term; /* integral correction */ + int64_t d_term; /* derivative correction */ + int64_t pid_out; /* combined PID output */ + int64_t adjustment; /* byte delta for dirty_max */ + uint64_t cur, proposed, next; + + adc->adc_n_syncs++; + + /* Step 1: Update smoothed sync duration via EMA */ + adc->adc_ema_delta = adc_ema(adc->adc_ema_delta, + raw_delta, zfs_adc_ema_alpha_pct); + + /* Step 2: Enforce adjustment holdoff (anti-flapping) */ + if (adc->adc_last_txg != 0 && + (txg - adc->adc_last_txg) < zfs_adc_holdoff_txgs) + return; + + /* Step 3: Compute normalized error + * + * error = (ema_delta - target) / target × 1000 + * + * > 0 : sync taking too long → dirty_max too high → must shrink + * < 0 : sync finishing early → dirty_max too low → can grow + * = 0 : perfect operating point + * + * Example: ema=6s, target=5s → error = +200 (20% over) + * Example: ema=3s, target=5s → error = -400 (40% under) + */ + if (target_ticks == 0) + return; /* Safety: avoid divide-by-zero on misconfiguration */ + + error = ((int64_t)adc->adc_ema_delta - (int64_t)target_ticks) + * 1000LL / (int64_t)target_ticks; + + /* Step 4: P term — immediate response to current error */ + p_term = (int64_t)zfs_adc_kp * error / 1000LL; + + /* Step 5: I term — accumulate to eliminate steady-state offset + * + * Anti-windup clamp: prevents integral from growing unboundedly + * during sustained overload (e.g., pool degraded, resilver running). + * Clamped at ±(30 × Kp) which limits I contribution to ≤3× P max. + */ + adc->adc_integral += error; + { + int64_t windup_limit = 30LL * (int64_t)zfs_adc_kp; + if (adc->adc_integral > windup_limit) adc->adc_integral = windup_limit; + if (adc->adc_integral < -windup_limit) adc->adc_integral = -windup_limit; + } + i_term = (int64_t)zfs_adc_ki * adc->adc_integral / 1000LL; + + /* Step 6: D term — dampen oscillation via rate-of-change */ + d_term = (int64_t)zfs_adc_kd + * (error - adc->adc_prev_error) / 1000LL; + adc->adc_prev_error = error; + + /* Save for diagnostics */ + adc->adc_last_p = p_term; + adc->adc_last_i = i_term; + adc->adc_last_d = d_term; + + /* Step 7: Combine PID output + * + * pid_out > 0 → sync was slow → DECREASE dirty_max + * pid_out < 0 → sync was fast → INCREASE dirty_max + * (sign inversion applied in Step 8) + */ + pid_out = p_term + i_term + d_term; + + if (pid_out == 0) + return; + + /* Step 8: Convert PID output to byte adjustment + * + * adjustment = -(pid_out / 1000) × dirty_max × step_scale + * + * pid_out is in units of 0.1% so dividing by 1000 gives fraction. + * Maximum single-step is capped at 20% of current dirty_max to + * prevent catastrophic collapse from one anomalous TXG. + */ + cur = zfs_dirty_data_max; + + adjustment = -((int64_t)cur / 1000LL) * pid_out; + + /* Cap single-step adjustment at ±20% of current dirty_max */ + { + int64_t max_step = (int64_t)(cur / 5); + if (adjustment > max_step) adjustment = max_step; + if (adjustment < -max_step) adjustment = -max_step; + } + + /* Step 9: Apply bounds */ + proposed = (int64_t)cur + adjustment; + + if (proposed <= adc->adc_min_dirty) { + next = adc->adc_min_dirty; + adc->adc_n_clamped++; + } + else if (proposed >= adc->adc_max_dirty) { + next = adc->adc_max_dirty; + adc->adc_n_clamped++; + } + else { + next = proposed; + } + + /* Step 10: Commit — single store, visible to txg_delay() immediately */ + if (next != cur) { + zfs_dirty_data_max = next; + adc->adc_last_txg = txg; + if (next > cur) adc->adc_n_raised++; + else adc->adc_n_lowered++; + + zfs_dbgmsg("txg_adc txg=%llu ema_delta=%ldms target=%ldms " + "err=%lld P=%lld I=%lld D=%lld " + "dirty_max %lluMB→%lluMB", + (u_longlong_t)txg, + (long)(((uint64_t)adc->adc_ema_delta * 1000ULL) / hz), + (long)(((uint64_t)target_ticks * 1000ULL) / hz), + (longlong_t)error, + (longlong_t)p_term, + (longlong_t)i_term, + (longlong_t)d_term, + (u_longlong_t)(cur >> 20), + (u_longlong_t)(next >> 20)); + } +} /* * Prepare the txg subsystem. */ @@ -213,8 +407,9 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, - dp, 0, &p0, TS_RUN, defclsyspri); + dp, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&tx->tx_sync_lock); } @@ -530,11 +725,18 @@ txg_sync_thread(void *arg) tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; clock_t start, delta; + txg_adc_t adc; (void) spl_fstrans_mark(); txg_thread_enter(tx, &cpr); start = delta = 0; + clock_t adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + + if (zfs_adc_enable) + adc_init(&adc, adc_target); + for (;;) { clock_t timeout = zfs_txg_timeout * hz; clock_t timer; @@ -598,6 +800,12 @@ txg_sync_thread(void *arg) delta = ddi_get_lbolt() - start; spa_txg_history_fini_io(spa, ts); + if (zfs_adc_enable) { + adc_target = (clock_t)(zfs_txg_timeout * hz) + * zfs_adc_target_sync_pct / 100; + adc_update(&adc, txg, delta, adc_target); + } + mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0;