@@ -685,9 +685,9 @@ const ChunkBatch = struct {
685685
686686 while (chunk_idx < ctx .end_chunk ) {
687687 const remaining = ctx .end_chunk - chunk_idx ;
688- const batch_size = @min (remaining , max_simd_degree );
688+ const batch_size : usize = @min (remaining , max_simd_degree );
689689 const offset = chunk_idx * chunk_length ;
690- const batch_len = @as ( usize , batch_size ) * chunk_length ;
690+ const batch_len = batch_size * chunk_length ;
691691
692692 const num_cvs = compressChunksParallel (
693693 ctx .input [offset .. ][0.. batch_len ],
@@ -723,6 +723,44 @@ fn processParentBatch(ctx: ParentBatchContext) void {
723723 }
724724}
725725
726+ fn processParentBatchSIMD (ctx : ParentBatchContext ) void {
727+ const num_parents = ctx .end_idx - ctx .start_idx ;
728+ if (num_parents == 0 ) return ;
729+
730+ // Convert input CVs to bytes for SIMD processing
731+ var input_bytes : [max_simd_degree * 2 * Blake3 .digest_length ]u8 = undefined ;
732+ var output_bytes : [max_simd_degree * Blake3 .digest_length ]u8 = undefined ;
733+ var parents_array : [max_simd_degree ][* ]const u8 = undefined ;
734+
735+ var processed : usize = 0 ;
736+ while (processed < num_parents ) {
737+ const batch_size : usize = @min (num_parents - processed , max_simd_degree );
738+
739+ // Convert CV pairs to byte blocks for this batch
740+ for (0.. batch_size ) | i | {
741+ const pair_idx = ctx .start_idx + processed + i ;
742+ const left_cv = ctx .input_cvs [pair_idx * 2 ];
743+ const right_cv = ctx .input_cvs [pair_idx * 2 + 1 ];
744+
745+ // Write left CV || right CV to form 64-byte parent block
746+ for (0.. 8) | j | {
747+ store32 (input_bytes [i * 64 + j * 4 .. ][0.. 4], left_cv [j ]);
748+ store32 (input_bytes [i * 64 + 32 + j * 4 .. ][0.. 4], right_cv [j ]);
749+ }
750+ parents_array [i ] = input_bytes [i * 64 .. ].ptr ;
751+ }
752+
753+ hashMany (parents_array [0.. batch_size ], batch_size , 1 , ctx .key , 0 , false , ctx .flags .with (.{ .parent = true }), .{}, .{}, output_bytes [0 .. batch_size * Blake3 .digest_length ]);
754+
755+ for (0.. batch_size ) | i | {
756+ const output_idx = ctx .start_idx + processed + i ;
757+ ctx .output_cvs [output_idx ] = loadCvWords (output_bytes [i * Blake3 .digest_length .. ][0.. Blake3 .digest_length ].* );
758+ }
759+
760+ processed += batch_size ;
761+ }
762+ }
763+
726764fn buildMerkleTreeLayerParallel (
727765 input_cvs : [][8 ]u32 ,
728766 output_cvs : [][8 ]u32 ,
@@ -732,11 +770,17 @@ fn buildMerkleTreeLayerParallel(
732770) void {
733771 const num_parents = input_cvs .len / 2 ;
734772
735- if (num_parents <= 16 ) {
736- for (0.. num_parents ) | i | {
737- const output = parentOutputFromCvs (input_cvs [i * 2 ], input_cvs [i * 2 + 1 ], key , flags );
738- output_cvs [i ] = output .chainingValue ();
739- }
773+ // Process sequentially with SIMD for smaller tree layers to avoid thread overhead
774+ // Tree layers shrink quickly, so only parallelize the first few large layers
775+ if (num_parents <= 1024 ) {
776+ processParentBatchSIMD (ParentBatchContext {
777+ .input_cvs = input_cvs ,
778+ .output_cvs = output_cvs ,
779+ .start_idx = 0 ,
780+ .end_idx = num_parents ,
781+ .key = key ,
782+ .flags = flags ,
783+ });
740784 return ;
741785 }
742786
@@ -748,7 +792,7 @@ fn buildMerkleTreeLayerParallel(
748792 const start_idx = worker_id * parents_per_worker ;
749793 if (start_idx >= num_parents ) break ;
750794
751- group .async (io , processParentBatch , .{ParentBatchContext {
795+ group .async (io , processParentBatchSIMD , .{ParentBatchContext {
752796 .input_cvs = input_cvs ,
753797 .output_cvs = output_cvs ,
754798 .start_idx = start_idx ,
0 commit comments