Skip to content

Commit 5f73c01

Browse files
authored
crypto.blake3: sequentially process larger small tree layers (#26046)
Improves performance by spawning less threads.
1 parent e23af9d commit 5f73c01

File tree

1 file changed

+52
-8
lines changed

1 file changed

+52
-8
lines changed

lib/std/crypto/blake3.zig

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -685,9 +685,9 @@ const ChunkBatch = struct {
685685

686686
while (chunk_idx < ctx.end_chunk) {
687687
const remaining = ctx.end_chunk - chunk_idx;
688-
const batch_size = @min(remaining, max_simd_degree);
688+
const batch_size: usize = @min(remaining, max_simd_degree);
689689
const offset = chunk_idx * chunk_length;
690-
const batch_len = @as(usize, batch_size) * chunk_length;
690+
const batch_len = batch_size * chunk_length;
691691

692692
const num_cvs = compressChunksParallel(
693693
ctx.input[offset..][0..batch_len],
@@ -723,6 +723,44 @@ fn processParentBatch(ctx: ParentBatchContext) void {
723723
}
724724
}
725725

726+
fn processParentBatchSIMD(ctx: ParentBatchContext) void {
727+
const num_parents = ctx.end_idx - ctx.start_idx;
728+
if (num_parents == 0) return;
729+
730+
// Convert input CVs to bytes for SIMD processing
731+
var input_bytes: [max_simd_degree * 2 * Blake3.digest_length]u8 = undefined;
732+
var output_bytes: [max_simd_degree * Blake3.digest_length]u8 = undefined;
733+
var parents_array: [max_simd_degree][*]const u8 = undefined;
734+
735+
var processed: usize = 0;
736+
while (processed < num_parents) {
737+
const batch_size: usize = @min(num_parents - processed, max_simd_degree);
738+
739+
// Convert CV pairs to byte blocks for this batch
740+
for (0..batch_size) |i| {
741+
const pair_idx = ctx.start_idx + processed + i;
742+
const left_cv = ctx.input_cvs[pair_idx * 2];
743+
const right_cv = ctx.input_cvs[pair_idx * 2 + 1];
744+
745+
// Write left CV || right CV to form 64-byte parent block
746+
for (0..8) |j| {
747+
store32(input_bytes[i * 64 + j * 4 ..][0..4], left_cv[j]);
748+
store32(input_bytes[i * 64 + 32 + j * 4 ..][0..4], right_cv[j]);
749+
}
750+
parents_array[i] = input_bytes[i * 64 ..].ptr;
751+
}
752+
753+
hashMany(parents_array[0..batch_size], batch_size, 1, ctx.key, 0, false, ctx.flags.with(.{ .parent = true }), .{}, .{}, output_bytes[0 .. batch_size * Blake3.digest_length]);
754+
755+
for (0..batch_size) |i| {
756+
const output_idx = ctx.start_idx + processed + i;
757+
ctx.output_cvs[output_idx] = loadCvWords(output_bytes[i * Blake3.digest_length ..][0..Blake3.digest_length].*);
758+
}
759+
760+
processed += batch_size;
761+
}
762+
}
763+
726764
fn buildMerkleTreeLayerParallel(
727765
input_cvs: [][8]u32,
728766
output_cvs: [][8]u32,
@@ -732,11 +770,17 @@ fn buildMerkleTreeLayerParallel(
732770
) void {
733771
const num_parents = input_cvs.len / 2;
734772

735-
if (num_parents <= 16) {
736-
for (0..num_parents) |i| {
737-
const output = parentOutputFromCvs(input_cvs[i * 2], input_cvs[i * 2 + 1], key, flags);
738-
output_cvs[i] = output.chainingValue();
739-
}
773+
// Process sequentially with SIMD for smaller tree layers to avoid thread overhead
774+
// Tree layers shrink quickly, so only parallelize the first few large layers
775+
if (num_parents <= 1024) {
776+
processParentBatchSIMD(ParentBatchContext{
777+
.input_cvs = input_cvs,
778+
.output_cvs = output_cvs,
779+
.start_idx = 0,
780+
.end_idx = num_parents,
781+
.key = key,
782+
.flags = flags,
783+
});
740784
return;
741785
}
742786

@@ -748,7 +792,7 @@ fn buildMerkleTreeLayerParallel(
748792
const start_idx = worker_id * parents_per_worker;
749793
if (start_idx >= num_parents) break;
750794

751-
group.async(io, processParentBatch, .{ParentBatchContext{
795+
group.async(io, processParentBatchSIMD, .{ParentBatchContext{
752796
.input_cvs = input_cvs,
753797
.output_cvs = output_cvs,
754798
.start_idx = start_idx,

0 commit comments

Comments
 (0)