diff --git a/cmd/entire/cli/checkpoint/checkpoint.go b/cmd/entire/cli/checkpoint/checkpoint.go index 9812b132d..cd8408d0e 100644 --- a/cmd/entire/cli/checkpoint/checkpoint.go +++ b/cmd/entire/cli/checkpoint/checkpoint.go @@ -334,6 +334,47 @@ type UpdateCommittedOptions struct { // CompactTranscript is the updated Entire Transcript Format bytes. // If non-nil, replaces the existing transcript.jsonl on v2 /main. CompactTranscript []byte + + // PrecomputedBlobs, if non-nil, provides chunk blob hashes and the + // content-hash blob hash computed once for this transcript. When set, + // UpdateCommitted skips the per-call ChunkTranscript + zlib work and + // reuses these hashes. Used by finalizeAllTurnCheckpoints to avoid + // re-compressing identical content N times. + PrecomputedBlobs *PrecomputedTranscriptBlobs +} + +// PrecomputedTranscriptBlobs holds blob hashes for a transcript that was +// chunked and written to the object store once, for reuse across multiple +// UpdateCommitted calls sharing the same transcript content. +// +// Blob hashes are content-addressed (SHA-1 of chunk bytes), so the same +// PrecomputedTranscriptBlobs works for both v1 (full.jsonl) and v2 +// (raw_transcript) paths — only the tree-entry filename differs. +// +// Callers should avoid constructing this for empty transcripts; agent.ChunkTranscript +// would otherwise produce a single zero-length chunk and a hash for an empty +// blob, which downstream stores would never reference. +type PrecomputedTranscriptBlobs struct { + // ChunkHashes are the blob hashes for each transcript chunk, in order. + // Always non-empty when built via PrecomputeTranscriptBlobs (a non-empty + // transcript chunks to at least one entry; callers should skip precompute + // for empty transcripts). + ChunkHashes []plumbing.Hash + + // ContentHashBlob is the blob hash of the "sha256:" content-hash + // string for the transcript. + ContentHashBlob plumbing.Hash + + // ContentHash is the "sha256:" string itself, so the short-circuit + // path can compare without re-reading the blob. + ContentHash string +} + +// isUsable reports whether the precomputed blobs satisfy the invariants that +// consumers depend on: a non-zero content-hash blob and at least one chunk +// hash. Callers should fall back to the fresh-write path when this is false. +func (p *PrecomputedTranscriptBlobs) isUsable() bool { + return p != nil && !p.ContentHashBlob.IsZero() && len(p.ChunkHashes) > 0 } // CommittedInfo contains summary information about a committed checkpoint. diff --git a/cmd/entire/cli/checkpoint/committed.go b/cmd/entire/cli/checkpoint/committed.go index 7e73e1ade..98b83caa6 100644 --- a/cmd/entire/cli/checkpoint/committed.go +++ b/cmd/entire/cli/checkpoint/committed.go @@ -7,6 +7,7 @@ import ( "encoding/json" "errors" "fmt" + "io" "log/slog" "os" "path/filepath" @@ -40,6 +41,12 @@ import ( // errStopIteration is used to stop commit iteration early in GetCheckpointAuthor. var errStopIteration = errors.New("stop iteration") +// chunkTranscript is an indirection over agent.ChunkTranscript so tests can +// count or intercept chunking calls (e.g., to verify the short-circuit avoids +// re-chunking identical content). Production code paths always use the +// unwrapped function. +var chunkTranscript = agent.ChunkTranscript + // WriteCommitted writes a committed checkpoint to the entire/checkpoints/v1 branch. // Checkpoints are stored at sharded paths: // // @@ -1318,7 +1325,7 @@ func (s *GitStore) UpdateCommitted(ctx context.Context, opts UpdateCommittedOpti // Replace transcript (full replace, not append). // Transcript is pre-redacted by the caller (enforced by RedactedBytes type). if opts.Transcript.Len() > 0 { - if err := s.replaceTranscript(ctx, opts.Transcript, opts.Agent, sessionPath, entries); err != nil { + if err := s.replaceTranscript(ctx, opts.Transcript, opts.Agent, opts.PrecomputedBlobs, sessionPath, entries); err != nil { return fmt.Errorf("failed to replace transcript: %w", err) } } @@ -1365,7 +1372,41 @@ func (s *GitStore) UpdateCommitted(ctx context.Context, opts UpdateCommittedOpti // replaceTranscript writes the full transcript content, replacing any existing transcript. // Also removes any chunk files from a previous write and updates the content hash. -func (s *GitStore) replaceTranscript(ctx context.Context, transcript redact.RedactedBytes, agentType types.AgentType, sessionPath string, entries map[string]object.TreeEntry) error { +// +// Short-circuits when the existing content_hash.txt already matches the new +// transcript's sha256 — in that case the chunk entries are preserved as-is and +// no chunking/zlib happens. Use precomputed (non-nil) to reuse blob hashes +// computed once across multiple checkpoints. +func (s *GitStore) replaceTranscript(ctx context.Context, transcript redact.RedactedBytes, agentType types.AgentType, precomputed *PrecomputedTranscriptBlobs, sessionPath string, entries map[string]object.TreeEntry) error { + // Ignore precompute if invariants are violated — fall back to fresh chunking. + if precomputed != nil && !precomputed.isUsable() { + precomputed = nil + } + + // Compute the new content-hash string (cheap — SHA-256 over transcript bytes). + var newContentHash string + if precomputed != nil { + newContentHash = precomputed.ContentHash + } else { + newContentHash = fmt.Sprintf("sha256:%x", sha256.Sum256(transcript.Bytes())) + } + + // Short-circuit: if the existing content_hash.txt already matches, the + // chunk entries currently in `entries` represent the same content. Leave + // everything as-is and skip chunking + zlib. + hashPath := sessionPath + paths.ContentHashFileName + if existing, ok := entries[hashPath]; ok { + if blob, err := s.repo.BlobObject(existing.Hash); err == nil { + if rdr, rerr := blob.Reader(); rerr == nil { + existingHash, readErr := io.ReadAll(rdr) + _ = rdr.Close() + if readErr == nil && string(existingHash) == newContentHash { + return nil + } + } + } + } + // Remove existing transcript files (base + any chunks) transcriptBase := sessionPath + paths.TranscriptFileName for key := range entries { @@ -1374,19 +1415,28 @@ func (s *GitStore) replaceTranscript(ctx context.Context, transcript redact.Reda } } - // Chunk the transcript (matches writeTranscript behavior) - chunks, err := agent.ChunkTranscript(ctx, transcript.Bytes(), agentType) - if err != nil { - return fmt.Errorf("failed to chunk transcript: %w", err) + // Resolve chunk hashes from precompute, or chunk + blob-write now. + var chunkHashes []plumbing.Hash + if precomputed != nil { + chunkHashes = precomputed.ChunkHashes + } else { + chunks, err := chunkTranscript(ctx, transcript.Bytes(), agentType) + if err != nil { + return fmt.Errorf("failed to chunk transcript: %w", err) + } + chunkHashes = make([]plumbing.Hash, len(chunks)) + for i, chunk := range chunks { + blobHash, err := CreateBlobFromContent(s.repo, chunk) + if err != nil { + return fmt.Errorf("failed to create transcript blob: %w", err) + } + chunkHashes[i] = blobHash + } } - // Write chunk files - for i, chunk := range chunks { + // Record chunk files in the tree at v1 (full.jsonl) naming. + for i, blobHash := range chunkHashes { chunkPath := sessionPath + agent.ChunkFileName(paths.TranscriptFileName, i) - blobHash, err := CreateBlobFromContent(s.repo, chunk) - if err != nil { - return fmt.Errorf("failed to create transcript blob: %w", err) - } entries[chunkPath] = object.TreeEntry{ Name: chunkPath, Mode: filemode.Regular, @@ -1394,13 +1444,17 @@ func (s *GitStore) replaceTranscript(ctx context.Context, transcript redact.Reda } } - // Update content hash - contentHash := fmt.Sprintf("sha256:%x", sha256.Sum256(transcript.Bytes())) - hashBlob, err := CreateBlobFromContent(s.repo, []byte(contentHash)) - if err != nil { - return fmt.Errorf("failed to create content hash blob: %w", err) + // Content-hash blob. + var hashBlob plumbing.Hash + if precomputed != nil { + hashBlob = precomputed.ContentHashBlob + } else { + h, err := CreateBlobFromContent(s.repo, []byte(newContentHash)) + if err != nil { + return fmt.Errorf("failed to create content hash blob: %w", err) + } + hashBlob = h } - hashPath := sessionPath + paths.ContentHashFileName entries[hashPath] = object.TreeEntry{ Name: hashPath, Mode: filemode.Regular, @@ -1410,6 +1464,44 @@ func (s *GitStore) replaceTranscript(ctx context.Context, transcript redact.Reda return nil } +// PrecomputeTranscriptBlobs chunks the given transcript and writes each chunk +// plus the content-hash blob to the object store once, returning the resulting +// hashes for reuse across multiple UpdateCommitted calls that share the same +// transcript content. +// +// The returned blobs work for both v1 (full.jsonl) and v2 (raw_transcript) +// paths since blob hashes are content-addressed (SHA-1 of chunk bytes). Only +// the tree-entry filenames differ between v1 and v2. +func PrecomputeTranscriptBlobs(ctx context.Context, repo *git.Repository, transcript redact.RedactedBytes, agentType types.AgentType) (*PrecomputedTranscriptBlobs, error) { + raw := transcript.Bytes() + + chunks, err := chunkTranscript(ctx, raw, agentType) + if err != nil { + return nil, fmt.Errorf("failed to chunk transcript: %w", err) + } + + chunkHashes := make([]plumbing.Hash, len(chunks)) + for i, chunk := range chunks { + h, err := CreateBlobFromContent(repo, chunk) + if err != nil { + return nil, fmt.Errorf("failed to create transcript blob: %w", err) + } + chunkHashes[i] = h + } + + contentHash := fmt.Sprintf("sha256:%x", sha256.Sum256(raw)) + hashBlob, err := CreateBlobFromContent(repo, []byte(contentHash)) + if err != nil { + return nil, fmt.Errorf("failed to create content hash blob: %w", err) + } + + return &PrecomputedTranscriptBlobs{ + ChunkHashes: chunkHashes, + ContentHashBlob: hashBlob, + ContentHash: contentHash, + }, nil +} + // ensureSessionsBranch ensures the entire/checkpoints/v1 branch exists. func (s *GitStore) ensureSessionsBranch(ctx context.Context) error { refName := plumbing.NewBranchReferenceName(paths.MetadataBranchName) diff --git a/cmd/entire/cli/checkpoint/committed_update_test.go b/cmd/entire/cli/checkpoint/committed_update_test.go index b2d43df64..3f957f460 100644 --- a/cmd/entire/cli/checkpoint/committed_update_test.go +++ b/cmd/entire/cli/checkpoint/committed_update_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "testing" + "github.com/entireio/cli/cmd/entire/cli/agent/types" "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" "github.com/entireio/cli/cmd/entire/cli/paths" "github.com/entireio/cli/redact" @@ -586,3 +587,183 @@ func TestGetGitAuthorFromRepo_NoConfig(t *testing.T) { // Verify go-git config import is used (compile-time check). var _ = config.GlobalScope + +// TestUpdateCommitted_PrecomputedBlobs_Roundtrip verifies that passing +// precomputed blob hashes produces the same on-disk tree content as the +// non-precomputed path. +func TestUpdateCommitted_PrecomputedBlobs_Roundtrip(t *testing.T) { + t.Parallel() + _, store, cpID := setupRepoForUpdate(t) + + transcript := redact.AlreadyRedacted([]byte("line1\nline2\nline3 with some payload\n")) + + precomputed, err := PrecomputeTranscriptBlobs(context.Background(), store.repo, transcript, "") + if err != nil { + t.Fatalf("PrecomputeTranscriptBlobs() error = %v", err) + } + if len(precomputed.ChunkHashes) == 0 { + t.Fatal("precompute returned no chunk hashes") + } + if precomputed.ContentHashBlob.IsZero() { + t.Fatal("precompute returned zero content-hash blob") + } + + if err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + PrecomputedBlobs: precomputed, + }); err != nil { + t.Fatalf("UpdateCommitted(precomputed) error = %v", err) + } + + content, err := store.ReadSessionContent(context.Background(), cpID, 0) + if err != nil { + t.Fatalf("ReadSessionContent() error = %v", err) + } + if string(content.Transcript) != string(transcript.Bytes()) { + t.Errorf("transcript mismatch via precomputed path\ngot: %q\nwant: %q", + string(content.Transcript), string(transcript.Bytes())) + } +} + +// TestUpdateCommitted_ContentHashShortCircuit verifies that a second update +// with identical transcript content skips chunking entirely (short-circuit +// fires before agent.ChunkTranscript is called). +func TestUpdateCommitted_ContentHashShortCircuit(t *testing.T) { + // Cannot run in parallel: patches the package-level chunkTranscript hook. + _, store, cpID := setupRepoForUpdate(t) + + transcript := redact.AlreadyRedacted([]byte("stable transcript content\n")) + + if err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + }); err != nil { + t.Fatalf("UpdateCommitted(first) error = %v", err) + } + + // Install a counter. The second UpdateCommitted with identical content + // should never touch the chunking function. + calls := installChunkCounter(t) + + if err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + }); err != nil { + t.Fatalf("UpdateCommitted(second) error = %v", err) + } + + if *calls != 0 { + t.Errorf("short-circuit failed: chunkTranscript was called %d time(s) on a no-op re-update; expected 0", *calls) + } +} + +// installChunkCounter swaps the package-level chunkTranscript hook for a +// counter and restores it when the test completes. Returns a pointer the +// caller can dereference to read the running count. +func installChunkCounter(t *testing.T) *int { + t.Helper() + original := chunkTranscript + t.Cleanup(func() { chunkTranscript = original }) + var count int + chunkTranscript = func(ctx context.Context, content []byte, agentType types.AgentType) ([][]byte, error) { + count++ + return original(ctx, content, agentType) + } + return &count +} + +// TestUpdateCommitted_ContentChangedRewrites verifies the short-circuit does +// not fire when content actually differs. +func TestUpdateCommitted_ContentChangedRewrites(t *testing.T) { + t.Parallel() + repo, store, cpID := setupRepoForUpdate(t) + + first := redact.AlreadyRedacted([]byte("first version\n")) + second := redact.AlreadyRedacted([]byte("second version with more content\n")) + + if err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: first, + }); err != nil { + t.Fatalf("UpdateCommitted(first) error = %v", err) + } + hashBefore := readTranscriptBlobHash(t, repo, cpID) + + if err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: second, + }); err != nil { + t.Fatalf("UpdateCommitted(second) error = %v", err) + } + hashAfter := readTranscriptBlobHash(t, repo, cpID) + + if hashBefore == hashAfter { + t.Errorf("expected transcript blob to change; stayed at %v", hashBefore) + } + + content, err := store.ReadSessionContent(context.Background(), cpID, 0) + if err != nil { + t.Fatalf("ReadSessionContent() error = %v", err) + } + if string(content.Transcript) != string(second.Bytes()) { + t.Errorf("transcript content mismatch\ngot: %q\nwant: %q", + string(content.Transcript), string(second.Bytes())) + } +} + +// TestPrecomputeAndReuse_MatchesFreshWrite verifies that precomputed blob +// hashes match the hashes produced by a fresh chunk + blob-write pass. +func TestPrecomputeAndReuse_MatchesFreshWrite(t *testing.T) { + t.Parallel() + _, store, _ := setupRepoForUpdate(t) + + transcript := redact.AlreadyRedacted([]byte("deterministic content for hash comparison\n")) + + precomputed, err := PrecomputeTranscriptBlobs(context.Background(), store.repo, transcript, "") + if err != nil { + t.Fatalf("PrecomputeTranscriptBlobs() error = %v", err) + } + + freshBlob, err := CreateBlobFromContent(store.repo, transcript.Bytes()) + if err != nil { + t.Fatalf("CreateBlobFromContent() error = %v", err) + } + + if len(precomputed.ChunkHashes) != 1 { + t.Fatalf("expected 1 chunk for small transcript; got %d", len(precomputed.ChunkHashes)) + } + if precomputed.ChunkHashes[0] != freshBlob { + t.Errorf("precomputed chunk hash %v != fresh blob hash %v", + precomputed.ChunkHashes[0], freshBlob) + } +} + +// readTranscriptBlobHash reads the transcript blob hash at session 0 from the +// metadata branch. +func readTranscriptBlobHash(t *testing.T, repo *git.Repository, cpID id.CheckpointID) plumbing.Hash { + t.Helper() + ref, err := repo.Reference(plumbing.NewBranchReferenceName(paths.MetadataBranchName), true) + if err != nil { + t.Fatalf("failed to get ref: %v", err) + } + commit, err := repo.CommitObject(ref.Hash()) + if err != nil { + t.Fatalf("failed to get commit: %v", err) + } + tree, err := commit.Tree() + if err != nil { + t.Fatalf("failed to get tree: %v", err) + } + transcriptPath := cpID.Path() + "/0/" + paths.TranscriptFileName + file, err := tree.File(transcriptPath) + if err != nil { + t.Fatalf("failed to find transcript blob at %s: %v", transcriptPath, err) + } + return file.Hash +} diff --git a/cmd/entire/cli/checkpoint/v2_committed.go b/cmd/entire/cli/checkpoint/v2_committed.go index 5af03004b..bc824a760 100644 --- a/cmd/entire/cli/checkpoint/v2_committed.go +++ b/cmd/entire/cli/checkpoint/v2_committed.go @@ -5,6 +5,7 @@ import ( "crypto/sha256" "errors" "fmt" + "io" "log/slog" "os" "strings" @@ -208,10 +209,40 @@ func (s *V2GitStore) updateCommittedFullTranscript(ctx context.Context, opts Upd return err } - // Clear existing transcript artifacts for this session path before writing new ones. - // Preserve non-transcript metadata under the same session (e.g., tasks/*). + // Ignore precompute if invariants are violated — fall back to fresh chunking. + precomputed := opts.PrecomputedBlobs + if precomputed != nil && !precomputed.isUsable() { + precomputed = nil + } + + // Short-circuit: if the existing raw_transcript_hash.txt already matches + // the new transcript's sha256, the existing chunk entries represent the + // same content — preserve them and skip chunking + zlib. rawTranscriptPath := sessionPath + paths.V2RawTranscriptFileName rawHashPath := sessionPath + paths.V2RawTranscriptHashFileName + var newContentHash string + if precomputed != nil { + newContentHash = precomputed.ContentHash + } else { + newContentHash = fmt.Sprintf("sha256:%x", sha256.Sum256(opts.Transcript.Bytes())) + } + if existing, ok := entries[rawHashPath]; ok { + if blob, err := s.repo.BlobObject(existing.Hash); err == nil { + if rdr, rerr := blob.Reader(); rerr == nil { + existingHash, readErr := io.ReadAll(rdr) + _ = rdr.Close() + if readErr == nil && string(existingHash) == newContentHash { + // Content unchanged — skip tree surgery and ref advance to + // avoid a no-op commit on /full/current. The existing ref + // already references the correct tree. + return nil + } + } + } + } + + // Clear existing transcript artifacts for this session path before writing new ones. + // Preserve non-transcript metadata under the same session (e.g., tasks/*). for key := range entries { switch { case key == rawTranscriptPath: @@ -223,12 +254,11 @@ func (s *V2GitStore) updateCommittedFullTranscript(ctx context.Context, opts Upd } } - redactedTranscript, err := s.writeTranscriptBlobs(ctx, opts.Transcript, opts.Agent, sessionPath, entries) - if err != nil { + if err := s.writeTranscriptBlobs(ctx, opts.Transcript, opts.Agent, precomputed, sessionPath, entries); err != nil { return err } - if err := s.writeContentHash(redactedTranscript, sessionPath, entries); err != nil { + if err := s.writeContentHashFromPrecompute(newContentHash, precomputed, sessionPath, entries); err != nil { return err } @@ -417,21 +447,6 @@ func (s *V2GitStore) writeMainSessionToSubdirectory(opts WriteCommittedOptions, return filePaths, nil } -// writeContentHash computes and writes the content hash for already-redacted transcript bytes. -func (s *V2GitStore) writeContentHash(redactedTranscript []byte, sessionPath string, entries map[string]object.TreeEntry) error { - contentHash := fmt.Sprintf("sha256:%x", sha256.Sum256(redactedTranscript)) - hashBlob, err := CreateBlobFromContent(s.repo, []byte(contentHash)) - if err != nil { - return err - } - entries[sessionPath+paths.V2RawTranscriptHashFileName] = object.TreeEntry{ - Name: sessionPath + paths.V2RawTranscriptHashFileName, - Mode: filemode.Regular, - Hash: hashBlob, - } - return nil -} - // writeCompactTranscriptHash computes and writes the SHA-256 hash of the compact transcript. func (s *V2GitStore) writeCompactTranscriptHash(compactTranscript []byte, sessionPath string, entries map[string]object.TreeEntry) error { hash := fmt.Sprintf("sha256:%x", sha256.Sum256(compactTranscript)) @@ -505,12 +520,12 @@ func (s *V2GitStore) writeCommittedFullTranscript(ctx context.Context, opts Writ } } - redactedTranscript, err := s.writeTranscriptBlobs(ctx, transcript, opts.Agent, sessionPath, entries) - if err != nil { + if err := s.writeTranscriptBlobs(ctx, transcript, opts.Agent, nil, sessionPath, entries); err != nil { return err } - if err := s.writeContentHash(redactedTranscript, sessionPath, entries); err != nil { + contentHash := fmt.Sprintf("sha256:%x", sha256.Sum256(transcript.Bytes())) + if err := s.writeContentHashFromPrecompute(contentHash, nil, sessionPath, entries); err != nil { return err } @@ -548,20 +563,29 @@ func (s *V2GitStore) writeCommittedFullTranscript(ctx context.Context, opts Writ } // writeTranscriptBlobs writes pre-redacted, chunked transcript blobs to entries. -// Returns the transcript bytes so the caller can compute the content hash. -func (s *V2GitStore) writeTranscriptBlobs(ctx context.Context, transcript redact.RedactedBytes, agentType types.AgentType, sessionPath string, entries map[string]object.TreeEntry) ([]byte, error) { - raw := transcript.Bytes() - chunks, err := agent.ChunkTranscript(ctx, raw, agentType) - if err != nil { - return nil, fmt.Errorf("failed to chunk transcript: %w", err) +// When precomputed is non-nil, reuses its chunk blob hashes and skips both +// ChunkTranscript and CreateBlobFromContent. +func (s *V2GitStore) writeTranscriptBlobs(ctx context.Context, transcript redact.RedactedBytes, agentType types.AgentType, precomputed *PrecomputedTranscriptBlobs, sessionPath string, entries map[string]object.TreeEntry) error { + var chunkHashes []plumbing.Hash + if precomputed != nil { + chunkHashes = precomputed.ChunkHashes + } else { + chunks, err := chunkTranscript(ctx, transcript.Bytes(), agentType) + if err != nil { + return fmt.Errorf("failed to chunk transcript: %w", err) + } + chunkHashes = make([]plumbing.Hash, len(chunks)) + for i, chunk := range chunks { + h, err := CreateBlobFromContent(s.repo, chunk) + if err != nil { + return err + } + chunkHashes[i] = h + } } - for i, chunk := range chunks { + for i, blobHash := range chunkHashes { chunkPath := sessionPath + agent.ChunkFileName(paths.V2RawTranscriptFileName, i) - blobHash, err := CreateBlobFromContent(s.repo, chunk) - if err != nil { - return nil, err - } entries[chunkPath] = object.TreeEntry{ Name: chunkPath, Mode: filemode.Regular, @@ -569,7 +593,29 @@ func (s *V2GitStore) writeTranscriptBlobs(ctx context.Context, transcript redact } } - return raw, nil + return nil +} + +// writeContentHashFromPrecompute writes the content-hash blob for the given +// transcript hash. When precomputed is non-nil, reuses its ContentHashBlob +// hash; otherwise creates a fresh blob. +func (s *V2GitStore) writeContentHashFromPrecompute(contentHash string, precomputed *PrecomputedTranscriptBlobs, sessionPath string, entries map[string]object.TreeEntry) error { + var hashBlob plumbing.Hash + if precomputed != nil { + hashBlob = precomputed.ContentHashBlob + } else { + h, err := CreateBlobFromContent(s.repo, []byte(contentHash)) + if err != nil { + return err + } + hashBlob = h + } + entries[sessionPath+paths.V2RawTranscriptHashFileName] = object.TreeEntry{ + Name: sessionPath + paths.V2RawTranscriptHashFileName, + Mode: filemode.Regular, + Hash: hashBlob, + } + return nil } // validateWriteOpts validates identifiers in WriteCommittedOptions. diff --git a/cmd/entire/cli/checkpoint/v2_precompute_test.go b/cmd/entire/cli/checkpoint/v2_precompute_test.go new file mode 100644 index 000000000..afb8beb8d --- /dev/null +++ b/cmd/entire/cli/checkpoint/v2_precompute_test.go @@ -0,0 +1,150 @@ +package checkpoint + +import ( + "context" + "testing" + + "github.com/entireio/cli/cmd/entire/cli/agent" + "github.com/entireio/cli/cmd/entire/cli/checkpoint/id" + "github.com/entireio/cli/cmd/entire/cli/paths" + "github.com/entireio/cli/redact" + "github.com/stretchr/testify/require" + + "github.com/go-git/go-git/v6" + "github.com/go-git/go-git/v6/plumbing" +) + +// setupV2ForUpdate creates a V2 store and writes an initial committed +// checkpoint so subsequent UpdateCommitted calls have a target. +func setupV2ForUpdate(t *testing.T, initialTranscript []byte) (*git.Repository, *V2GitStore, id.CheckpointID) { + t.Helper() + repo := initTestRepo(t) + store := NewV2GitStore(repo, "origin") + cpID := id.MustCheckpointID("a1b2c3d4e5f6") + + err := store.WriteCommitted(context.Background(), WriteCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Strategy: "manual-commit", + Agent: agent.AgentTypeClaudeCode, + Transcript: redact.AlreadyRedacted(initialTranscript), + Prompts: []string{"initial prompt"}, + AuthorName: "Test", + AuthorEmail: "test@test.com", + }) + require.NoError(t, err) + + return repo, store, cpID +} + +// readV2TranscriptBlobHash reads the /full/current transcript blob hash at +// session 0 for the given checkpoint. +func readV2TranscriptBlobHash(t *testing.T, repo *git.Repository, cpID id.CheckpointID) plumbing.Hash { + t.Helper() + tree := v2FullTree(t, repo) + transcriptPath := cpID.Path() + "/0/" + paths.V2RawTranscriptFileName + file, err := tree.File(transcriptPath) + require.NoError(t, err, "transcript blob not found at %s", transcriptPath) + return file.Hash +} + +// TestV2UpdateCommitted_PrecomputedBlobs_Roundtrip verifies that passing +// precomputed blob hashes produces the same /full/current transcript content +// as the non-precomputed path. +func TestV2UpdateCommitted_PrecomputedBlobs_Roundtrip(t *testing.T) { + t.Parallel() + repo, store, cpID := setupV2ForUpdate(t, []byte(`{"type":"assistant","message":"initial"}`)) + + transcript := redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"finalized content"}`)) + precomputed, err := PrecomputeTranscriptBlobs(context.Background(), repo, transcript, agent.AgentTypeClaudeCode) + require.NoError(t, err) + require.NotEmpty(t, precomputed.ChunkHashes) + require.False(t, precomputed.ContentHashBlob.IsZero()) + + err = store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + Agent: agent.AgentTypeClaudeCode, + PrecomputedBlobs: precomputed, + }) + require.NoError(t, err) + + got := v2ReadFile(t, v2FullTree(t, repo), cpID.Path()+"/0/"+paths.V2RawTranscriptFileName) + require.Equal(t, string(transcript.Bytes()), got) +} + +// TestV2UpdateCommitted_ContentHashShortCircuit verifies that a second +// identical update to /full/current skips chunking entirely and does not +// advance the ref (no no-op commit). +func TestV2UpdateCommitted_ContentHashShortCircuit(t *testing.T) { + // Cannot run in parallel: patches the package-level chunkTranscript hook. + repo, store, cpID := setupV2ForUpdate(t, []byte(`{"type":"assistant","message":"initial"}`)) + + transcript := redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"stable content"}`)) + + err := store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + Agent: agent.AgentTypeClaudeCode, + }) + require.NoError(t, err) + + fullRefName := plumbing.ReferenceName(paths.V2FullCurrentRefName) + refBefore, err := repo.Reference(fullRefName, true) + require.NoError(t, err) + + // Install a counter. The second UpdateCommitted with identical content + // should skip chunking and leave /full/current's ref unchanged. + calls := installChunkCounter(t) + + err = store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: transcript, + Agent: agent.AgentTypeClaudeCode, + }) + require.NoError(t, err) + + require.Equal(t, 0, *calls, + "short-circuit failed: chunkTranscript was called %d time(s) on a no-op re-update", *calls) + + refAfter, err := repo.Reference(fullRefName, true) + require.NoError(t, err) + require.Equal(t, refBefore.Hash(), refAfter.Hash(), + "short-circuit should skip the ref advance on /full/current to avoid a no-op commit") +} + +// TestV2UpdateCommitted_ContentChangedRewrites verifies the v2 short-circuit +// does NOT fire when content actually differs, and that the new content is +// persisted on /full/current. +func TestV2UpdateCommitted_ContentChangedRewrites(t *testing.T) { + t.Parallel() + repo, store, cpID := setupV2ForUpdate(t, []byte(`{"type":"assistant","message":"initial"}`)) + + first := redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"first version"}`)) + second := redact.AlreadyRedacted([]byte(`{"type":"assistant","message":"second version with more content"}`)) + + require.NoError(t, store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: first, + Agent: agent.AgentTypeClaudeCode, + })) + blobBefore := readV2TranscriptBlobHash(t, repo, cpID) + + require.NoError(t, store.UpdateCommitted(context.Background(), UpdateCommittedOptions{ + CheckpointID: cpID, + SessionID: "session-001", + Transcript: second, + Agent: agent.AgentTypeClaudeCode, + })) + blobAfter := readV2TranscriptBlobHash(t, repo, cpID) + + require.NotEqual(t, blobBefore, blobAfter, + "expected /full/current transcript blob to change on content update") + + got := v2ReadFile(t, v2FullTree(t, repo), cpID.Path()+"/0/"+paths.V2RawTranscriptFileName) + require.Equal(t, string(second.Bytes()), got) +} diff --git a/cmd/entire/cli/strategy/manual_commit_hooks.go b/cmd/entire/cli/strategy/manual_commit_hooks.go index 6e5505f2a..78e1b3362 100644 --- a/cmd/entire/cli/strategy/manual_commit_hooks.go +++ b/cmd/entire/cli/strategy/manual_commit_hooks.go @@ -2625,6 +2625,29 @@ func (s *ManualCommitStrategy) HandleTurnEnd(ctx context.Context, state *Session return nil } +// precomputeTranscriptBlobsForFinalize chunks + zlib-compresses the redacted +// transcript once for reuse across every checkpoint in the turn. Returns nil +// (without error) when the transcript is empty — downstream stores skip +// transcript updates in that case, so precompute would only write a wasted +// empty-chunk blob to the object store. On failure, logs a warning and +// returns nil so the loop falls back to per-checkpoint chunking. +func precomputeTranscriptBlobsForFinalize(ctx context.Context, repo *git.Repository, transcript redact.RedactedBytes, state *SessionState) *checkpoint.PrecomputedTranscriptBlobs { + if transcript.Len() == 0 { + return nil + } + _, span := perf.Start(ctx, "precompute_transcript_blobs") + defer span.End() + precomputed, err := checkpoint.PrecomputeTranscriptBlobs(ctx, repo, transcript, state.AgentType) + if err != nil { + logging.Warn(ctx, "finalize: precompute transcript blobs failed, falling back to per-checkpoint work", + slog.String("session_id", state.SessionID), + slog.String("error", err.Error()), + ) + return nil + } + return precomputed +} + // finalizeAllTurnCheckpoints replaces the provisional transcript in each checkpoint // created during this turn with the full session transcript. // @@ -2729,6 +2752,8 @@ func (s *ManualCommitStrategy) finalizeAllTurnCheckpoints(ctx context.Context, s v2Store = checkpoint.NewV2GitStore(repo, ResolveCheckpointURL(logCtx, "origin")) } + precomputed := precomputeTranscriptBlobsForFinalize(logCtx, repo, redactedTranscript, state) + // Update each checkpoint with the full transcript for _, cpIDStr := range state.TurnCheckpointIDs { cpID, parseErr := id.NewCheckpointID(cpIDStr) @@ -2742,11 +2767,12 @@ func (s *ManualCommitStrategy) finalizeAllTurnCheckpoints(ctx context.Context, s } updateOpts := checkpoint.UpdateCommittedOptions{ - CheckpointID: cpID, - SessionID: state.SessionID, - Transcript: redactedTranscript, - Prompts: prompts, - Agent: state.AgentType, + CheckpointID: cpID, + SessionID: state.SessionID, + Transcript: redactedTranscript, + Prompts: prompts, + Agent: state.AgentType, + PrecomputedBlobs: precomputed, } // Generate compact transcript for v2 /main