Re: [PR] Fix snapshot handling to prevent crashes [skywalking-banyandb]

via GitHub Thu, 19 Mar 2026 02:50:03 -0700


Copilot commented on code in PR #1016:
URL: 
https://github.com/apache/skywalking-banyandb/pull/1016#discussion_r2958926794



##########
banyand/trace/tstable.go:
##########
@@ -153,38 +157,56 @@ func (tst *tsTable) mustWriteSnapshot(snapshot uint64, 
partNames []string) {
                logger.Panicf("cannot marshal partNames to JSON: %s", err)
        }
        snapshotPath := filepath.Join(tst.root, snapshotName(snapshot))
-       lf, err := tst.fileSystem.CreateLockFile(snapshotPath, storage.FilePerm)
+       snapshotTempPath := snapshotPath + ".tmp"
+       lf, err := tst.fileSystem.CreateLockFile(snapshotTempPath, 
storage.FilePerm)
        if err != nil {
-               logger.Panicf("cannot create lock file %s: %s", snapshotPath, 
err)
+               logger.Panicf("cannot create lock file %s: %s", 
snapshotTempPath, err)
        }
        n, err := lf.Write(data)
        if err != nil {
-               logger.Panicf("cannot write snapshot %s: %s", snapshotPath, err)
+               _ = lf.Close()
+               logger.Panicf("cannot write snapshot %s: %s", snapshotTempPath, 
err)
        }
        if n != len(data) {
-               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotPath, n, len(data))
+               _ = lf.Close()
+               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotTempPath, n, len(data))
        }
+       if closeErr := lf.Close(); closeErr != nil {
+               logger.Panicf("cannot close snapshot temp file %s: %s", 
snapshotTempPath, closeErr)
+       }
+       if renameErr := tst.fileSystem.Rename(snapshotTempPath, snapshotPath); 
renameErr != nil {
+               logger.Panicf("cannot rename snapshot %s to %s: %s", 
snapshotTempPath, snapshotPath, renameErr)
+       }
+       tst.fileSystem.SyncPath(tst.root)
 }

Review Comment:
   mustWriteSnapshot now writes to a ".tmp" file (".snp.tmp") and then renames. 
If the process crashes between write and rename, that temp file will be left 
behind and initTSTable currently ignores it (since filepath.Ext is ".tmp"), so 
these can accumulate indefinitely. Consider adding startup cleanup for leftover 
snapshot temp files (e.g., delete files matching snapshotSuffix+".tmp") or 
choosing a temp naming scheme that initTSTable explicitly handles/removes.



##########
banyand/stream/tstable.go:
##########
@@ -152,38 +156,56 @@ func (tst *tsTable) mustWriteSnapshot(snapshot uint64, 
partNames []string) {
                logger.Panicf("cannot marshal partNames to JSON: %s", err)
        }
        snapshotPath := filepath.Join(tst.root, snapshotName(snapshot))
-       lf, err := tst.fileSystem.CreateLockFile(snapshotPath, storage.FilePerm)
+       snapshotTempPath := snapshotPath + ".tmp"
+       lf, err := tst.fileSystem.CreateLockFile(snapshotTempPath, 
storage.FilePerm)
        if err != nil {
-               logger.Panicf("cannot create lock file %s: %s", snapshotPath, 
err)
+               logger.Panicf("cannot create lock file %s: %s", 
snapshotTempPath, err)
        }
        n, err := lf.Write(data)
        if err != nil {
-               logger.Panicf("cannot write snapshot %s: %s", snapshotPath, err)
+               _ = lf.Close()
+               logger.Panicf("cannot write snapshot %s: %s", snapshotTempPath, 
err)
        }
        if n != len(data) {
-               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotPath, n, len(data))
+               _ = lf.Close()
+               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotTempPath, n, len(data))
+       }
+       if closeErr := lf.Close(); closeErr != nil {
+               logger.Panicf("cannot close snapshot temp file %s: %s", 
snapshotTempPath, closeErr)
        }
+       if renameErr := tst.fileSystem.Rename(snapshotTempPath, snapshotPath); 
renameErr != nil {
+               logger.Panicf("cannot rename snapshot %s to %s: %s", 
snapshotTempPath, snapshotPath, renameErr)
+       }
+       tst.fileSystem.SyncPath(tst.root)
 }

Review Comment:
   mustWriteSnapshot now writes a snapshot to a ".tmp" file (".snp.tmp") and 
renames it. If the process crashes before the rename, the temp file will remain 
and initTSTable currently ignores it (ext ".tmp"), so these leftovers can 
accumulate over time. Consider deleting stale snapshot temp files during 
initTSTable or using a temp naming convention that initTSTable explicitly 
cleans up.



##########
banyand/trace/tstable.go:
##########
@@ -262,9 +284,20 @@ func initTSTable(fileSystem fs.FileSystem, rootPath 
string, p common.Position,
        sort.Slice(loadedSnapshots, func(i, j int) bool {
                return loadedSnapshots[i] > loadedSnapshots[j]
        })
-       epoch := loadedSnapshots[0]
-       tst.loadSnapshot(epoch, loadedParts)
-       return &tst, epoch
+       var failedSnapshots []uint64
+       for _, epoch := range loadedSnapshots {
+               loadErr := tst.loadSnapshot(epoch, loadedParts)
+               if loadErr == nil {
+                       return &tst, epoch
+               }
+               tst.l.Warn().Err(loadErr).Uint64("epoch", epoch).Msg("cannot 
load snapshot, trying next older")
+               failedSnapshots = append(failedSnapshots, epoch)
+       }
+       for _, id := range failedSnapshots {
+               tst.l.Info().Str("path", filepath.Join(rootPath, 
snapshotName(id))).Msg("delete unreadable snapshot file")
+               fileSystem.MustRMAll(filepath.Join(rootPath, snapshotName(id)))
+       }

Review Comment:
   initTSTable collects failedSnapshots but returns immediately on the first 
successfully loaded older snapshot, so any newer unreadable snapshots 
encountered earlier are never deleted. This will cause a warning (and fallback 
scan) on every restart. If the intent is to clean up corrupt snapshots, delete 
each failed snapshot immediately (or defer cleanup before returning on success).
   



##########
banyand/stream/tstable.go:
##########
@@ -273,9 +295,20 @@ func initTSTable(fileSystem fs.FileSystem, rootPath 
string, p common.Position,
        sort.Slice(loadedSnapshots, func(i, j int) bool {
                return loadedSnapshots[i] > loadedSnapshots[j]
        })
-       epoch := loadedSnapshots[0]
-       tst.loadSnapshot(epoch, loadedParts)
-       return &tst, epoch, nil
+       var failedSnapshots []uint64
+       for _, epoch := range loadedSnapshots {
+               loadErr := tst.loadSnapshot(epoch, loadedParts)
+               if loadErr == nil {
+                       return &tst, epoch, nil
+               }
+               tst.l.Warn().Err(loadErr).Uint64("epoch", epoch).Msg("cannot 
load snapshot, trying next older")
+               failedSnapshots = append(failedSnapshots, epoch)
+       }
+       for _, id := range failedSnapshots {
+               tst.l.Info().Str("path", filepath.Join(rootPath, 
snapshotName(id))).Msg("delete unreadable snapshot file")
+               fileSystem.MustRMAll(filepath.Join(rootPath, snapshotName(id)))
+       }

Review Comment:
   initTSTable appends epochs to failedSnapshots, but because it returns as 
soon as one older snapshot loads successfully, any newer unreadable snapshots 
are never removed. This will keep emitting warnings and doing fallback work on 
every restart. Consider removing a failed snapshot immediately after 
loadSnapshot fails (or ensure cleanup runs before the successful return).
   



##########
banyand/measure/tstable.go:
##########
@@ -117,9 +117,20 @@ func initTSTable(fileSystem fs.FileSystem, rootPath 
string, p common.Position,
        sort.Slice(loadedSnapshots, func(i, j int) bool {
                return loadedSnapshots[i] > loadedSnapshots[j]
        })
-       epoch := loadedSnapshots[0]
-       tst.loadSnapshot(epoch, loadedParts)
-       return &tst, epoch
+       var failedSnapshots []uint64
+       for _, epoch := range loadedSnapshots {
+               loadErr := tst.loadSnapshot(epoch, loadedParts)
+               if loadErr == nil {
+                       return &tst, epoch
+               }
+               tst.l.Warn().Err(loadErr).Uint64("epoch", epoch).Msg("cannot 
load snapshot, trying next older")
+               failedSnapshots = append(failedSnapshots, epoch)
+       }
+       for _, id := range failedSnapshots {
+               tst.l.Info().Str("path", filepath.Join(rootPath, 
snapshotName(id))).Msg("delete unreadable snapshot file")
+               fileSystem.MustRMAll(filepath.Join(rootPath, snapshotName(id)))
+       }

Review Comment:
   initTSTable records failedSnapshots but returns immediately when an older 
snapshot loads, so any newer unreadable snapshots encountered earlier are never 
deleted. This will cause repeated warnings and fallback scanning on every 
restart. If cleanup is desired, delete the failed snapshot right after 
loadSnapshot fails, or run the cleanup before returning on success (e.g., via 
defer).
   



##########
banyand/measure/tstable.go:
##########
@@ -235,38 +250,56 @@ func (tst *tsTable) mustWriteSnapshot(snapshot uint64, 
partNames []string) {
                logger.Panicf("cannot marshal partNames to JSON: %s", err)
        }
        snapshotPath := filepath.Join(tst.root, snapshotName(snapshot))
-       lf, err := tst.fileSystem.CreateLockFile(snapshotPath, storage.FilePerm)
+       snapshotTempPath := snapshotPath + ".tmp"
+       lf, err := tst.fileSystem.CreateLockFile(snapshotTempPath, 
storage.FilePerm)
        if err != nil {
-               logger.Panicf("cannot create lock file %s: %s", snapshotPath, 
err)
+               logger.Panicf("cannot create lock file %s: %s", 
snapshotTempPath, err)
        }
        n, err := lf.Write(data)
        if err != nil {
-               logger.Panicf("cannot write snapshot %s: %s", snapshotPath, err)
+               _ = lf.Close()
+               logger.Panicf("cannot write snapshot %s: %s", snapshotTempPath, 
err)
        }
        if n != len(data) {
-               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotPath, n, len(data))
+               _ = lf.Close()
+               logger.Panicf("unexpected number of bytes written to %s; got 
%d; want %d", snapshotTempPath, n, len(data))
        }
+       if closeErr := lf.Close(); closeErr != nil {
+               logger.Panicf("cannot close snapshot temp file %s: %s", 
snapshotTempPath, closeErr)
+       }
+       if renameErr := tst.fileSystem.Rename(snapshotTempPath, snapshotPath); 
renameErr != nil {
+               logger.Panicf("cannot rename snapshot %s to %s: %s", 
snapshotTempPath, snapshotPath, renameErr)
+       }
+       tst.fileSystem.SyncPath(tst.root)
 }

Review Comment:
   mustWriteSnapshot writes to snapshotPath+".tmp" and then renames. If a crash 
happens before rename, the ".snp.tmp" file will be left behind and initTSTable 
currently ignores it (ext ".tmp"), so temp files can accumulate over time. 
Consider cleaning up stale snapshot temp files during initTSTable or adopting a 
temp naming scheme that initTSTable explicitly detects/removes.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix snapshot handling to prevent crashes [skywalking-banyandb]

Reply via email to