Skip to content

Commit 380f113

Browse files
Fix memleak in chunkedFileWriter (#1695)
* Fix memleak in chunkedFileWriter * Ensure Flush waits for workerRoutine to exit * Rename variable to a more meaningful one
1 parent d76b7bf commit 380f113

File tree

1 file changed

+64
-18
lines changed

1 file changed

+64
-18
lines changed

common/chunkedFileWriter.go

+64-18
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ type chunkedFileWriter struct {
8484

8585
// used for completion
8686
successMd5 chan []byte
87-
failureError chan error
87+
chunkWriterDone chan bool
8888

8989
// controls body-read retries. Public so value can be shared with retryReader
9090
maxRetryPerDownloadBody int
@@ -93,6 +93,10 @@ type chunkedFileWriter struct {
9393
md5ValidationOption HashValidationOption
9494

9595
sourceMd5Exists bool
96+
97+
currentReservedCapacity int64
98+
99+
err error //This field should be set only by workerRoutine
96100
}
97101

98102
type fileChunk struct {
@@ -112,11 +116,12 @@ func NewChunkedFileWriter(ctx context.Context, slicePool ByteSlicePooler, cacheL
112116
cacheLimiter: cacheLimiter,
113117
chunkLogger: chunkLogger,
114118
successMd5: make(chan []byte),
115-
failureError: make(chan error, 1),
119+
chunkWriterDone: make(chan bool, 1),
116120
newUnorderedChunks: make(chan fileChunk, chanBufferSize),
117121
maxRetryPerDownloadBody: maxBodyRetries,
118122
md5ValidationOption: md5ValidationOption,
119123
sourceMd5Exists: sourceMd5Exists,
124+
currentReservedCapacity: 0,
120125
}
121126
go w.workerRoutine(ctx)
122127
return w
@@ -137,14 +142,15 @@ func (w *chunkedFileWriter) WaitToScheduleChunk(ctx context.Context, id ChunkID,
137142
w.chunkLogger.LogChunkStatus(id, EWaitReason.RAMToSchedule())
138143
err := w.cacheLimiter.WaitUntilAdd(ctx, chunkSize, w.shouldUseRelaxedRamThreshold)
139144
if err == nil {
145+
atomic.AddInt64(&w.currentReservedCapacity, chunkSize)
140146
atomic.AddInt32(&w.activeChunkCount, 1)
141147
}
142148
return err
149+
//At this point, the book-keeping of this memory is chunkedFileWriter's responsibility
143150
}
144151

145152
// Threadsafe method to enqueue a new chunk for processing
146-
func (w *chunkedFileWriter) EnqueueChunk(ctx context.Context, id ChunkID, chunkSize int64, chunkContents io.Reader, retryable bool) error {
147-
153+
func (w *chunkedFileWriter) EnqueueChunk(ctx context.Context, id ChunkID, chunkSize int64, chunkContents io.Reader, retryable bool) (err error) {
148154
readDone := make(chan struct{})
149155
if retryable {
150156
// if retryable == true, that tells us that closing the reader
@@ -158,8 +164,21 @@ func (w *chunkedFileWriter) EnqueueChunk(ctx context.Context, id ChunkID, chunkS
158164

159165
// read into a buffer
160166
buffer := w.slicePool.RentSlice(chunkSize)
167+
168+
defer func() {
169+
//cleanup stuff if we abruptly quit
170+
if err == nil {
171+
return //We've successfully queued, the worker will now takeover
172+
}
173+
w.cacheLimiter.Remove(chunkSize) // remove this from the tally of scheduled-but-unsaved bytes
174+
atomic.AddInt64(&w.currentReservedCapacity, -chunkSize)
175+
w.slicePool.ReturnSlice(buffer)
176+
atomic.AddInt32(&w.activeChunkCount, -1)
177+
w.chunkLogger.LogChunkStatus(id, EWaitReason.ChunkDone()) // this chunk is all finished
178+
}()
179+
161180
readStart := time.Now()
162-
_, err := io.ReadFull(chunkContents, buffer)
181+
_, err = io.ReadFull(chunkContents, buffer)
163182
close(readDone)
164183
if err != nil {
165184
return err
@@ -172,15 +191,14 @@ func (w *chunkedFileWriter) EnqueueChunk(ctx context.Context, id ChunkID, chunkS
172191
// enqueue it
173192
w.chunkLogger.LogChunkStatus(id, EWaitReason.Sorting())
174193
select {
175-
case err = <-w.failureError:
194+
case <-w.chunkWriterDone:
195+
err = w.err
176196
if err != nil {
177197
return err
178198
}
179199
return ChunkWriterAlreadyFailed // channel returned nil because it was closed and empty
180-
case <-ctx.Done():
181-
return ctx.Err()
182200
case w.newUnorderedChunks <- fileChunk{id: id, data: buffer}:
183-
return nil
201+
return
184202
}
185203
}
186204

@@ -189,15 +207,30 @@ func (w *chunkedFileWriter) Flush(ctx context.Context) ([]byte, error) {
189207
// let worker know that no more will be coming
190208
close(w.newUnorderedChunks)
191209

210+
/*
211+
* We clear accounted but unused memory, i.e capacity, here. This capacity was
212+
* requested from cacheLimiter when we were waiting to schedule this chunk.
213+
* The below statement needs to happen after we've waited for all the chunks.
214+
*
215+
* Why should we do this?
216+
* Ideally, the capacity should be zero here, because workerRoutine() would return
217+
* the slice after saving the chunk. However, transferProcessor() is designed such that
218+
* it has to schedule all chunks of jptm even if it has detected a failure in between.
219+
* In such a case, we'd have added to the capacity of the fileWriter, while the
220+
* workerRoutine() has already exited. We release that capacity here. When Flush() finds
221+
* active chunks here, it is only those which have not rented a slice.
222+
*/
223+
defer func() {
224+
w.cacheLimiter.Remove(atomic.LoadInt64(&w.currentReservedCapacity))
225+
}()
226+
192227
// wait until all written to disk
193228
select {
194-
case err := <-w.failureError:
195-
if err != nil {
196-
return nil, err
229+
case <-w.chunkWriterDone:
230+
if w.err != nil {
231+
return nil, w.err
197232
}
198233
return nil, ChunkWriterAlreadyFailed // channel returned nil because it was closed and empty
199-
case <-ctx.Done():
200-
return nil, ctx.Err()
201234
case md5AtCompletion := <-w.successMd5:
202235
return md5AtCompletion, nil
203236
}
@@ -221,6 +254,19 @@ func (w *chunkedFileWriter) workerRoutine(ctx context.Context) {
221254
md5Hasher = &nullHasher{}
222255
}
223256

257+
defer func() {
258+
//cleanup stuff if we abruptly quit
259+
for _, chunk := range unsavedChunksByFileOffset {
260+
w.cacheLimiter.Remove(int64(chunk.id.length)) // remove this from the tally of scheduled-but-unsaved bytes
261+
atomic.AddInt64(&w.currentReservedCapacity, -chunk.id.length)
262+
w.slicePool.ReturnSlice(chunk.data)
263+
atomic.AddInt32(&w.activeChunkCount, -1)
264+
w.chunkLogger.LogChunkStatus(chunk.id, EWaitReason.ChunkDone()) // this chunk is all finished
265+
}
266+
close(w.chunkWriterDone) // must close because many goroutines may be calling the public methods, and all need to be able to tell there's been an error, even tho only one will get the actual error
267+
unsavedChunksByFileOffset = nil
268+
}()
269+
224270
for {
225271
var newChunk fileChunk
226272
var channelIsOpen bool
@@ -236,7 +282,7 @@ func (w *chunkedFileWriter) workerRoutine(ctx context.Context) {
236282
return
237283
}
238284
case <-ctx.Done(): // If cancelled out in the middle of enqueuing chunks OR processing chunks, they will both cleanly cancel out and we'll get back to here.
239-
w.failureError <- ctx.Err()
285+
w.err = ctx.Err()
240286
return
241287
}
242288

@@ -248,8 +294,7 @@ func (w *chunkedFileWriter) workerRoutine(ctx context.Context) {
248294
w.setStatusForContiguousAvailableChunks(unsavedChunksByFileOffset, nextOffsetToSave, ctx) // update states of those that have all their prior ones already here
249295
err := w.sequentiallyProcessAvailableChunks(unsavedChunksByFileOffset, &nextOffsetToSave, md5Hasher, ctx)
250296
if err != nil {
251-
w.failureError <- err
252-
close(w.failureError) // must close because many goroutines may be calling the public methods, and all need to be able to tell there's been an error, even tho only one will get the actual error
297+
w.err = err
253298
return // no point in processing any more after a failure
254299
}
255300
}
@@ -305,8 +350,9 @@ func (w *chunkedFileWriter) setStatusForContiguousAvailableChunks(unsavedChunksB
305350
func (w *chunkedFileWriter) saveOneChunk(chunk fileChunk, md5Hasher hash.Hash) error {
306351
defer func() {
307352
w.cacheLimiter.Remove(int64(len(chunk.data))) // remove this from the tally of scheduled-but-unsaved bytes
308-
atomic.AddInt32(&w.activeChunkCount, -1)
309353
w.slicePool.ReturnSlice(chunk.data)
354+
atomic.AddInt32(&w.activeChunkCount, -1)
355+
atomic.AddInt64(&w.currentReservedCapacity, -chunk.id.length)
310356
w.chunkLogger.LogChunkStatus(chunk.id, EWaitReason.ChunkDone()) // this chunk is all finished
311357
}()
312358

0 commit comments

Comments
 (0)