1
1
// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0.
2
2
3
3
use std:: {
4
- sync:: { Arc , Mutex } ,
4
+ sync:: {
5
+ atomic:: { AtomicUsize , Ordering } ,
6
+ Arc , Mutex ,
7
+ } ,
5
8
time:: Instant ,
6
9
} ;
7
10
8
11
use common_util:: { runtime:: Runtime , time:: InstantExt } ;
9
12
use futures:: Future ;
10
- use log:: error;
13
+ use log:: { error, warn } ;
11
14
use table_engine:: table:: TableId ;
12
15
use tokio:: sync:: {
13
16
oneshot,
14
17
watch:: { self , Receiver , Sender } ,
15
18
} ;
16
19
20
+ use super :: flush_compaction:: { BackgroundFlushFailed , TableFlushOptions } ;
17
21
use crate :: {
18
- instance:: flush_compaction:: { BackgroundFlushFailed , Other , Result } ,
22
+ instance:: flush_compaction:: { Other , Result } ,
19
23
table:: metrics:: Metrics ,
20
24
} ;
21
25
@@ -34,6 +38,26 @@ type ScheduleSyncRef = Arc<ScheduleSync>;
34
38
struct ScheduleSync {
35
39
state : Mutex < FlushState > ,
36
40
notifier : Sender < ( ) > ,
41
+ continuous_flush_failure_count : AtomicUsize ,
42
+ }
43
+
44
+ impl ScheduleSync {
45
+ #[ inline]
46
+ pub fn should_retry_flush ( & self , max_retry_limit : usize ) -> bool {
47
+ self . continuous_flush_failure_count . load ( Ordering :: Relaxed ) < max_retry_limit
48
+ }
49
+
50
+ #[ inline]
51
+ pub fn reset_flush_failure_count ( & self ) {
52
+ self . continuous_flush_failure_count
53
+ . store ( 0 , Ordering :: Relaxed ) ;
54
+ }
55
+
56
+ #[ inline]
57
+ pub fn inc_flush_failure_count ( & self ) {
58
+ self . continuous_flush_failure_count
59
+ . fetch_add ( 1 , Ordering :: Relaxed ) ;
60
+ }
37
61
}
38
62
39
63
pub struct TableFlushScheduler {
@@ -47,6 +71,7 @@ impl Default for TableFlushScheduler {
47
71
let schedule_sync = ScheduleSync {
48
72
state : Mutex :: new ( FlushState :: Ready ) ,
49
73
notifier : tx,
74
+ continuous_flush_failure_count : AtomicUsize :: new ( 0 ) ,
50
75
} ;
51
76
Self {
52
77
schedule_sync : Arc :: new ( schedule_sync) ,
@@ -105,7 +130,7 @@ impl TableFlushScheduler {
105
130
flush_job : F ,
106
131
on_flush_success : T ,
107
132
block_on_write_thread : bool ,
108
- res_sender : Option < oneshot :: Sender < Result < ( ) > > > ,
133
+ opts : TableFlushOptions ,
109
134
runtime : & Runtime ,
110
135
metrics : & Metrics ,
111
136
) -> Result < ( ) >
@@ -131,7 +156,21 @@ impl TableFlushScheduler {
131
156
}
132
157
FlushState :: Flushing => ( ) ,
133
158
FlushState :: Failed { err_msg } => {
134
- return BackgroundFlushFailed { msg : err_msg } . fail ( ) ;
159
+ if self
160
+ . schedule_sync
161
+ . should_retry_flush ( opts. max_retry_flush_limit )
162
+ {
163
+ warn ! ( "Re-flush memory tables after background flush failed:{err_msg}" ) ;
164
+ // Mark the worker is flushing.
165
+ * flush_state = FlushState :: Flushing ;
166
+ break ;
167
+ } else {
168
+ return BackgroundFlushFailed {
169
+ msg : err_msg,
170
+ retry_count : opts. max_retry_flush_limit ,
171
+ }
172
+ . fail ( ) ;
173
+ }
135
174
}
136
175
}
137
176
@@ -164,7 +203,7 @@ impl TableFlushScheduler {
164
203
if flush_res. is_ok ( ) {
165
204
on_flush_success. await ;
166
205
}
167
- send_flush_result ( res_sender, flush_res) ;
206
+ send_flush_result ( opts . res_sender , flush_res) ;
168
207
} ;
169
208
170
209
if block_on_write_thread {
@@ -182,9 +221,11 @@ fn on_flush_finished(schedule_sync: ScheduleSyncRef, res: &Result<()>) {
182
221
let mut flush_state = schedule_sync. state . lock ( ) . unwrap ( ) ;
183
222
match res {
184
223
Ok ( ( ) ) => {
224
+ schedule_sync. reset_flush_failure_count ( ) ;
185
225
* flush_state = FlushState :: Ready ;
186
226
}
187
227
Err ( e) => {
228
+ schedule_sync. inc_flush_failure_count ( ) ;
188
229
let err_msg = e. to_string ( ) ;
189
230
* flush_state = FlushState :: Failed { err_msg } ;
190
231
}
0 commit comments