Skip to content

Commit 628dc13

Browse files
committed
FIxed some bugs, made sky puppy more crash safe and better logging
1 parent 5e2b149 commit 628dc13

File tree

2 files changed

+104
-83
lines changed

2 files changed

+104
-83
lines changed

src/alerts.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class Alerts {
209209
await fasquest.request(JSON.parse(JSON.stringify(request)));
210210
} catch (e) {
211211
log.error(
212-
`ERROR: Alerter [${alert.alerter}] of type [${alert.type}] could not be reached. Errored with message ${e.err.message}`
212+
`ERROR: Alerter [${alert.alerter}] of type [${alert.type}] could not be reached. Errored with message ${e.err ? e.err.message : e.message}`
213213
);
214214
}
215215
} else {

src/health-check.js

+103-82
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ class HealthCheck {
173173
this.services[name] = nService;
174174

175175
this.services[name]._sTimeoutHandler = setTimeout(() => {
176-
this._runCheck(this.services[name]);
176+
this._run(this.services[name]);
177177
}, (nService.config.start_delay || 0) * 1000);
178178
}
179179
}
@@ -197,106 +197,127 @@ class HealthCheck {
197197
return message || '';
198198
}
199199

200-
async _runCheck(service) {
201-
if (service && service.enabled) {
202-
const startTime = process.hrtime.bigint();
203-
// const oldStatus = service.status.up;
200+
async _runChecker(service, startTime) {
201+
try {
202+
var res = await service.checker.check();
204203

205-
try {
206-
var res = await service.checker.check();
204+
service.status.time =
205+
Number(process.hrtime.bigint() - startTime) / 1000000;
206+
service.status.code = res.code;
207+
service.status.message = this._mapMessages(
208+
res.code,
209+
res.message,
210+
service
211+
);
212+
service.status.up = 1;
213+
214+
if (service.config.expected_status != service.status.code) {
215+
service.status.up = 0;
216+
service.status.count.unhealthy_status++;
217+
log.info(service.name, ' Unhealthy status: ' + service.status.code);
218+
}
207219

220+
if (service.status.time > service.config.expected_response_time) {
221+
service.status.up = 0;
222+
service.status.count.unhealthy_response_time++;
223+
log.info(
224+
service.name,
225+
' Unhealthy response time: ' + service.status.time.toFixed(2) + 'ms'
226+
);
227+
}
228+
229+
if (service.status.up > 0) {
230+
service.status.count.healthy++;
231+
} else {
232+
service.status.count.unhealthy++;
233+
}
234+
} catch (e) {
235+
if (e.message.indexOf('ETIMEDOUT') > -1) {
208236
service.status.time =
209237
Number(process.hrtime.bigint() - startTime) / 1000000;
210-
service.status.code = res.code;
238+
239+
service.status.count.unhealthy++;
240+
service.status.up = 0;
241+
service.status.code = 0;
242+
211243
service.status.message = this._mapMessages(
212-
res.code,
213-
res.message,
244+
service.status.code,
245+
'Timedout',
214246
service
215247
);
216-
service.status.up = 1;
217248

218-
if (service.config.expected_status != service.status.code) {
219-
service.status.up = 0;
220-
service.status.count.unhealthy_status++;
221-
log.info(service.name, ' Unhealthy status: ' + service.status.code);
222-
}
249+
log.info(service.name, ' Unhealthy ETIMEDOUT!');
250+
} else {
251+
service.status.time =
252+
Number(process.hrtime.bigint() - startTime) / 1000000;
253+
service.status.count.down++;
254+
service.status.up = -1;
255+
service.status.code = -1;
256+
service.status.message = this._mapMessages(
257+
service.status.code,
258+
e.message,
259+
service
260+
);
261+
log.info(service.name, ' Down! ', e.message);
262+
}
223263

224-
if (service.status.time > service.config.expected_response_time) {
225-
service.status.up = 0;
226-
service.status.count.unhealthy_response_time++;
227-
log.info(
228-
service.name,
229-
' Unhealthy response time: ' + service.status.time.toFixed(2) + 'ms'
230-
);
231-
}
264+
log.debug(service.name, e.message);
265+
}
232266

233-
if (service.status.up > 0) {
234-
service.status.count.healthy++;
235-
} else {
236-
service.status.count.unhealthy++;
237-
}
238-
} catch (e) {
239-
if (e.message.indexOf('ETIMEDOUT') > -1) {
240-
service.status.up = 0;
241-
service.status.count.unhealthy++;
242-
log.info(service.name, ' Unhealthy ETIMEDOUT!');
243-
} else {
244-
service.status.time =
245-
Number(process.hrtime.bigint() - startTime) / 1000000;
246-
service.status.count.down++;
247-
service.status.up = -1;
248-
service.status.code = 0;
249-
}
267+
if (service.status.last_status == null) {
268+
service.status.last_status = service.status.up;
269+
}
250270

251-
log.debug(service.name, e.message);
271+
if (service.status.up > 0) {
272+
if (!service.status.last_healthy) {
273+
service.status.last_healthy = process.hrtime.bigint();
252274
}
253-
254-
if (service.status.last_status == null) {
255-
service.status.last_status = service.status.up;
275+
if (service.status.last_status < 1 && service.status.last_healthy) {
276+
service.status.last_unhealthy_total_duration = (
277+
Number(process.hrtime.bigint() - service.status.last_unhealthy) /
278+
1000000000
279+
).toFixed(3);
280+
log.info(
281+
service.name,
282+
`healthy again after ${service.status.last_unhealthy_total_duration} second of down time!`
283+
);
284+
service.status.last_healthy = process.hrtime.bigint();
256285
}
286+
} else if (!service.status.last_unhealthy || (service.status.last_status > 0 && service.status.last_unhealthy)) {
287+
service.status.last_unhealthy = process.hrtime.bigint();
288+
}
289+
}
290+
async _run(service) {
291+
if (service && service.enabled) {
292+
const startTime = process.hrtime.bigint();
257293

258-
if (service.status.up > 0) {
259-
if (!service.status.last_healthy) {
260-
service.status.last_healthy = process.hrtime.bigint();
261-
}
262-
if (service.status.last_status < 1 && service.status.last_healthy) {
263-
service.status.last_unhealthy_total_duration = (
264-
Number(process.hrtime.bigint() - service.status.last_unhealthy) /
265-
1000000000
266-
).toFixed(3);
267-
log.info(
268-
service.name,
269-
`healthy again after ${service.status.last_unhealthy_total_duration} second of down time!`
270-
);
271-
service.status.last_healthy = process.hrtime.bigint();
272-
}
273-
} else {
274-
if (!service.status.last_unhealthy) {
275-
service.status.last_unhealthy = process.hrtime.bigint();
276-
}
277-
if (service.status.last_status > 0 && service.status.last_unhealthy) {
278-
service.status.last_unhealthy = process.hrtime.bigint();
279-
}
294+
try {
295+
await this._runChecker(service, startTime);
296+
this.stats.updateService(service.name, service.status);
297+
await this.alerts.alert(service);
298+
} catch (e) {
299+
log.error(e.message);
280300
}
281301

282-
this.stats.updateService(service.name, service.status);
302+
try {
303+
service.status.last_status = service.status.up;
304+
const tout =
305+
service.config.interval -
306+
Number(process.hrtime.bigint() - startTime) / 1000000;
283307

284-
await this.alerts.alert(service);
285-
service.status.last_status = service.status.up;
286-
const tout =
287-
service.config.interval -
288-
Number(process.hrtime.bigint() - startTime) / 1000000;
308+
if (tout <= 0) {
309+
log.debug(service.name + ' tout: ' + (tout > 0 ? tout : 0));
310+
}
289311

290-
if (tout <= 0) {
291-
log.debug(service.name + ' tout: ' + (tout > 0 ? tout : 0));
312+
this.services[service.name]._sTimeoutHandler = setTimeout(
313+
async () => {
314+
this._run(service);
315+
},
316+
tout > 0 ? tout : 0
317+
);
318+
} catch (e) {
319+
log.fatal('Could not run service: ' + (service ? service.name : 'Unknown' + ' e:' + e.message));
292320
}
293-
294-
this.services[service.name]._sTimeoutHandler = setTimeout(
295-
async () => {
296-
this._runCheck(service);
297-
},
298-
tout > 0 ? tout : 0
299-
);
300321
}
301322
}
302323
}

0 commit comments

Comments
 (0)