mirror of
https://github.com/openstf/stf
synced 2025-10-04 10:19:30 +02:00
React to device reboots more gracefully. Previously, the provider would attempt to restart the worker as soon as an essential service died, even if the device was soon released after. Now that doesn't happen.
This commit is contained in:
parent
8db22952bf
commit
816904b9bb
2 changed files with 106 additions and 27 deletions
|
@ -24,6 +24,10 @@ program
|
||||||
, 'name (or os.hostname())'
|
, 'name (or os.hostname())'
|
||||||
, String
|
, String
|
||||||
, os.hostname())
|
, os.hostname())
|
||||||
|
.option('-t, --restart-threshold <ms>'
|
||||||
|
, 'restart worker only if it stays alive for longer than this'
|
||||||
|
, Number
|
||||||
|
, 10000)
|
||||||
.action(function() {
|
.action(function() {
|
||||||
var serials = cliutil.allUnknownArgs(arguments)
|
var serials = cliutil.allUnknownArgs(arguments)
|
||||||
, options = cliutil.lastArg(arguments)
|
, options = cliutil.lastArg(arguments)
|
||||||
|
@ -37,6 +41,9 @@ program
|
||||||
|
|
||||||
require('./roles/provider')({
|
require('./roles/provider')({
|
||||||
name: options.name
|
name: options.name
|
||||||
|
, restartThreshold: options.restartThreshold
|
||||||
|
, restartTimeout: 1000
|
||||||
|
, killTimeout: 10000
|
||||||
, filter: function(device) {
|
, filter: function(device) {
|
||||||
return serials.length === 0 || serials.indexOf(device.id) !== -1
|
return serials.length === 0 || serials.indexOf(device.id) !== -1
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
var path = require('path')
|
var path = require('path')
|
||||||
|
var events = require('events')
|
||||||
|
|
||||||
var adb = require('adbkit')
|
var adb = require('adbkit')
|
||||||
var Promise = require('bluebird')
|
var Promise = require('bluebird')
|
||||||
|
@ -12,6 +13,7 @@ module.exports = function(options) {
|
||||||
var log = logger.createLogger('provider')
|
var log = logger.createLogger('provider')
|
||||||
var client = Promise.promisifyAll(adb.createClient())
|
var client = Promise.promisifyAll(adb.createClient())
|
||||||
var workers = Object.create(null)
|
var workers = Object.create(null)
|
||||||
|
var tracker = new events.EventEmitter()
|
||||||
|
|
||||||
// Output
|
// Output
|
||||||
var push = zmq.socket('push')
|
var push = zmq.socket('push')
|
||||||
|
@ -20,34 +22,46 @@ module.exports = function(options) {
|
||||||
push.connect(endpoint)
|
push.connect(endpoint)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
tracker.on('add', function(device) {
|
||||||
|
pushDeviceStatus(device, device.type)
|
||||||
|
maybeConnect(device)
|
||||||
|
})
|
||||||
|
|
||||||
|
tracker.on('change', function(device) {
|
||||||
|
pushDeviceStatus(device, device.type)
|
||||||
|
maybeConnect(device) || maybeDisconnect(device)
|
||||||
|
})
|
||||||
|
|
||||||
|
tracker.on('remove', function(device) {
|
||||||
|
pushDeviceStatus(device, 'absent')
|
||||||
|
maybeDisconnect(device)
|
||||||
|
})
|
||||||
|
|
||||||
client.trackDevicesAsync()
|
client.trackDevicesAsync()
|
||||||
.then(function(tracker) {
|
.then(function(unfilteredTracker) {
|
||||||
log.info('Tracking devices')
|
log.info('Tracking devices')
|
||||||
|
|
||||||
tracker.on('add', function(device) {
|
unfilteredTracker.on('add', function(device) {
|
||||||
if (isWantedDevice(device)) {
|
if (isWantedDevice(device)) {
|
||||||
log.info('Found device "%s" (%s)', device.id, device.type)
|
log.info('Found device "%s" (%s)', device.id, device.type)
|
||||||
pushDeviceStatus(device, device.type)
|
tracker.emit('add', device)
|
||||||
maybeConnect(device)
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
log.info('Ignoring device "%s" (%s)', device.id, device.type)
|
log.info('Ignoring device "%s" (%s)', device.id, device.type)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
tracker.on('change', function(device) {
|
unfilteredTracker.on('change', function(device) {
|
||||||
if (isWantedDevice(device)) {
|
if (isWantedDevice(device)) {
|
||||||
log.info('Device "%s" is now "%s"', device.id, device.type)
|
log.info('Device "%s" is now "%s"', device.id, device.type)
|
||||||
pushDeviceStatus(device, device.type)
|
tracker.emit('change', device)
|
||||||
maybeConnect(device) || maybeDisconnect(device)
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
tracker.on('remove', function(device) {
|
unfilteredTracker.on('remove', function(device) {
|
||||||
if (isWantedDevice(device)) {
|
if (isWantedDevice(device)) {
|
||||||
log.info('Lost device "%s" (%s)', device.id, device.type)
|
log.info('Lost device "%s" (%s)', device.id, device.type)
|
||||||
pushDeviceStatus(device, 'absent')
|
tracker.emit('remove', device)
|
||||||
maybeDisconnect(device)
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
@ -77,13 +91,15 @@ module.exports = function(options) {
|
||||||
|
|
||||||
function maybeConnect(device) {
|
function maybeConnect(device) {
|
||||||
if (isConnectable(device) && !isConnected(device)) {
|
if (isConnectable(device) && !isConnected(device)) {
|
||||||
log.info('Spawning worker for device "%s"', device.id)
|
log.info('Spawning device worker "%s"', device.id)
|
||||||
var proc = options.fork(device)
|
var proc = options.fork(device)
|
||||||
proc.on('error', function(err) {
|
|
||||||
|
function errorListener(err) {
|
||||||
log.error('Device worker "%s" had an error: %s',
|
log.error('Device worker "%s" had an error: %s',
|
||||||
device.id, err.message)
|
device.id, err.message)
|
||||||
})
|
}
|
||||||
proc.on('exit', function(code, signal) {
|
|
||||||
|
function exitListener(code, signal) {
|
||||||
var data = workers[device.id]
|
var data = workers[device.id]
|
||||||
delete workers[device.id]
|
delete workers[device.id]
|
||||||
switch (code) {
|
switch (code) {
|
||||||
|
@ -95,23 +111,54 @@ module.exports = function(options) {
|
||||||
, device.id)
|
, device.id)
|
||||||
break
|
break
|
||||||
default:
|
default:
|
||||||
log.error('Device worker "%s" had a dirty exit (code %d)',
|
if (Date.now() - data.started < options.restartThreshold) {
|
||||||
device.id, code)
|
log.error(
|
||||||
if (Date.now() - data.started < 10000) {
|
'Device worker "%s" died with exit code %d, ' +
|
||||||
log.error('Device worker "%s" failed within 10 seconds of startup,' +
|
'NOT restarting due to threshold of %dms not being met'
|
||||||
' will not attempt to restart', device.id)
|
, device.id
|
||||||
|
, code
|
||||||
|
, options.restartThreshold
|
||||||
|
)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
log.info('Restarting worker of "%s"', device.id)
|
log.error(
|
||||||
maybeConnect(device)
|
'Device worker "%s" died with exit code %d, ' +
|
||||||
|
'attempting to restart in %dms if device is still around'
|
||||||
|
, device.id
|
||||||
|
, code
|
||||||
|
, options.restartTimeout
|
||||||
|
)
|
||||||
|
waitForAnyChanges(device)
|
||||||
|
.timeout(options.restartTimeout)
|
||||||
|
.then(function(device) {
|
||||||
|
// Most likely we lost the device, but our tracker didn't
|
||||||
|
// see it before the process died
|
||||||
|
log.warn(
|
||||||
|
'Not restarting device worker "%s" due to tracker ' +
|
||||||
|
'activity (but the change may cause it to start)'
|
||||||
|
, device.id
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.catch(function() {
|
||||||
|
log.info('Restarting device worker "%s"', device.id)
|
||||||
|
maybeConnect(device)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
|
|
||||||
|
proc.on('error', errorListener)
|
||||||
|
proc.on('exit', exitListener)
|
||||||
|
|
||||||
workers[device.id] = {
|
workers[device.id] = {
|
||||||
device: device
|
device: device
|
||||||
, proc: proc
|
, proc: proc
|
||||||
, started: Date.now()
|
, started: Date.now()
|
||||||
|
, unbind: function() {
|
||||||
|
proc.removeListener('error', errorListener)
|
||||||
|
proc.removeListener('exit', exitListener)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
@ -120,21 +167,44 @@ module.exports = function(options) {
|
||||||
|
|
||||||
function maybeDisconnect(device) {
|
function maybeDisconnect(device) {
|
||||||
if (isConnected(device)) {
|
if (isConnected(device)) {
|
||||||
log.info('Releasing worker of %s', device.id)
|
log.info('Releasing device worker "%s"', device.id)
|
||||||
gracefullyKillWorker(device.id)
|
gracefullyKillWorker(device.id)
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function waitForAnyChanges(device) {
|
||||||
|
var resolver = Promise.defer()
|
||||||
|
|
||||||
|
function maybeResolve(otherDevice) {
|
||||||
|
if (otherDevice.id === device.id) {
|
||||||
|
resolver.resolve(otherDevice)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tracker.on('add', maybeResolve)
|
||||||
|
tracker.on('change', maybeResolve)
|
||||||
|
tracker.on('remove', maybeResolve)
|
||||||
|
|
||||||
|
return resolver.promise.finally(function() {
|
||||||
|
tracker.removeListener('add', maybeResolve)
|
||||||
|
tracker.removeListener('change', maybeResolve)
|
||||||
|
tracker.removeListener('remove', maybeResolve)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
function tryKillWorker(id) {
|
function tryKillWorker(id) {
|
||||||
var deferred = Promise.defer(),
|
var deferred = Promise.defer(),
|
||||||
worker = workers[id]
|
worker = workers[id]
|
||||||
|
|
||||||
function onExit() {
|
function onExit() {
|
||||||
|
delete workers[id]
|
||||||
|
log.info('Gracefully killed device worker "%s"', id)
|
||||||
deferred.resolve()
|
deferred.resolve()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
worker.unbind()
|
||||||
worker.proc.once('exit', onExit)
|
worker.proc.once('exit', onExit)
|
||||||
worker.proc.kill('SIGTERM')
|
worker.proc.kill('SIGTERM')
|
||||||
|
|
||||||
|
@ -144,15 +214,18 @@ module.exports = function(options) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function forceKillWorker(id) {
|
function forceKillWorker(id) {
|
||||||
log.warn('Force killing worker of device "%s"', id)
|
log.warn('Force killing device worker "%s"', id)
|
||||||
|
|
||||||
var deferred = Promise.defer()
|
var deferred = Promise.defer()
|
||||||
, worker = workers[id]
|
, worker = workers[id]
|
||||||
|
|
||||||
function onExit() {
|
function onExit() {
|
||||||
|
delete workers[id]
|
||||||
|
log.warn('Force killed device worker "%s"', id)
|
||||||
deferred.resolve()
|
deferred.resolve()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
worker.unbind()
|
||||||
worker.proc.once('exit', onExit)
|
worker.proc.once('exit', onExit)
|
||||||
worker.proc.kill('SIGKILL')
|
worker.proc.kill('SIGKILL')
|
||||||
|
|
||||||
|
@ -163,12 +236,11 @@ module.exports = function(options) {
|
||||||
|
|
||||||
function gracefullyKillWorker(id) {
|
function gracefullyKillWorker(id) {
|
||||||
return tryKillWorker(id)
|
return tryKillWorker(id)
|
||||||
.timeout(10000)
|
.timeout(options.killTimeout)
|
||||||
.catch(function() {
|
.catch(function() {
|
||||||
log.error('Device worker "%s" did not stop in time', id)
|
log.error('Device worker "%s" did not stop in time', id)
|
||||||
return forceKillWorker(id)
|
return forceKillWorker(id)
|
||||||
.timeout(10000)
|
.timeout(options.killTimeout)
|
||||||
.then(deferred.resolve)
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue