balance_pool log for 2017-01-23 has the following exception:
Traceback (most recent call last):
File "site_utils/balance_pools.py", line 599, in <module>
main(sys.argv)
File "site_utils/balance_pools.py", line 593, in main
parallel.RunTasksInProcessPool(balancer, board_info, processes=8)
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 809, in RunTasksInProcessPool
queue.put((idx, input_args))
File "/usr/lib/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 750, in BackgroundTaskRunner
queue.put(_AllTasksComplete())
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 750, in BackgroundTaskRunner
queue.put(_AllTasksComplete())
File "/usr/lib/python2.7/contextlib.py", line 24, in __exit__
self.gen.next()
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 561, in ParallelTasks
raise BackgroundFailure(exc_infos=errors)
chromite.lib.parallel.BackgroundFailure: <class 'urllib2.URLError'>: <urlopen error [Errno 110] Connection timed out>
Traceback (most recent call last):
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 602, in TaskRunner
task(*x, **task_kwargs)
File "/usr/local/autotest/site-packages/chromite/lib/parallel.py", line 800, in <lambda>
fn = lambda idx, task_args: out_queue.put((idx, task(*task_args)))
File "site_utils/balance_pools.py", line 562, in balancer
_balance_board(arguments, afe, board, pool, start_time, end_time)
File "site_utils/balance_pools.py", line 329, in _balance_board
start_time, end_time)
File "site_utils/balance_pools.py", line 179, in __init__
self.total_hosts = self._get_hosts(afe, start_time, end_time)
File "site_utils/balance_pools.py", line 195, in _get_hosts
diag = h.last_diagnosis()[0]
File "/usr/local/autotest/server/lib/status_history.py", line 573, in last_diagnosis
self._init_status_task()
File "/usr/local/autotest/server/lib/status_history.py", line 502, in _init_status_task
self._afe, self._host.id, self.end_time)
File "/usr/local/autotest/server/lib/status_history.py", line 235, in get_status_task
task = afe.get_host_status_task(host_id, query_end)
File "/usr/local/autotest/server/frontend.py", line 648, in get_host_status_task
host_id=host_id, end_time=end_time)
File "/usr/local/autotest/server/frontend.py", line 104, in run
result = utils.strip_unicode(rpc_call(**dargs))
File "/usr/local/autotest/frontend/afe/json_rpc/proxy.py", line 114, in __call__
respdata = urllib2.urlopen(request).read()
File "/usr/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 110] Connection timed out>
This caused the script to exit early and not balance the easy pools for me, so I complain. ;)
Comment 1 by jrbarnette@chromium.org
, Jan 23 2017The key line in the traceback is this one: File "/usr/local/autotest/frontend/afe/json_rpc/proxy.py", line 114, in __call__ respdata = urllib2.urlopen(request).read() That is, the call to urlopen was on behalf of an RPC call to cautotest. So, the timeout was because cautotest was slow to respond to that a particular RPC call. Looking at the traceback, it seems we're using server.frontend.AFE, rather than the RetryingAFE class. Retrying in this case _probably_ would make things better.