taskman_atf/plan.inc.php

707 lines
18 KiB
PHP
Raw Permalink Normal View History

2023-11-07 15:42:40 +03:00
<?php
namespace ATF;
use Exception;
use Amp;
use taskman;
class Plan
{
const EXCEPTIONS_THRESHOLD = 5;
const STUCK_THRESHOLD = 50;
const GONE_THRESHOLD = 20;
const HUNG_THRESHOLD = 20;
const BOGUS_DEVICE_COUNT_THRESHOLD = 6;
const BOGUS_REINSTALL_APP_EVERY_N = 2;
const IM_THREAD_UPDATE_INTERVAL = 5;
public Session $session;
public string $name;
public array $tasks = array();
public string $im_thread_id = '';
public int $im_thread_last_update = 0;
public int $start_time;
//execution history of all tasks
//array of tuples [code, msg]
public array $status_codes = array();
function __construct(string $name)
{
$this->name = $name;
}
function getTitle() : string
{
$issues_summary = '';
foreach($this->getProblemsHistogram() as $code => $count)
{
if($code == Task::CODE_GONE)
$issues_summary .= " gones:$count";
else if($code == Task::CODE_NSTART)
$issues_summary .= " nstrts:$count";
else if($code == Task::CODE_STUCK)
$issues_summary .= " stucks:$count";
else if($code == Task::CODE_HUNG)
$issues_summary .= " hungs:$count";
else if($code == Task::CODE_EXCEPTION)
$issues_summary .= " excepts:$count";
else if($code == Task::CODE_WARN)
$issues_summary .= " warns:$count";
}
$running_count = $this->countRunningTasks();
$over_count = $this->countFinishedTasks();
$predicted_total_duration = $this->getPredictedTotalDuration();
$progress_txt = ' tasks:' . ($running_count > 0 ? $running_count . '|' : '') . $over_count . '/' . sizeof($this->tasks) .
' (' . round($this->getProgress()*100, 2) . '%) (' .
round($this->getDuration()/60, 1) . '/' . ($predicted_total_duration > 0 ? round($predicted_total_duration/60, 1) : '?') .
' min)';
return "({$this->session->id}) Plan '{$this->name}'" . $issues_summary . $progress_txt;
}
function getDuration() : int
{
return time() - $this->start_time;
}
function getPredictedTotalDuration()
{
if($this->getDuration() <= 0)
return 0;
if($this->getProgress() <= 0)
return 0;
return $this->getDuration() / $this->getProgress();
}
function _createMessengerThread()
{
if($this->im_thread_id)
return;
for($i=0;$i<5;++$i)
{
$resp = $this->session->conf->im->post($this->getTitle(), $this->_getMessengerThreadProps());
if(isset($resp['ok']))
{
$this->im_thread_id = $resp['id'];
return;
}
sleep(1);
}
throw new Exception("Could not create IM thread");
}
function _updateMessengerThread(bool $force)
{
if(!$force && (time() - $this->im_thread_last_update) < self::IM_THREAD_UPDATE_INTERVAL)
return;
$this->im_thread_last_update = time();
if(!$this->im_thread_id)
return;
$this->session->conf->im->updatePost($this->im_thread_id, $this->_getMessengerThreadProps());
}
function _getMessengerThreadProps() : array
{
return ['props' => ['attachments' => [
[
'text' => '* ' . $this->getTitle(),
'color' => $this->_getThreadColor()
]
]]];
}
function addStatusCode($code, $msg = '')
{
$this->status_codes[] = array($code, $msg);
}
function getProblemsHistogram()
{
$histogram = array();
foreach($this->status_codes as $item)
{
list($code, $_) = $item;
if(!Task::isProblemCode($code))
continue;
if(!isset($histogram[$code]))
$histogram[$code] = 0;
++$histogram[$code];
}
return $histogram;
}
function hasStatusCode($code) : bool
{
return $this->countStatusCode($code) > 0;
}
function countStatusCode($code) : int
{
$c = 0;
foreach($this->status_codes as $item)
{
if($item[0] === $code)
$c++;
}
return $c;
}
function getStats() : Stats
{
$stats = new Stats();
$stats->gones = $this->countStatusCode(Task::CODE_GONE);
$stats->hungs = $this->countStatusCode(Task::CODE_HUNG);
$stats->stucks = $this->countStatusCode(Task::CODE_STUCK);
$stats->nostarts = $this->countStatusCode(Task::CODE_NSTART);
$stats->excepts = $this->countStatusCode(Task::CODE_EXCEPTION);
$stats->warns = $this->countStatusCode(Task::CODE_WARN);
return $stats;
}
function _getThreadColor() : string
{
if(!$this->session->getDevices())
return "#000000";
$stats = $this->getStats();
return $stats->getColor($this->isOver());
}
function addTask(Task $task)
{
$task->plan = $this;
$this->tasks[] = $task;
}
function isOver() : bool
{
if($this->hasTooManyFatalProblems())
return true;
foreach($this->tasks as $task)
{
if(!$task->isDone())
return false;
}
return true;
}
function hasTooManyFatalProblems() : bool
{
return
($this->countStatusCode(Task::CODE_EXCEPTION) >= self::EXCEPTIONS_THRESHOLD) ||
($this->countStatusCode(Task::CODE_STUCK) >= self::STUCK_THRESHOLD) ||
($this->countStatusCode(Task::CODE_GONE) >= self::GONE_THRESHOLD) ||
($this->countStatusCode(Task::CODE_HUNG) >= self::HUNG_THRESHOLD)
;
}
function getProgress() : float
{
$total_progress = 0;
foreach($this->tasks as $task)
$total_progress += $task->getProgress();
return $total_progress / sizeof($this->tasks);
}
function countRunningTasks() : int
{
$running_count = 0;
foreach($this->tasks as $task)
{
if($task->device)
$running_count++;
}
return $running_count;
}
function countFinishedTasks() : int
{
$done_count = 0;
foreach($this->tasks as $task)
{
if($task->isDone())
$done_count++;
}
return $done_count;
}
function runTaskAsync(Task $task) : Amp\Promise
{
return Amp\call(function() use($task) {
yield $this->_startAsync($task);
while(!$this->isOver())
{
$this->_updateMessengerThread(false);
yield Amp\delay((int)($this->session->conf->sleep_time*1000));
log("Task #{$task->run_id} {$task->cmd}(".sizeof($task->args).") progress {$task->getProgress()}, last ext time {$task->last_ext_status_item_time} *{$task->device}*...");
$check_error = yield $this->_tryCheckExtStatusAsync($task, /*attempts*/3, /*timeout*/20);
if($check_error !== null)
{
//something wrong with device, let's ignore the device and reschedule the task
$this->_ignoreDevice($task->device, $check_error);
$task->reschedule();
break;
}
if(!$task->isDone())
yield $this->_checkHealthAsync($task);
if($task->hasFatalProblem())
{
yield $this->_processFatalProblemAsync($task);
break;
}
else if($task->isDone())
{
//let's reset bogus stats for this device
$this->session->resetBogusDeviceCount($task->device);
//let's free the device
$task->device = null;
break;
}
}
$this->_updateMessengerThread(true);
});
}
function _onStart()
{
$this->start_time = time();
log("Starting " . $this->getTitle() . "...");
$this->_createMessengerThread();
}
function _startAsync(Task $task) : Amp\Promise
{
//let's create a Slack thread synchronously if it's not present yet
if(!$this->im_thread_id)
$this->_onStart();
return Amp\call(function() use($task) {
try
{
log("Preparing *{$task->device}*");
yield $this->session->apk_installer->installAsync($task->device);
yield start_ext_cmd_on_device_async(
$this->session->conf->atf_host,
$task->device,
$task->getCmd(),
$task->getCmdArgs()
);
}
catch(Exception $e)
{
$this->_ignoreDevice($task->device, $e->getMessage());
$task->reschedule();
return false;
}
$task->start();
return true;
});
}
function _ignoreDevice($device, $reason)
{
$this->session->ignoreDevice($device, $reason);
$this->post("Ignoring device *$device*: $reason");
}
function _processFatalProblemAsync(Task $task) : Amp\Promise
{
return Amp\call(function() use($task) {
$fatal_msg = "Fatal problem ({$task->getLastFatalProblem()} - ".Task::code2string($task->getLastFatalProblem())."), attempt:{$task->attempts} *{$task->device}*";
err("[FTL] $fatal_msg");
$this->post($fatal_msg);
$this->_postScreenToMessenger($task);
if($task->getLastFatalProblem() == Task::CODE_EXCEPTION)
yield $this->_postLastReplayToMessengerAsync($task);
else if($task->getLastFatalProblem() == Task::CODE_GONE ||
$task->getLastFatalProblem() == Task::CODE_HUNG)
{
$app_log = get_logcat_unity($this->session->conf->atf_host, $task->device, 300);
$app_log = _atf_trim_start($app_log, 2000);
$this->post("Last logs: \n```\n$app_log\n```\n *{$task->device}*");
}
if($this->session->conf->error_pause)
$this->_ignoreDevice($task->device, "Ignored for error investigation");
//Let's reinstall application on this device if it's failing too often
else
$this->_incAndCheckBogusDevice($task->device);
$task->onFatalProblem();
//let's free the device anyway, it's up to Task::onFatalProblem() to reschedule it or not
$task->device = null;
});
}
function _tryCheckExtStatusAsync(Task $task, int $attempts, int $timeout) : Amp\Promise
{
return Amp\call(function() use($task, $attempts, $timeout) {
$last_error = null;
for($i=0;$i<$attempts;++$i)
{
try
{
yield $this->_checkExtStatusAsync($task, $timeout);
break;
}
catch(Exception $e)
{
$last_error = $e->getMessage();
continue;
}
}
return $last_error;
});
}
function _checkExtStatusAsync(Task $task, int $timeout) : Amp\Promise
{
return Amp\call(function() use($task, $timeout) {
$ext_status = yield get_ext_status_async($this->session->conf->atf_host, $task->device, $timeout);
if(!is_array($ext_status))
return;
$new_items = $this->_getExtStatusItemsSince($task->last_ext_status_item_time, $ext_status);
if(!$new_items)
return;
$task->last_ext_status_item_time = end($new_items)['time'];
foreach($new_items as $item)
yield $this->_analyzeExtStatusItemAsync($task, $item);
});
}
function _getExtInputRequestsSince($id, array $all_reqs) : array
{
$new_reqs = array();
foreach($all_reqs as $req)
{
if($req['id'] > $id)
$new_reqs[] = $req;
}
return $new_reqs;
}
function _getExtStatusItemsSince($time, array $ext_status) : array
{
$new_items = array();
foreach($ext_status['entries'] as $item)
{
if($item['time'] > $time)
$new_items[] = $item;
}
return $new_items;
}
function _checkIfAppRunning(Task $task) : Amp\Promise
{
return Amp\call(function() use($task) {
list($status, $_) =
yield host_exec_async($this->session->conf->atf_host, "%{adb}% -s {$task->device} shell pidof %{package_id}%", DEPLOY_OPT_ERR_OK);
return $status == 0;
});
}
function _reportGone(Task $task)
{
//check it application hasn't even started
if($task->last_alive_check_time === $task->reset_time)
{
$task->addStatusCode(Task::CODE_NSTART);
$this->session->trySendStatsEvent($task, 'nstart');
err("[NFD] No app started after {$this->session->conf->dead_threshold} seconds *{$task->device}*");
}
else
{
$task->addStatusCode(Task::CODE_GONE);
$this->session->trySendStatsEvent($task, 'gone');
err("[GNE] App is gone after {$this->session->conf->dead_threshold} seconds *{$task->device}*");
}
}
function _checkHealthAsync(Task $task) : Amp\Promise
{
return Amp\call(function() use($task) {
$not_alive_time = microtime(true) - $task->last_alive_check_time;
$stuck_time = microtime(true) - $task->last_stuck_check_time;
if($not_alive_time > $this->session->conf->dead_threshold)
{
if(yield $this->_checkIfAppRunning($task))
{
$task->addStatusCode(Task::CODE_HUNG);
$this->session->trySendStatsEvent($task, 'hung');
err("[HNG] No activity for {$this->session->conf->dead_threshold} seconds *{$task->device}*");
}
else
$this->_reportGone($task);
$this->_reportErrorFromLogcatToMessenger($task, 1000);
}
else if($stuck_time > $this->session->conf->stuck_threshold)
{
if(yield $this->_checkIfAppRunning($task))
{
$task->addStatusCode(Task::CODE_STUCK);
$this->session->trySendStatsEvent($task, 'stuck');
err("[STK] Stuck for {$this->session->conf->stuck_threshold} seconds *{$task->device}*");
}
else
$this->_reportGone($task);
}
});
}
function _incAndCheckBogusDevice(string $device)
{
$count = $this->session->incBogusDevice($device);
if($count > self::BOGUS_DEVICE_COUNT_THRESHOLD)
{
$this->_ignoreDevice($device, "Too many bogus errors");
}
else if(($count % self::BOGUS_REINSTALL_APP_EVERY_N) == 0)
{
$this->session->apk_installer->forgetInstall($device);
}
}
static function _parseExtMessage(string $message)
{
$msg_code = null;
$msg_text = $message;
//example: [DBG] this a debug message
if(preg_match('~^(\[[^\]]+\])(.*)$~', $message, $matches))
{
$msg_code = $matches[1];
$msg_text = $matches[2];
}
return array($msg_code, $msg_text);
}
static function _printToShellExtItem(Task $task, array $item)
{
$shell_msg = _atf_trim($item['message'], 200);
if(Task::isProblemCode($item['error']))
$shell_msg = "[PRB] Code:{$item['error']}, $shell_msg";
$shell_msg = "(".round($item['time'],1)."s) {$shell_msg} *{$task->device}*";
log($shell_msg);
}
function _postToMessengerExtStatusItem(Task $task, array $item)
{
$orig_msg = _atf_trim($item['message'], 3000);
$mm_msg = $orig_msg;
if($item['error'] == Task::CODE_EXCEPTION)
$mm_msg = "\n```\n$mm_msg\n```\n";
$mm_msg = '('.round($item['time'],1).'s) '.$mm_msg.' *'.$task->device.'*';
$resp = $this->post($mm_msg);
if(isset($resp['ok']) && $item['error'] == Task::CODE_EXCEPTION)
$this->session->tryShareToQAChannel($resp['id'], $orig_msg);
}
function post($msg, array $args = array())
{
$args['root_id'] = $this->im_thread_id;
return $this->session->conf->im->post($msg, $args);
}
function _analyzeExtStatusItemAsync(Task $task, array $item) : Amp\Promise
{
return Amp\call(function() use($task, $item) {
self::_printToShellExtItem($task, $item);
$task->addStatusCode($item['error'], $item['message']);
if($item['error'] == Task::CODE_EXCEPTION)
$this->session->trySendStatsEvent($task, 'exception');
list($msg_type, $msg_text) = self::_parseExtMessage($item['message']);
//NOTE: alive system messages are sent periodically by ExtBot itself,
// we'd like to ignore them for stuck detection
if($msg_type !== '[ALIVE]')
$task->last_stuck_check_time = microtime(true);
//NOTE: in case of any message from the device we update the alive check timestamp
$task->last_alive_check_time = microtime(true);
yield $this->_processExtStatusMessageAsync($task, $item, $msg_type, $msg_text);
});
}
function _processExtStatusMessageAsync($task, $item, $msg_type, $msg_text) : Amp\Promise
{
return Amp\call(function() use($task, $item, $msg_type, $msg_text) {
if($item['error'] == Task::CODE_EXCEPTION)
{
$this->_postToMessengerExtStatusItem($task, $item);
}
else if($msg_type === '[WRN]')
{
$task->addStatusCode(Task::CODE_WARN);
$this->_postToMessengerExtStatusItem($task, $item);
}
else if($msg_type === '[PRG]')
{
//let's reset hung stats since progress is going
$this->session->resetBogusDeviceCount($task->device);
$task->onProgress($msg_text);
}
else if($msg_type === '[STAT]')
yield $this->session->trySendStatsFromJzonAsync($task, $msg_text);
else if($msg_type === '[RPL]')
$this->_postReplayToMessenger($task, $msg_text);
});
}
function _postLastReplayToMessengerAsync(Task $task) : Amp\Promise
{
return Amp\call(function() use($task) {
if($this->session->conf->device_replay_path)
{
list($repl_err, $repl_txt) =
yield get_last_replay_async($this->session->conf->atf_host, $task->device, $this->session->conf->device_replay_path);
if($repl_err === 0)
$this->_postReplayToMessenger($task, $repl_txt);
}
});
}
function _postReplayToMessenger(Task $task, $repl_txt)
{
$repl_txt = _atf_try_lz4_replay($repl_txt);
$this->post("Last Replay: \n```\n$repl_txt\n```\n *{$task->device}*");
}
function _reportErrorFromLogcatToMessenger(Task $task, $limit)
{
$errors_log = _atf_trim(get_logcat_errors($task->device, $limit), 3950);
if($errors_log)
$this->post("\n```\n$errors_log\n```\n *{$task->device}*");
}
function _postScreenToMessenger(Task $task)
{
global $GAME_ROOT;
$png_data = device_screen($this->session->conf->atf_host, $task->device);
if($png_data)
{
$tmp_png = tempnam(sys_get_temp_dir(), 'device_screen') . '.png';
taskman\ensure_write($tmp_png, $png_data);
try
{
$this->session->conf->im->postPNG("screen: ".$task->device, $tmp_png, array('root_id' => $this->im_thread_id));
}
catch(Exception $e)
{}
finally
{
taskman\ensure_rm($tmp_png);
}
}
}
}
class Stats
{
public int $gones = 0;
public int $hungs = 0;
public int $stucks = 0;
public int $nostarts = 0;
public int $excepts = 0;
public int $warns = 0;
function add(Stats $other)
{
$this->gones += $other->gones;
$this->hungs += $other->hungs;
$this->stucks += $other->stucks;
$this->nostarts += $other->nostarts;
$this->excepts += $other->excepts;
$this->warns += $other->warns;
}
function getColor(bool $is_over) : string
{
if($this->gones > 0 ||
$this->hungs > 0 ||
$this->warns > 0 ||
$this->excepts > 0 ||
$this->stucks > 0 ||
$this->nostarts > 0
)
{
if($this->excepts)
return $is_over ? "danger" : "#FFCCCB";
else
return $is_over ? "warning" : "#FFCF9E";
}
else if($is_over)
return "good";
else
return "#D3D3D3";
}
}