Skip to content

Commit

Permalink
[Watchdog] Support non-unique client names. (#1446)
Browse files Browse the repository at this point in the history
Support non-unique client names with a parallel data structure
client_list_ to client_map_. Adds new register, ping, and unregister
functions that return a client pointer to be maintained and used by the
caller to identify the client.

b/296467581

Change-Id: I48017eda5630ee1ebc6eb5489f22ae6c0934ffe0
  • Loading branch information
briantting authored Sep 11, 2023
1 parent 419286c commit 7583e2c
Show file tree
Hide file tree
Showing 3 changed files with 298 additions and 93 deletions.
245 changes: 172 additions & 73 deletions cobalt/watchdog/watchdog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,39 +208,24 @@ void* Watchdog::Monitor(void* context) {
starboard::ScopedLock scoped_lock(static_cast<Watchdog*>(context)->mutex_);
while (1) {
SbTimeMonotonic current_monotonic_time = SbTimeGetMonotonicNow();

// Iterates through client map to monitor all registered clients.
bool watchdog_violation = false;

// Iterates through client map to monitor all name registered clients.
for (auto& it : static_cast<Watchdog*>(context)->client_map_) {
Client* client = it.second.get();
// Ignores and resets clients in idle states, clients whose monitor_state
// is below the current application state. Resets time_wait_microseconds
// and time_interval_microseconds start values.
if (static_cast<Watchdog*>(context)->state_ > client->monitor_state) {
client->time_registered_monotonic_microseconds = current_monotonic_time;
client->time_last_updated_monotonic_microseconds =
current_monotonic_time;
continue;
if (MonitorClient(context, client, current_monotonic_time)) {
watchdog_violation = true;
}
}

SbTimeMonotonic time_delta =
current_monotonic_time -
client->time_last_updated_monotonic_microseconds;
SbTimeMonotonic time_wait =
current_monotonic_time -
client->time_registered_monotonic_microseconds;

// Watchdog violation
if (time_delta > client->time_interval_microseconds &&
time_wait > client->time_wait_microseconds) {
// Iterates through client list to monitor all client registered clients.
for (auto& it : static_cast<Watchdog*>(context)->client_list_) {
Client* client = it.get();
if (MonitorClient(context, client, current_monotonic_time)) {
watchdog_violation = true;
UpdateViolationsMap(context, client, time_delta);

// Resets time last updated.
client->time_last_updated_monotonic_microseconds =
current_monotonic_time;
}
}

if (static_cast<Watchdog*>(context)->pending_write_)
MaybeWriteWatchdogViolations(context);
if (watchdog_violation) MaybeTriggerCrash(context);
Expand All @@ -255,6 +240,33 @@ void* Watchdog::Monitor(void* context) {
return nullptr;
}

bool Watchdog::MonitorClient(void* context, Client* client,
SbTimeMonotonic current_monotonic_time) {
// Ignores and resets clients in idle states, clients whose monitor_state
// is below the current application state. Resets time_wait_microseconds
// and time_interval_microseconds start values.
if (static_cast<Watchdog*>(context)->state_ > client->monitor_state) {
client->time_registered_monotonic_microseconds = current_monotonic_time;
client->time_last_updated_monotonic_microseconds = current_monotonic_time;
return false;
}

SbTimeMonotonic time_delta =
current_monotonic_time - client->time_last_updated_monotonic_microseconds;
SbTimeMonotonic time_wait =
current_monotonic_time - client->time_registered_monotonic_microseconds;

// Watchdog violation
if (time_delta > client->time_interval_microseconds &&
time_wait > client->time_wait_microseconds) {
UpdateViolationsMap(context, client, time_delta);
// Resets time last updated.
client->time_last_updated_monotonic_microseconds = current_monotonic_time;
return true;
}
return false;
}

void Watchdog::UpdateViolationsMap(void* context, Client* client,
SbTimeMonotonic time_delta) {
// Gets violation dictionary with key client name from violations map.
Expand Down Expand Up @@ -309,6 +321,9 @@ void Watchdog::UpdateViolationsMap(void* context, Client* client,
for (auto& it : static_cast<Watchdog*>(context)->client_map_) {
registered_clients.GetList().emplace_back(base::Value(it.first));
}
for (auto& it : static_cast<Watchdog*>(context)->client_list_) {
registered_clients.GetList().emplace_back(base::Value(it->name));
}
violation.SetKey("registeredClients", registered_clients.Clone());

// Adds new violation to violations map.
Expand Down Expand Up @@ -423,19 +438,6 @@ bool Watchdog::Register(std::string name, std::string description,
int64_t time_wait_microseconds, Replace replace) {
if (is_disabled_) return true;

// Validates parameters.
if (time_interval_microseconds < watchdog_monitor_frequency_ ||
time_wait_microseconds < 0) {
SB_DLOG(ERROR) << "[Watchdog] Unable to Register: " << name;
if (time_interval_microseconds < watchdog_monitor_frequency_) {
SB_DLOG(ERROR) << "[Watchdog] Time interval less than min: "
<< watchdog_monitor_frequency_;
} else {
SB_DLOG(ERROR) << "[Watchdog] Time wait is negative.";
}
return false;
}

starboard::ScopedLock scoped_lock(mutex_);

int64_t current_time = SbTimeToPosix(SbTimeGetNow());
Expand All @@ -457,6 +459,65 @@ bool Watchdog::Register(std::string name, std::string description,
}
}

// Creates new client.
std::unique_ptr<Client> client = CreateClient(
name, description, monitor_state, time_interval_microseconds,
time_wait_microseconds, current_time, current_monotonic_time);
if (client == nullptr) return false;

// Registers.
auto result = client_map_.emplace(name, std::move(client));

if (result.second) {
SB_DLOG(INFO) << "[Watchdog] Registered: " << name;
} else {
SB_DLOG(ERROR) << "[Watchdog] Unable to Register: " << name;
}
return result.second;
}

std::shared_ptr<Client> Watchdog::RegisterByClient(
std::string name, std::string description,
base::ApplicationState monitor_state, int64_t time_interval_microseconds,
int64_t time_wait_microseconds) {
if (is_disabled_) return nullptr;

starboard::ScopedLock scoped_lock(mutex_);

int64_t current_time = SbTimeToPosix(SbTimeGetNow());
SbTimeMonotonic current_monotonic_time = SbTimeGetMonotonicNow();

// Creates new client.
std::shared_ptr<Client> client = CreateClient(
name, description, monitor_state, time_interval_microseconds,
time_wait_microseconds, current_time, current_monotonic_time);
if (client == nullptr) return nullptr;

// Registers.
client_list_.emplace_back(client);

SB_DLOG(INFO) << "[Watchdog] Registered: " << name;
return client;
}

std::unique_ptr<Client> Watchdog::CreateClient(
std::string name, std::string description,
base::ApplicationState monitor_state, int64_t time_interval_microseconds,
int64_t time_wait_microseconds, int64_t current_time,
SbTimeMonotonic current_monotonic_time) {
// Validates parameters.
if (time_interval_microseconds < watchdog_monitor_frequency_ ||
time_wait_microseconds < 0) {
SB_DLOG(ERROR) << "[Watchdog] Unable to Register: " << name;
if (time_interval_microseconds < watchdog_monitor_frequency_) {
SB_DLOG(ERROR) << "[Watchdog] Time interval less than min: "
<< watchdog_monitor_frequency_;
} else {
SB_DLOG(ERROR) << "[Watchdog] Time wait is negative.";
}
return nullptr;
}

// Creates new Client.
std::unique_ptr<Client> client(new Client);
client->name = name;
Expand All @@ -470,15 +531,7 @@ bool Watchdog::Register(std::string name, std::string description,
client->time_last_pinged_microseconds = current_time;
client->time_last_updated_monotonic_microseconds = current_monotonic_time;

// Registers.
auto result = client_map_.emplace(name, std::move(client));

if (result.second) {
SB_DLOG(INFO) << "[Watchdog] Registered: " << name;
} else {
SB_DLOG(ERROR) << "[Watchdog] Unable to Register: " << name;
}
return result.second;
return std::move(client);
}

bool Watchdog::Unregister(const std::string& name, bool lock) {
Expand All @@ -497,11 +550,68 @@ bool Watchdog::Unregister(const std::string& name, bool lock) {
return result;
}

bool Watchdog::UnregisterByClient(std::shared_ptr<Client> client) {
if (is_disabled_) return true;

starboard::ScopedLock scoped_lock(mutex_);

std::string name = "";
if (client) name = client->name;

// Unregisters.
for (auto it = client_list_.begin(); it != client_list_.end(); it++) {
if (client == *it) {
client_list_.erase(it);
SB_DLOG(INFO) << "[Watchdog] Unregistered: " << name;
return true;
}
}
SB_DLOG(ERROR) << "[Watchdog] Unable to Unregister: " << name;
return false;
}

bool Watchdog::Ping(const std::string& name) { return Ping(name, ""); }

bool Watchdog::Ping(const std::string& name, const std::string& info) {
if (is_disabled_) return true;

starboard::ScopedLock scoped_lock(mutex_);

auto it = client_map_.find(name);
bool client_exists = it != client_map_.end();

if (client_exists) {
Client* client = it->second.get();
return PingHelper(client, name, info);
}
SB_DLOG(ERROR) << "[Watchdog] Unable to Ping: " << name;
return false;
}

bool Watchdog::PingByClient(std::shared_ptr<Client> client) {
return PingByClient(client, "");
}

bool Watchdog::PingByClient(std::shared_ptr<Client> client,
const std::string& info) {
if (is_disabled_) return true;

std::string name = "";
if (client) name = client->name;

starboard::ScopedLock scoped_lock(mutex_);

for (auto it = client_list_.begin(); it != client_list_.end(); it++) {
if (client == *it) {
return PingHelper(client.get(), name, info);
}
}
SB_DLOG(ERROR) << "[Watchdog] Unable to Ping: " << name;
return false;
}

bool Watchdog::PingHelper(Client* client, const std::string& name,
const std::string& info) {
// Validates parameters.
if (info.length() > kWatchdogMaxPingInfoLength) {
SB_DLOG(ERROR) << "[Watchdog] Unable to Ping: " << name;
Expand All @@ -510,36 +620,25 @@ bool Watchdog::Ping(const std::string& name, const std::string& info) {
return false;
}

starboard::ScopedLock scoped_lock(mutex_);

auto it = client_map_.find(name);
bool client_exists = it != client_map_.end();
int64_t current_time = SbTimeToPosix(SbTimeGetNow());
SbTimeMonotonic current_monotonic_time = SbTimeGetMonotonicNow();

if (client_exists) {
int64_t current_time = SbTimeToPosix(SbTimeGetNow());
SbTimeMonotonic current_monotonic_time = SbTimeGetMonotonicNow();
// Updates last ping.
client->time_last_pinged_microseconds = current_time;
client->time_last_updated_monotonic_microseconds = current_monotonic_time;

Client* client = it->second.get();
// Updates last ping.
client->time_last_pinged_microseconds = current_time;
client->time_last_updated_monotonic_microseconds = current_monotonic_time;
if (info != "") {
// Creates new ping_info.
base::Value ping_info(base::Value::Type::DICTIONARY);
ping_info.SetKey("timestampMilliseconds",
base::Value(std::to_string(current_time / 1000)));
ping_info.SetKey("info", base::Value(info));

if (info != "") {
// Creates new ping_info.
base::Value ping_info(base::Value::Type::DICTIONARY);
ping_info.SetKey("timestampMilliseconds",
base::Value(std::to_string(current_time / 1000)));
ping_info.SetKey("info", base::Value(info));

client->ping_infos.GetList().emplace_back(ping_info.Clone());
if (client->ping_infos.GetList().size() > kWatchdogMaxPingInfos)
client->ping_infos.GetList().erase(
client->ping_infos.GetList().begin());
}
} else {
SB_DLOG(ERROR) << "[Watchdog] Unable to Ping: " << name;
client->ping_infos.GetList().emplace_back(ping_info.Clone());
if (client->ping_infos.GetList().size() > kWatchdogMaxPingInfos)
client->ping_infos.GetList().erase(client->ping_infos.GetList().begin());
}
return client_exists;
return true;
}

std::string Watchdog::GetWatchdogViolations(
Expand Down
24 changes: 23 additions & 1 deletion cobalt/watchdog/watchdog.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,19 @@ class Watchdog : public Singleton<Watchdog> {
base::ApplicationState monitor_state,
int64_t time_interval_microseconds,
int64_t time_wait_microseconds = 0, Replace replace = NONE);
std::shared_ptr<Client> RegisterByClient(std::string name,
std::string description,
base::ApplicationState monitor_state,
int64_t time_interval_microseconds,
int64_t time_wait_microseconds = 0);
bool Unregister(const std::string& name, bool lock = true);
bool UnregisterByClient(std::shared_ptr<Client> client);
bool Ping(const std::string& name);
bool Ping(const std::string& name, const std::string& info);
bool PingByClient(std::shared_ptr<Client> client);
bool PingByClient(std::shared_ptr<Client> client, const std::string& info);
bool PingHelper(Client* client, const std::string& name,
const std::string& info);
std::string GetWatchdogViolations(
const std::vector<std::string>& clients = {}, bool clear = true);
bool GetPersistentSettingWatchdogEnable();
Expand All @@ -107,7 +117,16 @@ class Watchdog : public Singleton<Watchdog> {
std::shared_ptr<base::Value> GetViolationsMap();
void WriteWatchdogViolations();
void EvictOldWatchdogViolations();
std::unique_ptr<Client> CreateClient(std::string name,
std::string description,
base::ApplicationState monitor_state,
int64_t time_interval_microseconds,
int64_t time_wait_microseconds,
int64_t current_time,
SbTimeMonotonic current_monotonic_time);
static void* Monitor(void* context);
static bool MonitorClient(void* context, Client* client,
SbTimeMonotonic current_monotonic_time);
static void UpdateViolationsMap(void* context, Client* client,
SbTimeMonotonic time_delta);
static void EvictWatchdogViolation(void* context);
Expand Down Expand Up @@ -139,8 +158,11 @@ class Watchdog : public Singleton<Watchdog> {
SbTimeMonotonic time_last_written_microseconds_ = 0;
// Number of microseconds between writes.
int64_t write_wait_time_microseconds_;
// Dictionary of registered Watchdog clients.
// Dictionary of name registered Watchdog clients.
std::unordered_map<std::string, std::unique_ptr<Client>> client_map_;
// List of client registered Watchdog clients, parallel data structure to
// client_map_.
std::vector<std::shared_ptr<Client>> client_list_;
// Dictionary of lists of Watchdog violations represented as dictionaries.
std::shared_ptr<base::Value> violations_map_;
// Monitor thread.
Expand Down
Loading

0 comments on commit 7583e2c

Please sign in to comment.