From b5c946c82ebc3649340296c9b60d4e6e65cfa1fa Mon Sep 17 00:00:00 2001 From: Igor Zolotarev <63460867+yngvar-antonsson@users.noreply.github.com> Date: Wed, 8 Nov 2023 13:10:14 +0100 Subject: [PATCH] Add election leader idle issue and graphql (#2150) --- CHANGELOG.rst | 8 ++++++++ cartridge/issues.lua | 29 +++++++++++++++++++++++++++-- cartridge/lua-api/boxinfo.lua | 1 + cartridge/webui/gql-boxinfo.lua | 5 ++++- doc/schema.graphql | 17 ++++++++++------- test/integration/api_edit_test.lua | 5 ++++- test/integration/issues_test.lua | 30 ++++++++++++++++++++++++++++++ 7 files changed, 84 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 571af62ad..142fc236a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,14 @@ and this project adheres to Unreleased ------------------------------------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Added +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- ``election_leader_idle`` field to GraphQL API. + +- new issue when ``box.info.election.leader_idle`` is too high. + ------------------------------------------------------------------------------- [2.8.4] - 2023-10-31 ------------------------------------------------------------------------------- diff --git a/cartridge/issues.lua b/cartridge/issues.lua index d37047b31..c7821ef16 100644 --- a/cartridge/issues.lua +++ b/cartridge/issues.lua @@ -59,6 +59,11 @@ -- -- * warning: "Instance ... has spaces with deprecated format: space1, ..." -- +-- Raft issues: +-- +-- * warning: "Raft leader idle is 10.000 on ... . +-- Is raft leader alive and connection is healthy?" +-- -- Custom issues (defined by user): -- -- * Custom roles can announce more issues with their own level, topic @@ -203,8 +208,8 @@ local function list_on_instance(opts) } table.insert(ret, issue) end - - for _, replication_info in pairs(box.info.replication) do + local box_info = box.info + for _, replication_info in pairs(box_info.replication) do local replica = enabled_servers[replication_info.uuid] if replica == nil then goto continue @@ -292,6 +297,26 @@ local function list_on_instance(opts) ::continue:: end + if box_info.election then + local leader_idle = box_info.election.leader_idle + if leader_idle ~= nil + and leader_idle >= 4 * box.cfg.replication_timeout then + local issue = { + level = 'warning', + topic = 'raft', + replicaset_uuid = replicaset_uuid, + instance_uuid = instance_uuid, + message = string.format( + "Raft leader idle is %f on %s. ".. + "Is raft leader alive and connection is healthy?", + leader_idle, + instance_uuid + ) + } + table.insert(ret, issue) + end + end + local failover_error = failover.get_error() if failover_error ~= nil then table.insert(ret, { diff --git a/cartridge/lua-api/boxinfo.lua b/cartridge/lua-api/boxinfo.lua index e5e514e31..ddd3140d4 100644 --- a/cartridge/lua-api/boxinfo.lua +++ b/cartridge/lua-api/boxinfo.lua @@ -142,6 +142,7 @@ local function get_info(uri) ro = box_info.ro, ro_reason = box_info.ro_reason, election_state = box_info.election and box_info.election.state, + election_leader_idle = box_info.election and box_info.election.leader_idle, election_mode = box.cfg.election_mode or "off", synchro_queue_owner = box_info.synchro and box_info.synchro.queue.owner or 0, }, diff --git a/cartridge/webui/gql-boxinfo.lua b/cartridge/webui/gql-boxinfo.lua index ccd9be0de..ad870aab5 100644 --- a/cartridge/webui/gql-boxinfo.lua +++ b/cartridge/webui/gql-boxinfo.lua @@ -140,7 +140,10 @@ local boxinfo_schema = { kind = gql_types.string, description = 'Current read-only state reason', }, - + election_leader_idle = { + kind = gql_types.float, + description = 'Leader idle value in seconds', + }, election_state = { kind = gql_types.string, description = 'State after Raft leader election', diff --git a/doc/schema.graphql b/doc/schema.graphql index b82841c9c..32fa76f79 100644 --- a/doc/schema.graphql +++ b/doc/schema.graphql @@ -1,5 +1,5 @@ # source: http://127.0.0.1:8081/admin/api -# timestamp: Fri Mar 17 2023 16:41:38 GMT+0300 (Москва, стандартное время) +# timestamp: Fri Nov 03 2023 16:54:14 GMT+0300 (Moscow Standard Time) """Custom scalar specification.""" directive @specifiedBy( @@ -747,27 +747,30 @@ type ServerInfoGeneral { """State after Raft leader election""" election_state: String + """The number of seconds since the instance started""" + uptime: Float! + """ The maximum number of threads to use during execution of certain internal processes (currently socket.getaddrinfo() and coio_call()) """ worker_pool_threads: Int - """The number of seconds since the instance started""" - uptime: Float! - """The UUID of the replica set""" replicaset_uuid: String! - """Current working directory of a process""" - work_dir: String - """A directory where write-ahead log (.xlog) files are stored""" wal_dir: String + """Current working directory of a process""" + work_dir: String + """Current read-only state reason""" ro_reason: String + """Leader idle value in seconds""" + election_leader_idle: Float + """Id of current queue owner""" synchro_queue_owner: Int! diff --git a/test/integration/api_edit_test.lua b/test/integration/api_edit_test.lua index 4977bb2f1..150a71a22 100644 --- a/test/integration/api_edit_test.lua +++ b/test/integration/api_edit_test.lua @@ -266,7 +266,9 @@ local function test_all_rw(all_rw) servers { uuid boxinfo { - general { ro ro_reason election_state election_mode synchro_queue_owner} + general { ro ro_reason election_leader_idle + election_state election_mode synchro_queue_owner + } } } master { @@ -292,6 +294,7 @@ local function test_all_rw(all_rw) if helpers.tarantool_version_ge('2.10.0') then t.assert_equals(srv['boxinfo']['general']['election_state'], 'follower') + t.assert_equals(srv['boxinfo']['general']['election_leader_idle'], box.NULL) t.assert_equals(srv['boxinfo']['general']['ro_reason'], box.NULL) end else diff --git a/test/integration/issues_test.lua b/test/integration/issues_test.lua index ea0a846d3..2cdfc257a 100644 --- a/test/integration/issues_test.lua +++ b/test/integration/issues_test.lua @@ -419,6 +419,36 @@ function g.test_state_hangs() t.assert_equals(helpers.list_cluster_issues(g.master), {}) end +g.before_test('test_election_leader_high_idle', function() + t.skip_if(not helpers.tarantool_version_ge('2.10.0'), 'leader_idle is not supported') + g.master:exec(function() + rawset(_G, 'old_info_election', box.info.election) + local elect = table.deepcopy(_G.old_info_election) + elect.leader_idle = 10 + rawset(box.info, 'election', elect) + end) +end) + +function g.test_election_leader_high_idle() + t.assert_items_equals(helpers.list_cluster_issues(g.master), { + { + level = 'warning', + topic = 'raft', + message = ("Raft leader idle is 10.000000 on %s. ".. + "Is raft leader alive and connection is healthy?"): + format(g.master.instance_uuid), + instance_uuid = g.master.instance_uuid, + replicaset_uuid = g.master.replicaset_uuid, + }, + }) +end + +g.after_test('test_election_leader_high_idle', function() + g.master:exec(function() + rawset(box.info, 'election', _G.old_info_election) + end) +end) + function g.test_aliens() g.alien:start()