From 68140139eb6d309d069e8aae717e4523b7fdf20e Mon Sep 17 00:00:00 2001 From: Dashuai Zhang <164845223+sdszhang@users.noreply.github.com> Date: Fri, 17 Jan 2025 17:04:34 +1100 Subject: [PATCH] flap interface after sfp reset (#16375) Description of PR Summary: Fixes interface stays down after tests/platform_tests/api/test_sfp.py::sfp_reset() And causing error during shutdown_ebgp fixture teardown. Type of change Bug fix Testbed and Framework(new/improvement) Test case(new/improvement) Back port request 202012 202205 202305 202311 202405 202411 Approach What is the motivation for this PR? keep interface up after sfp_reset if it's T2 and QSFP-DD SFP. How did you do it? flap the interface after sfp_reset to restore the interface state. How did you verify/test it? passed on physical testbed with admin@svcstr2-8800-lc1-1:~$ sudo sfputil show eeprom -d -p Ethernet0 Ethernet0: SFP EEPROM detected ... Application Advertisement: 400GAUI-8 C2M (Annex 120E) - Host Assign (0x1) - Active Cable assembly with BER < 5x10^-5 - Media Assign (0x1) CAUI-4 C2M (Annex 83E) - Host Assign (0x1) - Active Cable assembly with BER < 5x10^-5 - Media Assign (0x1) CMIS Revision: 4.0 Connector: No separable connector Encoding: N/A Extended Identifier: Power Class 5 (10.0W Max) Extended RateSelect Compliance: N/A Hardware Revision: 1.0 Host Electrical Interface: 400GAUI-8 C2M (Annex 120E) Host Lane Assignment Options: 1 Host Lane Count: 8 Identifier: QSFP-DD Double Density 8X Pluggable Transceiver Length Cable Assembly(m): 1.0 ...... platform_tests/api/test_sfp.py::TestSfpApi::test_reset[xxx-lc1-1] PASSED [ 73%] ...... =========================== short test summary info ============================ FAILED platform_tests/api/test_sfp.py::TestSfpApi::test_lpmode[svcstr2-8800-lc1-1] <<<< this is separate issue, not related to this PR. ========================= 1 failed, 22 passed, 1 warning in 2104.41s (0:35:04) ========================= `` ============= 1 failed, 22 passed, 1 warning in 2104.41s (0:35:04) ============= Any platform specific information? Supported testbed topology if it's a new test case? Co-authored-by: jianquanye@microsoft.com --- tests/common/devices/multi_asic.py | 28 +++++++++++++++ tests/common/platform/interface_utils.py | 6 ++++ tests/common/platform/transceiver_utils.py | 2 ++ tests/common/port_toggle.py | 1 + tests/platform_tests/api/test_sfp.py | 41 ++++++++++++++++++++++ tests/platform_tests/sfp/test_sfputil.py | 2 +- 6 files changed, 79 insertions(+), 1 deletion(-) diff --git a/tests/common/devices/multi_asic.py b/tests/common/devices/multi_asic.py index d879e468481..deef9323751 100644 --- a/tests/common/devices/multi_asic.py +++ b/tests/common/devices/multi_asic.py @@ -8,6 +8,7 @@ from tests.common.devices.sonic_asic import SonicAsic from tests.common.helpers.assertions import pytest_assert from tests.common.helpers.constants import DEFAULT_ASIC_ID, DEFAULT_NAMESPACE, ASICS_PRESENT +from tests.common.platform.interface_utils import get_dut_interfaces_status logger = logging.getLogger(__name__) @@ -856,3 +857,30 @@ def ports_list(self): """ mg_facts = self.sonichost.minigraph_facts(host=self.sonichost.hostname) return list(mg_facts['ansible_facts']['minigraph_ports'].keys()) + + def get_admin_up_ports(self): + intf_status = get_dut_interfaces_status(self.sonichost) + admin_up_ports = [k for k, v in intf_status.items() if v['admin'] == "up"] + return admin_up_ports + + def shutdown_interface(self, port): + """ + This function works for both multi/single-asic dut + """ + logging.info("Shutting down {}".format(port)) + if self.sonichost.is_multi_asic: + asic_ns = self.get_port_asic_instance(port).namespace + return self.command(f"sudo config interface -n {asic_ns} shutdown {port}") + else: + return self.command(f"sudo config interface shutdown {port}") + + def no_shutdown_interface(self, port): + """ + This function works for both multi/single-asic dut + """ + logging.info("Bring up {}".format(port)) + if self.sonichost.is_multi_asic: + asic_ns = self.get_port_asic_instance(port).namespace + return self.command(f"sudo config interface -n {asic_ns} startup {port}") + else: + return self.command(f"sudo config interface startup {port}") diff --git a/tests/common/platform/interface_utils.py b/tests/common/platform/interface_utils.py index 1c3c77c9fea..66c8a3196c3 100644 --- a/tests/common/platform/interface_utils.py +++ b/tests/common/platform/interface_utils.py @@ -43,6 +43,12 @@ def parse_intf_status(lines): return result +def get_dut_interfaces_status(duthost): + output = duthost.command("show interface description") + intf_status = parse_intf_status(output["stdout_lines"][2:]) + return intf_status + + def check_interface_status_of_up_ports(duthost): if duthost.facts['asic_type'] == 'vs' and duthost.is_supervisor_node(): return True diff --git a/tests/common/platform/transceiver_utils.py b/tests/common/platform/transceiver_utils.py index d4ac0b37c71..4409a702ccb 100644 --- a/tests/common/platform/transceiver_utils.py +++ b/tests/common/platform/transceiver_utils.py @@ -7,6 +7,8 @@ import re from copy import deepcopy +I2C_WAIT_TIME_AFTER_SFP_RESET = 5 # in seconds + def parse_transceiver_info(output_lines): """ diff --git a/tests/common/port_toggle.py b/tests/common/port_toggle.py index d387c7229d4..961b1b15b5e 100644 --- a/tests/common/port_toggle.py +++ b/tests/common/port_toggle.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) BASE_PORT_COUNT = 28.0 # default t0 topology has 28 ports to toggle +WAIT_TIME_AFTER_INTF_SHUTDOWN = 5 # in seconds def port_toggle(duthost, tbinfo, ports=None, wait_time_getter=None, wait_after_ports_up=60, watch=False): diff --git a/tests/platform_tests/api/test_sfp.py b/tests/platform_tests/api/test_sfp.py index da61175b566..f2f66d5d60f 100644 --- a/tests/platform_tests/api/test_sfp.py +++ b/tests/platform_tests/api/test_sfp.py @@ -1,12 +1,16 @@ import ast import logging import pytest +import time from tests.common.helpers.assertions import pytest_assert from tests.common.helpers.platform_api import sfp from tests.common.utilities import skip_release from tests.common.utilities import skip_release_for_platform from tests.common.platform.interface_utils import get_physical_port_indices +from tests.common.platform.interface_utils import check_interface_status_of_up_ports +from tests.common.port_toggle import default_port_toggle_wait_time, WAIT_TIME_AFTER_INTF_SHUTDOWN +from tests.common.platform.transceiver_utils import I2C_WAIT_TIME_AFTER_SFP_RESET from tests.common.utilities import wait_until from tests.common.fixtures.conn_graph_facts import conn_graph_facts # noqa F401 from tests.common.fixtures.duthost_utils import shutdown_ebgp # noqa F401 @@ -43,6 +47,7 @@ def setup(request, duthosts, enum_rand_one_per_hwsku_hostname, # We are interested only in ports that are used for device connection physical_intfs = conn_graph_facts["device_conn"][duthost.hostname] + sfp_setup["conn_interfaces"] = physical_intfs physical_port_index_map = get_physical_port_indices(duthost, physical_intfs) sfp_setup["physical_port_index_map"] = physical_port_index_map @@ -682,17 +687,53 @@ def test_reset(self, request, duthosts, enum_rand_one_per_hwsku_hostname, localh # TODO: Verify that the transceiver was actually reset duthost = duthosts[enum_rand_one_per_hwsku_hostname] skip_release_for_platform(duthost, ["202012"], ["arista", "mlnx"]) + port_index_to_info_dict = {} for i in self.sfp_setup["sfp_test_port_indices"]: info_dict = sfp.get_transceiver_info(platform_api_conn, i) if not self.expect(info_dict is not None, "Unable to retrieve transceiver {} info".format(i)): continue + port_index_to_info_dict[i] = info_dict ret = sfp.reset(platform_api_conn, i) if self.is_xcvr_resettable(request, info_dict): self.expect(ret is True, "Failed to reset transceiver {}".format(i)) else: self.expect(ret is False, "Resetting transceiver {} succeeded but should have failed".format(i)) + + # allow the I2C interface to recover post sfp reset + time.sleep(I2C_WAIT_TIME_AFTER_SFP_RESET) + + # shutdown and bring up in batch so that we don't have to add delay for each interface. + intfs_changed = [] + admin_up_port_list = duthost.get_admin_up_ports() + for intf in self.sfp_setup['conn_interfaces']: + if intf not in admin_up_port_list: + # skip interfaces which are not in admin up state. + continue + + sfp_port_idx = self.sfp_setup['physical_port_index_map'][intf] + # skip if info_dict is not retrieved during reset, which also means reset was not performed. + if sfp_port_idx not in port_index_to_info_dict: + continue + info_dict = port_index_to_info_dict[sfp_port_idx] + + # only flap interfaces where are CMIS optics, + # non-CMIS optics should stay up after sfp_reset(), no need to flap. + if "cmis_rev" in info_dict: + duthost.shutdown_interface(intf) + intfs_changed.append(intf) + + time.sleep(WAIT_TIME_AFTER_INTF_SHUTDOWN) + + for intf in intfs_changed: + duthost.no_shutdown_interface(intf) + + _, port_up_wait_time = default_port_toggle_wait_time(duthost, len(intfs_changed)) + if not wait_until(port_up_wait_time, 10, 0, + check_interface_status_of_up_ports, duthost): + self.expect(False, "Not all interfaces are up after reset") + self.assert_expectations() def test_tx_disable(self, duthosts, enum_rand_one_per_hwsku_hostname, localhost, platform_api_conn): # noqa F811 diff --git a/tests/platform_tests/sfp/test_sfputil.py b/tests/platform_tests/sfp/test_sfputil.py index f088a360b8f..334dea5dc85 100644 --- a/tests/platform_tests/sfp/test_sfputil.py +++ b/tests/platform_tests/sfp/test_sfputil.py @@ -16,6 +16,7 @@ from tests.common.utilities import skip_release, wait_until from tests.common.fixtures.duthost_utils import shutdown_ebgp # noqa F401 from tests.common.port_toggle import default_port_toggle_wait_time +from tests.common.platform.transceiver_utils import I2C_WAIT_TIME_AFTER_SFP_RESET from tests.common.platform.interface_utils import get_physical_port_indices cmd_sfp_presence = "sudo sfputil show presence" @@ -34,7 +35,6 @@ DOM_ENABLED = "enabled" DOM_POLLING_CONFIG_VALUES = [DOM_DISABLED, DOM_ENABLED] -I2C_WAIT_TIME_AFTER_SFP_RESET = 5 # in seconds WAIT_TIME_AFTER_LPMODE_SET = 3 # in seconds logger = logging.getLogger(__name__)