diff --git a/lab/README.md b/lab/README.md index 8d49c1a4..f22b6d43 100644 --- a/lab/README.md +++ b/lab/README.md @@ -1,41 +1,4 @@ # OPI Lab -- [Goals and Requirments](goals-and-requirements.md) -- Setup - - [Cables/Fibers/transceivers](./cables.md) - - [LAB IPs allocation](./ips.md) and [Inventory](./ansible/inventory) - - [Open Telemetry Observability](./otel.md) - - [Server Software Setup](server-setup.md) - - [Bill of Materials](bom.md) - - [Physical Testbed Setup](physical-testbed.md) - - [ToR - Arista DCS-7280TR-48C6-R](./hardware/ToR/README.md) - - [TS - Serial Consoles - Avocent ACS8000](./hardware/TS/README.md) - - [KVM - KVM Switch - Avocent MPU8032DAC](./hardware/KVM/README.md) - - [VPN - F5 BIG IP i4000](./hardware/VPN/README.md) - - [nPDU - Network PDU - Vertiv VP7N3001](./hardware/nPDU/README.md) - - [DH1 - DPU Host 1 - Dell R650](./hardware/dh1/README.md) - - [DH2 - DPU Host 2 - HPE DL360](./hardware/dh2/README.md) - - [DH3 - DPU Host 3 - HP RL300](./hardware/dh3/README.md) - - [DH4 - DPU Host 4 - Dell R760](./hardware/dh4/README.md) - - [A100G - 100G Switch - Arista DCS-7280CR](./hardware/A100G/README.md) - - [TGEN1 - Traffic Generator 1 - Supermicro](./hardware/tgen1/README.md) -- [Running the Test Cases](running-the-tests.md) +Moved to -## Passwords - -We manage all passwords in - -## Access the LAB - -- ask in [Opi Slack](https://join.slack.com/t/opi-project/shared_invite/zt-1ctqtrgkz-WJZrcVPp3P1ACZWjpZP2KQ) for initial user/password -- log into [vpn.opiproject-lab.org](http://vpn.opiproject-lab.org) via web browser and change first time password -- either allow the browser based Network Access application to be installed and the VPN will be launched -- or if your company restricts browser-based installs you can: - - If on Windows, install [F5 Access](./images/f5-vpn-msft-app.png) app from [www.microsoft.com/store/productId/9WZDNCRDSFN0](https://www.microsoft.com/store/productId/9WZDNCRDSFN0) - - If on Mac, install the F5 Access app from the Mac App store from [apps.apple.com/us/app/f5access/id1243219105](https://apps.apple.com/us/app/f5access/id1243219105) -- create new OPI [VPN connection](./images/add-vpn-windows.png) and enter your new credentials -- test it by going to [https://172.22.4.2/redfish/v1/](https://172.22.4.2/redfish/v1/) - -## Rack diagram - -![lab cabling diagram](./images/opi-lab-cabling.drawio.svg) diff --git a/lab/ansible/README.md b/lab/ansible/README.md deleted file mode 100644 index 5a44c70d..00000000 --- a/lab/ansible/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# What is this? - -This is an ansible playbook for helping set up an OPI LAB environment. - -## How do you use this? - -The ansible_user should have sudo privileges. Preferably, set up password-less -ssh and password-less sudo for that user, though you can instead instruct -ansible to ask for passwords. - -This is an example how to set password-less: - -```bash - ssh-copy-id root@172.22.X.X -``` - -Install dependencies: - -```bash -python3 -m pip install ansible-pylibssh -ansible-galaxy collection install -r collections/requirements.yml -``` - -Then run the playbook: - -```bash -ansible-playbook -i inventory -e ansible_password='TAKE-FROM-1-PASSWORD' setup.yml -``` - -Run the playbook only on specific host or group: - -```bash -ansible-playbook -i inventory -l mev -e ansible_password='TAKE-FROM-1-PASSWORD' setup.yml -``` - -## Example log - -```bash -root@dh1:~/opi-poc/lab/ansible# ansible-playbook -i inventory setup.yml - -PLAY [Set up for OPI LAB environment] ******************************************************************************************************************************************************* - -TASK [Gathering Facts] ********************************************************************************************************************************************************************** -ok: [dh2] -ok: [dh4] -ok: [dh1] -fatal: [dh3]: UNREACHABLE! => {"changed": false, "msg": "Failed to connect to the host via ssh: ssh: connect to host 172.22.1.3 port 22: No route to host", "unreachable": true} -fatal: [amd]: UNREACHABLE! => {"changed": false, "msg": "Failed to connect to the host via ssh: ssh: connect to host 172.22.3.1 port 22: No route to host", "unreachable": true} -fatal: [mrv]: UNREACHABLE! => {"changed": false, "msg": "Failed to connect to the host via ssh: ssh: connect to host 172.22.3.3 port 22: No route to host", "unreachable": true} -ok: [mev] -ok: [bf2] - -TASK [Testing] ****************************************************************************************************************************************************************************** -changed: [dh2] -changed: [dh4] -changed: [dh1] -changed: [bf2] -changed: [mev] - -PLAY RECAP ********************************************************************************************************************************************************************************** -amd : ok=0 changed=0 unreachable=1 failed=0 skipped=0 rescued=0 ignored=0 -bf2 : ok=2 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 -dh1 : ok=2 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 -dh2 : ok=2 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 -dh3 : ok=0 changed=0 unreachable=1 failed=0 skipped=0 rescued=0 ignored=0 -dh4 : ok=2 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 -mev : ok=2 changed=1 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0 -mrv : ok=0 changed=0 unreachable=1 failed=0 skipped=0 rescued=0 ignored=0 -``` diff --git a/lab/ansible/collections/requirements.yml b/lab/ansible/collections/requirements.yml deleted file mode 100644 index d04f555e..00000000 --- a/lab/ansible/collections/requirements.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -collections: - - community.docker - - community.general - - arista.eos diff --git a/lab/ansible/inventory b/lab/ansible/inventory deleted file mode 100644 index fc7cb885..00000000 --- a/lab/ansible/inventory +++ /dev/null @@ -1,37 +0,0 @@ -# from https://github.com/opiproject/opi-poc/blob/main/lab/ips.md - -[hostservers] -dh1 ansible_host=172.22.1.1 ansible_connection=ssh ansible_user=root -dh2 ansible_host=172.22.1.2 ansible_connection=ssh ansible_user=root -dh3 ansible_host=172.22.1.3 ansible_connection=ssh ansible_user=root -dh4 ansible_host=172.22.1.4 ansible_connection=ssh ansible_user=root - -[hostbmcs] -dh1bmc ansible_host=172.22.2.1 ansible_connection=local ansible_user=root -dh2bmc ansible_host=172.22.2.2 ansible_connection=local ansible_user=Administrator -dh3bmc ansible_host=172.22.2.3 ansible_connection=local ansible_user=Administrator -dh4bmc ansible_host=172.22.2.4 ansible_connection=local ansible_user=root - -[DPUs] -amd ansible_host=172.22.3.1 ansible_connection=ssh ansible_user=root -bf2 ansible_host=172.22.3.2 ansible_connection=ssh ansible_user=root -mrv ansible_host=172.22.3.3 ansible_connection=ssh ansible_user=root -mev ansible_host=192.168.0.2 ansible_connection=ssh ansible_user=root ansible_ssh_common_args='-J root@172.22.4.4' - -[dpubmcs] -bf2bmc ansible_host=172.22.4.2 ansible_connection=local ansible_user=root -mrvbmc ansible_host=172.22.4.3 ansible_connection=local ansible_user=root -mevbmc ansible_host=172.22.4.4 ansible_connection=ssh ansible_user=root - -[tgens] -tgen1 ansible_host=172.22.1.100 ansible_connection=ssh ansible_user=root - -[tgenbmcs] -tgen1bmc ansible_host=172.22.2.100 ansible_connection=local ansible_user=ADMIN - -[Management] -mgmt ansible_host=172.22.0.1 ansible_connection=ssh ansible_user=root - -[switches] -ToR ansible_host=172.22.0.5 ansible_connection=ansible.netcommon.network_cli ansible_user=arista ansible_ssh_pass=arista ansible_network_os=arista.eos.eos -A100G ansible_host=172.22.1.250 ansible_connection=ansible.netcommon.network_cli ansible_user=arista ansible_ssh_pass=arista ansible_network_os=arista.eos.eos diff --git a/lab/ansible/setup.yml b/lab/ansible/setup.yml deleted file mode 100644 index e11d63de..00000000 --- a/lab/ansible/setup.yml +++ /dev/null @@ -1,382 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (c) 2022 Dell Inc, or its subsidiaries. ---- - -- name: - hosts: all - gather_facts: false - tasks: - - name: Sync all clocks - ansible.builtin.shell: "date -s '{{ now() }}'" - -- name: - hosts: switches - gather_facts: false - tasks: - - name: run show version on remote devices - #become: yes - arista.eos.eos_command: - commands: - - show version - - show interfaces - #- show running-config - - # TODO: send lab/hardware/A100G/arista.config to the switch - # TODO: send lab/hardware/ToR/arista.config to the switch - - #- name: load config file onto an Arista switch - # become: yes - # arista.eos.eos_config: - # backup: true - # src: ../hardware/{{ inventory_hostname }}/arista.config - -- name: - hosts: hostbmcs - become: yes - tasks: - - name: Get Firmware Inventory - community.general.redfish_info: - category: Update - command: GetFirmwareInventory - baseuri: "{{ ansible_host }}" - username: "{{ ansible_user | default(ansible_env.USER) }}" - password: "{{ ansible_password }}" - register: result - - - name: Debug print first firmware entry version - ansible.builtin.debug: var=result.redfish_facts.firmware.entries[0].Version - - - name: Get BIOS attributes - community.general.redfish_info: - category: Systems - command: GetBiosAttributes - baseuri: "{{ ansible_host }}" - username: "{{ ansible_user | default(ansible_env.USER) }}" - password: "{{ ansible_password }}" - register: result - - - name: Debug print bios serial number - ansible.builtin.debug: msg={{ result.redfish_facts.bios_attribute.entries[0][1].SerialNumber | default(result.redfish_facts.bios_attribute.entries[0][1].SystemServiceTag) }} - - # TODO: configre BIOS to be always on ( see lab/hardware/dh123) and any virtualization or hyper threading settings we might need - - # TODO: consider flushing OS image (we use ubuntu currently) on the Host x86 servers, since we do it manually will now - - -- name: Intel MEV | Enable Proxy and Port Forwarding - hosts: mevbmc - become: yes - gather_facts: false - tasks: - - ansible.builtin.package: name=socat,squid state=present - - ansible.builtin.copy: src=../hardware/dh4/fs/etc/systemd/system/socat-otel.service dest=/etc/systemd/system/socat-otel.service - - ansible.builtin.service: name=socat-otel enabled=yes state=started daemon_reload=yes - - ansible.builtin.shell: sed -i 's/http_access deny all/http_access allow all/g' /etc/squid/squid.conf - - ansible.builtin.service: name=squid enabled=yes state=started - -- name: Install docker and other packages - hosts: hostservers,tgens,DPUs,Management - become: yes - environment: - http_proxy: "{{ 'http://192.168.0.1:3128' if inventory_hostname == 'mev' else '' }}" - https_proxy: "{{ 'http://192.168.0.1:3128' if inventory_hostname == 'mev' else '' }}" - tasks: - - ansible.builtin.package: state=present name=python3-pip,sshpass,git - - when: ansible_os_family == 'Debian' - block: - - ansible.builtin.get_url: url=https://download.docker.com/linux/ubuntu/gpg dest=/etc/apt/keyrings/docker.asc mode='0644' force=true - - ansible.builtin.shell: dpkg --print-architecture - register: deb_architecture - - ansible.builtin.apt_repository: state=present repo="deb [arch={{ deb_architecture.stdout}} signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable" - - ansible.builtin.package: state=absent name=docker.io,docker-doc,docker-compose,docker-compose-v2,podman-docker,containerd,runc - - ansible.builtin.package: state=present name=docker-ce,docker-ce-cli,containerd.io,docker-buildx-plugin,docker-compose-plugin - - ansible.builtin.systemd: state=started name=docker - -- name: Monitoring - # Management server runs it via compose, see below. So skip it here - hosts: hostservers,tgens,DPUs - become: yes - tasks: - - # TODO: redfish fix IP and credentials for the BMC - - - name: Copy telegraf folder to remote - ansible.builtin.copy: src=../telegraf.d dest=/root - - # TODO: create new telegraf container or use same for Marvell card - - # TODO: see if there is an opportunity to consolidate and code dup removal - - - name: Nvidia | telegraf otel monitoring - when: inventory_hostname == 'bf2' - block: - - name: Nvidia | make sure emulation is running for temperature - ansible.builtin.systemd: state=started name=set_emu_param - - ansible.builtin.systemd: state=stopped name=mlnx_snap - - ansible.builtin.systemd: state=started name=spdk_tgt - - name: Nvidia | Run telegraf container on Nvidia BF - community.docker.docker_container: - name: telegraf - image: docker.io/library/telegraf:1.29 - state: started - restart: true - detach: true - network_mode: host - restart_policy: always - mounts: - - type: bind - source: /root/telegraf.d/telegraf.conf.bf2 - target: /etc/telegraf/telegraf.conf - read_only: true - - type: bind - source: /run/emu_param - target: /run/emu_param - read_only: true - - # TODO: see if there is an opportunity to consolidate and code dup removal - - - name: Intel | telegraf otel monitoring - when: inventory_hostname == 'mev' - environment: - http_proxy: http://192.168.0.1:3128 - https_proxy: http://192.168.0.1:3128 - block: - - name: Intel | Downgrade requests package due to bug https://github.com/ansible-collections/community.docker/issues/868 - ansible.builtin.pip: name=requests<2.32 - - name: Intel | Run telegraf container on Intel MEV - community.docker.docker_container: - name: telegraf - image: docker.io/library/telegraf:1.29 - state: started - restart: true - detach: true - network_mode: host - restart_policy: always - mounts: - - type: bind - source: /root/telegraf.d/telegraf.conf.mev - target: /etc/telegraf/telegraf.conf - read_only: true - - - name: Run telegraf container on others - when: - - inventory_hostname != 'mev' - - inventory_hostname != 'bf2' - community.docker.docker_container: - name: telegraf - image: docker.io/library/telegraf:1.29 - state: started - restart: true - detach: true - network_mode: host - restart_policy: always - mounts: - - type: bind - source: /root/telegraf.d - target: /etc/telegraf/telegraf.d - read_only: true - -- name: - hosts: Management - become: yes - tasks: - - name: Run Monitoring OTEL, Prometheus, Grafana - community.docker.docker_compose_v2: - project_src: /root/opi-poc/lab - register: output - -- name: Intel | Upgrade Intel Mev IMC and ACC FWs - hosts: mev - become: yes - vars: - imc_version: 1.4.0.8469 - imc_local_file: /root/intel-ipu-pldm-image-{{ imc_version }}.tar.gz - imc_remote_file: /work/intel-ipu-pldm-image-{{ imc_version }}/intel-ipu-pldm-{{ imc_version }}.bin - acc_version: "{{ imc_version }}" - acc_local_file: /root/intel-ipu-acc-eval-image-{{ acc_version }}.tar.gz - acc_remote_file: /work/intel-ipu-acc-eval-image-{{ acc_version }}/ACC/OS/acc-os-kernel.bin - ansible_remote_tmp: /tmp - tasks: - - - name: Fetch Intel Mev ACC running version - block: - - ansible.builtin.shell: cat /etc/issue - - ansible.builtin.shell: cat /etc/issue.net - register: result - - ansible.builtin.set_fact: acc_run_version={{ result.stdout | trim }} - - ansible.builtin.debug: var=acc_run_version - - ansible.builtin.fail: msg="Could not find ACC string in the running {{ acc_run_version }}" - when: not '"ACC" in acc_run_version' - - - name: Fetch Intel Mev IMC running version - delegate_to: mevbmc - block: - - ansible.builtin.shell: cat /etc/issue - - ansible.builtin.shell: cat /etc/issue.net - register: result - - ansible.builtin.set_fact: imc_run_version={{ result.stdout | trim }} - - ansible.builtin.debug: var=imc_run_version - - ansible.builtin.shell: /usr/bin/ipu-update -i - - ansible.builtin.fail: msg="Could not find IMC string in the running {{ imc_run_version }}" - when: not '"IMC" in imc_run_version' - - - name: Upgrade Intel Mev IMC FW to {{ imc_version }} - when: not imc_version in imc_run_version - delegate_to: mevbmc - block: - - name: Check if firmware image exists remotely {{ imc_remote_file }} - ansible.builtin.stat: path={{ imc_remote_file }} - register: imc_remote_file_check - - # FW file doesn't exist, copy and unpack it - - - name: Copy and Extract {{ imc_local_file }} into remote /work folder - when: not imc_remote_file_check.stat.exists - block: - - name: Check if firmware image exists locally {{ imc_local_file }} - delegate_to: localhost - ansible.builtin.stat: path={{ imc_local_file }} - register: imc_local_file_check - - - name: Copy and Unpack {{ imc_local_file }} into remote /work folder - when: imc_local_file_check.stat.exists - ansible.builtin.unarchive: src={{ imc_local_file }} dest=/work - - - name: Check again if firmware image exists remotely {{ imc_remote_file }} after copy and unpack - ansible.builtin.stat: path={{ imc_remote_file }} - register: imc_remote_file_check - - # FW file exists, use it to start upgrade - - - name: Start upgrade Intel Mev IMC FW using existing {{ imc_remote_file }} - when: imc_remote_file_check.stat.exists - block: - - ansible.builtin.shell: /usr/bin/ipu-update -i {{ imc_remote_file }} - # TODO: remove echo below - - ansible.builtin.shell: echo /usr/bin/ipu-update -u {{ imc_remote_file }} - - ansible.builtin.shell: echo reboot - - # TODO: now update ACC using /usr/bin/imc-scripts/acc_os_partition_provision.sh - - - name: Upgrade Intel Mev ACC FW to {{ acc_version }} - when: not acc_version in acc_run_version - delegate_to: mevbmc - block: - - name: Check if firmware image exists remotely {{ acc_remote_file }} - ansible.builtin.stat: path={{ acc_remote_file }} - register: acc_remote_file_check - - # FW file doesn't exist, copy and unpack it - - - name: Copy and Extract {{ acc_local_file }} into remote /work folder - when: not acc_remote_file_check.stat.exists - block: - - name: Check if firmware image exists locally {{ acc_local_file }} - delegate_to: localhost - ansible.builtin.stat: path={{ acc_local_file }} - register: acc_local_file_check - - - name: Copy and Unpack {{ acc_local_file }} into remote /work folder - when: acc_local_file_check.stat.exists - ansible.builtin.unarchive: src={{ acc_local_file }} dest=/work - - - name: Check again if firmware image exists remotely {{ acc_remote_file }} after copy and unpack - ansible.builtin.stat: path={{ acc_remote_file }} - register: acc_remote_file_check - - # FW file exists, use it to start upgrade - - - name: Start upgrade Intel Mev ACC FW using existing {{ acc_remote_file }} - when: acc_remote_file_check.stat.exists - block: - # TODO: remove echo below - - ansible.builtin.shell: ls /usr/bin/imc-scripts/acc_os_partition_provision.sh {{ acc_remote_file }} - -- name: - hosts: bf2 - become: yes - vars: - bfb_local_file: /root/bf-bundle-2.7.0-33_24.04_ubuntu-22.04_prod.bfb - bfb_url: https://content.mellanox.com/BlueField/BFBs/Ubuntu22.04/{{ bfb_local_file | basename }} - tasks: - - name: Nvidia | Fetch BlueField runnikng version - block: - - ansible.builtin.shell: cat /etc/mlnx-release - register: result - - ansible.builtin.set_fact: bf_version={{ result.stdout | trim }} - - ansible.builtin.debug: var=bf_version - - - name: Nvidia | Update BlueField using BFB image from the Host - delegate_to: dh2 - block: - - ansible.builtin.shell: lspci | grep BlueField - - ansible.builtin.service: name=rshim enabled=yes state=started - - ansible.builtin.shell: cat /dev/rshim0/misc - - ansible.builtin.shell: ifconfig tmfifo_net0 192.168.100.1/30 up - - - name: Check if firmware image exists locally {{ bfb_local_file }} - ansible.builtin.stat: path={{ bfb_local_file }} - register: bfb_local_file_check - - - name: Download firmware image {{ bfb_url }} - ansible.builtin.get_url: url={{ bfb_url }} dest={{ bfb_local_file }} mode='0440' - when: not bfb_local_file_check.stat.exists - - # TODO: handle ubuntu bfb cfg password - - # TODO: ansible.builtin.shell: "bfb-install --bfb {{ local_bfb }} --config /tmp/bf.cfg --rshim {{ rshim.dev }}" - - - name: Flush BFB image and reboot BF card - ansible.builtin.shell: cat {{ bfb_local_file }} > /dev/rshim0/boot - when: not hostvars.bf2.bf_version in bfb_local_file - -# from https://www.kernel.org/doc/html/v5.8/networking/device_drivers/pensando/ionic.html -- name: Pensando DSC installation-update - hosts: dh1 - become: yes - tasks: - - ansible.builtin.shell: lspci -d 1dd8:1002 - - ansible.builtin.shell: lspci | grep Pensando - - ansible.builtin.shell: dmesg | grep ionic - - ansible.builtin.shell: ls -l /sys/class/net/*/device - - ansible.builtin.shell: ls -l /sys/class/net/*/device/driver - - ansible.builtin.shell: ethtool -i enp25s0np0 - - ansible.builtin.shell: devlink dev info pci/0000:19:00.0 - - # TODO: update FW for Pensando DSC - -- name: Marvell CN106 installation-update - hosts: dh3 - become: yes - tasks: - - ansible.builtin.shell: lspci | grep Cavium - - ansible.builtin.shell: lsusb | grep CP2105 - - ansible.builtin.shell: ls -l /dev/ttyUSB* - - # TODO: update FW for Marvell CN106 - -- name: Secure Zero Touch Provisioning - hosts: DPUs - become: yes - tasks: - # TODO: enable this - - name: Nvidia | Fix dhcp client config and then run sztp agent container - when: inventory_hostname == 'bf123' - block: - - ansible.builtin.lineinfile: backup=true path=/etc/dhcp/dhclient.conf insertbefore='^option(.*)code(.*)$' line='option sztp-redirect-urls code 143 = text;' - - ansible.builtin.lineinfile: backup=true path=/etc/dhcp/dhclient.conf regexp='^(request .*)' line='\g<1> sztp-redirect-urls,' - - name: Nvidia | Run sztp agent container on Nvidia BF - community.docker.docker_container: - name: sztp - image: ghcr.io/opiproject/opi-sztp-client:main - state: started - restart: true - detach: true - network_mode: host - restart_policy: always - mounts: - - type: bind - source: /var/lib/NetworkManager/dhclient-aa93b667-6aac-3804-91e9-4958e07fdb2f-oob_net0.lease - target: /var/lib/dhclient/dhclient.leases - read_only: true - command: /opi-sztp-agent daemon --bootstrap-trust-anchor-cert /mnt/opi.pem --device-end-entity-cert /mnt/opi_cert.pem --device-private-key /mnt/opi_private_key.pem diff --git a/lab/bom.md b/lab/bom.md deleted file mode 100644 index baad5135..00000000 --- a/lab/bom.md +++ /dev/null @@ -1,219 +0,0 @@ -# Lab Bill of Materials - -[Spending Guidelines](https://github.com/opiproject/opi/blob/main/Policies/spending_guidelines.md) - -No vendor selection has happened yet, bellow table is for illustration/estimate only. - -| Item | Manufacturer | Part Number | Quantity | MSRP | Street | Budget | references | -|----------------------------------|--------------|--------------------|----------|----------|---------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 4 post rack | | | 1 | | $800 | $800 | | -| | | | | | $0 | $0 | | -| DPUs/IPUs | | | | | $0 | $0 | | -| Intel Mount Evans | Intel | | 2 | $4,000 | $4,000 | $8,000 | | -| Intel IPU E2000 | Intel | | 2 | $4,000 | $4,000 | $8,000 | | -| Marvell Octeon 10 | Marvell | | 2 | $4,000 | $4,000 | $8,000 | | -| Nvidia BlueField 2 | Nvidia | | 2 | $4,000 | $4,000 | $8,000 | [Product](https://www.nvidia.com/en-us/networking/products/data-processing-unit/) | -| Nvidia BlueField 3 | Nvidia | | 2 | $4,000 | $4,000 | $8,000 | [Product](https://www.nvidia.com/en-us/networking/products/data-processing-unit/) | -| AMD/Pensando DSC200 | AMD | | 2 | $4,000 | $4,000 | $8,000 | [Product](https://www.amd.com/system/files/documents/pensando-dsc-200-product-brief.pdf) | -| need full list of supported _PUs | | | | | $0 | $0 | | -| | | | | | $0 | $0 | | -| Servers | | | | | $0 | $0 | | -| VM Hypervisor | Dell | RAX QH12-22E4-2GPU | 1 | | $26,020 | $26,020 | [Shop](https://www.thinkmate.com/quotation-request?a=YToxOntzOjI6ImlkIjtpOjYxOTQ2Njt9) | -| DPU Host (1 or 2 _PUs) | Supermicro | RAX XS4-11S3-10G | 10 | | $5,131 | $51,310 | [Shop](https://www.thinkmate.com) | -| Storage Host | ASUS | ESC8000A-E11 | 1 | | $14,000 | $14,000 | [Product](https://www.asus.com/commercial-servers-workstations/esc8000a-e11/) | -| | | | | | $0 | $0 | | -| Traffic generators | | | | | $0 | $0 | | -| Server with 100G NICs | Supermicro | RAX XS4-11S3-10G | 2 | | $5,131 | $10,262 | [Shop](https://www.thinkmate.com) | -| hardware traffic generator | | | | | $0 | $0 | | -| | | | | | $0 | $0 | | -| Switches | | | | | $0 | $0 | | -| ToR 1G with 10G uplink | Cisco | | 1 | | $0 | $0 | | -| _PU to tgen | Arista | DCS-7280CR2-60-F | 2 | $149,995 | $20,000 | $40,000 | [Product](https://www.arista.com/assets/data/pdf/Datasheets/7280R-DataSheet.pdf) | -| 400G ? | | | | | | | | -| | | | | | $0 | $0 | | -| PDUs | | | | | $0 | $0 | | -| APC 885-1935 | APC | APDU9941 | 2 | $1,960 | $1,170 | $2,340 | [Product](https://www.apc.com/us/en/product/APDU9941/apc-rack-pdu-9000-switched-0u-30a-200v-and-208v-21-c13-and-c15-3-c19-and-c21-sockets/?range=61799-netshelter-switched-rack-pdus&selected-node-id=27602435913) [Shop](https://www.provantage.com/apc-apdu9941~7AMP987M.htm) | -| | | | | | $0 | $0 | | -| KVMs | | | | | $0 | $0 | | -| MergePoint Unity 8032 | Vertiv | MPU8032DAC-400 | 1 | $14,601 | $8,942 | $8,942 | [Product](https://www.vertiv.com/en-us/products-catalog/monitoring-control-and-management/ip-kvm/avocent-mergepoint-unity-digital-kvm-switches/) [Shop](https://www.provantage.com/vertiv-mpu8032dac-400~7LBRT80Q.htm) | -| KVM dongle | Vertiv | MPUIQ-VMCHS | 6 | $220 | $143 | $2,002 | [Product](https://www.provantage.com/vertiv-mpuiq-vmchs-g01~7AVOE04X.htm) | -| KVM to dongle cable 5m | | | 6 | $10 | $10 | $140 | | -| | | | | | $0 | $0 | | -| Terminal servers | | | | | $0 | $0 | | -| Avocent ACS8000 48p serial | Vertiv | ACS8048DAC-400 | 1 | $7,720 | $5,647 | $5,647 | [Product](https://www.vertiv.com/en-us/products-catalog/monitoring-control-and-management/serial-consoles-and-gateways/avocent-acs-8000-serial-consoles/) [Shop](https://www.amazon.com/Vertiv-Avocent-48-port-Console-ACS8048DAC-400/dp/B01N64R35P?th=1) | -| RJ45 5m serial cables | | | 14 | $10 | $10 | $140 | | -| DB9 to RJ45 addapter | | | 5 | $10 | $10 | $50 | | -| | | | | | | $0 | | -| Cables | | | | | | $0 | | -| 3M QSFP28 DAC cable | Molex | 1002971301 | 40 | $135 | $125 | $5,000 | | -| | | | | | | $0 | | -| Licenses | | | | | | 0 | | -| ESXi 8.0 license for 128 cores | VmWare | VS8-STD-C | 4 | | $1,100 | $4,400 | | -| | | | | | | | | -| TOTAL | | | | | | $237,327 | | - -another table probably needed to include the actual deployment info: - -* rack space for each device -* power used by each device -* power to heat to HVAC math - -## Phase 1 testbed diagram - -![xPU Rack phase 1](./images/opi-rack-phase1.svg) - -## Phase 1 bill of materials - -| Item | Manufacturer | Part Number | Quantity | MSRP | Street | Budget | references | -|----------------------------------|--------------|--------------------|----------|----------|---------|----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 4 post rack | | | 1 | | $800 | $800 | | -| | | | | | $0 | $0 | | -| DPUs/IPUs | | | | | $0 | $0 | | -| Intel Mount Evans | Intel | | 1 | $4,000 | $4,000 | $4,000 | | -| Marvell Octeon 10 | Marvell | | 1 | $4,000 | $4,000 | $4,000 | | -| Nvidia BlueField 2 | Nvidia | | 1 | $4,000 | $4,000 | $4,000 | [Product](https://www.nvidia.com/en-us/networking/products/data-processing-unit/) | -| AMD/Pensando DSC200 | AMD | | 1 | $4,000 | $4,000 | $4,000 | [Product](https://www.amd.com/system/files/documents/pensando-dsc-200-product-brief.pdf) | -| need full list of supported _PUs | | | | | $0 | $0 | | -| | | | | | $0 | $0 | | -| Servers | | | | | $0 | $0 | | -| VM Hypervisor | Dell | RAX QH12-22E4-2GPU | 1 | | $26,020 | $26,020 | [Shop](https://www.thinkmate.com/quotation-request?a=YToxOntzOjI6ImlkIjtpOjYxOTQ2Njt9) | -| DPU Host (1 or 2 _PUs) | Supermicro | RAX XS4-11S3-10G | 3 | | $5,131 | $15,393 | [Shop](https://www.thinkmate.com) | -| | | | | | $0 | $0 | | -| Traffic generators | | | | | $0 | $0 | | -| Server with 100G NICs | Supermicro | RAX XS4-11S3-10G | 1 | | $10,262 | $10,262 | [Shop](https://www.thinkmate.com) | -| | | | | | $0 | $0 | | -| Switches | | | | | $0 | $0 | | -| ToR 1G with 10G uplink | Cisco | | 1 | | $0 | $0 | | -| _PU to tgen | Arista | DCS-7280CR2-60-F | 1 | $149,995 | $20,000 | $20,000 | [Product](https://www.arista.com/assets/data/pdf/Datasheets/7280R-DataSheet.pdf) | -| 400G ? | | | | | | | | -| | | | | | $0 | $0 | | -| PDUs | | | | | $0 | $0 | | -| APC 885-1935 | APC | APDU9941 | 2 | $1,960 | $1,170 | $2,340 | [Product](https://www.apc.com/us/en/product/APDU9941/apc-rack-pdu-9000-switched-0u-30a-200v-and-208v-21-c13-and-c15-3-c19-and-c21-sockets/?range=61799-netshelter-switched-rack-pdus&selected-node-id=27602435913) [Shop](https://www.provantage.com/apc-apdu9941~7AMP987M.htm) | -| | | | | | $0 | $0 | | -| KVMs | | | | | $0 | $0 | | -| MergePoint Unity 8032 | Vertiv | MPU8032DAC-400 | 1 | $14,601 | $8,942 | $8,942 | [Product](https://www.vertiv.com/en-us/products-catalog/monitoring-control-and-management/ip-kvm/avocent-mergepoint-unity-digital-kvm-switches/) [Shop](https://www.provantage.com/vertiv-mpu8032dac-400~7LBRT80Q.htm) | -| KVM dongle | Vertiv | MPUIQ-VMCHS | 6 | $220 | $143 | $858 | [Product](https://www.provantage.com/vertiv-mpuiq-vmchs-g01~7AVOE04X.htm) | -| KVM to dongle cable 5m | | | 6 | $10 | $10 | $60 | | -| | | | | | $0 | $0 | | -| Terminal servers | | | | | $0 | $0 | | -| Avocent ACS8000 48p serial | Vertiv | ACS8048DAC-400 | 1 | $7,720 | $5,647 | $5,647 | [Product](https://www.vertiv.com/en-us/products-catalog/monitoring-control-and-management/serial-consoles-and-gateways/avocent-acs-8000-serial-consoles/) [Shop](https://www.amazon.com/Vertiv-Avocent-48-port-Console-ACS8048DAC-400/dp/B01N64R35P?th=1) | -| RJ45 5m serial cables | | | 7 | $10 | $10 | $70 | | -| DB9 to RJ45 addapter | | | 5 | $10 | $10 | $50 | | -| | | | | | | $0 | | -| Cables | | | | | | $0 | | -| 3M QSFP28 DAC cable | Molex | 1002971301 | 20 | $135 | $125 | $2,500 | | -| | | | | | | $0 | | -| Licenses | | | | | | 0 | | -| ESXi 8.0 license for 128 cores | VmWare | VS8-STD-C | 4 | | $1,100 | $4,400 | ? | -| | | | | | | | | -| TOTAL | | | | | | $113,342 | | - -## Selection criteria - -| Item | how | -|---------------------------------|-----------------------------------------------------------------------------------------------------------------------| -| features | must do the job required, expected, have the feature set | -| waranty | 1 year standard waranty as a minimum, can be extended one more year in the next year budged | -| support | no or standard support for 1 year, can be extended one more year in the next year budged | -| end of life date | product must not be on an end of life, end of support anouncement | -| api availability | functionality must be accesible via API for devices part of CI pipeline, not mandatory for manual testbed but desired | -| lab management tool integration | TBD | -| availability | must be available for purchase with max 30 days lead time | -| provenience | must be a genuine device | -| vendor / mfg | both vendor and product manufacturer must have a good past history | -| price | best price that meets the above criterias | -| DPU host vendor | strive for vendor heterogeniety (multiple vendors) | -| DPU host compatibility | DPU host must be on DPU vendor's compatibility list (if possible) | - -## Server details - -* cost to host each _PU -* many in a server or each in a unique server -* mandatory features to test interaction with the host - -### DPU HOST - RAX XS4-11S3-10G - -* 1U -* Intel C621A Chipset - 4x NVMe/SATA - 1x M.2 - Dual Intel 10-Gigabit Ethernet (RJ45) - 600W Power Supply -* 2x PCIE 4.0 x16 -* Intel Xeon Gold 6312U Processor 24-Core 2.4GHz 36MB Cache (185W) -* 8 x 16GB PC4-25600 3200MHz DDR4 ECC RDIMM -* 960GB Micron 7450 PRO Series M.2 PCIe 4.0 x4 NVMe Solid State Drive (80mm) -* Connect X6 2x100G nic -* Thinkmate Server Manager (Datacenter Management Package) -* Thinkmate 3 Year Depot Warranty (Return for Repair) -* ??? does it has PCIE 4/6/8 pin power adapter since some _PU needs it - -### DPU HOST - RAX QS4-11E4 - -* 1U -* has PCIE 5.0 x16 - -### DPU HOST - ESC8000A-E11 - -* 4U -* has 8 slots of PCIE 4.0 x16 with dedicated PCIE power connector for each slot -* a reboot will restart all 8 _PUs - -### GPU servers may be more apropiate for _PU hosting - -* better cooling -* more room (2 PCIE slots per PCIE connector) -* dedicated PCIE power connector - -### VM HOST - RAX QH12-22E4-2GPU - -* AMD EPYC 9004 Series - 2U - 2 GPUs - 8x 3.5" SAS/SATA + 4x 3.5" Hybrid - 2x M.2 NVMe - Dual GbE (RJ45) - 2400W Redundant -* 2 x AMD EPYC 9554 Processor 64-core 3.10GHz 256MB Cache (360W) -* 24 x 16GB PC5-38400 4800MHz DDR5 ECC RDIMM -* 2 x 3.84TB Samsung PM9A3 Series M.2 PCIe 4.0 x4 NVMe Solid State Drive -* Trusted Platform Module - TPM 2.0 -* Thinkmate Server Management Software Package - IPMI and Redfish Compatible -* 2 x AC Power Cord (North America), C13, NEMA 5-15P, 2.1m CAB-AC -* Thinkmate Standard Rail Kit for 1U/2U Servers (Square Hole) (Included) -* Thinkmate 3 Year Advanced Parts Replacement Warranty (Zone 0) - -### Pedestal servers , why ??? - -### storage server or back up in AWS/Azure/etc - -### Cables - -* Optical cables -* Transceivers -* DAC cables - -### Switches - -* Top of Rack Layer 2/3 switches -* Layer 1 switches -* Programmable switches - -### PDUs - -* PDUs -* C13/C14/C15 power cables - -### KVMs - -* KVMs -* Dongles - -### Terminal servers - -### DPUs/IPUs - -* Generally available DPUs -* Generally available IPUs - -### Software - -* OS licenses -* Software required to enable hardware - -### Hosting services (University of New Hamshire Interoperatability Lab) - -* Rack space -* Electricity -* HVAC -* Services (lab tech to change cables) diff --git a/lab/cables.md b/lab/cables.md deleted file mode 100644 index 8fdaa603..00000000 --- a/lab/cables.md +++ /dev/null @@ -1,26 +0,0 @@ -# Cables and Transceivers - -## SFP28/SFP+ to 1G RJ45 - -Used on servers that have management 10G/25G management nic. - -- HP 453154-B21 - 1G SFP RJ-45 Transceiver -- HP 813874-B21 - 10GBase-T SFP+ Transceiver - -## 100G DAC cables - -Used for all 100G QSFP28 ports - -- Molex 1002971101 1M -- Molex 1002971201 2M -- Molex 1002971301 3M - -## 1G Cat6 Network cables - -Used for 1G or 10G RJ45 connectivity for management network or for KVM Switch to KVM dongles - -- various lengths and colors - -## RJ45 Serial cables - -Used to connect the Serial Switch to the serial management port. diff --git a/lab/config/grafana-dashboard-servers.json b/lab/config/grafana-dashboard-servers.json deleted file mode 100644 index a0c7ca79..00000000 --- a/lab/config/grafana-dashboard-servers.json +++ /dev/null @@ -1,440 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 1, - "links": [], - "liveNow": false, - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 11, - "x": 0, - "y": 0 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "temp_temp", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Temperature", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 13, - "x": 11, - "y": 0 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "disk_free", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Disk", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 11, - "x": 0, - "y": 12 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "mem_available", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Memory", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 12, - "w": 13, - "x": 11, - "y": 12 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "cpu_usage_idle", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "CPU", - "type": "timeseries" - } - ], - "refresh": "", - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Memory & Temp", - "uid": "ddb82bae-1e00-4c70-aba6-b03249088133", - "version": 1, - "weekStart": "" -} diff --git a/lab/config/grafana-dashboard-switches.json b/lab/config/grafana-dashboard-switches.json deleted file mode 100644 index ef720589..00000000 --- a/lab/config/grafana-dashboard-switches.json +++ /dev/null @@ -1,758 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "description": "Switches : Interface Rate (Prometheus)", - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 2, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 7, - "panels": [], - "repeat": "netif", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "refId": "A" - } - ], - "title": "$netif", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [ - { - "options": { - "down": { - "text": "DOWN" - }, - "up": { - "text": "UP" - } - }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 2, - "x": 0, - "y": 1 - }, - "id": 9, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "/^operstate$/", - "values": false - }, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "10.2.3", - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "interfaces_ifindex{source=~\"[[switch]]\",name=~\"[[netif]]\"}", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Link Status", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": {}, - "renameByName": {} - } - } - ], - "transparent": true, - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 2, - "x": 2, - "y": 1 - }, - "id": 10, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "10.2.3", - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "interfaces_mtu{source=~\"[[switch]]\",name=~\"[[netif]]\"}", - "instant": true, - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Link MTU", - "transparent": true, - "type": "stat" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 6, - "x": 4, - "y": 1 - }, - "hiddenSeries": false, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "10.2.3", - "pointradius": 2, - "points": false, - "renderer": "flot", - "repeatDirection": "v", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "ifcounters_in_bits_per_second{host=~\"[[switch]]\",name=~\"[[netif]]\"}", - "interval": "", - "legendFormat": "IN (bps)", - "queryType": "randomWalk", - "range": true, - "refId": "A" - }, - { - "datasource": { - "uid": "Prometheus" - }, - "expr": "rate(ifcounters_out_bits_per_second{host=~\"[[switch]]\",name=~\"[[netif]]\"}[20s])", - "interval": "", - "legendFormat": "OUT (bps)", - "refId": "B" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "$netif : Bandwidth", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 6, - "w": 6, - "x": 10, - "y": 1 - }, - "hiddenSeries": false, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "10.2.3", - "pointradius": 2, - "points": false, - "renderer": "flot", - "repeatDirection": "v", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "interfaces_counters_in_unicast_pkts{source=~\"[[switch]]\",name=~\"[[netif]]\"}", - "interval": "", - "legendFormat": "IN (pps)", - "queryType": "randomWalk", - "range": true, - "refId": "A" - }, - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "rate(interfaces_counters_out_unicast_pkts{source=~\"[[switch]]\",name=~\"[[netif]]\"}[20s])", - "interval": "", - "legendFormat": "OUT (pps)", - "range": true, - "refId": "B" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "$netif : Packets", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "pps", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 6, - "w": 6, - "x": 16, - "y": 1 - }, - "hiddenSeries": false, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "10.2.3", - "pointradius": 2, - "points": false, - "renderer": "flot", - "repeatDirection": "v", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "interfaces_counters_in_discards{source=~\"[[switch]]\",name=~\"[[netif]]\"}", - "interval": "", - "legendFormat": "IN (discards)", - "queryType": "randomWalk", - "range": true, - "refId": "A" - }, - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "rate(interfaces_counters_out_discards{source=~\"[[switch]]\",name=~\"[[netif]]\"}[20s])", - "interval": "", - "legendFormat": "OUT (discards)", - "range": true, - "refId": "B" - }, - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "rate(interfaces_counters_in_errors{source=~\"[[switch]]\",name=~\"[[netif]]\"}[20s])", - "interval": "", - "legendFormat": "IN (errors)", - "range": true, - "refId": "C" - }, - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "rate(interfaces_counters_out_errors{source=~\"[[switch]]\",name=~\"[[netif]]\"}[20s])", - "interval": "", - "legendFormat": "OUT (errors)", - "range": true, - "refId": "D" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "$netif : Drops/Errors", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "fixed" - }, - "mappings": [], - "noValue": "NA", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Gbits" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 4, - "x": 0, - "y": 4 - }, - "id": 11, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "value", - "wideLayout": true - }, - "pluginVersion": "10.2.3", - "targets": [ - { - "datasource": { - "uid": "Prometheus" - }, - "editorMode": "code", - "expr": "((interfaces_ifindex{source=~\"[[switch]]\",name=~\"[[netif]]\"})*8)/1000000000", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "queryType": "randomWalk", - "refId": "A" - } - ], - "title": "Link Speed", - "transparent": true, - "type": "stat" - } - ], - "refresh": "", - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [ - { - "auto": true, - "auto_count": 30, - "auto_min": "10s", - "current": { - "selected": false, - "text": "1m", - "value": "1m" - }, - "hide": 0, - "label": "Sampling", - "name": "Sampling", - "options": [ - { - "selected": false, - "text": "auto", - "value": "$__auto_interval_Sampling" - }, - { - "selected": false, - "text": "10s", - "value": "10s" - }, - { - "selected": false, - "text": "30s", - "value": "30s" - }, - { - "selected": true, - "text": "1m", - "value": "1m" - }, - { - "selected": false, - "text": "2m", - "value": "2m" - }, - { - "selected": false, - "text": "5m", - "value": "5m" - }, - { - "selected": false, - "text": "10m", - "value": "10m" - }, - { - "selected": false, - "text": "30m", - "value": "30m" - }, - { - "selected": false, - "text": "1h", - "value": "1h" - } - ], - "query": "10s,30s,1m,2m,5m,10m,30m,1h", - "queryValue": "", - "refresh": 2, - "skipUrlSync": false, - "type": "interval" - }, - { - "current": { - "selected": false, - "text": "172.22.0.5", - "value": "172.22.0.5" - }, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "label_values(interfaces_counters_in_unicast_pkts,source)", - "hide": 0, - "includeAll": false, - "label": "switch", - "multi": false, - "name": "switch", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(interfaces_counters_in_unicast_pkts,source)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": { - "selected": true, - "text": [ - "Management1" - ], - "value": [ - "Management1" - ] - }, - "datasource": { - "type": "prometheus", - "uid": "PBFA97CFB590B2093" - }, - "definition": "label_values(interfaces_counters_in_unicast_pkts{source=\"$switch\"},name)", - "hide": 0, - "includeAll": true, - "label": "Network Interface", - "multi": true, - "name": "netif", - "options": [], - "query": { - "qryType": 1, - "query": "label_values(interfaces_counters_in_unicast_pkts{source=\"$switch\"},name)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "Switches : Interface Rate (P)", - "uid": "467bc3PMz", - "version": 14, - "weekStart": "" -} diff --git a/lab/config/grafana-dashboards.yaml b/lab/config/grafana-dashboards.yaml deleted file mode 100644 index 4f0d356e..00000000 --- a/lab/config/grafana-dashboards.yaml +++ /dev/null @@ -1,11 +0,0 @@ -# config file version -apiVersion: 1 - -providers: - - name: 'default' - orgId: 1 - folder: '' - folderUid: '' - type: file - options: - path: /var/lib/grafana/dashboards diff --git a/lab/config/grafana-datasources.yml b/lab/config/grafana-datasources.yml deleted file mode 100644 index d3ac2652..00000000 --- a/lab/config/grafana-datasources.yml +++ /dev/null @@ -1,13 +0,0 @@ -# config file version -apiVersion: 1 - -datasources: - - name: Prometheus - type: prometheus - access: proxy - # Access mode - proxy (server in the UI) or direct (browser in the UI). - url: http://prometheus:9090 - jsonData: - httpMethod: POST - manageAlerts: true - disableRecordingRules: false diff --git a/lab/config/grafana.ini b/lab/config/grafana.ini deleted file mode 100644 index 16427069..00000000 --- a/lab/config/grafana.ini +++ /dev/null @@ -1,7 +0,0 @@ -[auth] -disable_login_form = true - -[auth.anonymous] -enabled = true -org_name = Main Org. -org_role = Admin diff --git a/lab/config/otel-collector-config.yaml b/lab/config/otel-collector-config.yaml deleted file mode 100644 index 512203f4..00000000 --- a/lab/config/otel-collector-config.yaml +++ /dev/null @@ -1,35 +0,0 @@ ---- -receivers: - otlp: - protocols: - grpc: - -exporters: - prometheus: - endpoint: "0.0.0.0:8889" - const_labels: - label1: value1 - - logging: - -processors: - batch: - -extensions: - health_check: - pprof: - endpoint: :1888 - zpages: - endpoint: :55679 - -service: - extensions: [pprof, zpages, health_check] - pipelines: - traces: - receivers: [otlp] - processors: [batch] - exporters: [logging] - metrics: - receivers: [otlp] - processors: [batch] - exporters: [logging, prometheus] diff --git a/lab/config/prometheus.yaml b/lab/config/prometheus.yaml deleted file mode 100644 index 4c82de85..00000000 --- a/lab/config/prometheus.yaml +++ /dev/null @@ -1,7 +0,0 @@ ---- -scrape_configs: - - job_name: 'otel-collector' - scrape_interval: 10s - static_configs: - - targets: ['otel-gw-collector:8889'] - - targets: ['otel-gw-collector:8888'] diff --git a/lab/docker-compose.yml b/lab/docker-compose.yml deleted file mode 100644 index 24caef4f..00000000 --- a/lab/docker-compose.yml +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (c) 2022-2023 Dell Inc, or its subsidiaries. ---- -version: "3.7" - -services: - - telegraf: - restart: unless-stopped - image: docker.io/library/telegraf:1.29 - volumes: - - /:/hostfs:ro - - ./telegraf.d:/etc/telegraf/telegraf.d:ro - network_mode: host - environment: - HOST_MOUNT_PREFIX: /hostfs - HOST_PROC: /hostfs/proc - - otel-gw-collector: - restart: unless-stopped - image: docker.io/otel/opentelemetry-collector:0.92.0 - command: ["--config=/etc/otel-collector-config.yaml"] - volumes: - - ./config/otel-collector-config.yaml:/etc/otel-collector-config.yaml:z - ports: - - "1888:1888" # pprof extension - - "8888:8888" # Prometheus metrics exposed by the collector - - "8889:8889" # Prometheus exporter metrics - - "13133:13133" # health_check extension - - "4317:4317" # OTLP gRPC receiver - - "55679:55679" # zpages extension - networks: - - opi - healthcheck: - test: wget --no-verbose --tries=1 --spider http://localhost:13133/ping || exit 1 - - prometheus: - restart: unless-stopped - image: docker.io/prom/prometheus:v2.48.1 - volumes: - - ./config/prometheus.yaml:/etc/prometheus/prometheus.yml:z - ports: - - "9091:9090" - networks: - - opi - healthcheck: - test: ["CMD", "wget", "http://localhost:9090"] - - grafana: - restart: unless-stopped - image: docker.io/grafana/grafana:10.2.3 - volumes: - - ./config/grafana.ini:/etc/grafana/grafana.ini - - ./config/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasource.yaml - - ./config/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/default.yaml - - ./config/grafana-dashboard-servers.json:/var/lib/grafana/dashboards/servers.json - - ./config/grafana-dashboard-switches.json:/var/lib/grafana/dashboards/switches.json - ports: - - "3000:3000" - networks: - - opi - healthcheck: - test: wget --no-verbose --tries=1 --spider http://localhost:3000/ || exit 1 - interval: 6s - timeout: 10s - retries: 3 - - mysql: - restart: unless-stopped - image: mysql:8.0 - hostname: mysql - networks: - - opi - volumes: - - semaphore-mysql:/var/lib/mysql - environment: - MYSQL_RANDOM_ROOT_PASSWORD: 'yes' - MYSQL_DATABASE: semaphore - MYSQL_USER: semaphore - MYSQL_PASSWORD: semaphore - - semaphore: - restart: unless-stopped - networks: - - opi - ports: - - 4000:3000 - image: semaphoreui/semaphore:v2.9.75 - environment: - SEMAPHORE_DB_USER: semaphore - SEMAPHORE_DB_PASS: semaphore - SEMAPHORE_DB_HOST: mysql # for postgres, change to: postgres - SEMAPHORE_DB_PORT: 3306 # change to 5432 for postgres - SEMAPHORE_DB_DIALECT: mysql # for postgres, change to: postgres - SEMAPHORE_DB: semaphore - SEMAPHORE_PLAYBOOK_PATH: /tmp/semaphore/ - SEMAPHORE_ADMIN_PASSWORD: changeme - SEMAPHORE_ADMIN_NAME: admin - SEMAPHORE_ADMIN_EMAIL: admin@localhost - SEMAPHORE_ADMIN: admin - SEMAPHORE_ACCESS_KEY_ENCRYPTION: gs72mPntFATGJs9qK0pQ0rKtfidlexiMjYCH9gWKhTU= - TZ: UTC - depends_on: - - mysql - - portainer: - restart: unless-stopped - image: portainer/portainer-ce:2.0.0 - command: -H unix:///var/run/docker.sock - networks: - - opi - ports: - - 9000:9000 - volumes: - - /var/run/docker.sock:/var/run/docker.sock - - portainer_data:/data - - bootstrap: - image: ghcr.io/opiproject/opi-sztp-server:main - environment: - SZTPD_INIT_PORT: 6080 - SZTPD_NBI_PORT: 7080 - SZTPD_SBI_PORT: 8080 - SZTPD_INIT_MODE: 1 - SZTPD_ACCEPT_CONTRACT: "Yes" - SZTPD_INIT_ADDR: 0.0.0.0 - SZTPD_OPI_MODE: "running" - SZTPD_RETRY_ATTEMPTS: 30 - ports: - - 7080:7080 - - 8080:8080 - networks: - - opi - healthcheck: - test: ["CMD-SHELL", "curl --fail -H Accept:application/yang-data+json http://127.0.0.1:$$SZTPD_NBI_PORT/.well-known/host-meta || exit 1"] - - web: - image: docker.io/library/httpd:2.4.57-alpine3.17 - volumes: - - ./sztp/my-boot-image.img:/usr/local/apache2/htdocs/my-boot-image.img - ports: - - 80:80 - networks: - - opi - -volumes: - semaphore-mysql: - portainer_data: - -networks: - opi: diff --git a/lab/goals-and-requirements.md b/lab/goals-and-requirements.md deleted file mode 100644 index 1e872b9f..00000000 --- a/lab/goals-and-requirements.md +++ /dev/null @@ -1,115 +0,0 @@ -# OPI Lab Goals - -The OPI members may contribute some hardware -to be co-managed between the member and the lab staff. -Having hardware in the OPI lab helps -to validate compatibility and to avoid regressions. - -## Usage - -The OPI lab should be used for: - -Phase 0 - -- define partitioning -- budgeting -- hardware duplication only when needed and if needed - -- automatic testing - - Phase 1: - - Functional validation and regression - - scheduled functional validation - - patch validation (CI) - - No Access except the lab owner(s) responsible to maintain the CI/CD - - Conformance - - Phase 2: - - Interoperability - - Reserve IPU/DPUs from multiple vendors to run the test - - Test portability, compatibility -- manual - - Phase 3: - - Drive adoption - - - Open Access All IPU/DPUs accessible to all OPI members - - Demos, Conferences - OPI Lab preferred - - Learning, Training - Vendor Lab preferred - - Development lab for SW vendors - - - Phase 4: - - certification - -The performance data generated in the lab shall not be used -for competitive marketing purposes. -Conformance data or certifications from OPI labs can be used for marketing purposes. - -## Properties - -In order to serve the OPI community, these properties should be targeted: - -- reliability: always running, without false positive result -- neutrality: same automatic tests and configurations on all HW/SW platforms -- security: access is restricted to avoid unexpected changes - -## Hosting considerations - -- 2 years minimum engagement -- 1 year billing intervals - -## For consideration - -- ease to replicate the testbed -- ease of use -- separation (physical or by time bound reservation) of HW/SW used for automated regression from manual -- clean start before each reservation (restore to factory defaults functionality) - -## OPI Lab Requirements - -## Access - -- restricted access to the automated setup (only maintainers get access) -- reservation based access to the manual setup - -## Backup - -- All critical servers must be backed up. -- Restoration of critical servers must be done within a couple of hours. -- Hosting provider must be trained to do backup/restore. - -## Capacity - -- Wait queue for automatic tests should be less than 1 hour. -- Sufficient excess capacity so that manual workloads does not affect automatic tests too much. -- System must support firmware updates. - -## Process - -- End users given 2 weeks notice for major lab works -- End users given 1 week notice for regular lab maintenance works -- End users notified as soon as it is known immediate emergency lab work need to be done -- Hosting provider must be notified 2 days in advance of a change on a machine - in order to avoid false negative results due to an upgrade in process. -- Any environment change (firmware, OS update or new software) - and test changes (new or updated use case) - must be validated first before being used in automatic tests. -- Lab inventory management system in place -- Lab tech updates inventory as needed -- Lab manager assigned -- Process in place to forecast equipment needs -- Questionnaire in place to be handed out to lab users to help with equipment forecast -- Lab requirements reviewed by OPI TSC -- Sufficient budget in place to fund initial capital costs -- Sufficient budget in place to fund ongoing operations -- Ticketing system in place to request lab services - -## Documentation - -- Wiki showing servers, assigned owners, emergency console and SSH access info -- Documentation in place for browsing, requesting, and releasing lab resources - - e.g. how to request for new physical servers with power cycler -- Documentation in place for lab resources and interconnection - - example: - -## SLA - -- SSH access at 99% availability. -- Automatic tests running at 95% availability (includes maintenance downtime). diff --git a/lab/hardware/A100G/README.md b/lab/hardware/A100G/README.md deleted file mode 100644 index 382cbdd6..00000000 --- a/lab/hardware/A100G/README.md +++ /dev/null @@ -1,115 +0,0 @@ -# 100G Switch - -This device is intended to mold the test traffic between the trafic generators and the devices under test - -## HW spec - -Arista 7280R, 48x100GbE QSFP and 8x40GbE QSFP+ switch, front to rear air, 2 x AC and 2 x C19-C20 cords - -- Part number: Arista DCS-7280CR-48-F -- Hardware version: 11.01 -- Software image version: 4.20.10M - -```bash -A100G#show version -Arista DCS-7280CR-48-F -Hardware version: 11.01 -Serial number: JPE16491339 -System MAC address: 444c.a897.b5cf - -Software image version: 4.20.10M -Architecture: i386 -Internal build version: 4.20.10M-10040268.42010M -Internal build ID: 5ba83857-5952-4713-b850-6e3c7c79cac3 - -Uptime: 4 weeks, 6 days, 19 hours and 1 minutes -Total memory: 15992160 kB -Free memory: 13583756 kB -``` - -## Management - -- serial port -- 1G ethernet management port - -## gNMI - -```bash -enable -configure -management api gnmi - transport grpc openmgmt - port 5900 -``` - -and - -```bash -A100G#show management api gnmi -Enabled: Yes -Server: running on port 5900, in default VRF -SSL Profile: none -``` - -test - -```bash -root@dh1:~# docker run --network host --rm ghcr.io/openconfig/gnmic get --log --username arista --password arista --insecure --address 172.22.1.250 --port 5900 --path /openconfig-interfaces:interfaces/interface[name=Management1]/state -2024/05/16 18:50:27.749705 [gnmic] version=0.37.0, commit=05a3e785, date=2024-05-13T23:27:31Z, gitURL=https://github.com/openconfig/gnmic, docs=https://gnmic.openconfig.net -2024/05/16 18:50:27.749742 [gnmic] using config file "" -2024/05/16 18:50:27.750028 [gnmic] sending gNMI GetRequest: prefix='', path='[elem:{name:"openconfig-interfaces:interfaces"} elem:{name:"interface" key:{key:"name" value:"Management1"}} elem:{name:"state"}]', type='ALL', encoding='JSON', models='[]', extension='[]' to 172.22.1.250 -2024/05/16 18:50:27.750656 [gnmic] creating gRPC client for target "172.22.1.250" -[ - { - "source": "172.22.1.250", - "time": "1970-01-01T00:00:00Z", - "updates": [ - { - "Path": "interfaces/interface[name=Management1]/state", - "values": { - "interfaces/interface/state": { - "arista-intf-augments:inactive": false, - "openconfig-interfaces:admin-status": "UP", - "openconfig-interfaces:counters": { - "in-broadcast-pkts": "36076006", - "in-discards": "0", - "in-errors": "0", - "in-multicast-pkts": "184661", - "in-octets": "10041381967", - "in-unicast-pkts": "93505125", - "out-broadcast-pkts": "8649", - "out-discards": "0", - "out-errors": "0", - "out-multicast-pkts": "184034", - "out-octets": "23569377019", - "out-unicast-pkts": "116336808" - }, - "openconfig-interfaces:description": "", - "openconfig-interfaces:enabled": true, - "openconfig-interfaces:ifindex": 999001, - "openconfig-interfaces:last-change": "171036192364", - "openconfig-interfaces:mtu": 0, - "openconfig-interfaces:name": "Management1", - "openconfig-interfaces:oper-status": "UP", - "openconfig-interfaces:type": "ethernetCsmacd" - } - } - } - ] - } -] -``` - -## Docs - -[7280R-DataSheet](https://www.arista.com/assets/data/pdf/Datasheets/7280R-DataSheet.pdf) - -## Config - -[Config](arista.config) - -## Pictures - -![front](front.jpg) - -![back](back.png) diff --git a/lab/hardware/A100G/arista.config b/lab/hardware/A100G/arista.config deleted file mode 100644 index 98712871..00000000 --- a/lab/hardware/A100G/arista.config +++ /dev/null @@ -1,189 +0,0 @@ -! Command: show running-config -! device: A100G (DCS-7280CR-48, EOS-4.20.10M) -! -! boot system flash:/EOS-4.20.10.M.swi -! -transceiver qsfp default-mode 4x10G -! -hostname A100G -ip name-server vrf default 8.8.8.8 -! -spanning-tree mode mstp -! -no aaa root -! -username admin role network-admin secret sha512 zzzzzzzzzzzzzzzzzzzzzz -! -interface Ethernet1/1 -! -interface Ethernet2/1 -! -interface Ethernet3/1 -! -interface Ethernet4/1 -! -interface Ethernet5/1 - error-correction encoding reed-solomon -! -interface Ethernet6/1 -! -interface Ethernet7/1 -! -interface Ethernet8/1 -! -interface Ethernet9/1 -! -interface Ethernet10/1 -! -interface Ethernet11/1 -! -interface Ethernet12/1 -! -interface Ethernet13/1 -! -interface Ethernet14/1 -! -interface Ethernet15/1 -! -interface Ethernet16/1 -! -interface Ethernet17/1 -! -interface Ethernet18/1 -! -interface Ethernet19/1 -! -interface Ethernet20/1 -! -interface Ethernet21/1 -! -interface Ethernet22/1 -! -interface Ethernet23/1 -! -interface Ethernet24/1 -! -interface Ethernet25/1 -! -interface Ethernet26/1 -! -interface Ethernet27/1 -! -interface Ethernet28/1 -! -interface Ethernet29/1 -! -interface Ethernet30/1 -! -interface Ethernet31/1 -! -interface Ethernet32/1 -! -interface Ethernet33/1 -! -interface Ethernet34/1 -! -interface Ethernet35/1 -! -interface Ethernet36/1 -! -interface Ethernet37/1 -! -interface Ethernet38/1 -! -interface Ethernet39/1 -! -interface Ethernet40/1 -! -interface Ethernet41/1 -! -interface Ethernet42/1 -! -interface Ethernet43/1 -! -interface Ethernet44/1 -! -interface Ethernet45/1 -! -interface Ethernet46/1 -! -interface Ethernet47/1 -! -interface Ethernet48/1 - shutdown -! -interface Ethernet49/1 -! -interface Ethernet49/2 -! -interface Ethernet49/3 -! -interface Ethernet49/4 -! -interface Ethernet50/1 -! -interface Ethernet50/2 -! -interface Ethernet50/3 -! -interface Ethernet50/4 -! -interface Ethernet51/1 -! -interface Ethernet51/2 -! -interface Ethernet51/3 -! -interface Ethernet51/4 -! -interface Ethernet52/1 -! -interface Ethernet52/2 -! -interface Ethernet52/3 -! -interface Ethernet52/4 -! -interface Ethernet53/1 -! -interface Ethernet53/2 -! -interface Ethernet53/3 -! -interface Ethernet53/4 -! -interface Ethernet54/1 -! -interface Ethernet54/2 -! -interface Ethernet54/3 -! -interface Ethernet54/4 -! -interface Ethernet55/1 -! -interface Ethernet55/2 -! -interface Ethernet55/3 -! -interface Ethernet55/4 -! -interface Ethernet56/1 -! -interface Ethernet56/2 -! -interface Ethernet56/3 -! -interface Ethernet56/4 -! -interface Management1 - ip address dhcp - dhcp client accept default-route -! -no ip routing -! -management api gnmi - transport grpc default - port 5900 -! -end diff --git a/lab/hardware/A100G/back.png b/lab/hardware/A100G/back.png deleted file mode 100644 index 9245c48e..00000000 Binary files a/lab/hardware/A100G/back.png and /dev/null differ diff --git a/lab/hardware/A100G/front.jpg b/lab/hardware/A100G/front.jpg deleted file mode 100644 index 7ca35cf6..00000000 Binary files a/lab/hardware/A100G/front.jpg and /dev/null differ diff --git a/lab/hardware/KVM/README.md b/lab/hardware/KVM/README.md deleted file mode 100644 index 79476ee8..00000000 --- a/lab/hardware/KVM/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Keyboard Video Mouse Switch - -KVM switches provide remote management to access and control servers. - -## HW spec - -- Part Number: MPU8032DAC-35-B4-3E -- Version: 2.12.4.26064 -- KVM dongle: MPUIQ-VMCHS - -## Management - -- serial port (SETUP) -- 1G ethernet management port - -## Docs - -[User guide](https://www.vertiv.com/4a6ff9/globalassets/shared/mergepoint-unity-kvm-over-ip-and-serial-console-switch-installeruser-guide_00.pdf) - -## Pictures - -![front](front.png) diff --git a/lab/hardware/KVM/front.png b/lab/hardware/KVM/front.png deleted file mode 100644 index b476875e..00000000 Binary files a/lab/hardware/KVM/front.png and /dev/null differ diff --git a/lab/hardware/README.md b/lab/hardware/README.md deleted file mode 100644 index 580f9c8d..00000000 --- a/lab/hardware/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# OPI Lab - -- [Cables](../cables.md) -- [LAB IPs allocation](../ips.md) and [Inventory](../ansible/inventory) -- [Server Software Setup](../server-setup.md) -- [Bill of Materials](../bom.md) -- [Physical Testbed Setup](../physical-testbed.md) -- [Running the Test Cases](../running-the-tests.md) -- [Goals and Requirments](../goals-and-requirements.md) - -## OPI Rack - -![lab cabling diagram](../images/opi-lab-cabling.drawio.svg) diff --git a/lab/hardware/TS/README.md b/lab/hardware/TS/README.md deleted file mode 100644 index 3f913bf1..00000000 --- a/lab/hardware/TS/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Serial Consoles - -IP to serial switch for remote access to the devices serial port -ACS8000 48-port unit dual AC power supply - -## HW spec - -- Part Number: ACS8048DAC-400 -- Firmware: 2.26.1.5874+902+37+17 (Nov 7 2023 - 11:53:23) - -## Management - -- 1G ethernet management port - -## Docs - -[User guide](https://www.vertiv.com/49db15/globalassets/shared/avocent-acs-8000-installer-userguide.pdf) - -## Devices connected - -- Network PDU -- 100G Switch -- KVM - -## Pictures - -![front](front.png) - -![back](back.jpg) diff --git a/lab/hardware/TS/back.jpg b/lab/hardware/TS/back.jpg deleted file mode 100644 index 8e2723d1..00000000 Binary files a/lab/hardware/TS/back.jpg and /dev/null differ diff --git a/lab/hardware/TS/front.png b/lab/hardware/TS/front.png deleted file mode 100644 index 4c507e7f..00000000 Binary files a/lab/hardware/TS/front.png and /dev/null differ diff --git a/lab/hardware/ToR/README.md b/lab/hardware/ToR/README.md deleted file mode 100644 index fcc0b9d9..00000000 --- a/lab/hardware/ToR/README.md +++ /dev/null @@ -1,111 +0,0 @@ -# Top of Rack - -ToR switch/router provides network conectivity for management and access to the lab devices. All non test traffic goes through this device. - -## HW spec - -- Part Number: Arista DCS-7280TR-48C6-R -- Hardware version: 11.04 -- Software image version: 4.22.3M - -```bash -tor>show vers -Arista DCS-7280TR-48C6-R -Hardware version: 11.04 -Serial number: SSJ18302393 -System MAC address: 2899.3a9c.3965 - -Software image version: 4.22.3M -Architecture: i686 -Internal build version: 4.22.3M-14418192.4223M -Internal build ID: a077fcd6-7c48-4b5d-9d76-bd5f7a250bd5 - -Uptime: 16 weeks, 2 days, 6 hours and 37 minutes -Total memory: 8104364 kB -Free memory: 6384448 kB -``` - -## Management - -- serial port -- 1G ethernet management port - -## gNMI - -```bash -enable -configure -management api gnmi - transport grpc openmgmt - port 5900 -``` - -and - -```bash -tor#show management api gnmi -Enabled: Yes -Server: running on port 5900, in default VRF -SSL Profile: none -``` - -test - -```bash -root@dh1:~# docker run --network host --rm ghcr.io/openconfig/gnmic get --log --username arista --password arista --insecure --address 172.22.0.5 --port 5900 --path /openconfig-interfaces:interfaces/interface[name=Management1]/state -2024/05/16 18:52:10.247944 [gnmic] version=0.37.0, commit=05a3e785, date=2024-05-13T23:27:31Z, gitURL=https://github.com/openconfig/gnmic, docs=https://gnmic.openconfig.net -2024/05/16 18:52:10.248005 [gnmic] using config file "" -2024/05/16 18:52:10.248420 [gnmic] sending gNMI GetRequest: prefix='', path='[elem:{name:"openconfig-interfaces:interfaces"} elem:{name:"interface" key:{key:"name" value:"Management1"}} elem:{name:"state"}]', type='ALL', encoding='JSON', models='[]', extension='[]' to 172.22.0.5 -2024/05/16 18:52:10.249123 [gnmic] creating gRPC client for target "172.22.0.5" -[ - { - "source": "172.22.0.5", - "time": "1970-01-01T00:00:00Z", - "updates": [ - { - "Path": "interfaces/interface[name=Management1]/state", - "values": { - "interfaces/interface/state": { - "arista-intf-augments:inactive": false, - "openconfig-interfaces:admin-status": "UP", - "openconfig-interfaces:counters": { - "in-broadcast-pkts": "52578878", - "in-discards": "0", - "in-errors": "0", - "in-multicast-pkts": "332586", - "in-octets": "4134370723", - "in-unicast-pkts": "5137160", - "out-broadcast-pkts": "43", - "out-discards": "0", - "out-errors": "0", - "out-multicast-pkts": "331867", - "out-octets": "1259190372", - "out-unicast-pkts": "5140141" - }, - "openconfig-interfaces:description": "", - "openconfig-interfaces:enabled": true, - "openconfig-interfaces:ifindex": 999001, - "openconfig-interfaces:last-change": "170595730184", - "openconfig-interfaces:loopback-mode": false, - "openconfig-interfaces:mtu": 0, - "openconfig-interfaces:name": "Management1", - "openconfig-interfaces:oper-status": "UP", - "openconfig-interfaces:type": "iana-if-type:ethernetCsmacd", - "openconfig-vlan:tpid": "openconfig-vlan-types:TPID_0X8100" - } - } - } - ] - } -] -``` - -## Docs - -[Data Sheet](https://www.arista.com/assets/data/pdf/Datasheets/7280R-DataSheet.pdf) - -## Pictures - -![front](front.png) - -![back](back.png) diff --git a/lab/hardware/ToR/arista.config b/lab/hardware/ToR/arista.config deleted file mode 100644 index 7c1ac0c3..00000000 --- a/lab/hardware/ToR/arista.config +++ /dev/null @@ -1,194 +0,0 @@ -! Command: show running-config -! device: tor (DCS-7280TR-48C6, EOS-4.22.3M) -! -! boot system flash:/EOS-4.22.3M.swi -! -dhcp server -! -transceiver qsfp default-mode 4x10G -! -hostname tor -! -spanning-tree mode mstp -! -enable password sha512 $6$kWDV2wbi4BjUTaWk$ol.SlGBWKIkxobuttOVl71a6ABb/.nzXymp3KdCMOE8WV7oS7g9qv.tK/a5y2ezAuMDcTrhwXwCFwYjOodTdB/ -no aaa root -! -username admin privilege 15 role network-admin secret sha512 $6$m4L1NvdUYdBGkgN/$AS63fLihLk3ofFmtZ8A/Coh6O9YxOB5mV6eMFPl5C3Vy3IfbPdoK/mBFHg6rr3M1rSURXH65YoejOt6IZuQKb/ -username arista secret sha512 $6$XSWQaaBNh..Tfg0L$rCVurZ3ffUZ41xGTv0jIE9TtRK95iR7vqGskQyDaRMDzZmdALmxvI4L2imKjHl6cpcY6OXiDLZpWw0h/ADUqB0 -! -interface Ethernet1 - speed forced 1000full -! -interface Ethernet2 - speed forced 1000full -! -interface Ethernet3 - speed forced 1000full -! -interface Ethernet4 - speed forced 1000full -! -interface Ethernet5 - speed forced 1000full -! -interface Ethernet6 - speed forced 1000full -! -interface Ethernet7 - speed forced 1000full -! -interface Ethernet8 - speed forced 1000full -! -interface Ethernet9 - speed forced 1000full -! -interface Ethernet10 - speed forced 1000full -! -interface Ethernet11 - speed forced 1000full -! -interface Ethernet12 - speed forced 1000full -! -interface Ethernet13 - speed forced 1000full -! -interface Ethernet14 - speed forced 1000full -! -interface Ethernet15 - speed forced 1000full -! -interface Ethernet16 - speed forced 1000full -! -interface Ethernet17 - speed forced 1000full -! -interface Ethernet18 - speed forced 1000full -! -interface Ethernet19 - speed forced 1000full -! -interface Ethernet20 - speed forced 1000full -! -interface Ethernet21 - speed forced 1000full -! -interface Ethernet22 - speed forced 1000full -! -interface Ethernet23 - speed forced 1000full -! -interface Ethernet24 - speed forced 1000full -! -interface Ethernet25 - speed forced 1000full -! -interface Ethernet26 - speed forced 1000full -! -interface Ethernet27 - speed forced 1000full -! -interface Ethernet28 - speed forced 1000full -! -interface Ethernet29 - speed forced 1000full -! -interface Ethernet30 - speed forced 1000full -! -interface Ethernet31 - speed forced 1000full -! -interface Ethernet32 - speed forced 1000full -! -interface Ethernet33 - speed forced 1000full -! -interface Ethernet34 - speed forced 1000full -! -interface Ethernet35 - speed forced 1000full -! -interface Ethernet36 - speed forced 1000full -! -interface Ethernet37 - speed forced 1000full -! -interface Ethernet38 - speed forced 1000full -! -interface Ethernet39 - speed forced 1000full -! -interface Ethernet40 - speed forced 1000full -! -interface Ethernet41 - speed forced 1000full -! -interface Ethernet42 - speed forced 1000full -! -interface Ethernet43 - speed forced 1000full -! -interface Ethernet44 - speed forced 1000full -! -interface Ethernet45 - speed forced 1000full -! -interface Ethernet46 - speed forced 1000full -! -interface Ethernet47 - speed forced 1000full -! -interface Ethernet48 - speed forced 1000full -! -interface Ethernet49/1 - speed forced 100gfull -! -interface Ethernet50/1 - speed forced 100gfull -! -interface Ethernet51/1 - speed forced 100gfull -! -interface Ethernet52/1 - speed forced 100gfull -! -interface Ethernet53/1 - speed forced 100gfull -! -interface Ethernet54/1 - shutdown - speed forced 100gfull -! -interface Management1 - ip address 172.22.0.5/16 -! -ip route 0.0.0.0/0 172.22.0.1 -! -no ip routing -! -management api gnmi - transport grpc openmgmt - port 5900 -! -end diff --git a/lab/hardware/ToR/back.png b/lab/hardware/ToR/back.png deleted file mode 100644 index d9737097..00000000 Binary files a/lab/hardware/ToR/back.png and /dev/null differ diff --git a/lab/hardware/ToR/front.png b/lab/hardware/ToR/front.png deleted file mode 100644 index 31995659..00000000 Binary files a/lab/hardware/ToR/front.png and /dev/null differ diff --git a/lab/hardware/dh1/README.md b/lab/hardware/dh1/README.md deleted file mode 100644 index 1fc266e8..00000000 --- a/lab/hardware/dh1/README.md +++ /dev/null @@ -1,151 +0,0 @@ -# DPU Host 1 - -Dell PowerEdge R650 - -## BIOS settings - -- change power to always on - -## Otel or telegraf - -Run telegraf container: - -```bash -sudo docker run -d --restart=always --network=host -v ./telegraf.d/telegraf.conf:/etc/telegraf/telegraf.conf docker.io/library/telegraf:1.29 -``` - -## Pensando - -:exclamation: register management to the ionic driver (if using standard linux drivers ) - -```bash -echo "1dd8 1004" > /sys/bus/pci/drivers/ionic/new_id -``` - -lspci - -```bash -root@dh1:~# lspci -d 1dd8:1002 -19:00.0 Ethernet controller: AMD Pensando Systems DSC Ethernet Controller -1a:00.0 Ethernet controller: AMD Pensando Systems DSC Ethernet Controller - -root@dh1:~# lspci | grep Pensando -17:00.0 PCI bridge: AMD Pensando Systems DSC2 Elba Upstream Port -18:00.0 PCI bridge: AMD Pensando Systems DSC Virtual Downstream Port -18:01.0 PCI bridge: AMD Pensando Systems DSC Virtual Downstream Port -18:02.0 PCI bridge: AMD Pensando Systems DSC Virtual Downstream Port -19:00.0 Ethernet controller: AMD Pensando Systems DSC Ethernet Controller -1a:00.0 Ethernet controller: AMD Pensando Systems DSC Ethernet Controller -1b:00.0 Ethernet controller: AMD Pensando Systems DSC Management Controller -``` - -serial and mac - -```bash -root@dh1:~# lspci -vvv -s 1b:00.0 | grep -A 19 "Vital Product Data" - Capabilities: [c0] Vital Product Data - Product Name: Pensando DSC2-100 100G 2p QSFP56 DPU - Read-only fields: - [PN] Part number: 0PCFPCA00 - [SN] Serial number: MYFLEPK31D02ZH - [EC] Engineering changes: 0 - [MN] Manufacture ID: 1028 - [V3] Vendor specific: 1.46.0-E-28 - [V4] Vendor specific: 00aecd5be7ee - [V5] Vendor specific: MKY=V0-d78e97440a382c8825cd4320627bba5d - [VA] Vendor specific: DSV1028VPDR.VER2.2 - [VB] Vendor specific: NMVPensando Systems - [VC] Vendor specific: FFV01.46.00.28 - [VD] Vendor specific: DTINIC - [VE] Vendor specific: NPY1 - [VF] Vendor specific: PMTD - [VG] Vendor specific: DCM3001FFFFFF - [VH] Vendor specific: UUID1dd8000000004000800000aecd5be7ec - [RV] Reserved: checksum good, 128 byte(s) reserved - End -``` - -drivers - -```bash -root@dh1:~# dmesg | grep ionic -[ 2.462867] integrity: Loaded X.509 cert 'pensando: ionic.ko: 1941fcb3df8deb68fe3aed35aab0f867032935c9' -[ 2.804739] ionic 0000:19:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link) -[ 2.808758] ionic 0000:19:00.0: FW: 1.46.0-E-28 -[ 3.253894] ionic 0000:1a:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link) -[ 3.260512] ionic 0000:1a:00.0: FW: 1.46.0-E-28 -[ 3.450751] ionic 0000:1a:00.0 enp26s0np0: renamed from eth1 -[ 3.459006] ionic 0000:19:00.0 enp25s0np0: renamed from eth0 -[149888.934142] ionic 0000:1b:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link) -[149888.937785] ionic 0000:1b:00.0: FW: 1.46.0-E-28 -[149889.003793] ionic 0000:1b:00.0 eth0: Link up - 1 Gbps - -root@dh1:~# ls -l /sys/class/net/*/device -lrwxrwxrwx 1 root root 0 May 29 23:10 /sys/class/net/eno12399np0/device -> ../../../0000:31:00.0 -lrwxrwxrwx 1 root root 0 May 29 23:10 /sys/class/net/eno12409np1/device -> ../../../0000:31:00.1 -lrwxrwxrwx 1 root root 0 May 31 16:28 /sys/class/net/enp25s0np0/device -> ../../../0000:19:00.0 -lrwxrwxrwx 1 root root 0 May 31 16:28 /sys/class/net/enp26s0np0/device -> ../../../0000:1a:00.0 -lrwxrwxrwx 1 root root 0 May 31 16:48 /sys/class/net/enp27s0np0/device -> ../../../0000:1b:00.0 -lrwxrwxrwx 1 root root 0 May 30 17:45 /sys/class/net/idrac/device -> ../../../1-14.3:1.0 - -root@dh1:~# ls -l /sys/class/net/*/device/driver -lrwxrwxrwx 1 root root 0 May 29 23:10 /sys/class/net/eno12399np0/device/driver -> ../../../../bus/pci/drivers/bnxt_en -lrwxrwxrwx 1 root root 0 May 29 23:10 /sys/class/net/eno12409np1/device/driver -> ../../../../bus/pci/drivers/bnxt_en -lrwxrwxrwx 1 root root 0 May 31 16:28 /sys/class/net/enp25s0np0/device/driver -> ../../../../../../bus/pci/drivers/ionic -lrwxrwxrwx 1 root root 0 May 31 16:28 /sys/class/net/enp26s0np0/device/driver -> ../../../../../../bus/pci/drivers/ionic -lrwxrwxrwx 1 root root 0 May 31 16:48 /sys/class/net/enp27s0np0/device/driver -> ../../../../../../bus/pci/drivers/ionic -lrwxrwxrwx 1 root root 0 May 29 23:10 /sys/class/net/idrac/device/driver -> ../../../../../../../bus/usb/drivers/cdc_ether -``` - -network - -```bash -root@dh1:~# ethtool -i enp25s0np0 -driver: ionic -version: 6.8.0-31-generic -firmware-version: 1.46.0-E-28 -expansion-rom-version: -bus-info: 0000:19:00.0 -supports-statistics: yes -supports-test: no -supports-eeprom-access: no -supports-register-dump: yes -supports-priv-flags: no - -root@dh1:~# devlink dev info pci/0000:19:00.0 -pci/0000:19:00.0: - driver ionic - serial_number MYFLEPK31D02ZH - versions: - fixed: - asic.id 0x0 - asic.rev 0x0 - running: - fw 1.46.0-E-28 - -root@dh1:~# ifconfig -... -enp25s0np0: flags=4099 mtu 1500 - ether 00:ae:cd:5b:e7:ec txqueuelen 1000 (Ethernet) - RX packets 0 bytes 0 (0.0 B) - RX errors 0 dropped 0 overruns 0 frame 0 - TX packets 0 bytes 0 (0.0 B) - TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 - -enp26s0np0: flags=4099 mtu 1500 - ether 00:ae:cd:5b:e7:ed txqueuelen 1000 (Ethernet) - RX packets 0 bytes 0 (0.0 B) - RX errors 0 dropped 0 overruns 0 frame 0 - TX packets 0 bytes 0 (0.0 B) - TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 - -enp27s0np0: flags=4163 mtu 1500 - inet 169.254.27.2 netmask 255.255.255.0 broadcast 169.254.27.255 - inet6 fe80::2ae:cdff:fe5b:e7ee prefixlen 64 scopeid 0x20 - ether 00:ae:cd:5b:e7:ee txqueuelen 1000 (Ethernet) - RX packets 0 bytes 0 (0.0 B) - RX errors 0 dropped 0 overruns 0 frame 0 - TX packets 280 bytes 12304 (12.3 KB) - TX errors 0 dropped 0 overruns 0 carrier 0 collisions 0 -... -``` diff --git a/lab/hardware/dh2/README.md b/lab/hardware/dh2/README.md deleted file mode 100644 index ff8ee05a..00000000 --- a/lab/hardware/dh2/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# DPU Host 2 - -HPE DL360 - -## BIOS settings - -- change power to always on - -## Otel or telegraf - -Run telegraf container: - -```bash -sudo docker run -d --restart=always --network=host -v ./telegraf.d/telegraf.conf:/etc/telegraf/telegraf.conf docker.io/library/telegraf:1.29 -``` - -## Nvidia - -lspci - -```bash -root@dh2:~# lspci | grep BlueField-2 -0f:00.0 Ethernet controller: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller (rev 01) -0f:00.1 Ethernet controller: Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 Dx network controller (rev 01) -0f:00.2 DMA controller: Mellanox Technologies MT42822 BlueField-2 SoC Management Interface (rev 01) -``` - -serial - -```bash -root@dh2:~# lspci -vvv -s 0f:00.2 | grep -A 11 "Vital Product Data" - Capabilities: [48] Vital Product Data - Product Name: BlueField-2 E-Series DPU 100GbE Dual-Port QSFP56, integrated BMC, Secure Boot Enabled, Crypto Enabled, 16GB on-board DDR, 1GbE OOB management, FHHL - Read-only fields: - [PN] Part number: MBF2M516C-CECOT - [EC] Engineering changes: A9 - [V2] Vendor specific: MBF2M516C-CECOT - [SN] Serial number: MT2321XZ0KVX - [V3] Vendor specific: 1814c074ae05ee118000946daeb98570 - [VA] Vendor specific: MLX:MN=MLNX:CSKU=V2:UUID=V3:PCI=V0:MODL=BF2M516C - [V0] Vendor specific: PCIeGen4 x16 - [RV] Reserved: checksum good, 1 byte(s) reserved - End -``` - -rshim - -```bash -root@dh2:~# cat /dev/rshim0/misc -DISPLAY_LEVEL 0 (0:basic, 1:advanced, 2:log) -BOOT_MODE 1 (0:rshim, 1:emmc, 2:emmc-boot-swap) -BOOT_TIMEOUT 150 (seconds) -DROP_MODE 0 (0:normal, 1:drop) -SW_RESET 0 (1: reset) -DEV_NAME pcie-0000:0f:00.2 -DEV_INFO BlueField-2(Rev 1) -OPN_STR N/A -``` - -ssh over rshim - -```bash -root@dh2:~# ifconfig tmfifo_net0 192.168.100.1/30 up -root@dh2:~# ping 192.168.100.2 -root@dh2:~# ssh ubuntu@192.168.100.2 -``` - -flush new BFB image - -```bash -root@dh2:~# wget https://content.mellanox.com/BlueField/BFBs/Ubuntu22.04/bf-bundle-2.7.0-33_24.04_ubuntu-22.04_prod.bfb -root@dh2:~# cat bf-bundle-2.7.0-33_24.04_ubuntu-22.04_prod.bfb > /dev/rshim0/boot -``` diff --git a/lab/hardware/dh3/README.md b/lab/hardware/dh3/README.md deleted file mode 100644 index 55a9c98e..00000000 --- a/lab/hardware/dh3/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# DPU Host 3 - -HP RL300 - -## BIOS settings - -- change power to always on - -## Link speed - -an aditional service was created in order to set the nic at boot in 1G mode -otherwise it starts in 10G/25G mode and link will be down - -- run manually: - -```bash -ethtool -s enP2p1s0f0np0 speed 1000 -``` - -- or via systemctl - -```bash -systemctl edit --force --full link-speed.service -systemctl enable link-speed.service -systemctl start link-speed.service -``` - -where `link-speed.service` is: - -```ini -[Unit] -After=network.target - -[Service] -ExecStart=ethtool -s enP2p1s0f0np0 speed 1000 - -[Install] -WantedBy=default.target -``` - -## Otel or telegraf - -Run telegraf container: - -```bash -sudo docker run -d --restart=always --network=host -v ./telegraf.d/telegraf.conf:/etc/telegraf/telegraf.conf docker.io/library/telegraf:1.29 -``` diff --git a/lab/hardware/dh3/fs/etc/systemd/system/link-speed.service b/lab/hardware/dh3/fs/etc/systemd/system/link-speed.service deleted file mode 100644 index 27623962..00000000 --- a/lab/hardware/dh3/fs/etc/systemd/system/link-speed.service +++ /dev/null @@ -1,8 +0,0 @@ -[Unit] -After=network.target - -[Service] -ExecStart=/usr/local/bin/link-speed.sh - -[Install] -WantedBy=default.target \ No newline at end of file diff --git a/lab/hardware/dh3/fs/etc/usr/local/bin/link-speed.sh b/lab/hardware/dh3/fs/etc/usr/local/bin/link-speed.sh deleted file mode 100644 index 0cab645b..00000000 --- a/lab/hardware/dh3/fs/etc/usr/local/bin/link-speed.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -ethtool -s enP2p1s0f0np0 speed 1000 \ No newline at end of file diff --git a/lab/hardware/dh4/README.md b/lab/hardware/dh4/README.md deleted file mode 100644 index 28f0b40c..00000000 --- a/lab/hardware/dh4/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# DPU Host 4 - -Dell PowerEdge R760 - -## BIOS settings - -- change power to always on - -## IPU - -Intel Dayton Peak - -- to enable port forwarding for `OTEL` from `ACC` through `IMC`, run manually: - -```bash -dnf install -y socat -socat tcp-l:4317,fork,reuseaddr tcp:172.22.0.1:4317 -``` - -- or via systemctl - -```bash -systemctl edit --force --full socat-otel.service -systemctl enable socat-otel.service -systemctl start socat-otel.service -``` - -where `socat-otel.service` is: - -```ini -[Unit] -After=network.target - -[Service] -ExecStart=socat tcp-l:4317,fork,reuseaddr tcp:172.22.0.1:4317 - -[Install] -WantedBy=default.target -``` - -## Otel or telegraf - -Run telegraf container: - -```bash -sudo docker run -d --restart=always --network=host -v ./telegraf.d/telegraf.conf:/etc/telegraf/telegraf.conf docker.io/library/telegraf:1.29 -``` diff --git a/lab/hardware/dh4/fs/etc/systemd/system/socat-otel.service b/lab/hardware/dh4/fs/etc/systemd/system/socat-otel.service deleted file mode 100644 index 961fac70..00000000 --- a/lab/hardware/dh4/fs/etc/systemd/system/socat-otel.service +++ /dev/null @@ -1,8 +0,0 @@ -[Unit] -After=network.target - -[Service] -ExecStart=socat tcp-l:4317,fork,reuseaddr tcp:172.22.0.1:4317 - -[Install] -WantedBy=default.target diff --git a/lab/hardware/mgmt/README.md b/lab/hardware/mgmt/README.md deleted file mode 100644 index 8b355385..00000000 --- a/lab/hardware/mgmt/README.md +++ /dev/null @@ -1,64 +0,0 @@ -# Management server - -as of now this is merged with dh1 until new server arives - -## Roles - -- DHCP server for the lab -- GitHub self hosted runners -- OTEL, Prometheus and Grafana servers for the lab monitoring -- SZTP bootstrap server for the lab -- network gateway between private network 172.22 and the internet -- Other containers and vms for remaining services - -## BIOS settings - -- change power to always on - -## DHCP server - -run bare metal or via docker: - -```bash -sudo docker run --rm -it --network=host --privileged --restart always -v "$(PWD)"/fs/etc/dhcp/dhcpd.conf:/etc/dhcp/dhcpd.conf docker.io/networkboot/dhcpd:1.3.0 -``` - -or [compose](https://github.com/opiproject/sztp/blob/0addb57154332e7ecdebe4ba18b2633278857ef5/docker-compose.yml#L51-L68) - -## GitHub self hosted runners - -Follow GitHub [instructions](https://github.com/opiproject/opi-poc/settings/actions/runners/new?arch=x64&os=linux) - -## Ansible - -- Run `docker compose up -d semaphore` or `docker-compose up -d semaphore` from [here](../../otel). -- This will start [Ansible Semaphore](https://docs.semui.co). -- Access it via - -## Portainer - -- Run `docker compose up -d portainer` or `docker-compose up -d portainer` from [here](../../otel). -- This will start [Portainer Community Edition](https://www.portainer.io/). -- Access it via - -## Monitoring & Telemetry - -Run `docker compose up -d` or `docker-compose up -d` from [here](../../otel) - -1. [OTEL Gateway Collector](https://opentelemetry.io/docs/collector/deployment/gateway/) to aggregate telemetry from all DPUs and IPUs. -2. [Prometheus](https://prometheus.io/) Monitoring system & time series database -3. [Grafana](https://grafana.com/) Open source analytics & monitoring solution for every database. - -Acccess it via: - -1. -2. -3. - -## Paswordless - -for all servers, dpu and ipus from [here](../../ips.md) run: - -```bash - ssh-copy-id root@172.22.X.X -``` diff --git a/lab/hardware/mgmt/fs/etc/dhcp/dhcpd.conf b/lab/hardware/mgmt/fs/etc/dhcp/dhcpd.conf deleted file mode 100644 index 02111f37..00000000 --- a/lab/hardware/mgmt/fs/etc/dhcp/dhcpd.conf +++ /dev/null @@ -1,172 +0,0 @@ -# dhcpd.conf -# -# Sample configuration file for ISC dhcpd -# -# Attention: If /etc/ltsp/dhcpd.conf exists, that will be used as -# configuration file instead of this file. -# - -# option definitions common to all supported networks... -option domain-name "lab.opiproject.org"; -option domain-name-servers 8.8.8.8, 1.1.1.1; -option routers 172.22.0.1; - -option sztp-redirect-urls code 143 = text; -option sztp-redirect-urls "https://bootstrap:8080/restconf/operations/ietf-sztp-bootstrap-server:get-bootstrapping-data"; - -default-lease-time 600; -max-lease-time 7200; - -# The ddns-updates-style parameter controls whether or not the server will -# attempt to do a DNS update when a lease is confirmed. We default to the -# behavior of the version 2 packages ('none', since DHCP v2 didn't -# have support for DDNS.) -ddns-update-style none; - -# If this DHCP server is the official DHCP server for the local -# network, the authoritative directive should be uncommented. -authoritative; - -# Use this to send dhcp log messages to a different log file (you also -# have to hack syslog.conf to complete the redirection). -log-facility local7; - -# No service will be given on this subnet, but declaring it helps the -# DHCP server to understand the network topology. - -subnet 172.22.0.0 netmask 255.255.0.0 { -} - -# This is a very basic subnet declaration. - -subnet 172.22.0.0 netmask 255.255.0.0 { - range 172.22.222.1 172.22.222.254; -} - -# This are for known devices - -host npdu { - hardware ethernet 00:19:85:0f:37:aa; - fixed-address 172.22.0.10; -} - -host kvm { - hardware ethernet 00:e0:86:35:b4:3e; - fixed-address 172.22.0.20; -} - -# do not enable this yet -# host ts { -# hardware ethernet 00:e0:86:35:b8:56; -# fixed-address 172.22.0.30; -# } - -host dh1 { - hardware ethernet 04:32:01:18:ac:60; - fixed-address 172.22.1.1; -} - -host dh1idrac { - hardware ethernet c8:4b:d6:a6:b4:cb; - fixed-address 172.22.2.1; -} - -# host dh1pen { -# hardware ethernet zzzzzzzzzz; -# fixed-address 172.22.3.1; -# } - -host dh2 { - hardware ethernet 00:62:0b:a2:59:8e; - fixed-address 172.22.1.2; -} - -host dh2ilo { - hardware ethernet 5c:ed:8c:b4:41:c6; - fixed-address 172.22.2.2; -} - -host dh2bf2 { - hardware ethernet 94:6d:ae:b9:85:7e; - fixed-address 172.22.3.2; -} - -host dh2bf2bmc { - hardware ethernet 94:6d:ae:b9:85:72; - fixed-address 172.22.4.2; -} - -host dh3 { - hardware ethernet b8:3f:d2:48:ca:66; - fixed-address 172.22.1.3; -} - -host dh3ilo { - hardware ethernet 5c:ed:8c:6b:11:41; - fixed-address 172.22.2.3; -} - -host dh4 { - hardware ethernet d4:04:e6:2a:b0:20; - fixed-address 172.22.1.4; -} - -host dh4idrac { - hardware ethernet c4:5a:b1:b9:d0:45; - fixed-address 172.22.2.4; -} - -host dh4dypimc { - hardware ethernet 10:2e:00:01:b4:6f; - fixed-address 172.22.4.4; -} - -# host dh3cn106 { -# hardware ethernet zzzzzzzzzzzzzz; -# fixed-address 172.22.3.3; -# } - -host dh3cn106mcu { - hardware ethernet 02:04:9F:00:00:00; - fixed-address 172.22.4.3; -} - -host tgen1 { - hardware ethernet 7c:c2:55:25:1f:9a; - fixed-address 172.22.1.100; -} - -host tgen1bmc { - hardware ethernet 7c:c2:55:18:f1:68; - fixed-address 172.22.2.100; -} - -host arista100g { - hardware ethernet 44:4C:A8:97:B5:CE; - fixed-address 172.22.1.250; -} - -host tor { - hardware ethernet 28:99:3a:9c:39:65; - fixed-address 172.22.1.250; -} - - - - -# PXE BOOT -# allow booting; -# allow bootp; -# option arch code 93 = unsigned integer 16; -# host ubuntu { -# hardware ethernet xx:xx:xx:xx:xx:xx; -# if option arch = 00:07 { -# filename "boot/bootx64.efi"; -# } else { -# filename "boot/pxelinux.0"; -# } -# next-server x.x.x.x; -# fixed-address x.x.x.x; -# } - - diff --git a/lab/hardware/mgmt/fs/etc/hostname b/lab/hardware/mgmt/fs/etc/hostname deleted file mode 100644 index 10a92cbb..00000000 --- a/lab/hardware/mgmt/fs/etc/hostname +++ /dev/null @@ -1 +0,0 @@ -mgmt diff --git a/lab/hardware/mgmt/fs/etc/nftables.conf b/lab/hardware/mgmt/fs/etc/nftables.conf deleted file mode 100644 index 8f4a7b27..00000000 --- a/lab/hardware/mgmt/fs/etc/nftables.conf +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/sbin/nft -f - -flush ruleset - -table inet filter { - chain input { - type filter hook input priority filter; policy accept; - } - - chain forward { - type filter hook forward priority filter; policy accept; - } - - chain output { - type filter hook output priority filter; policy accept; - } -} -table inet nat { - chain prerouting { - type nat hook prerouting priority dstnat; policy accept; - } - - chain postrouting { - type nat hook postrouting priority srcnat; policy accept; - ip saddr 172.22.0.0/16 masquerade - } -} diff --git a/lab/hardware/nPDU/README.md b/lab/hardware/nPDU/README.md deleted file mode 100644 index d12d13e9..00000000 --- a/lab/hardware/nPDU/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Network controled power delivery unit - -This device is intended to provide remote capability to cold boot the lab devices -When all hopes are lost bring down and up the power and you may recover your device - -## HW spec - -Upgradeable Switched Outlet Level Monitored rPDUs provide reliable power distribution to critical IT equipment while delivering individual outlet level control and a comprehensive view of outlet level power usage via remote network access. - -- Part Number: VP7N3001 -- Model Number: MNU3EGB1-24CF18-3TL6A0A10-S -- Version: 5.10.8 - -## Management - -- serial port -- 1G ethernet management port - -## Docs - -[User guide](https://www.geistglobal.com/sites/all/files/Manuals/vm1221_590-2196-501a.pdf) - -## Pictures - -![front](front.png) - -![interface](interface.png) diff --git a/lab/hardware/nPDU/front.png b/lab/hardware/nPDU/front.png deleted file mode 100644 index cb9a8af9..00000000 Binary files a/lab/hardware/nPDU/front.png and /dev/null differ diff --git a/lab/hardware/nPDU/interface.png b/lab/hardware/nPDU/interface.png deleted file mode 100644 index e77fc1f5..00000000 Binary files a/lab/hardware/nPDU/interface.png and /dev/null differ diff --git a/lab/hardware/tgen1/README.md b/lab/hardware/tgen1/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/lab/images/add-vpn-windows.png b/lab/images/add-vpn-windows.png deleted file mode 100644 index 89c4f98d..00000000 Binary files a/lab/images/add-vpn-windows.png and /dev/null differ diff --git a/lab/images/f5-vpn-msft-app.png b/lab/images/f5-vpn-msft-app.png deleted file mode 100644 index c78a8472..00000000 Binary files a/lab/images/f5-vpn-msft-app.png and /dev/null differ diff --git a/lab/images/opi-lab-cabling.drawio.svg b/lab/images/opi-lab-cabling.drawio.svg deleted file mode 100644 index 8f12c3e1..00000000 --- a/lab/images/opi-lab-cabling.drawio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
IP
VGA
USB
USB
P1
P2
NIC1
SW TGEN 1
RAX XS4-11S3-10G
IPMI
P1
P2
NIC2
tgen1
172.22.1.100
172.22.2.100
18
IP
IP
13
15
17
19
21
23
14
16
20
22
24
1
3
5
7
2
4
6
8
9
11
10
12
29
31
30
32
25
27
26
28
TS
18
IP
COM
13
15
17
19
21
23
14
16
20
22
24
1
3
5
7
2
4
6
8
9
11
10
12
ToR
KVM
DO NOT USE, RESERVED FOR AIRFLOW
DO NOT USE, RESERVED FOR AIRFLOW
1
2
3
4
5
6
7
8
9
10
11
12
EMPTY
13
EMPTY
14
EMPTY
15
EMPTY
16
EMPTY
17
EMPTY
18
EMPTY
19
20
21
DO NOT USE, RESERVED FOR AIRFLOW
22
DO NOT USE, RESERVED FOR AIRFLOW
23
24
EMPTY
25
EMPTY
26
EMPTY
27
EMPTY
28
EMPTY
29
EMPTY
30
EMPTY
31
EMPTY
32
EMPTY
33
EMPTY
34
EMPTY
35
EMPTY
36
EMPTY
37
EMPTY
38
EMPTY
39
EMPTY
40
EMPTY
41
42
DO NOT USE, RESERVED FOR AIRFLOW
43
DO NOT USE, RESERVED FOR AIRFLOW
44
NETWORK PDU
COM
IP
53
54
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
55
56
49
50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
51
52
Arista DCS-7280CR-48-F
172.22.1.250
IP
SFP28
VGA
USB
USB
P1
P2
DPU
IP
SERVER DPU HOST 1
IP
SFP28
VGA
USB
USB
P1
P2
DPU
IP
SERVER DPU HOST 2
IP
SFP28
VGA
USB
USB
SERVER DPU HOST 3
Vertiv Avocent 8032DAC
Vertiv Avocent ACS8000
18
13
15
17
19
21
23
14
16
20
22
24
1
3
5
7
2
4
6
8
9
11
10
12
29
31
30
32
25
27
26
28
IP
IP
COM
USB
USB
LAN 2
LAN 1
MOD
SETUP
PDU1
PDU2
inf
USB
USB
USB
USB
VGA
COM
IP
AMD/Pensando DSC2-100
Nvidia BlueField2
24
23
22
21
20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
.
172.22.0.20
Vertiv Geist GU2 rack PDU VP7N3001
172.22.0.10
172.22.0.1
172.22.1.2
172.22.1.3
172.22.3.1
172.22.3.2
iDRAC
iLO
iLO
dh1
dh2
dh3
18
COM
IP
13
15
17
19
21
23
14
16
20
22
24
1
3
5
7
2
4
6
8
9
11
10
12
old ToR unconnected
49
50
51
52
53
54
42
37
39
41
43
45
47
38
40
44
46
48
25
27
29
31
26
28
30
32
33
35
34
36
CON
FAIL
IP
IP
MGMT
USB
IP
IP
IP
IP
IP
IP
IP
IP
IP
IP
CON
FAIL
IP
IP
MGMT
USB
IP
IP
IP
IP
IP
IP
IP
IP
IP
IP
F5A
F5B
.
.
.
F5 BIGI-IP i4000
F5 BIGI-IP i4000
DCS-7280TR-48C6-R
172.22.0.5
P1
DPU
IP
Marvell CN106
USB
USB
USB
172.22.3.3
internet
IP
VGA
USB
USB
SERVER DPU HOST 4
Dell R760
172.22.1.4
iDRAC
dh4
P1
DPU
IP
Intel DYP
USB
172.22.3.4
P2
172.22.2.2
172.22.2.1
Dell PowerEdge R650
HPE DL360 Gen10+
172.22.2.3
172.22.2.4
HP RL300 Gen11
\ No newline at end of file diff --git a/lab/images/opi-rack-phase1.svg b/lab/images/opi-rack-phase1.svg deleted file mode 100644 index 19006f01..00000000 --- a/lab/images/opi-rack-phase1.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -123456789101112
ToR
ToR
KVM
KVM
TS
TS
PDU1
PDU1
PDU2
PDU2
Virt
Virt
Test SW
Test SW
DPU Host 1
DPU Host 1
DPU Host 2
DPU Host 2
Tgen
Tgen
Storage
Storage
DPU Host 3
DPU Host 3
Text is not SVG - cannot display
\ No newline at end of file diff --git a/lab/ips.md b/lab/ips.md deleted file mode 100644 index 1a1a0f56..00000000 --- a/lab/ips.md +++ /dev/null @@ -1,55 +0,0 @@ -# IP Allocations - -Proposal is to use 172.22.0.0/16 network for all ip addresing in the lab, and have 1 gateway that will handle internet access and be entry point for access in the lab. - -## Code duplication - -:exclamation: with [ansible inventory](./ansible/inventory) - -## Device types ip allocation - -| IR Range | Purpose | -|----------------- |--------------------------------------------------------- | -| 172.22.0.0/24 | All management devices like, gateway, kvm, ts, pdu, ... | -| 172.22.1.0/24 | All servers, dpu hosts, test, ... | -| 172.22.2.0/24 | All iDRAC, iLO, BMC... | -| 172.22.3.0/24 | All DPUs | -| 172.22.4.0/24 | All BMC of the DPU | -| 172.22.222.0/24 | dhcp allocated to any unknown device | -| 172.22.254.0/24 | VPN Lease Pool -- IPs dynamically assigned to VPN clients | - -## Per Device IP allocation - -| IPv4 | IPv6 | DEVICE | Helpful URLs | -|--------------------------- |------ |----------------- |----------------| -| public ip / 172.22.0.1 | | gateway/mgmt | and | -| 172.22.0.5 | | ToR | | -| 172.22.0.10 | | Network PDU | | -| 172.22.0.20 | | KVM | | -| 172.22.0.30 | | TS | | -| 172.22.0.226 | | F5 VPN Floating | | -| 172.22.0.227 | | F5 BIG-IP-01 | | -| 172.22.0.228 | | F5 VPN Floating | | -| | | | | -| 172.22.1.1 | | DPU Host 1 | | -| 172.22.2.1 | | DH1 iDRAC | | -| 172.22.3.1 | | AMD DSC200 | | -| | | | | -| 172.22.1.2 | | DPU Host 2 | | -| 172.22.2.2 | | DH2 iLO | | -| 172.22.3.2 | | Nvidia BF2 | | -| 172.22.4.2 | | Nvidia BF2 bmc | | -| | | | | -| 172.22.1.3 | | DPU Host 3 | | -| 172.22.2.3 | | DH3 iLO | | -| 172.22.3.3 | | Marvell CN106 | | -| 172.22.4.3 | | Marvell CN106 bmc | | -| | | | | -| 172.22.1.4 | | DPU Host 4 | | -| 172.22.2.4 | | DH4 iDRAC | | -| 172.22.4.4 | | Intel DYP imc | | -| | | | | -| 172.22.1.100 | | Tgen1 | | -| 172.22.2.100 | | Tgen1 bmc | | -| | | | | -| 172.22.1.250 | | Arista 100G | | diff --git a/lab/otel.md b/lab/otel.md deleted file mode 100644 index 122bfdda..00000000 --- a/lab/otel.md +++ /dev/null @@ -1,154 +0,0 @@ -# Monitoring & Telemetry - -Took from - -![OPI Telemetry Deploy Option](https://github.com/opiproject/otel/blob/main/doc/dpu-otel.png) - -## Standartization - -OPI standardized on OTEL, but is agnostic to actual collector or agent implementation that each vendor decides to run inside. - -For example, Nvidia is using [Nvidia Doca Telemetry Service](https://docs.nvidia.com/doca/sdk/nvidia+doca+telemetry+service+guide/index.html), unfortanutely it does not support OTEL as of writing this paragraph. - -## On DPUs and IPUs - -### Configuration - -Create `telegraf.conf` file, see example [here](./telegraf.d/telegraf.conf.bf2) - -- change `172.22.0.1` in `outputs.opentelemetry` to the correct management server name/ip -- change `192.168.240.1` and credentails to the internal DPU/IPU AMC/BMC for redfish collection - -### Service - -Run telegraf container: - -```bash -sudo docker run -d --restart=always --network=host -v ./telegraf.d/telegraf.conf.bf2:/etc/telegraf/telegraf.conf docker.io/library/telegraf:1.29 -``` - -### Optional SPDK - -To monitor [SPDK](https://spdk.io/) storage metrics, make sure correct service is running: - -```bash -systemctl stop mlnx_snap -systemctl start spdk_tgt -``` - -And few block devices exist to monitor, like: - -```bash -spdk_rpc.py bdev_malloc_create -b Malloc0 64 512 -spdk_rpc.py bdev_malloc_create -b Malloc1 64 512 -``` - -And [Proxy](https://github.com/spdk/spdk/blob/v24.01.x/scripts/rpc_http_proxy.py) script is running: - -```bash -# TODO: make it a service -spdk_rpc_http_proxy.py 0.0.0.0 9009 spdkuser spdkpass -``` - -And add this to your config file: - -```ini -[[inputs.http]] - urls = ["http://localhost:9009"] - headers = {"Content-Type" = "application/json"} - method = "POST" - username = "spdkuser" - password = "spdkpass" - body = '{"id":1, "method": "bdev_get_iostat"}' - data_format = "json" - name_override = "spdk" - json_strict = true - tag_keys = ["name"] - json_query = "result.bdevs" -``` - -### Optional Temperature - -For regular Servers, add to your config file: - -```ini -[[inputs.temp]] - # no configuration -``` - -For `Nvidia BlueField` cards, to monitor temperature, add to your telegraf config file: - -```ini -[[inputs.file]] - files = ["/run/emu_param/bluefield_temp"] - name_override = "temp" - value_field_name="temp" - data_format = "value" - data_type = "integer" - file_tag = "sensor" -``` - -and add to your docker run command: - -```text --v /run/emu_param:/run/emu_param -``` - -and make sure emulation service is running: - -```bash -systemctl start set_emu_param -``` - -For `Intel MEV` cards the temperature is on the ICC chip, no easy access to it: - -```ini -[[inputs.exec]] - commands = ["iset-cli get-temperature"] - name_override = "temp" - data_format = "json" -``` - -## On Management server - -See management server details [here](../hardware/mgmt) - -### Run - -Run `docker compose up -d` or `docker-compose up -d` - -:exclamation: `docker-compose` is deprecated. For details, see [Migrate to Compose V2](https://docs.docker.com/compose/migrate/). - -This will start those services: - -1. [OTEL Gateway Collector](https://opentelemetry.io/docs/collector/deployment/gateway/) to aggregate telemetry from all DPUs and IPUs. -2. [Prometheus](https://prometheus.io/) Monitoring system & time series database -3. [Grafana](https://grafana.com/) Open source analytics & monitoring solution for every database. - -### Otel Gateway Collector - -1. - health check -2. - my own metrics -3. - real metrics - -### Prometheus - -1. Open to explore Prometheus UI -2. or via API examples: - -```bash -curl --fail http://172.22.0.1:9091/api/v1/query?query=mem_free | grep mem_free -curl --fail http://172.22.0.1:9091/api/v1/query?query=cpu_usage_user | grep cpu_usage_user -curl --fail http://172.22.0.1:9091/api/v1/query?query=spdk_num_read_ops | grep spdk_num_read_ops -curl --fail http://172.22.0.1:9091/api/v1/query?query=nstat_TcpActiveOpens | grep nstat_TcpActiveOpens -curl --fail http://172.22.0.1:9091/api/v1/query?query=redfish_power_powercontrol_interval_in_min | grep redfish_power_powercontrol_interval_in_min -``` - -### Grafana - -1. Open to explore Grafana UI -2. or via API examples: - -```bash -curl -s http://172.22.0.1:3000/api/datasources | jq . -``` diff --git a/lab/physical-testbed.md b/lab/physical-testbed.md deleted file mode 100644 index 46dd5cd8..00000000 --- a/lab/physical-testbed.md +++ /dev/null @@ -1,5 +0,0 @@ -# OPI Testbed Topology - -## Cabling diagram - -![lab cabling diagram](./images/opi-lab-cabling.drawio.svg) diff --git a/lab/running-the-tests.md b/lab/running-the-tests.md deleted file mode 100644 index 6750e520..00000000 --- a/lab/running-the-tests.md +++ /dev/null @@ -1,12 +0,0 @@ -# OPI test frmework - -will be using [pytest](https://github.com/opiproject/pydpu) - -## to run the test cases - -```Shell -docker run --rm --network host \ - --mount src=./,target=/opi/test-cases,type=bind \ - --it opi/test-framework bash -pytest /opi/test-cases/test_sample.py -``` diff --git a/lab/server-setup.md b/lab/server-setup.md deleted file mode 100644 index e7933fcb..00000000 --- a/lab/server-setup.md +++ /dev/null @@ -1,155 +0,0 @@ -# Prepare Server - -## Automation - -Then run the [playbook](./ansible) to automate all the lab steps: - -```bash -ansible-playbook -i inventory setup.yml -``` - -## Manual - -* Install Ubuntu[^1] 22.04 x64 on the server. ([ubuntu-22.04.1-live-server-amd64.iso](https://releases.ubuntu.com/22.04/)) - * select all default options (unless otherwise noted bellow) - * on disk setup: disable LVM (optional) - * on profile setup: put name, servername, username, password all as `opi` for example purposes - * on ssh setup: enable `install OpenSSH server` -* Install Ubuntu prerequisites - -```Shell -sudo apt -y update -sudo apt -y upgrade -sudo apt -y autoremove -sudo apt -y install \ - python3 \ - python3-pip \ - python-is-python3 \ - net-tools \ - curl \ - git \ - make -sudo apt -y install ubuntu-desktop (TODO: remove this depedency) -``` - -* install Docker (all credits to [Docker manual](https://docs.docker.com/engine/install/ubuntu/) ) - -```Shell -sudo apt-get -y remove docker docker-engine docker.io containerd runc -sudo apt-get update -sudo apt-get -y install \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg-agent \ - gnupg \ - lsb-release \ - software-properties-common -sudo mkdir -p /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null -sudo apt-get update -sudo apt-get -y install docker-ce docker-ce-cli containerd.io docker-compose-plugin -sudo docker run hello-world -``` - -* add your user to docker group - -```Shell -sudo usermod -aG docker $USER -``` - -* enable root (optional) - -```Shell - sudo sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/g" /etc/ssh/sshd_config - echo 'root:opi' | sudo chpasswd - sudo systemctl restart sshd -``` - -* setup management port configuration using this sample `/etc/netplan/00-installer-config.yaml`: - -```code - --- - network: - ethernets: - ens160: - dhcp4: false - dhcp6: false - bridges: - br1: - interfaces: [ens160] - addresses: [10.36.118.210/24] - routes: - - to: default - via: 10.36.118.1 - mtu: 1500 - nameservers: - addresses: [4.4.4.4, 8.8.8.8] - parameters: - stp: false - forward-delay: 0 - max-age: 0 - dhcp4: false - dhcp6: false - version: 2 -``` - -* check the yaml file is ok (optional) - -```Shell -sudo apt -y install yamllint -yamllint /etc/netplan/00-installer-config.yaml -``` - -* reboot - * ensure networking is ok - * this is needed also for the permissions to be update, otherwise next step will fail - -## DHCP Server - -```Shell -apt-get install -y isc-dhcp-server -systemctl enable isc-dhcp-server -systemctl start isc-dhcp-server - -git clone https://github.com/opiproject/opi-poc.git -cp ./opi-poc/lab/hardware/mgmt/fs/etc/dhcp/dhcpd.conf /etc/dhcp/dhcpd.conf - -systemctl restart isc-dhcp-server -``` - -to see new devices use `cat /var/lib/dhcp/dhcpd.leases` - -## Others - -* install ansible - -```bash -sudo apt -y install ansible-core -``` - -* Fix pasword-less access - -```bash -ssh-keygen -ssh-copy-id 172.22.X.X -``` - -## Testing - -* clone the `opiproject/testing` repository into your working directory: - -```Shell -# TBD right now work is happening under opi-poc repo -git clone https://github.com/opiproject/testing -``` - -* build container - -```Shell -docker build --no-cache --tag opi/test-framework:latest ./testing/framework -docker tag opi/test-framework:latest opi/test-framework:1.0.0 # we can chose a versioning schema for the containers -``` diff --git a/lab/sztp.md b/lab/sztp.md deleted file mode 100644 index 83502986..00000000 --- a/lab/sztp.md +++ /dev/null @@ -1,68 +0,0 @@ -# Secure Zero Touch Provisioning (sZTP) - -Took from - -## Run on Management server - -Start Bootstrap and Web servers from [compose](./docker-compose.yml): - -```bash -docker compose up -d -``` - -Add SZTP options to your DHCP server [config](./hardware/mgmt/fs/etc/dhcp/dhcpd.conf), [for example](https://github.com/opiproject/sztp/blob/main/dhcp/dhcpd.conf.template): - -```bash -$ grep sztp /etc/dhcp/dhcpd.conf -option sztp-redirect-urls code 143 = text; -option sztp-redirect-urls "https://bootstrap:8080/restconf/operations/ietf-sztp-bootstrap-server:get-bootstrapping-data"; -``` - -Extract certificates from Bootstrap server: - -```bash -docker compose cp bootstrap:/opi.pem /tmp/opi.pem -docker compose cp bootstrap:/tmp/sztpd-simulator/pki/client/end-entity/my_cert.pem /tmp/opi_cert.pem -docker compose cp bootstrap:/tmp/sztpd-simulator/pki/client/end-entity/private_key.pem /tmp/opi_private_key.pem -``` - -Copy extracted certificates to DPUs: - -```bash -scp /tmp/opi*.pem root@172.22.3.2:/mnt/ -``` - -## Run on DPUs - -Add sztp option to the dhcp client, [example](https://github.com/opiproject/sztp/blob/main/dhcp/dhclient.conf): - -```bash -root@bf2:~# grep sztp /etc/dhcp/dhclient.conf -option sztp-redirect-urls code 143 = text; -request subnet-mask, broadcast-address, time-offset, routers, sztp-redirect-urls, -``` - -Make sure lease file received the correct option: - -```bash -root@bf2:~# DHCLIENT_LEASE_FILE=/var/lib/NetworkManager/dhclient-aa93b667-6aac-3804-91e9-4958e07fdb2f-oob_net0.lease -root@bf2:~# grep sztp ${DHCLIENT_LEASE_FILE} - option sztp-redirect-urls "https://bootstrap:8080/restconf/operations/ietf-sztp-bootstrap-server:get-bootstrapping-data"; -``` - -Add static hostname resolution per certificate dns limitation: - -```bash -# add hostnames per certificate dns generation -root@bf2:~# grep bootstrap /etc/hosts -172.22.0.1 bootstrap web -``` - -Finally run SZTP agent/client: - -```bash -docker run --rm -it --network=host -v /mnt/:/mnt \ - --mount type=bind,source=${DHCLIENT_LEASE_FILE},target=/var/lib/dhclient/dhclient.leases \ - ghcr.io/opiproject/opi-sztp-client:main \ - /opi-sztp-agent daemon --bootstrap-trust-anchor-cert /mnt/opi.pem --device-end-entity-cert /mnt/opi_cert.pem --device-private-key /mnt/opi_private_key.pem -``` diff --git a/lab/sztp/my-boot-image.img b/lab/sztp/my-boot-image.img deleted file mode 100644 index 43bf08fa..00000000 Binary files a/lab/sztp/my-boot-image.img and /dev/null differ diff --git a/lab/telegraf.d/arista.conf b/lab/telegraf.d/arista.conf deleted file mode 100644 index b6401f76..00000000 --- a/lab/telegraf.d/arista.conf +++ /dev/null @@ -1,62 +0,0 @@ -[[inputs.gnmi]] - ## Address and port of the GNMI GRPC server - addresses = ["172.22.0.5:5900", "172.22.1.250:5900"] - - ## credentials - username = "arista" - password = "arista" - - ## redial in case of failures after - redial = "10s" - - ## Guess the path-tag if an update does not contain a prefix-path - ## If enabled, the common-path of all elements in the update is used. - guess_path_tag = true - - [[inputs.gnmi.subscription]] - ## Name of the measurement - name = "interfaces" - origin = "openconfig" - path = "/interfaces/interface/state" - subscription_mode = "TARGET_DEFINED" - sample_interval = "10s" - - [[inputs.gnmi.subscription]] - ## Name of the measurement - name = "openconfig_bgp" - origin = "openconfig" - path = "/network-instances/network-instance/protocols/protocol/bgp/" - subscription_mode = "TARGET_DEFINED" - sample_interval = "10s" - -[[inputs.gnmi]] - ## Address and port of the GNMI GRPC server - addresses = ["172.22.1.250:5900"] - - ## credentials - username = "arista" - password = "arista" - - ## redial in case of failures after - redial = "10s" - - [[inputs.gnmi.subscription]] - name = "eos_bgp" - origin = "eos_native" - subscription_mode = "TARGET_DEFINED" - path = "/Sysdb/routing/bgp/export/" - sample_interval = "10s" - - [[inputs.gnmi.subscription]] - name = "CPU" - origin = "eos_native" - subscription_mode = "TARGET_DEFINED" - path = "/Kernel/proc/cpu/utilization/total" - sample_interval = "10s" - - [[inputs.gnmi.subscription]] - name = "RAM" - origin = "eos_native" - subscription_mode = "TARGET_DEFINED" - path = "/Kernel/proc/meminfo" - sample_interval = "10s" diff --git a/lab/telegraf.d/hostmetrics.conf b/lab/telegraf.d/hostmetrics.conf deleted file mode 100644 index 360664d7..00000000 --- a/lab/telegraf.d/hostmetrics.conf +++ /dev/null @@ -1,18 +0,0 @@ -[[inputs.cpu]] - percpu = true - totalcpu = true - collect_cpu_time = false - report_active = false - -[[inputs.mem]] - # no configuration - -[[inputs.nstat]] - # no configuration - -[[inputs.disk]] - ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] - -[[inputs.temp]] - # no configuration - # for DPU cards this is not applicable, see readme.md below diff --git a/lab/telegraf.d/output.conf b/lab/telegraf.d/output.conf deleted file mode 100644 index 73b69cbb..00000000 --- a/lab/telegraf.d/output.conf +++ /dev/null @@ -1,6 +0,0 @@ -[[outputs.file]] - files = ["stdout"] - data_format = "influx" - -[[outputs.opentelemetry]] - service_address = "172.22.0.1:4317" diff --git a/lab/telegraf.d/redfish.conf b/lab/telegraf.d/redfish.conf deleted file mode 100644 index f0e15d47..00000000 --- a/lab/telegraf.d/redfish.conf +++ /dev/null @@ -1,6 +0,0 @@ -[[inputs.redfish]] - address = "https://192.168.240.1" - username = "root" - password = "123456" - computer_system_id="XXX" - insecure_skip_verify = true diff --git a/lab/telegraf.d/telegraf.conf.bf2 b/lab/telegraf.d/telegraf.conf.bf2 deleted file mode 100644 index 63fcacdc..00000000 --- a/lab/telegraf.d/telegraf.conf.bf2 +++ /dev/null @@ -1,51 +0,0 @@ -# Also check https://docs.nvidia.com/doca/sdk/nvidia+doca+telemetry+service+guide/index.html - -[[inputs.redfish]] - address = "https://172.22.4.2" - username = "root" - password = "NvidiaBf2#Pass" - computer_system_id="Bluefield" - insecure_skip_verify = true - -[[inputs.http]] - urls = ["http://localhost:9009"] - headers = {"Content-Type" = "application/json"} - method = "POST" - username = "spdkuser" - password = "spdkpass" - body = '{"id":1, "method": "bdev_get_iostat"}' - data_format = "json" - name_override = "spdk" - json_strict = true - tag_keys = ["name"] - json_query = "result.bdevs" - -[[inputs.cpu]] - percpu = true - totalcpu = true - collect_cpu_time = false - report_active = false - -[[inputs.mem]] - # no configuration - -[[inputs.nstat]] - # no configuration - -[[inputs.file]] - files = ["/run/emu_param/bluefield_temp"] - name_override = "temp" - value_field_name="temp" - data_format = "value" - data_type = "integer" - file_tag = "sensor" - -[[inputs.infiniband]] - # no configuration - -[[outputs.file]] - files = ["stdout"] - data_format = "influx" - -[[outputs.opentelemetry]] - service_address = "172.22.0.1:4317" diff --git a/lab/telegraf.d/telegraf.conf.mev b/lab/telegraf.d/telegraf.conf.mev deleted file mode 100644 index 3df3db38..00000000 --- a/lab/telegraf.d/telegraf.conf.mev +++ /dev/null @@ -1,23 +0,0 @@ -[[inputs.cpu]] - percpu = true - totalcpu = true - collect_cpu_time = false - report_active = false - -[[inputs.mem]] - # no configuration - -[[inputs.nstat]] - # no configuration - -#[[inputs.exec]] -# commands = ["ssh root@192.168.0.1 iset-cli get-temperature"] -# name_override = "temp" -# data_format = "json" - -[[outputs.file]] - files = ["stdout"] - data_format = "influx" - -[[outputs.opentelemetry]] - service_address = "192.168.0.1:4317" diff --git a/lab/telegraf.d/telegraf.conf.mrv b/lab/telegraf.d/telegraf.conf.mrv deleted file mode 100644 index f4aa7f74..00000000 --- a/lab/telegraf.d/telegraf.conf.mrv +++ /dev/null @@ -1,66 +0,0 @@ -# Read metrics about cpu usage -[[inputs.cpu]] - ## Whether to report per-cpu stats or not - percpu = true - ## Whether to report total system cpu stats or not - totalcpu = true - ## If true, collect raw CPU time metrics - collect_cpu_time = false - ## If true, compute and report the sum of all non-idle CPU states - ## NOTE: The resulting 'time_active' field INCLUDES 'iowait'! - report_active = false - ## If true and the info is available then add core_id and physical_id tags - core_tags = false - -# Diskio Input Plugin -[[inputs.diskio]] - -# Disk Input Plugin -[[inputs.disk]] - -# Internal Input Plugin -[[inputs.internal]] - -# Kernel Input Plugin -[[inputs.kernel]] - -# Linux Sysctl FS Input Plugin -[[inputs.linux_sysctl_fs]] - -# Mem Input Plugin -[[inputs.mem]] - -# Nstat Input Plugin -[[inputs.nstat]] - -# Net Input Plugin -[[inputs.net]] - -# Netstat Input Plugin -[[inputs.netstat]] - -# Processes Input Plugin -[[inputs.processes]] - -# Sensors Input Plugin -[[inputs.sensors]] - -# Swap Input Plugin -[[inputs.swap]] - -# System Input Plugin -[[inputs.system]] - -# ZFS Input Plugin -[[inputs.zfs]] - poolMetrics = true - -# Docker Input Plugin -[[inputs.docker]] - -[[outputs.file]] - files = ["stdout"] - data_format = "influx" - -[[outputs.opentelemetry]] - service_address = "172.22.0.1:4317"