= hirsute verification =
ubuntu@blanka:~/nvidia-dgx-2/tests$ cat /proc/version
Linux version 5.11.0-42-generic (buildd@lgw01-amd64-041) (gcc (Ubuntu 
10.3.0-1ubuntu1) 10.3.0, GNU ld (GNU Binutils for Ubuntu) 2.36.1) #46-Ubuntu 
SMP Fri Nov 26 12:04:17 UTC 2021
ubuntu@blanka:~/nvidia-dgx-2/tests$ ./nvidia-peermem-test.sh 
+ export DEBCONF_FRONTEND=noninteractive
+ DEBCONF_FRONTEND=noninteractive
+ export DEBIAN_PRIORITY=critical
+ DEBIAN_PRIORITY=critical
+ SERVER_IFACE=enp148s0
+ SERVER_IP=192.168.5.1/24
+ SERVER_IB_BDF=0000:4b:00.0
+ CLIENT_IFACE=enp18s0
+ CLIENT_IP=192.168.5.2/24
+ CLIENT_IB_BDF=0000:ba:00.0
+ trap cleanup EXIT
+ sudo service unattended-upgrades stop
+ install_cuda_perftest
+ local release
+ local components
+ dpkg-query -W -f '${Version}' perftest
+ grep -q '+cuda.1$'
+ return
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_0
++++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband/mlx5_0
+++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband
++ basename 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0
+ bdf=0000:0c:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_1
++++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband/mlx5_1
+++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband
++ basename 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0
+ bdf=0000:12:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_2
++++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband/mlx5_2
+++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband
++ basename 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0
+ bdf=0000:4b:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_2
+ server_ib_dev=mlx5_2
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_3
++++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband/mlx5_3
+++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband
++ basename 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0
+ bdf=0000:54:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_4
++++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband/mlx5_4
+++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband
++ basename 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0
+ bdf=0000:8d:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_5
++++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband/mlx5_5
+++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband
++ basename 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0
+ bdf=0000:94:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_6
++++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband/mlx5_6
+++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband
++ basename 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0
+ bdf=0000:ba:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_6
+ client_ib_dev=mlx5_6
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_7
++++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband/mlx5_7
+++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband
++ basename 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0
+ bdf=0000:cc:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_8
++++ dirname 
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband/mlx5_8
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0
+ bdf=0000:e1:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_9
++++ dirname 
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband/mlx5_9
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1
+ bdf=0000:e1:00.1
+ case "$bdf" in
+ '[' -z mlx5_6 ']'
+ '[' -z mlx5_2 ']'
+ sudo rdma system set netns exclusive
+ sudo ip netns add peermemclient
+ sudo rdma dev set mlx5_6 netns peermemclient
+ sudo ip netns exec peermemclient ip link set dev lo up
+ sudo ip link set netns peermemclient enp18s0
+ sudo ip netns exec peermemclient ip addr add dev enp18s0 192.168.5.2/24
+ sudo ip netns exec peermemclient ip link set dev enp18s0 up
+ sudo ip addr add dev enp148s0 192.168.5.1/24
+ sudo ip link set dev enp148s0 up
+ sudo modprobe ib_umad
+ sudo modprobe nvidia-peermem
+ sudo_apt install -y opensm
+ sudo --preserve-env=DEBCONF_FRONTEND,DEBIAN_PRIORITY apt install -y opensm
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
opensm is already the newest version (3.3.23-2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
+ sudo service opensm start
+ use_cuda_needs_devid
+ ib_write_bw --help
+ grep use_cuda=
      --use_cuda=<cuda device id> Use CUDA specific device for GPUDirect RDMA 
testing
+ return 0
+ server_use_cuda_arg=--use_cuda=0
+ client_use_cuda_arg=--use_cuda=1
+ srvpid=7209
+ sleep 5
+ sudo ib_write_bw -a -d mlx5_2 --use_cuda=0

************************************
* Waiting for client to connect... *
************************************
+ sudo ip netns exec peermemclient ib_write_bw -a -d mlx5_6 192.168.5.1 
--use_cuda=1
initializing CUDA
initializing CUDA
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00

Picking device No. 1
[pid = 7216, dev = 1] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00

Picking device No. 0
[pid = 7211, dev = 0] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f0eba000000 pointer=0x7f0eba000000
---------------------------------------------------------------------------------------
                    RDMA_Write BW Test
 Dual-port       : OFF          Device         : mlx5_6
 Number of qps   : 1            Transport type : IB
 Connection type : RC           Using SRQ      : OFF
 PCIe relax order: ON
 ibv_wr* API     : ON
 TX depth        : 128
 CQ Moderation   : 100
 Mtu             : 4096[B]
 Link type       : IB
 Max inline data : 0[B]
 rdma_cm QPs     : OFF
 Data ex. method : Ethernet
---------------------------------------------------------------------------------------
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f682e000000 pointer=0x7f682e000000
---------------------------------------------------------------------------------------
                    RDMA_Write BW Test
 Dual-port       : OFF          Device         : mlx5_2
 Number of qps   : 1            Transport type : IB
 Connection type : RC           Using SRQ      : OFF
 PCIe relax order: ON
 ibv_wr* API     : ON
 CQ Moderation   : 100
 Mtu             : 4096[B]
 Link type       : IB
 Max inline data : 0[B]
 rdma_cm QPs     : OFF
 Data ex. method : Ethernet
---------------------------------------------------------------------------------------
 local address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 
0x007f682e800000
 local address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 
0x007f0eba800000
 remote address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 
0x007f0eba800000
 remote address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 
0x007f682e800000
---------------------------------------------------------------------------------------
 #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
---------------------------------------------------------------------------------------
 #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
Conflicting CPU frequency values detected: 1500.000000 != 3391.375000. CPU 
Frequency is not max.
 2          5000             4.11               4.10               2.151153
Conflicting CPU frequency values detected: 1500.000000 != 3345.763000. CPU 
Frequency is not max.
 4          5000             8.07               8.04               2.108648
Conflicting CPU frequency values detected: 1500.000000 != 3362.509000. CPU 
Frequency is not max.
 8          5000             16.13              16.13              2.113996
Conflicting CPU frequency values detected: 1500.000000 != 3335.048000. CPU 
Frequency is not max.
 16         5000             32.30              32.19              2.109436
Conflicting CPU frequency values detected: 1500.000000 != 3339.906000. CPU 
Frequency is not max.
 32         5000             64.41              64.38              2.109663
Conflicting CPU frequency values detected: 1500.000000 != 3333.100000. CPU 
Frequency is not max.
 64         5000             129.43             129.12             2.115557
Conflicting CPU frequency values detected: 1500.000000 != 3349.864000. CPU 
Frequency is not max.
 128        5000             257.89             257.16             2.106668
Conflicting CPU frequency values detected: 1500.000000 != 3350.294000. CPU 
Frequency is not max.
 256        5000             516.27             515.84             2.112864
Conflicting CPU frequency values detected: 1500.000000 != 3340.996000. CPU 
Frequency is not max.
 512        5000             1024.81            1024.72            2.098633
Conflicting CPU frequency values detected: 1500.000000 != 3356.251000. CPU 
Frequency is not max.
 1024       5000             2053.47            2053.08            2.102352
Conflicting CPU frequency values detected: 1500.000000 != 3339.107000. CPU 
Frequency is not max.
 2048       5000             3864.52            3720.22            1.904755
Conflicting CPU frequency values detected: 1500.000000 != 3355.693000. CPU 
Frequency is not max.
 4096       5000             4494.10            4083.37            1.045344
Conflicting CPU frequency values detected: 1500.000000 != 3342.793000. CPU 
Frequency is not max.
 8192       5000             4590.54            4425.60            0.566476
Conflicting CPU frequency values detected: 1500.000000 != 3351.159000. CPU 
Frequency is not max.
 16384      5000             4517.28            4279.27            0.273873
Conflicting CPU frequency values detected: 1500.000000 != 3314.743000. CPU 
Frequency is not max.
 32768      5000             4460.95            4387.03            0.140385
Conflicting CPU frequency values detected: 1500.000000 != 3305.732000. CPU 
Frequency is not max.
 65536      5000             4465.92            4408.98            0.070544
Conflicting CPU frequency values detected: 1500.000000 != 3310.266000. CPU 
Frequency is not max.
 131072     5000             4449.90            4422.93            0.035383
Conflicting CPU frequency values detected: 1500.000000 != 3364.586000. CPU 
Frequency is not max.
 262144     5000             4443.64            4439.50            0.017758
Conflicting CPU frequency values detected: 1500.000000 != 3325.738000. CPU 
Frequency is not max.
 524288     5000             4444.42            4441.08            0.008882
Conflicting CPU frequency values detected: 1500.000000 != 3391.764000. CPU 
Frequency is not max.
 1048576    5000             4453.77            4452.52            0.004453
Conflicting CPU frequency values detected: 1500.000000 != 3391.441000. CPU 
Frequency is not max.
 2097152    5000             4450.29            4449.44            0.002225
Conflicting CPU frequency values detected: 1500.000000 != 1958.593000. CPU 
Frequency is not max.
 4194304    5000             4452.98            4451.38            0.001113
Conflicting CPU frequency values detected: 1500.000000 != 2246.050000. CPU 
Frequency is not max.
 8388608    5000             4453.11            4452.79            0.000557
---------------------------------------------------------------------------------------
 8388608    5000             4453.11            4452.79            0.000557
---------------------------------------------------------------------------------------
deallocating RX GPU buffer 00007f682e000000
deallocating RX GPU buffer 00007f0eba000000
destroying current CUDA Ctx
destroying current CUDA Ctx
+ cleanup
+ '[' -n 7209 ']'
+ test -d /proc/7209
+ sudo kill 7209
kill: (7209): No such process
+ /bin/true
+ '[' -z '' ']'
+ sudo ip addr del dev enp148s0 192.168.5.1/24
+ sudo ip netns exec peermemclient ip addr del dev enp18s0 192.168.5.2/24
+ sudo ip netns delete peermemclient
ubuntu@blanka:~/nvidia-dgx-2/tests$ echo $?
0


** Tags removed: verification-needed-hirsute
** Tags added: verification-done-hirsute

-- 
You received this bug notification because you are a member of Ubuntu
Bugs, which is subscribed to Ubuntu.
https://bugs.launchpad.net/bugs/1947206

Title:
  Updates to ib_peer_memory requested by Nvidia

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1947206/+subscriptions


-- 
ubuntu-bugs mailing list
ubuntu-bugs@lists.ubuntu.com
https://lists.ubuntu.com/mailman/listinfo/ubuntu-bugs

Reply via email to