OpenStack GPU passthrough

Enable iommu in BIOS

Get MMUI groups

shopt -s nullglob
for g in $(find /sys/kernel/iommu_groups/* -maxdepth 0 -type d | sort -V); do
    echo "IOMMU Group ${g##*/}:"
    for d in $g/devices/*; do
        echo -e "\t$(lspci -nns ${d##*/})"
    done;
done;

Configure compute node

NVIDIA_PCI_ID=$(lspci -nn | grep NVIDIA | cut -d "[" -f3 | cut -d "]" -f1)
 
echo "blacklist nouveau" >> /etc/modprobe.d/blacklist-nvidia.conf
echo "blacklist nvidiafb" >> /etc/modprobe.d/blacklist-nvidia.conf
echo vfio-pci >> /etc/modules-load.d/vfio-pci.conf
# configure all PCI IDs (also Audio device from same MMUI group as ids=10de:xxxx,10de:xxxx
echo options vfio-pci ids=${NVIDIA_PCI_ID} >> /etc/modprobe.d/gpu-vfio.conf
 
sed -i 's/GRUB_CMDLINE_LINUX_DEFAULT=.*/GRUB_CMDLINE_LINUX_DEFAULT="amd_iommu=on"/g' /etc/default/grub
 
update-grub
 
reboot
#/etc/modprobe.d/kvm.conf
#options kvm ignore_msrs=Y
 
#echo "options vfio-pci ids=${NVIDIA_PCI_ID}" >> /etc/modprobe.d/vfio.conf
#echo "softdep drm pre: vfio-pci" >> /etc/modprobe.d/vfio.conf
#echo "options kvm ignore_msrs=1" >> /etc/modprobe.d/kvm.conf
#echo "vfio vfio_iommu_type1 vfio_virqfd vfio_pci ids=${NVIDIA_PCI_ID}" >> /etc/initramfs-tools/modules
#echo GRUB_CMDLINE_LINUX_DEFAULT=\"intel_iommu=on vfio-pci.ids=${NVIDIA_PCI_ID} vfio_iommu_type1.allow_unsafe_interrupts=1 modprobe.blacklist=nvidiafb,nouveau\" >> /etc/default/grub
#echo GRUB_CMDLINE_LINUX_DEFAULT=\"amd_iommu=on vfio-pci.ids=${NVIDIA_PCI_ID} vfio_iommu_type1.allow_unsafe_interrupts=1 modprobe.blacklist=nvidiafb,nouveau\" >> /etc/default/grub
#echo 'GRUB_CMDLINE_LINUX_DEFAULT="amd_iommu=on kvm.ignore_msrs=1 vfio-pci.ids=10de:27b8"' >> /etc/default/grub
#echo 'GRUB_CMDLINE_LINUX_DEFAULT="amd_iommu=on kvm.ignore_msrs=1"' >> /etc/default/grub
 
# iommu=pt

Test whather vfio driver is loaded

for PCI_ID in $(lspci | grep NVIDIA | cut -d" " -f1); do
    sudo lspci -s ${PCI_ID} -k
done
 
02:00.0 3D controller: NVIDIA Corporation Device 27b8 (rev a1)
	Subsystem: NVIDIA Corporation Device 16ca
	Kernel driver in use: vfio-pci
	Kernel modules: nvidiafb, nouveau

Configure OpenStack
type-PF or type-PCI depending on the GPU

# /etc/kolla/config/ew/nova/nova-compute.conf
...
[pci]
device_spec = { "vendor_id": "10de", "product_id": "27b8" }
alias = { "vendor_id":"10de", "product_id":"27b8", "device_type":"type-PF", "name":"gpu" }
 
# /etc/kolla/config/ew/nova/nova-api.conf
...
[pci]
alias = { "vendor_id":"10de", "product_id":"27b8", "device_type":"type-PF", "name":"gpu" }
 
 
# /etc/kolla/config/ew/nova/nova-scheduler.conf
[filter_scheduler]
...
available_filters = nova.scheduler.filters.all_filters
enabled_filters = ...,PciPassthroughFilter

Create flavor

openstack flavor create gpu-8c64g \
    --vcpus 8 \
    --ram 65536 \
    --disk 100 \
    --property "pci_passthrough:alias"="gpu:1"

Create VM

openstack server create --flavor gpu-16c64g --image "Ubuntu 22.04" --network test-network --security-group test-secgroup --key-name test-keypair gpu-test-${RANDOM}

OPTIONAL: Auto-install GPU driver

cat <<EOF> gpu.cloud-config
#cloud-config
package_upgrade: true
packages:
- ubuntu-drivers-common
#- nvidia-driver-550-server
runcmd:
- ubuntu-drivers install
- echo nvidia-smi > /etc/update-motd.d/99-gpu
- chmod 755 /etc/update-motd.d/99-gpu
EOF
 
# create VM
openstack server create --flavor gpu-16c64g --image "Ubuntu 24.04 GPU" --network test-network --security-group test-secgroup --key-name test-keypair gpu-test-$$
 
openstack server create --flavor gpu-32c96g --image "Ubuntu 24.04 GPU" --network test-network --security-group test-secgroup --key-name test-keypair gpu-test-$$ --user-data gpu.cloud-config

Install GPU driver inside of the VM

sudo apt update && sudo apt install -y ubuntu-drivers-common
#sudo apt install -y nvidia-driver-550-server
 
# apt install ubuntu-drivers-common
# ubuntu-drivers devices
sudo ubuntu-drivers install
 
# show GPU usage on login
echo -e '#!/bin/bash\n\nnvidia-smi' | sudo tee /etc/update-motd.d/99-gpu
sudo chmod 755 /etc/update-motd.d/99-gpu
 
reboot

GPU Benchmark
https://docs.fuga.cloud/cloud/benchmark/how-to-do-a-gpu-benchmark-test/

wget https://cdn.geekbench.com/Geekbench-5.4.1-Linux.tar.gz
tar -xzvf Geekbench-5.4.1-Linux.tar.gz
cd Geekbench-5.4.1-Linux
./geekbench5 --compute CUDA

CUDA
https://developer.nvidia.com/cuda-gpus

https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-5
 
sudo apt-get install -y cuda-drivers
sudo apt-get install -y nvidia-driver-555-open
sudo apt-get install -y cuda-drivers-555
 
nvtop
 
# hashcat -b

gpu-burn
https://github.com/wilicc/gpu-burn

sudo apt install -y gpu-burn libcublas12
gpu-burn 3600 # burns all GPUs for an hour

Links
https://docs.openstack.org/nova/latest/admin/pci-passthrough.html
https://wiki.archlinux.org/title/PCI_passthrough_via_OVMF
https://superuser.openinfra.dev/articles/a-comprehensive-guide-to-configuring-gpu-passthrough-in-openstack-for-high-performance-computing/
https://satishdotpatel.github.io/gpu-passthrough-for-openstack/
https://gitlab.com/polloloco/vgpu-proxmox
https://www.reddit.com/r/openstack/comments/1aoi3sg/how_to_create_instances_with_gpu/
https://128b.xyz/ops/initial_setup
https://doc.opensuse.org/documentation/leap/virtualization/html/book-virtualization/app-gpu-passthru.html
https://www.jimmdenton.com/gpu-offloading-openstack/