Dockerfile
以下环境均为:Ubuntu22.04
# 查看主机显卡
lspci | grep NVIDIA
# 安装Nvidia驱动
主机上需要安装Nvidia驱动,可进入Nvidia官网 (opens new window)下载对应系统驱动。
ubuntu 官方安装文档 (opens new window)
version=570.133.20
distro=ubuntu2004
arch_ext=amd64.deb
apt install linux-headers-$(uname -r)
wget https://developer.download.nvidia.com/compute/nvidia-driver/$version/local_installers/nvidia-driver-local-repo-$distro-$version_$arch_ext.deb
dpkg -i nvidia-driver-local-repo-$distro-$version_$arch_ext.deb
# 更新源
apt update
cp /var/nvidia-driver-local-repo-$distro-$version/nvidia-driver-*-keyring.gpg /usr/share/keyrings/
# 查看驱动
apt search nvidia-driver
apt install nvidia-driver-570
或者
## 查看当前系统推荐的 NVIDIA 驱动版本。
sudo ubuntu-drivers devices
## 搭配 autoinstall 自动安装推荐驱动
sudo ubuntu-drivers autoinstall
运行命令查看驱动是否正常安装
nvidia-smi
示例
root@ecs001:~# nvidia-smi
Thu May 22 14:43:38 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20 Driver Version: 570.133.20 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla T4 Off | 00000000:00:07.0 Off | 0 |
| N/A 34C P8 9W / 70W | 8MiB / 15360MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 1140 G /usr/lib/xorg/Xorg 4MiB |
+-----------------------------------------------------------------------------------------+
# cuda-toolkit-12-9
提示
深度学习CUDA环境,不需要可以不安装
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
wget https://developer.download.nvidia.com/compute/cuda/12.9.0/local_installers/cuda-repo-ubuntu2004-12-9-local_12.9.0-575.51.03-1_amd64.deb
sudo dpkg -i cuda-repo-ubuntu2004-12-9-local_12.9.0-575.51.03-1_amd64.deb
sudo cp /var/cuda-repo-ubuntu2004-12-9-local/cuda-*-keyring.gpg /usr/share/keyrings/
# 更新源
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-9
# 安装 nvidia-container-toolkit
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
# 配置 docker 使用nvidia runtime
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker
查看配置文件
cat /etc/docker/daemon.json
应该出现如下结果
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"args": [],
"path": "nvidia-container-runtime"
}
}
}
# 使用
# docker使用
--gpus all
:表示将所有 GPU 都分配给该容器--gpus "device=$id,"
分配指定gpu
测试
docker run --rm --gpus all nvidia/cuda:12.8.0-runtime-ubuntu22.04 nvidia-smi
# Docker Swarm
services:
cuda12:
image: nvidia/cuda:12.8.0-runtime-ubuntu22.04
container_name: cuda12.8
restart: "no"
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,video,utility
command: ["nvidia-smi"]
deploy:
resources:
reservations:
devices:
- capabilities: [gpu] # 旧 Swarm 用法,
# docker-conpose v2使用
services:
cuda12:
image: nvidia/cuda:12.8.0-runtime-ubuntu22.04
container_name: cuda12.8
restart: "no"
environment:
NVIDIA_DRIVER_CAPABILITIES: compute,video,utility # compute:计算 video:视频编解码 utility:工具 all:所有模块
command: ["nvidia-smi"]
# runtime: nvidia
# 新版 Docker Compose 也可以用 device_requests 和 runtime: nvidia 取其一
device_requests:
- driver: nvidia
count: all
capabilities: ["gpu"]
runtime: nvidia
告诉 Docker运行时用NVIDIA GPU容器替代默认的runc的runtime。device_requests
运行时请求 要暴露哪些设备给容器。
# 配置默认
提示
docker-conpose
配置可省略runtime: nvidia
vi /etc/docker/daemon.json
添加"default-runtime": "nvidia"
,示例如下
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"args": [],
"path": "nvidia-container-runtime"
}
}
}
上次更新: 2025/05/22, 07:52:48