最近课题组新到了一台8卡GPU服务器,为了更有效的利用计算资源,准备安装一个slurm任务提交系统。
cat /etc/os-release
NAME=”CentOS Linux”
VERSION=”8”
ID=”centos”
ID_LIKE=”rhel fedora”
VERSION_ID=”8”
PLATFORM_ID=”platform:el8”
PRETTY_NAME=”CentOS Linux 8”
ANSI_COLOR=”0;31”
CPE_NAME=”cpe:/o:centos:centos:8”
HOME_URL=”https://centos.org/"
BUG_REPORT_URL=”https://bugs.centos.org/"
CENTOS_MANTISBT_PROJECT=”CentOS-8”
CENTOS_MANTISBT_PROJECT_VERSION=”8”
1.换清华源(非必要)
#
sudo cp /etc/apt/sources.list /etc/apt/sources.list.bak
rm /etc/apt/sources.list
cat >> /etc/apt/sources.list << "EOF"
#添加清华源
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
EOF
# 更新配置文件
sudo apt-get update
sudo apt-get upgrade
2.安装slurm
# 下载 slurm_install.sh 网络问题,我直接从github上粘贴过去的
wget --no-check-certificate https://raw.githubusercontent.com/NISP-GmbH/SLURM/main/slurm_install.sh
export VER=23.02.0
# line 1239: bc: command not found
sudo apt-get install bc
bash slurm_install.sh
# Do you want to enable Slurm accounting support? Possible answers: [yes/no] <- no
# yes 要配置DB
# 按要求修改配置文件
vi /etc/slurm/slurm.conf
# 设置CPU数量和内存
# 详细信息可以通过 systemctl status slurmd 得到,有冲突他会提示
# 直接填提示的就行
NodeName=citrus CPUs=256 RealMemory=1031612 CoresPerSocket=64 SocketsPerBoard=2 ThreadsPerCore=2 State=idle Feature=dcv2,other
# 更新配置文件
sudo systemctl restart slurmctld
sudo systemctl restart slurmd
# 检查服务运行状态
sudo systemctl status slurmctld
sudo systemctl status slurmd
# 查看slurm 系统配置
sinfo -o "%P %D %c %m %C"
# 设置开机自启动
sudo systemctl enable slurmctld # 守护进程
sudo systemctl enable slurmd
sudo systemctl enable munge # 一个用于安全认证的守护进程
# sudo systemctl enable slurmdbd 没有装DB 不需要
3.GPU配置
我们服务器有8张GPUs,现在配置下GPU的设置,让slurm能正确识别GPUs。
#
vi /etc/slurm/slurm.conf
"
GresTypes=gpu
NodeName=citrus CPUs=256 Gres=gpu:8 RealMemory=1031612 CoresPerSocket=64 SocketsPerBoard=2 ThreadsPerCore=2 State=idle Feature=dcv2,other
"
# Gres
cat >> /etc/slurm/gres.conf << "EOF"
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia0
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia1
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia2
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia3
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia4
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia5
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia6
NodeName=citrus Name=gpu Type=nvidia File=/dev/nvidia7
EOF
# 更新配置文件
sudo systemctl restart slurmctld
sudo systemctl restart slurmd
systemctl restart munge.service
# 检查是否识别GPU
sinfo -o "%n %T %C %G" # 正确识别8张GPU卡
# citrus idle 0/256/0/256 gpu:8
4.测试
resource_test.sh#!/bin/bash #SBATCH --job-name=resource_test #SBATCH --output=resource_test.out #SBATCH --error=resource_test.err #SBATCH --ntasks=1 #SBATCH --mem=40G #SBATCH --cpus-per-task=10 #SBATCH --gres=gpu:1 echo "Hello world!"
sbatch resource_test.sh
squeue
scancel jobid
5.删除slurm
sudo systemctl stop slurmctld
sudo systemctl stop slurmd
sudo systemctl stop slurmdbd
# sudo yum remove slurm*
sudo apt-get remove --purge slurm*
sudo rm -rf /etc/slurm/
sudo rm -rf /var/log/slurm/
sudo rm -rf /var/spool/slurm/
sudo userdel slurm
sudo groupdel slurm
sudo rm -rf /var/log/slurm/
sudo rm /etc/systemd/system/slurmctld.service
sudo rm /etc/systemd/system/slurmd.service
sudo rm /etc/systemd/system/slurmdbd.service
sudo systemctl daemon-reload
# 谨慎
sudo find / -name "*slurm*" -exec rm -rf {} +
普通用户提交任务报错:
sbatch: error: Batch job submission failed: Unexpected message received
检查/var/log/var/log/slurmctld.log是否有写入权限。
一个比较直接的方法是创建一个新的group,将用户提交到这个新的group里,将/var/log文件所属的group也改为这个,大部分的问题都是用户权限,仔细排查应该不会有什么难度。

