ubuntu24.04搭建slurm集群

警告,ubuntu作为服务器nfs有大问题,请看这里,因此我后来使用了opensuse-leap系统,挺好用。

不换系统也行,不要使用默认的nfs4加载,使用nfs3(如下的,vers=3),因为opensuse-leap系统用nfs4有时候也会出现挂载不上,暂时懒得找了,直接用nfs3

1
icpcs:/opt /opt nfs defaults,vers=3 0 2

先配置基础服务

mysql

slurm 的数据库服务

1
2
apt install mysql-server
sudo mysql
1
2
3
4
5
6
7
8
# 创建slurm单独的mysql用户,只本地访问即可
# 密码 mysql_slurm_passwd 后面用到了,注意同步修改
create user 'slurm'@'localhost' identified by 'mysql_slurm_passwd';

# 创建另一个,这个不强制,但是建议,如果不创建,后面slurm配置需要改,所以还是创建吧
create database slurm_job_db;

grant all on slurm_job_db.* TO 'slurm'@'localhost';
1
systemctl enable --now mysql

munge

slurm 的加密服务

1
apt install munge
1
2
./nodes 1 14 "scp" ":/etc/munge/munge.key /etc/munge/"
./nodes 1 14 "ssh" " chown munge:munge /etc/munge/munge.key"
1
systemctl enable --now munge.service

nfs

主要目录子节点共享主节点

子节点共享 /home /usr/local /opt 三个目录

1
sudo apt install nfs-kernel-server

主节点允许共享

1
2
3
4
5
echo "/home 192.168.3.0/24(rw,sync,no_subtree_check)" >> /etc/exports
echo "/usr/local 192.168.3.0/24(rw,sync,no_subtree_check)" >> /etc/exports
echo "/opt 192.168.3.0/24(rw,sync,no_subtree_check)" >> /etc/exports

systemctl enable --now nfs-server.service

子节点需要配置

删除旧的

1
2
3
./nodes 1 14 "ssh" " mv /opt /opt_old && mkdir /opt"
./nodes 1 14 "ssh" " mv /home /home_old && mkdir /home"
./nodes 1 14 "ssh" " mv /usr/local /usr/local_old && mkdir /usr/local"

子节点,挂载新的

1
2
3
./nodes 1 14 "ssh" " echo "icpcs:/opt /opt nfs defaults 0 2" >> /etc/fstab"
./nodes 1 14 "ssh" " echo "icpcs:/usr/local /usr/local nfs defaults 0 2" >> /etc/fstab"
./nodes 1 14 "ssh" " echo "icpcs:/home /home nfs defaults 0 2" >> /etc/fstab"

警告,ubuntu作为服务器nfs有大问题,请看这里,因此我后来使用了opensuse-leap系统,挺好用。

不换系统也行,不要使用默认的nfs4加载,使用nfs3(如下的,vers=3),因为opensuse-leap系统用nfs4有时候也会出现挂载不上,暂时懒得找了,直接用nfs3

1
icpcs:/opt /opt nfs defaults,vers=3 0 2

子节点,激活共享分区

1
./nodes 1 14 "ssh" " systemctl daemon-reload && mount -a && df -Th | grep icpcs"

nis

子节点共享主节点的共享账户信息

(参考这里)[https://blog.csdn.net/snow5255/article/details/131531103]

主节点和子节点,安装nis并激活

1
sudo apt install nis

主节点配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 只共享 `192.168.3` 开头的
vim /etc/ypserv.securenets
# 注释掉如下两行
# 0.0.0.0 0.0.0.0
# ::/0
# 添加这一行
# 255.255.255.0 192.168.3.0

vim /etc/yp.conf
# 添加如下一行
# domain cluster server icpcs

vim /etc/defaultdomain
# 添加如下一行
# cluster
# 激活
domainname cluster

# 激活主节点服务
systemctl enable --now ypserv

# 初始化
/usr/lib/yp/ypinit -m

子节点配置

1
2
3
4
5
6
7
8
9
10
11
./nodes 1 14 "ssh" " echo 'domain cluster server icpcs' > /etc/yp.conf"

./nodes 1 14 "ssh" " echo 'cluster' > /etc/defaultdomain"

# 测试
./nodes 1 14 "ssh" " yptest"


# 激活节点服务
./nodes 1 14 "ssh" " systemctl enable --now ypbind.service"

slurm服务

主节点安装

1
apt install libopenmpi-dev slurmdbd slurmctld slurm-wlm-mysql-plugin slurm-wlm-ipmi-plugins

子节点安装

1
./nodes 1 14 "ssh" " apt install libopenmpi-dev slurmd slurm-wlm-mysql-plugin slurm-wlm-ipmi-plugins"

主节点注意,/usr/lib/systemd/system/slurmctld.service 中的GroupUser修改为 root,slurm用户无法管理

可参考 配置,这里比较新

配置文件可参考

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# /etc/slurm/slurm.conf

ClusterName=cluster
SlurmctldHost=icpcs
MaxJobCount=10000
MpiDefault=pmix
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmdUser=root
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0

DefMemPerCPU=100
#DefMemPerNode=100

SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU_Memory
#
#
PriorityFlags=CALCULATE_RUNNING
PriorityType=priority/multifactor
PriorityWeightAge=1000
PriorityWeightFairshare=100000
#
#
AccountingStorageHost=localhost
AccountingStoragePass=/var/run/munge/munge.socket.2
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageUser=slurm
JobCompHost=localhost
JobCompLoc=slurm_job_db
JobCompPass=mysql_slurm_passwd
JobCompType=jobcomp/mysql
JobCompUser=slurm
JobAcctGatherType=jobacct_gather/cgroup

SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
#
#
NodeName=node[1-2] CPUs=144 Sockets=4 CoresPerSocket=18 ThreadsPerCore=2 RealMemory=1031534 State=UNKNOWN

PartitionName=ptt1 Nodes=node[1-2] MaxTime=INFINITE State=UP AllowAccounts=icpcs
1
2
3
4
5
6
7
# /etc/slurm/cgroup.conf
CgroupAutomount=yes
CgroupMountpoint=/sys/fs/cgroup

#the behavior of this particular plugin
ConstrainCores=yes
ConstrainRAMSpace=yes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# /etc/slurm/slurmdbd.conf
ArchiveEvents=yes
ArchiveJobs=yes
ArchiveResvs=yes
ArchiveSteps=no
ArchiveSuspend=no
ArchiveTXN=no
ArchiveUsage=no
AuthInfo=/var/run/munge/munge.socket.2
AuthType=auth/munge
DbdHost=localhost
DebugLevel=info
PurgeEventAfter=1month
PurgeJobAfter=12month
PurgeResvAfter=1month
PurgeStepAfter=1month
PurgeSuspendAfter=1month
PurgeTXNAfter=12month
PurgeUsageAfter=24month
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
SlurmUser=slurm
StoragePass=mysql_slurm_passwd
StorageType=accounting_storage/mysql
StorageUser=slurm
StorageHost=localhost
StoragePort=3306

启动

1
2
3
4
5
6
7
systemctl daemon-reload
systemctl enable --now slurmctld.service
systemctl enable --now slurmdbd.service

./nodes 1 14 "scp" ":/etc/slurm/cgroup.conf /etc/slurm/"
./nodes 1 14 "scp" ":/etc/slurm/slurm.conf /etc/slurm/"
./nodes 1 14 "ssh" " systemctl enable --now slurmd"

本文作者:yuhldr
本文地址: https://yuhldr.github.io/posts/49ae9ac0.html
版权声明:转载请注明出处!