1. 生成集群ID
1
| cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 16 | head -n 1
|
下面以 CLUSTER_ID=fKbIga9RHP79OIx1 为例
2. 第一个节点上
1
2
3
| $CONTAINER_CLI rm -f $FDB_INSTANCE_NAME
mv $FDB_DIR $FDB_DIR.$(date +%Y%m%d%H%M%S).bak
mkdir -p $FDB_DIR
|
1
2
3
4
5
6
7
8
9
| export CONTAINER_CLI=nerdctl
export IMAGE=foundationdb/foundationdb:7.1.26
export CLUSTER_ID=fKbIga9RHP79OIx1
export FDB_INSTANCE_NAME=fdb_server
export FDB_CLUSTER_FIRST_IP=$(hostname -I | awk '{print $1}')
export FDB_PORT=4500
export FDB_DIR=/data/ops/fdb/$FDB_INSTANCE_NAME
|
1
2
| echo "${FDB_INSTANCE_NAME}:${CLUSTER_ID}@${FDB_CLUSTER_FIRST_IP}:4500" > $FDB_DIR/fdb.cluster
cat $FDB_DIR/fdb.cluster
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| mkdir -p $FDB_DIR/data $FDB_DIR/logs
$CONTAINER_CLI run -d \
--name $FDB_INSTANCE_NAME \
--restart always \
--network host \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--ulimit nofile=1048576:1048576 \
--memory-swappiness=0 \
-h $(hostname) \
-v $FDB_DIR/fdb.cluster:/var/fdb/fdb.cluster \
-v $FDB_DIR/data:/var/fdb/data \
-v $FDB_DIR/logs:/var/fdb/logs \
--entrypoint "" \
$IMAGE \
fdbserver --listen-address 0.0.0.0:$FDB_PORT --public-address $FDB_PUBLIC_IP:$FDB_PORT
|
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "configure new ssd single"
|
先初始化集群为 single 模式,否则会因为副本不够会 hang 住。
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "status"
|
1
2
3
4
5
6
7
8
9
10
| Configuration:
Redundancy mode - single
Storage engine - ssd-2
Coordinators - 1
Usable Regions - 1
Cluster:
FoundationDB processes - 1
Zones - 1
Machines - 1
|
3. 其他节点
需要其他节点全都加入到第一个节点新建的集群中。
1
2
3
| $CONTAINER_CLI rm -f $FDB_INSTANCE_NAME
mv $FDB_DIR $FDB_DIR.$(date +%Y%m%d%H%M%S).bak
mkdir -p $FDB_DIR
|
1
2
3
4
5
6
7
8
9
10
11
12
| export CONTAINER_CLI=nerdctl
export IMAGE=foundationdb/foundationdb:7.1.26
export FDB_DIR=/data/ops/fdb/$FDB_INSTANCE_NAME
export CLUSTER_ID=fKbIga9RHP79OIx1
export FDB_INSTANCE_NAME=fdb_server
export FDB_PUBLIC_IP=$(hostname -I | awk '{print $1}')
export FDB_PORT=4500
export FDB_CLUSTER_FIRST_IP="10.0.0.1"
export FDB_CLUSTER_FIRST_PORT=4500
|
1
| echo "$FDB_INSTANCE_NAME:$CLUSTER_ID@$FDB_CLUSTER_FIRST_IP:$FDB_CLUSTER_FIRST_PORT" > $FDB_DIR/fdb.cluster
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
| $CONTAINER_CLI run -d \
--name $FDB_INSTANCE_NAME \
--restart always \
--network host \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--ulimit nofile=1048576:1048576 \
--memory-swappiness=0 \
-h $(hostname) \
-v $FDB_DIR/fdb.cluster:/var/fdb/fdb.cluster:ro \
-v $FDB_DIR/data:/var/fdb/data \
-v $FDB_DIR/logs:/var/fdb/logs \
--entrypoint "" \
$IMAGE \
fdbserver --listen-address 0.0.0.0:$FDB_PORT --public-address $FDB_PUBLIC_IP:$FDB_PORT
|
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "status"
|
全部节点添加完成应该可以看到
1
2
3
4
5
6
7
8
9
10
| Configuration:
Redundancy mode - single
Storage engine - ssd-2
Coordinators - 1
Usable Regions - 1
Cluster:
FoundationDB processes - 5
Zones - 5
Machines - 5
|
此时应该有 5 个服务器节点。
4. 调整副本数量为 triple
single 模式下数据为单副本,不是生产环境推荐配置,需要调整为 triple 模式以提高数据可靠性。
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "configure ssd triple"
|
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "status"
|
1
2
3
4
5
6
7
8
9
10
| Configuration:
Redundancy mode - triple
Storage engine - ssd-2
Coordinators - 1
Usable Regions - 1
Cluster:
FoundationDB processes - 5
Zones - 5
Machines - 5
|
此时 Coordinators 的数量还是为 1,不是生产可用状态。
5. 更新全部节点的 cluster 文件
将全部节点都添加到 cluster 文件中。
1
| echo "$FDB_INSTANCE_NAME:$CLUSTER_ID@10.0.0.1:4500,10.0.0.2:4500,10.0.0.3:4500,10.0.0.4:4500,10.0.0.5:4500" > $FDB_DIR/fdb.cluster
|
1
| $CONTAINER_CLI restart $FDB_INSTANCE_NAME
|
6. 查看集群状态
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "status"
|
1
2
3
4
5
6
7
8
9
10
11
| Configuration:
Redundancy mode - triple
Storage engine - ssd-2
Coordinators - 5
Usable Regions - 1
Cluster:
FoundationDB processes - 5
Zones - 5
Machines - 5
Fault Tolerance - 2 machines
|
此时的 Coordinators 的数量应该为 5,集群可以容忍 2 台机器故障。
7. 启动备份 agent
1
| mkdir -p $FDB_DIR/backup
|
1
2
3
4
5
6
7
8
9
| $CONTAINER_CLI run -d \
--name fdb_backup_agent \
--restart always \
--network host \
-v $FDB_DIR/fdb.cluster:/var/fdb/fdb.cluster:ro \
-v $FDB_DIR/backup:/var/fdb/backup \
--entrypoint "" \
$IMAGE \
backup_agent --cluster_file /var/fdb/fdb.cluster
|
8. 备份数据
1
2
3
4
5
| $CONTAINER_CLI run -it --rm \
-v $FDB_DIR/fdb.cluster:/var/fdb/fdb.cluster:ro \
-v $FDB_DIR/backup:/var/fdb/backup \
--entrypoint "" \
$IMAGE bash
|
1
| fdbbackup start -d file:///var/fdb/backup
|
9. 恢复数据
1
2
3
4
5
| $CONTAINER_CLI run -it --rm \
-v $FDB_DIR/fdb.cluster:/var/fdb/fdb.cluster:ro \
-v $FDB_DIR/backup:/var/fdb/backup \
--entrypoint "" \
$IMAGE bash
|
1
| fdbcli --exec "writemode on; clearrange '' \xFF"
|
1
2
3
| fdbrestore start \
-r file:///var/fdb/backup/backup-2026-02-10-06-09-38.111134 \
--dest-cluster-file /var/fdb/fdb.cluster
|
每个backup目录就是一份数据。
1
| fdbrestore status --dest-cluster-file /var/fdb/fdb.cluster
|
确认恢复状态为 State: completed 即恢复完成。
1
| fdbcli --exec "getrange '' \xFF"
|
10. 监控集群状态
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "status"
|
1
| $CONTAINER_CLI exec -it $FDB_INSTANCE_NAME fdbcli --exec "describe"
|
11. 集群监控
1
2
3
4
5
| export CONTAINER_CLI=nerdctl
export IMAGE=aikoven/foundationdb-exporter
export FDB_INSTANCE_NAME=fdb_server
export FDB_DIR=/data/ops/fdb/$FDB_INSTANCE_NAME
|
1
2
3
4
5
6
7
| $CONTAINER_CLI run -d \
--name fdb-exporter \
--restart always \
-v $FDB_DIR/fdb.cluster:/etc/foundationdb/fdb.cluster:ro \
-p 9444:9444 \
$IMAGE \
exporter
|
1
| curl http://localhost:9444/metrics
|
指标含义参考 https://github.com/aikoven/foundationdb-exporter
配置采集之后,导入面板 https://github.com/aikoven/foundationdb-exporter/blob/master/grafana/foundationdb.json 稍微调整下即可看到如下面板:
