Database

基于主机离线部署 MySQL MGR 高可用生产集群

基于主机离线部署 MySQL MGR 高可用生产集群

1. 部署准备

IP Description
10.0.0.161 node1 (master)
10.0.0.162 node2 (slave)
10.0.0.163 node3 (slave)
  • 下载安装包

    • 一键下载 (可复制直接执行)
sudo wget -O /tmp/mysql.tar.gz https://cdn.mysql.com//Downloads/MySQL-5.7/mysql-5.7.35-linux-glibc2.12-x86_64.tar.gz
  • 安装 (可复制直接执行)
sudo mkdir -p /usr/lib/mysql-current
sudo tar -xf /tmp/mysql.tar.gz --strip-components=1 -C /usr/lib/mysql-current
#sudo rm -rf /tmp/mysql.tar.gz

注: 监听地址每个节点不同 loose-group_replication_local_address= "ip:3306"

2. 环境配置 (可复制直接执行)

  • node1、node2、node3 分别执行
[展开] /etc/profile.d/profile-mysqld.sh

# 下载环境配置
sudo curl -sSL -o /etc/profile.d/profile-mysqld.sh https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/hosts-mysql-mgr-deployment/resources/profile-mysqld.sh

# 使生效
. /etc/profile

# 创建用户、目录
sudo groupadd $MYSQL_GROUP
sudo useradd -g $MYSQL_GROUP $MYSQL_USER
sudo mkdir -p $MYSQL_DATA_DIR
sudo mkdir -p ${MYSQL_LOG_DIR}
# 授权
sudo chmod -R 755 ${MYSQL_HOME}
sudo chmod -R 700 ${MYSQL_DATA_DIR}
sudo chmod -R 755 ${MYSQL_LOG_DIR}
sudo chown -R ${MYSQL_USER}:${MYSQL_GROUP} ${MYSQL_HOME}
sudo chown -R ${MYSQL_USER}:${MYSQL_GROUP} ${MYSQL_DATA_DIR}
sudo chown -R ${MYSQL_USER}:${MYSQL_GROUP} ${MYSQL_LOG_DIR}


3. 运行配置 (不可复制直接执行, 请注意生成 3 个节点的配置时的 ip 不同。)

[展开] /etc/my.cnf

# 根据主机配置生成调优的配置
totalMemKB=$(cat /proc/meminfo|grep MemTotal|sed -r 's|[^0-9]+([0-9]+).*$|\1|')
innodbBufferPoolSizeMB=$(($totalMemKB/1024*7/10)) # 按专有服务器计算为主机总内存的70%

# 下载配置模版
sudo curl -sSL -o /tmp/my.cnf.tpl https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/hosts-mysql-mgr-deployment/resources/my.cnf.tpl

# 生成3个节点的配置(***按实际自行修改***)
export host1='10.0.0.161'
export host2='10.0.0.162'
export host3='10.0.0.163'
export seeds="$host1:23366,$host2:23366,$host3:23366"
sudo cat /tmp/my.cnf.tpl | sed "s/HOST/$host1/g" | sed 's/SERVER_ID/1/g' | sed "s/SEEDS/$seeds/g" > /tmp/my1.cnf
sudo cat /tmp/my.cnf.tpl | sed "s/HOST/$host2/g" | sed 's/SERVER_ID/2/g' | sed "s/SEEDS/$seeds/g" > /tmp/my2.cnf
sudo cat /tmp/my.cnf.tpl | sed "s/HOST/$host3/g" | sed 's/SERVER_ID/3/g' | sed "s/SEEDS/$seeds/g" > /tmp/my3.cnf

# 授权
sudo chmod -R 755 ${MYSQL_CONF_FILE}
sudo chown -R ${MYSQL_USER}:${MYSQL_GROUP} ${MYSQL_CONF_FILE}

# 拷贝到其他节点
sudo cp /tmp/my1.cnf /etc/my.cnf
sudo scp /tmp/my2.cnf $host2:/etc/my.cnf
sudo scp /tmp/my3.cnf $host3:/etc/my.cnf


4. 初始化 (可复制直接执行)

  • node1、node2、node3 分别执行
[展开] Initializating MySQL metadata

. /etc/profile # 加载环境

# (推荐可选) 如果已启动先停掉(防止反复执行出错)
systemctl stop mysqld
# 删除否则初始化会失败. [为通用安全起见, 只执行mv]
trashTmpDir="/tmp/removed_mysql_data_$(date +%Y%m%d%H%M%S)"
mkdir -p $trashTmpDir
mv ${MYSQL_DATA_DIR}/* $trashTmpDir

# 切换用户
su $MYSQL_USER

# 加载环境
. /etc/profile

# 执行初始化
${MYSQL_HOME}/bin/mysqld \
--defaults-file=${MYSQL_CONF_FILE} \
--initialize \
--user=${MYSQL_USER} \
--basedir=${MYSQL_HOME} \
--datadir=${MYSQL_DATA_DIR} \
--plugin-dir=${MYSQL_PLUGIN_DIR} > "$MYSQL_LOG_ERR" 2>&1

# 【重要】查看日志,找到初始化生成的临时密码
cat ${MYSQL_LOG_ERR}

# 回到管理账户(加判断防止反复执行出错)
[ "$USER" == "$MYSQL_USER" ] && exit


5. 服务配置 (可复制直接执行)

  • node1、node2、node3 分别执行

  • mysqld.sh

[展开] /etc/init.d/mysqld.sh

sudo curl -sSL -o /etc/init.d/mysqld.sh https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/hosts-mysql-mgr-deployment/resources/mysqld.sh

# 授权
sudo chmod -R 755 /etc/init.d/mysqld.sh
sudo chown -R mysql:mysql /etc/init.d/mysqld.sh


  • mysqld.service
[展开] /etc/systemd/system/mysqld.service

# 下载服务配置
sudo curl -sSL -o /etc/systemd/system/mysqld.service https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/hosts-mysql-mgr-deployment/resources/mysqld.service

# 分别启动
sudo systemctl daemon-reload
sudo systemctl enable mysqld
sudo systemctl restart mysqld
sudo systemctl status mysqld
sudo journalctl -afu mysqld


6. 允许远程 & 改密 (不可复制直接执行, 请根据上一步生成的临时密码来重置密码)

  • node1、node2、node3 分别执行

  • 首次初始化必须修改密码

$MYSQL_HOME/bin/mysqladmin -u root -p password --socket=/mnt/disk1/mysql/mysqld.sock
  • 本地连接
$MYSQL_HOME/bin/mysql -S $MYSQL_SOCKET -uroot -p
Enter password: <输入初始化生成的临时密码>
  • 修改密码
-- 修改密码的方式1
mysql> alter user root@'%' identified by '123456';
-- 修改密码的方式2 (MySQL 5.7)
mysql> set password for root@'%'=password('123456');
-- 注:MySQL 8.x 必须显示指定密码插件为 mysql_native_password
mysql> alter user root@'%' identified with mysql_native_password by '123456';
-- 授权允许远程登录方式1
mysql> grant all privileges on *.* to root@'%' identified by '123456';
-- 授权允许远程登录方式2
mysql> use mysql; update user set host='%' where user='root' and host='localhost';
-- 刷新权限
mysql> flush privileges;
  • 用户管理 (使用 root 新增 user 和 schema 然后授权)
-- 创建用户及数据库.
mysql> create schema testdb;
mysql> create user test@'%' identified by '123456';

-- 【重要】: 必须先给 root 用户授予有执行 grant 命令的权限, 才能授权给其他用户. 否则报错: ERROR 1044 (42000): Access denied for user 'root'@'%' to database 'testdb'

mysql> update mysql.user set Grant_priv='Y',Super_priv='Y' where user = 'root' and host = '%';
flush privileges;
# 还必须重启才行.
sudo systemctl restart mysqld
# 重新以 root 登录
$MYSQL_HOME/bin/mysql -S $MYSQL_SOCKET -uroot -p'123456'
-- 重新授权给其他用户.
mysql> grant all privileges on testdb.* to 'test'@'%' identified by '123456' with grant option;
  • 重登验证
$MYSQL_HOME/bin/mysql -h10.0.0.161 -utest -p'123456'
-- 显示权限范围内的数据库
mysql> show databases;
+--------------------+
| Database           |
+--------------------+
| information_schema |
| testdb             |
+--------------------+
2 rows in set (0.00 sec)

7. 配置 MGR

sudo curl -sSL -o /bin/mgrctl https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/docker-mysql-mgr-simple-deployment/resources/mgrctl.sh

7.1 node1(首先启动,引导节点)

[展开] Initializating primary nodes

-- 安装(mysql8+默认未安装)
INSTALL PLUGIN group_replication SONAME 'group_replication.so';

-- 重置复制配置(可选,如需重置上次创建的配置)
RESET MASTER;
RESET SLAVE ALL;

-- 关闭超级用户只读(MGR启动后会被自动设置从节点为只读)
SET GLOBAL SUPER_READ_ONLY = OFF;
SET GLOBAL READ_ONLY = OFF;

-- 配置账户使其他node登录进来
SET SQL_LOG_BIN=0;
GRANT REPLICATION SLAVE ON *.* TO repl@'%'  IDENTIFIED BY '123456';
SET SQL_LOG_BIN=1;

-- 使用账户密码登录其他node
CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='123456' FOR CHANNEL 'group_replication_recovery';

-- 只需node1需要执行
SET GLOBAL group_replication_bootstrap_group=ON;
START GROUP_REPLICATION;
SET GLOBAL group_replication_bootstrap_group=OFF;
-- STOP GROUP_REPLICATION;

-- 查看集群成员状态
select * from performance_schema.replication_group_members;


7.2. node2、node3(成员节点)

[展开] Initializating standby nodes

-- 安装(mysql8+默认未安装)
INSTALL PLUGIN group_replication SONAME 'group_replication.so';

-- 重置复制配置(可选,如需重置上次创建的配置)
RESET MASTER;
RESET SLAVE ALL;

-- 关闭超级用户只读(MGR启动后会被自动设置从节点为只读)
SET GLOBAL SUPER_READ_ONLY = OFF;
SET GLOBAL READ_ONLY = OFF;

-- 配置账户使其他node登录进来
SET SQL_LOG_BIN=0;
GRANT REPLICATION SLAVE ON *.* TO repl@'%'  IDENTIFIED BY '123456';
SET SQL_LOG_BIN=1;

-- 使用账户密码登录其他node
CHANGE MASTER TO MASTER_USER='repl', MASTER_PASSWORD='123456' FOR CHANNEL 'group_replication_recovery';  

-- 除node1外其他需执行此配置
SET GLOBAL group_replication_allow_local_disjoint_gtids_join=ON; # 高版本已过时
SET GLOBAL group_replication_bootstrap_group=OFF;
START GROUP_REPLICATION;
-- STOP GROUP_REPLICATION;

-- 开启超级用户只读(有时MGR启动后没有会被自动设置从节点为只读?)
SET GLOBAL SUPER_READ_ONLY = ON;
SET GLOBAL READ_ONLY = ON;

-- 查看集群成员状态
select * from performance_schema.replication_group_members;


7.3. 查询主从状态

SELECT
    rgm.CHANNEL_NAME AS channelName,
    rgm.MEMBER_ID AS nodeId,
    rgm.MEMBER_HOST AS nodeHost,
    rgm.MEMBER_PORT AS nodePort,
    rgm.MEMBER_STATE AS nodeState,
    @@read_only AS readOnly,
    @@super_read_only AS superReadOnly,(
    CASE (SELECT TRIM(VARIABLE_VALUE) FROM `performance_schema`.`global_status` WHERE VARIABLE_NAME = 'group_replication_primary_member')
      WHEN '' THEN 'UNKOWN'
      WHEN rgm.MEMBER_ID THEN 'PRIMARY'
      ELSE 'STANDBY' END
    ) AS nodeRole
FROM
    `performance_schema`.`replication_group_members` rgm;
  • 结果输出
+---------------------------+--------------------------------------+------------------+----------+-----------+----------+---------------+----------+
| channelName               | nodeId                               | nodeHost         | nodePort | nodeState | readOnly | superReadOnly | nodeRole |
+---------------------------+--------------------------------------+------------------+----------+-----------+----------+---------------+----------+
| group_replication_applier | 290cfb2f-4123-11ec-a209-00163e0bdf1e | node1 |    13306 | ONLINE    |        1 |             1 | STANDBY  |
| group_replication_applier | 7a9dd78b-4122-11ec-ac63-00163e0c24a7 | node2 |    13306 | ONLINE    |        1 |             1 | STANDBY  |
| group_replication_applier | df6b5139-4121-11ec-bb72-00163e11baa2 | node3 |    13306 | ONLINE    |        1 |             1 | PRIMARY  |
+---------------------------+--------------------------------------+------------------+----------+-----------+----------+---------------+----------+
3 rows in set (0.00 sec)

7.4. 其他相关指令

SET GLOBAL read_only=0; -- 1:启动/0:关闭,如:多主模式式,当stop group_replication时会自动将此节点设置为ON,启动后会自动设为OFF
FLUSH PRIVILEGES;
SHOW VARIABLES like 'read_only';
SHOW VARIABLES like '%slow_query_log%';
SHOW VARIABLES like '%long_query_time%';
SHOW VARIABLES like '%log_queries_not_using_indexes%';

-- 慢SQL统计
SET GLOBAL slow_query_log=on;
SET GLOBAL long_query_time=0.1;
SET GLOBAL log_queries_not_using_indexes=0;

8. 数据备份

注: 以下脚本中数据库地址账号密码等,需要自行修改。

  • 创建备份账号
create user `wordpress_backup`@`127.0.0.1` identified with mysql_native_password by '123456';
grant Lock Tables, Select ON `wordpress`.* TO `wordpress_backup`@`127.0.0.1`;
grant Reload ON *.* TO `wordpress_backup`@`127.0.0.1`;
[展开] /usr/lib/mysql-current/bin/mysql-backup.sh

# 加载环境变量
. /etc/profile

# 下载备份脚本(最新)
sudo curl -sSL -o $MYSQL_HOME/bin/mysql-backup.sh https://gitee.com/wl4g/blogs/raw/master/docs/articles/database/hosts-mysql-mgr-deployment/resources/mysql-backup.sh

# 或从此处下载(可能延迟)
curl -sSL -o $MYSQL_HOME/bin/mysql-backup.sh http://pkg.wl4g.com/software/mysql/script-tools/mysql-backup.sh
sudo chmod +x ${MYSQL_HOME}/bin/mysql-backup.sh

# 加入 crontab 任务
sudo echo "0  *  *  *  * root  /bin/bash ${MYSQL_HOME}/bin/mysql-backup.sh -f" >> /etc/crontab
sudo systemctl restart crond
tail -f /var/log/messages


9. FAQ

9.1 初始执行 start group_replication; 启动 MGR 插件失败

...
2021-04-15T12:55:23.019059Z 3 [ERROR] Plugin group_replication reported: '[GCS] Invalid hostname or IP address ( "127.0.0.1:3306") assigned to the parameter local_node!'
2021-04-15T12:55:23.019091Z 3 [ERROR] Plugin group_replication reported: 'Unable to initialize the group communication engine'
2021-04-15T12:55:23.019102Z 3 [ERROR] Plugin group_replication reported: 'Error on group communication engine initialization'
...

9.2 运行期间,由于主机故障或误操作等错误情况,导致部分节点故障,且恢复执行 start group_replication; 启动 MGR 插件失败

...
2021-04-17T13:39:57.759602Z 121 [ERROR] Slave SQL for channel 'group_replication_applier': Could not execute Update_rows event on table aabbcc.t_test; Can't find record in 't_test', Error_code: 1032; handler error HA_ERR_KEY_NOT_FOUND, Error_code: 1032
2021-04-17T13:39:57.759627Z 121 [Warning] Slave: Can't find record in 't_test' Error_code: 1032
2021-04-17T13:39:57.759641Z 121 [ERROR] Plugin group_replication reported: 'The applier thread execution was aborted. Unable to process more transactions, this member will now leave the group.'
2021-04-17T13:39:57.759662Z 121 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'FIRST' position 0.
2021-04-17T13:39:57.759674Z 118 [ERROR] Plugin group_replication reported: 'Fatal error during execution on the Applier process of Group Replication. The server will now leave the group.'
2021-04-17T13:39:57.759727Z 118 [ERROR] Plugin group_replication reported: '[GCS] The member is already leaving or joining a group.'
2021-04-17T13:39:57.759757Z 118 [ERROR] Plugin group_replication reported: 'Unable to confirm whether the server has left the group or not. Check performance_schema.replication_group_members to check group membership information.'
2021-04-17T13:39:57.759775Z 118 [Note] Plugin group_replication reported: 'Going to wait for view modification'
2021-04-17T13:39:58.760466Z 0 [ERROR] Plugin group_replication reported: 'There was a previous plugin error while the member joined the group. The member will now exit the group.'
...

从日志分析 Can't find record in 't_test', Error_code: 1032; handler error HA_ERR_KEY_NOT_FOUND, Error_code: 1032 说明主主(主从)之间复制存在不一致问题,解决供参考:

STOP GROUP_REPLICATION; -- 停止出现问题的节点MGR
RESET SLAVE ALL FOR CHANNEL "group_replication_applier"; -- 重置复制通道(logbin/relaylog不一致)
START GROUP_REPLICATION; -- 重启MGR

若还是不行,则尝试执行停止 stop group_replication 所有节点 MGR 复制,然后执行重置 RESET SLAVE ALL FOR CHANNEL "group_replication_applier" 再次逐个启动 start group_replication - 注:此操作需谨慎使用,操作前请先中断所有业务系统 jdbc 连接,且对各个节点数据备份。

注:已下线的节点 MGR 插件会自动设置 read_only=ON,千万不要手动 read_only=OFF 然后修改任何数据,否则就算后续恢复正常后,被强制修改的记录将无法被更新,报错 3101 - Plugin instructed the server to rollback the current transaction.


9.3 从 Master 依次启动 MGR,从节点一直是 RECOVERING 状态

[ERROR] Slave SQL for channel'group_replication_recovery': Error 'Operation CREATE USER failed for'piriineos'@'localhost'' on query. Default database: ''. Query: 'CREATE USER'piriineos'@'localhost' IDENTIFIED WITH 'mysql_native_password' AS'*077BFD72D9E814194FBA9A90A6A8DB13BC476718'', Error_code: 1396

slave apply 了 master 操作系统表的事务,这里是 master 上创建用户的语句,忘记 set sql_log_bin=0; 了,不想让 slave 执行的操作都需要设置它,这里的解决办法是直接把这个事务的 gtid 放到 purged 表中,跳过他即可。

[展开] Resetting Group Replication

-- 重新创建 MGR 主从复制,先停止所有节点
STOP GROUP_REPLICATION; -- 注意执行顺序,先停止从节点最后停止主节点。启动顺序则相反,是先启动主节点
-- 重置主节点
-- 作用:
-- 删除binlog索引文件中列出的所有binlog文件;
-- 清空binlog索引文件;
-- 创建一个新的binlog文件;
-- 清空系统变量gtid_purged和gtid_executed ;
-- 在MySQL 5.7.5 及后续版本中, RESET MASTER还会会清空 [mysql.gtid_executed](https://dev.mysql.com/doc/refman/5.7/en/replication-gtids-concepts.html#replication-gtids-gtid-executed-table) 数据表; 
RESET MASTER;

-- 重置从节点
-- 作用:
-- 清除slave 复制时的master binlog的位置;
-- 清空master info, relay log info ;
-- 删除所有的relay log文件,并创建一个新的relay log文件;
-- 重置复制延迟(change master to 的 MASTER_DELAY 参数指定的)为 0;
-- 另外,不会改变gtid_executed or gtid_purged ,也不会改变复制连接使用的参数,例如 master host, master port, master user, or master password ;
-- 如果是执行 RESET SLAVE ALL 则需重新执行 change master to 指定复制连接参数;
RESET SLAVE;

-- 然后重复 7.1 和 7.2,重启所有节点 MGR


9.4 START GROUPLICATION 启动报错日志: Error on opening a connection to 192.168.2.1:33061 on local port: 33061.’

https://forums.percona.com/t/mysql-group-replication-error-on-opening-a-connection-to-192-168-2-1-33061-on-local-port-33061/8087

mysql> START GROUP_REPLICATION;

[ERROR] Plugin group_replication reported: ‘[GCS] Error on opening a connection to 192.168.2.1:33061 on local port: 33061.’
[ERROR] Plugin group_replication reported: '[GCS] Error on opening a connection to 192.168.2.2:33061 on local port: 33061.
'[ERROR] Plugin group_replication reported: ‘[GCS] Error connecting to all peers. Member join failed. Local port: 33061’
[Warning] Plugin group_replication reported: ‘read failed’
Error reading relay log event for channel ‘group_replication_applier’: slave SQL thread was killed

引导节点可能设置正确,重新设置重启 Group Replication

SET GLOBAL group_replication_bootstrap_group = ON;
START GROUP_REPLICATION;
SET GLOBAL group_replication_bootstrap_group = OFF;

9.5 启动 mysqld 错误 InnoDB: mmap(136019968 bytes) failed; errno 12

> tail -99f /mnt/disk1/log/mysql/mysqld.err
InnoDB: mmap(136019968 bytes) failed; errno 12

9.6 启动 mysqld 错误 mysqld.service: Main process exited, code=killed, status=9/KILL

  • 日志
> journalctl -u mysqld.service -f
mysqld.service: Main process exited, code=killed, status=9/KILL
Nov 09 10:51:56 k8s-master-node1 systemd[1]: mysqld.service: Failed with result 'signal'.
Nov 09 10:51:56 k8s-master-node1 systemd[1]: mysqld.service: Scheduled restart job, restart counter is at 5.
Nov 09 10:51:56 k8s-master-node1 systemd[1]: Stopped MySQL Server.

> tail -99f /mnt/disk1/log/mysql/mysqld.err
...
2021-11-09T02:58:42.778133Z 0 [Note] InnoDB: Using Linux native AIO
2021-11-09T02:58:42.779655Z 0 [Note] InnoDB: Number of pools: 1
2021-11-09T02:58:42.779802Z 0 [Note] InnoDB: Using CPU crc32 instructions
2021-11-09T02:58:42.782165Z 0 [Note] InnoDB: Initializing buffer pool, total size = 2G, instances = 8, chunk size = 128M
  • 分析

从日志看 Initializing buffer pool 就 holding 了,但从 journal 日志看 Main process exited, code=killed, status=9/KILL 是被 kill 掉的,之后又尝试将 /etc/my.cnfinnodb_buffer_pool_size 注释掉或者改小为 512M 然后 systemctl restart mysqld.service 发现都启动正常,分析可能是由于初始化 innno buffer memory 触发系统限制被 kill 掉了(此问题环境是 alibaba cloud ubuntu 20.04)

  • 解决

TODO


10. MGR 后期运维细则

留言

您的电子邮箱地址不会被公开。