From 5739eb0de26328cc894212ec6b842abb741d7e61 Mon Sep 17 00:00:00 2001 From: ligang Date: Wed, 22 May 2019 16:16:01 +0800 Subject: [PATCH 1/7] add monitorServerState --- install.sh | 9 +++++++++ script/monitor_server.py | 42 +++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/install.sh b/install.sh index 9023af86be..2186f974d0 100644 --- a/install.sh +++ b/install.sh @@ -98,6 +98,8 @@ xlsFilePath="/tmp/xls" # 不启动设置为false,如果为false,以下配置不需要修改 hdfsStartupSate="false" +#是否启动自启动脚本 +monitorServerState="true" # namenode地址,支持HA,需要将core-site.xml和hdfs-site.xml放到conf目录下 namenodeFs="hdfs://mycluster:8020" @@ -364,3 +366,10 @@ fi # 6,启动 echo "6,启动" sh ${workDir}/script/start_all.sh + +# 7启动自启动脚本 +if [ "true" = $monitorServerState ];then + echo 'start monitor server' + nohup python -u ${workDir}/script/monitor_server.py $installPath $zkQuorum $zkMasters $zkWorkers > ${workDir}/monitor_server.log 2>&1 & +fi + diff --git a/script/monitor_server.py b/script/monitor_server.py index a0d1066661..169b14c269 100644 --- a/script/monitor_server.py +++ b/script/monitor_server.py @@ -10,9 +10,13 @@ pip install kazoo 安装 conda install -c conda-forge kazoo 安装 运行脚本: -nohup python -u monitor_server.py > nohup.out 2>&1 & +/data1_1T/escheduler的值来自install.sh中的installPath +192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181的值来自install.sh中的zkQuorum +/escheduler/masters的值来自install.sh中的zkMasters +/escheduler/workers的值来自install.sh中的zkWorkers +nohup python -u monitor_server.py /data1_1T/escheduler 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 /escheduler/masters /escheduler/workers> nohup.out 2>&1 & ''' - +import sys import socket import os import sched @@ -20,14 +24,17 @@ import time from datetime import datetime from kazoo.client import KazooClient - schedule = sched.scheduler(time.time, time.sleep) class ZkClient: def __init__(self): # hosts配置zk地址集群 - self.zk = KazooClient(hosts='ark0:2181,ark1:2181,ark2:2181') - self.zk.start() + #self.zk = KazooClient(hosts='192.168.220.188:2181,192.168.220.189:2181,192.168.220.190:2181') + print zookeepers + #zookeepers1 = zookeepers + self.zk = KazooClient(hosts=zookeepers) + print "ready start" + self.zk.start() # 读取配置文件,组装成字典 def read_file(self,path): @@ -45,35 +52,37 @@ class ZkClient: # 重启服务 def restart_server(self,inc): - config_dict = self.read_file('/data1_1T/escheduler/conf/config/run_config.conf') + config_dict = self.read_file(install_path + '/conf/config/run_config.conf') master_list = config_dict.get('masters').split(',') + print master_list master_list = list(map(lambda item : self.get_ip_by_hostname(item),master_list)) worker_list = config_dict.get('workers').split(',') + print worker_list worker_list = list(map(lambda item: self.get_ip_by_hostname(item), worker_list)) - if (self.zk.exists('/escheduler/masters')): + if (self.zk.exists(masters_zk_path)): zk_master_list = [] - zk_master_nodes = self.zk.get_children('/escheduler/masters') + zk_master_nodes = self.zk.get_children(masters_zk_path) for zk_master_node in zk_master_nodes: zk_master_list.append(zk_master_node.split('_')[0]) restart_master_list = list(set(master_list) - set(zk_master_list)) if (len(restart_master_list) != 0): for master in restart_master_list: print("master " + self.get_ip_by_hostname(master) + " 服务已经掉了") - os.system('ssh ' + self.get_ip_by_hostname(master) + ' sh /data1_1T/escheduler/bin/escheduler-daemon.sh start master-server') + os.system('ssh ' + self.get_ip_by_hostname(master) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start master-server') - if (self.zk.exists('/escheduler/workers')): + if (self.zk.exists(workers_zk_path)): zk_worker_list = [] - zk_worker_nodes = self.zk.get_children('/escheduler/workers') + zk_worker_nodes = self.zk.get_children(workers_zk_path) for zk_worker_node in zk_worker_nodes: zk_worker_list.append(zk_worker_node.split('_')[0]) restart_worker_list = list(set(worker_list) - set(zk_worker_list)) if (len(restart_worker_list) != 0): for worker in restart_worker_list: print("worker " + self.get_ip_by_hostname(worker) + " 服务已经掉了") - os.system('ssh ' + self.get_ip_by_hostname(worker) + ' sh /data1_1T/escheduler/bin/escheduler-daemon.sh start worker-server') + os.system('ssh ' + self.get_ip_by_hostname(worker) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start worker-server') print(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) schedule.enter(inc, 0, self.restart_server, (inc,)) @@ -84,5 +93,12 @@ class ZkClient: schedule.enter(0, 0, self.restart_server, (inc,)) schedule.run() if __name__ == '__main__': + if (len(sys.argv) < 4): + print('please input install_path,zookeepers,masters_zk_path and worker_zk_path') + install_path = sys.argv[1] + #zookeepers = "'" + sys.argv[2] + "'" + zookeepers = sys.argv[2] + masters_zk_path = sys.argv[3] + workers_zk_path = sys.argv[4] zkClient = ZkClient() - zkClient.main(300) \ No newline at end of file + zkClient.main(300) From 39f94615b2c4964f184c6c82508aaa50a3aae586 Mon Sep 17 00:00:00 2001 From: ligang Date: Wed, 22 May 2019 16:19:29 +0800 Subject: [PATCH 2/7] remove annotation --- script/monitor_server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/script/monitor_server.py b/script/monitor_server.py index 169b14c269..73f654fd7d 100644 --- a/script/monitor_server.py +++ b/script/monitor_server.py @@ -96,7 +96,6 @@ if __name__ == '__main__': if (len(sys.argv) < 4): print('please input install_path,zookeepers,masters_zk_path and worker_zk_path') install_path = sys.argv[1] - #zookeepers = "'" + sys.argv[2] + "'" zookeepers = sys.argv[2] masters_zk_path = sys.argv[3] workers_zk_path = sys.argv[4] From 9b206778ead611b4bc28780567e9dfb163a86999 Mon Sep 17 00:00:00 2001 From: ligang Date: Wed, 22 May 2019 16:22:34 +0800 Subject: [PATCH 3/7] remove annotation --- script/monitor_server.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/script/monitor_server.py b/script/monitor_server.py index 73f654fd7d..fcb4006b2c 100644 --- a/script/monitor_server.py +++ b/script/monitor_server.py @@ -29,12 +29,8 @@ schedule = sched.scheduler(time.time, time.sleep) class ZkClient: def __init__(self): # hosts配置zk地址集群 - #self.zk = KazooClient(hosts='192.168.220.188:2181,192.168.220.189:2181,192.168.220.190:2181') - print zookeepers - #zookeepers1 = zookeepers - self.zk = KazooClient(hosts=zookeepers) - print "ready start" - self.zk.start() + self.zk = KazooClient(hosts=zookeepers) + self.zk.start() # 读取配置文件,组装成字典 def read_file(self,path): From 1d72cfc5b0c9fa53c4619ee8228a592fdfe71038 Mon Sep 17 00:00:00 2001 From: ligang Date: Thu, 23 May 2019 11:02:42 +0800 Subject: [PATCH 4/7] update install.sh --- install.sh | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/install.sh b/install.sh index 2186f974d0..6c7d140d41 100644 --- a/install.sh +++ b/install.sh @@ -99,7 +99,7 @@ xlsFilePath="/tmp/xls" hdfsStartupSate="false" #是否启动自启动脚本 -monitorServerState="true" +monitorServerState="false" # namenode地址,支持HA,需要将core-site.xml和hdfs-site.xml放到conf目录下 namenodeFs="hdfs://mycluster:8020" @@ -368,8 +368,28 @@ echo "6,启动" sh ${workDir}/script/start_all.sh # 7启动自启动脚本 +monitor_pid=${workDir}/monitor_server.pid if [ "true" = $monitorServerState ];then - echo 'start monitor server' - nohup python -u ${workDir}/script/monitor_server.py $installPath $zkQuorum $zkMasters $zkWorkers > ${workDir}/monitor_server.log 2>&1 & + if [ -f $monitor_pid ]; then + TARGET_PID=`cat $monitor_pid` + if kill -0 $TARGET_PID > /dev/null 2>&1; then + echo "monitor server running as process ${TARGET_PID}.Stopping" + kill $TARGET_PID + sleep 5 + if kill -0 $TARGET_PID > /dev/null 2>&1; then + echo "$command did not stop gracefully after 5 seconds: killing with kill -9" + kill -9 $TARGET_PID + fi + else + echo "no monitor server to stop" + fi + echo "monitor server running as process ${TARGET_PID}.Stopped success" + rm -f $monitor_pid + fi + nohup python -u ${workDir}/script/monitor_server.py $installPath $zkQuorum $zkMasters $zkWorkers > ${workDir}/monitor_server +.log 2>&1 & + echo $! > $monitor_pid + echo "start monitor server success as process `cat $monitor_pid`" + fi From 2ef349372307c161e47afe1d4be650c5cb71cdf8 Mon Sep 17 00:00:00 2001 From: ligang Date: Thu, 23 May 2019 11:05:37 +0800 Subject: [PATCH 5/7] update install.sh --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 6c7d140d41..bf66ef1e3b 100644 --- a/install.sh +++ b/install.sh @@ -377,7 +377,7 @@ if [ "true" = $monitorServerState ];then kill $TARGET_PID sleep 5 if kill -0 $TARGET_PID > /dev/null 2>&1; then - echo "$command did not stop gracefully after 5 seconds: killing with kill -9" + echo "monitor server did not stop gracefully after 5 seconds: killing with kill -9" kill -9 $TARGET_PID fi else From d442acb15bf300c915ec2fadde74f57055650a5e Mon Sep 17 00:00:00 2001 From: ligang Date: Thu, 23 May 2019 11:18:18 +0800 Subject: [PATCH 6/7] update monitor_server.py --- script/monitor_server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/script/monitor_server.py b/script/monitor_server.py index fcb4006b2c..5f236cac7e 100644 --- a/script/monitor_server.py +++ b/script/monitor_server.py @@ -9,12 +9,13 @@ yum -y install python-pip pip install kazoo 安装 conda install -c conda-forge kazoo 安装 -运行脚本: +运行脚本及参数说明: +nohup python -u monitor_server.py /data1_1T/escheduler 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 /escheduler/masters /escheduler/workers> monitor_server.log 2>&1 & +参数说明如下: /data1_1T/escheduler的值来自install.sh中的installPath 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181的值来自install.sh中的zkQuorum /escheduler/masters的值来自install.sh中的zkMasters /escheduler/workers的值来自install.sh中的zkWorkers -nohup python -u monitor_server.py /data1_1T/escheduler 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 /escheduler/masters /escheduler/workers> nohup.out 2>&1 & ''' import sys import socket From d40117799d18bdc5b8d034e81a7b0324d46e8c30 Mon Sep 17 00:00:00 2001 From: ligang Date: Thu, 23 May 2019 11:22:25 +0800 Subject: [PATCH 7/7] update annotation --- install.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index bf66ef1e3b..e0c1878c37 100644 --- a/install.sh +++ b/install.sh @@ -98,8 +98,9 @@ xlsFilePath="/tmp/xls" # 不启动设置为false,如果为false,以下配置不需要修改 hdfsStartupSate="false" -#是否启动自启动脚本 +#是否启动监控自启动脚本 monitorServerState="false" + # namenode地址,支持HA,需要将core-site.xml和hdfs-site.xml放到conf目录下 namenodeFs="hdfs://mycluster:8020" @@ -367,7 +368,7 @@ fi echo "6,启动" sh ${workDir}/script/start_all.sh -# 7启动自启动脚本 +# 7启动监控自启动脚本 monitor_pid=${workDir}/monitor_server.pid if [ "true" = $monitorServerState ];then if [ -f $monitor_pid ]; then