horovod安装及PyCharm
一系统选型操作系统Ubuntu-16.04-desktop-amd64.isoAnacondaAnaconda3-5.2.0-Linux-x86_64.sh 二安装操作系统及基础软件包ubuntu操作系...
·
一系统选型
操作系统 | Ubuntu-16.04-desktop-amd64.iso |
Anaconda | Anaconda3-5.2.0-Linux-x86_64.sh |
|
|
|
|
二安装操作系统及基础软件包
ubuntu操作系统安装
hadoop@ubuntu:~$ sudo su -
[sudo] password for hadoop:
root@ubuntu:~# cd /etc/apt/
root@ubuntu:~# mv sources.list sources.list.backup
更换163源
root@ubuntu:/etc/apt# cat sources.list
deb http://mirrors.163.com/ubuntu/ bionic main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-security main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-updates main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-backports main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-security main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-updates main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-backports main restricted universe multiverse
root@ubuntu:/etc/apt#
root@ubuntu:~# apt-get install vim openssh-server -y
root@ubuntu:~# service sshd restart
#这一步不执行
#root@ubuntu:~# sudo apt-get install libopenmpi-dev openmpi-bin openmpi-doc
root@ubuntu:~# apt-get install mpich -y
root@ubuntu:~# which mpicc
/usr/bin/mpicc
root@ubuntu:~# which mpiexec
/usr/bin/mpiexec
hadoop@ubuntu:~$ bash Anaconda3-5.2.0-Linux-x86_64.sh
hadoop@ubuntu:~$ source .bashrc
hadoop@ubuntu:~$ conda install tensorflow
hadoop@ubuntu:~$ pip install horovod
hadoop@ubuntu:~$ pip install msgpack
hadoop@ubuntu:~$ conda instlal mpi4py
hadoop@ubuntu:~$ pip install --upgrade h5py
二centos环境安装
[root@localhost ~]# yum install mpich mpich-devel mpich-doc -y
yum install bzip2 -y
yum install gcc gcc-c++ -y
[root@tfcentos-1 ~]# vi /etc/profile
MPI_ROOT=/usr/lib64/mpich
export PATH=$MPI_ROOT/bin:$PATH
root@tfcentos-1 ~]# source /etc/profile
[centos@tfcentos-1 ~]$ source /etc/profile
[centos@@tfcentos-1 ~]$ bash Anaconda3-5.2.0-Linux-x86_64.sh
[centos@tfcentos-1 ~]$ source .bashrc
[centos@tfcentos-1 ~]$ conda install tensorflow -y
[centos@tfcentos-1 ~]$ pip install horovod
[centos@tfcentos-1 ~]$ pip install msgpack
hadoop@ubuntu:~$ python
Python 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56)
[GCC 7.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
/home/hadoop/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
>>> import horovod.tensorflow as hvd
>>> hvd.init()
0
>>>
Centos安装Pycharm
wget https://download.jetbrains.com/python/pycharm-professional-2018.1.tar.gz
[hadoop@master ~]$ tar -xzvf pycharm-professional-2018.1.tar.gz
注册http://idea.lanyus.com 获取注册码
[root@master ~]# cat /etc/hosts
0.0.0.0 account.jetbrains.com
[root@master ~]#
[hadoop@master pycharm-2018.1]$ cd
[hadoop@master ~]$ cd pycharm-2018.1/bin/
[hadoop@master bin]$ ./pycharm.sh
环境变量设置路径
/home/hadoop/anaconda3/bin/python3.6
三 API例子(运行C语言)
hadoop@conda1:~$ cat mpi_hello.c
/* C Example */
#include <mpi.h>
#include <stdio.h>
int main (int argc, char* argv[])
{
int rank, size;
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
printf( "Hello world from process %d of %d\n", rank, size );
MPI_Finalize();
return 0;
}
hadoop@conda1:~$ mpic
mpic++ mpicc.openmpi mpicxx
mpicc mpichversion mpicxx.mpich
mpicc.mpich mpic++.openmpi mpicxx.openmpi
hadoop@conda1:~$ mpicc mpi_hello.c -o hello
四运行
4.1#在一台机器两个CPU核上进行测试
hadoop@conda1:~$ mpirun -np 2 ./hello
Hello world from process 1 of 2
Hello world from process 0 of 2
4.2在同一个网段中所有机器上运行
在启动MPI时明确指明要使用的网卡或网段
默认情况下,OpenMPI会尝试选择使用所有启动的网络设备(自环虚拟设备io/loopback除外),但是能不能选对就不能保证了
4.2.1显示指定方法
使用btl_tcp_if_include 或 btl_tcp_if_exclude 选项启动MPI,来制定使用的或排除的网段设备
hadoop@conda1:~$ mpirun --mca btl_tcp_if_include 192.168.229.0/24 hello
五实际例子
hadoop@conda2:~/examples$ pwd
/home/hadoop/examples
hadoop@conda2:~/examples$ cat hello_c.c
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
*
* Sample MPI "hello world" application in C
*/
#include <stdio.h>
#include "mpi.h"
int main(int argc, char* argv[])
{
int rank, size, len;
char version[MPI_MAX_LIBRARY_VERSION_STRING];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Get_library_version(version, &len);
printf("Hello, world, I am %d of %d, (%s, %d)\n",
rank, size, version, len);
printf("conda2----222222222\n");
MPI_Finalize();
return 0;
}
hadoop@conda2:~/examples$
hadoop@conda2:~/examples$ make
hadoop@conda1:~/examples$ pwd
/home/hadoop/examples
hadoop@conda1:~/examples$ cat hello_c.c
/*
* Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
*
* Sample MPI "hello world" application in C
*/
#include <stdio.h>
#include "mpi.h"
int main(int argc, char* argv[])
{
int rank, size, len;
char version[MPI_MAX_LIBRARY_VERSION_STRING];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Get_library_version(version, &len);
printf("Hello, world, I am %d of %d, (%s, %d)\n",
rank, size, version, len);
printf("conda1----11111111111111\n");
MPI_Finalize();
return 0;
}
hadoop@conda1:~/examples$
hadoop@conda1:~/examples$ make
hadoop@conda1:~$ cat hostfile
conda1 slots=3
conda2 slots=3
hadoop@conda1:~$ pwd
/home/hadoop
hadoop@conda1:~$
hadoop@conda1:~$ mpiexec --hostfile /home/hadoop/hostfile -np 6 examples/hello_c
Hello, world, I am 2 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 1 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 0 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 5 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
Hello, world, I am 4 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
Hello, world, I am 3 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
hadoop@conda1:~$
六tensorflow
hadoop@conda2:~$ cat test.py
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
from mpi4py import MPI
import tensorflow as tf
#创建一个常量op
m1 = tf.constant([[3,3]]) #两行一列
#创建一个常量op
m2 = tf.constant([[2],[3]]) #一行两列
#创建一个矩阵乘法op,把m1和m2传入
product = tf.matmul(m1,m2)
#输出
print(product)
#定义一个会话,启动默认图
sess = tf.Session()
#调用sess的run方法来执行矩阵乘法op
#run(product)出发了图中3个op
result = sess.run(product)
print(result)
#关闭会话
sess.close()
hadoop@conda2:~$
hadoop@conda1:~$ mpiexec -hosts conda1,conda2 -np 6 python test.py
七tensorflow主机名称写入到配置文件中
hadoop@conda1:~$ cat hostfile1
conda1:2
conda2:2
hadoop@conda1:~$ cat test.py
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
from mpi4py import MPI
import tensorflow as tf
import os
import socket
print (socket.gethostname())
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
#创建一个常量op
m1 = tf.constant([[3,3]]) #两行一列
#创建一个常量op
m2 = tf.constant([[2],[3]]) #一行两列
#创建一个矩阵乘法op,把m1和m2传入
product = tf.matmul(m1,m2)
#输出
print(product)
#定义一个会话,启动默认图
sess = tf.Session()
#调用sess的run方法来执行矩阵乘法op
#run(product)出发了图中3个op
result = sess.run(product)
print(result)
#关闭会话
sess.close()
hadoop@conda1:~$ mpiexec -f hostfile1 -np 4 python test.py
conda2
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda1
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda2
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda1
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
hadoop@conda1:~$
7.1消除h5py问题/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated.
pip install --upgrade h5py
分布式测试
数据集 MNIST-data-0
代码
hadoop@conda1:~$ cat tensorflow_mnist.py
70 mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
hadoop@conda1:~$ cat hostfile1
conda1:1
conda2:1
hadoop@conda1:~$ mpiexec -f hostfile1 -np 2 python tensorflow_mnist.py
hadoop@conda1:~$ mpiexec -hosts conda1,conda2 -np 2 python tensorflow_mnist.py
更多推荐
已为社区贡献1条内容
所有评论(0)