一系统选型

操作系统

Ubuntu-16.04-desktop-amd64.iso

Anaconda

Anaconda3-5.2.0-Linux-x86_64.sh

 

 

 

 

二安装操作系统及基础软件包
ubuntu操作系统安装

hadoop@ubuntu:~$ sudo su -
[sudo] password for hadoop: 
root@ubuntu:~# cd /etc/apt/
root@ubuntu:~# mv sources.list sources.list.backup
更换163源
root@ubuntu:/etc/apt# cat sources.list
deb http://mirrors.163.com/ubuntu/ bionic main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-security main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-updates main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb http://mirrors.163.com/ubuntu/ bionic-backports main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-security main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-updates main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb-src http://mirrors.163.com/ubuntu/ bionic-backports main restricted universe multiverse
root@ubuntu:/etc/apt# 

root@ubuntu:~# apt-get install vim openssh-server -y
root@ubuntu:~# service sshd restart

#这一步不执行
#root@ubuntu:~# sudo apt-get install libopenmpi-dev openmpi-bin openmpi-doc
root@ubuntu:~# apt-get install mpich -y
root@ubuntu:~# which mpicc
/usr/bin/mpicc
root@ubuntu:~# which mpiexec
/usr/bin/mpiexec
hadoop@ubuntu:~$  bash Anaconda3-5.2.0-Linux-x86_64.sh 
hadoop@ubuntu:~$  source .bashrc
hadoop@ubuntu:~$ conda install tensorflow
hadoop@ubuntu:~$ pip install horovod
hadoop@ubuntu:~$ pip install msgpack
hadoop@ubuntu:~$ conda instlal mpi4py
hadoop@ubuntu:~$ pip install --upgrade h5py


二centos环境安装
[root@localhost ~]# yum install mpich mpich-devel mpich-doc -y
yum install bzip2 -y
yum install gcc gcc-c++ -y
[root@tfcentos-1 ~]# vi /etc/profile
MPI_ROOT=/usr/lib64/mpich
export PATH=$MPI_ROOT/bin:$PATH
root@tfcentos-1 ~]# source /etc/profile
[centos@tfcentos-1 ~]$ source /etc/profile
[centos@@tfcentos-1 ~]$ bash Anaconda3-5.2.0-Linux-x86_64.sh
[centos@tfcentos-1 ~]$ source .bashrc
[centos@tfcentos-1 ~]$ conda install tensorflow -y
[centos@tfcentos-1 ~]$ pip install horovod
[centos@tfcentos-1 ~]$ pip install msgpack


hadoop@ubuntu:~$ python

Python 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56) 

[GCC 7.2.0] on linux

Type "help", "copyright", "credits" or "license" for more information.

>>> import tensorflow as tf

/home/hadoop/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

  from ._conv import register_converters as _register_converters

>>> import horovod.tensorflow as hvd

>>> hvd.init()
0
>>> 

Centos安装Pycharm

wget https://download.jetbrains.com/python/pycharm-professional-2018.1.tar.gz

[hadoop@master ~]$ tar -xzvf pycharm-professional-2018.1.tar.gz 

注册http://idea.lanyus.com 获取注册码

[root@master ~]# cat /etc/hosts
0.0.0.0 account.jetbrains.com
[root@master ~]# 


[hadoop@master pycharm-2018.1]$ cd
[hadoop@master ~]$ cd pycharm-2018.1/bin/
[hadoop@master bin]$ ./pycharm.sh 

环境变量设置路径

/home/hadoop/anaconda3/bin/python3.6



三 API例子(运行C语言)

hadoop@conda1:~$ cat mpi_hello.c 
/* C Example */
#include <mpi.h>
#include <stdio.h>
 
int main (int argc, char* argv[])
{
    int rank, size;
 
    MPI_Init (&argc, &argv);   /* starts MPI */
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);    /* get current process id */
    MPI_Comm_size (MPI_COMM_WORLD, &size);    /* get number of processes */
    printf( "Hello world from process %d of %d\n", rank, size );
    MPI_Finalize();
    return 0;
}
hadoop@conda1:~$ mpic
mpic++          mpicc.openmpi   mpicxx          
mpicc           mpichversion    mpicxx.mpich    
mpicc.mpich     mpic++.openmpi  mpicxx.openmpi  
hadoop@conda1:~$ mpicc mpi_hello.c -o hello

四运行

4.1#在一台机器两个CPU核上进行测试
hadoop@conda1:~$ mpirun -np 2 ./hello   
Hello world from process 1 of 2
Hello world from process 0 of 2
4.2在同一个网段中所有机器上运行
    在启动MPI时明确指明要使用的网卡或网段
    默认情况下,OpenMPI会尝试选择使用所有启动的网络设备(自环虚拟设备io/loopback除外),但是能不能选对就不能保证了
4.2.1显示指定方法
    使用btl_tcp_if_include 或 btl_tcp_if_exclude 选项启动MPI,来制定使用的或排除的网段设备
    hadoop@conda1:~$ mpirun --mca btl_tcp_if_include 192.168.229.0/24 hello

 

五实际例子

hadoop@conda2:~/examples$ pwd
/home/hadoop/examples

hadoop@conda2:~/examples$ cat hello_c.c 
/*
 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
 *
 * Sample MPI "hello world" application in C
 */

#include <stdio.h>
#include "mpi.h"

int main(int argc, char* argv[])
{
    int rank, size, len;
    char version[MPI_MAX_LIBRARY_VERSION_STRING];

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Get_library_version(version, &len);
    printf("Hello, world, I am %d of %d, (%s, %d)\n",
           rank, size, version, len);
    printf("conda2----222222222\n");
    MPI_Finalize();

    return 0;
}
hadoop@conda2:~/examples$ 

hadoop@conda2:~/examples$ make

hadoop@conda1:~/examples$ pwd
/home/hadoop/examples

hadoop@conda1:~/examples$ cat hello_c.c 
/*
 * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2006      Cisco Systems, Inc.  All rights reserved.
 *
 * Sample MPI "hello world" application in C
 */

#include <stdio.h>
#include "mpi.h"

int main(int argc, char* argv[])
{
    int rank, size, len;
    char version[MPI_MAX_LIBRARY_VERSION_STRING];

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Get_library_version(version, &len);
    printf("Hello, world, I am %d of %d, (%s, %d)\n",
           rank, size, version, len);
    printf("conda1----11111111111111\n");
    MPI_Finalize();

    return 0;
}
hadoop@conda1:~/examples$ 

hadoop@conda1:~/examples$ make



hadoop@conda1:~$ cat hostfile 
conda1 slots=3
conda2 slots=3
hadoop@conda1:~$ pwd
/home/hadoop
hadoop@conda1:~$ 

hadoop@conda1:~$ mpiexec --hostfile /home/hadoop/hostfile -np 6 examples/hello_c
Hello, world, I am 2 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 1 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 0 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda1----11111111111111
Hello, world, I am 5 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
Hello, world, I am 4 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
Hello, world, I am 3 of 6, (Open MPI v2.1.1, package: Open MPI buildd@lcy01-amd64-009 Distribution, ident: 2.1.1, repo rev: v2.1.0-100-ga2fdb5b, May 10, 2017, 130)
conda2----222222222
hadoop@conda1:~$ 
六tensorflow
hadoop@conda2:~$ cat test.py
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
from mpi4py import MPI
import tensorflow as tf
#创建一个常量op
m1 = tf.constant([[3,3]]) #两行一列

#创建一个常量op
m2 = tf.constant([[2],[3]]) #一行两列

#创建一个矩阵乘法op,把m1和m2传入
product = tf.matmul(m1,m2)

#输出
print(product)

#定义一个会话,启动默认图
sess = tf.Session()
#调用sess的run方法来执行矩阵乘法op
#run(product)出发了图中3个op
result = sess.run(product)
print(result)
#关闭会话
sess.close()
hadoop@conda2:~$
hadoop@conda1:~$ mpiexec -hosts conda1,conda2 -np 6 python test.py
七tensorflow主机名称写入到配置文件中
hadoop@conda1:~$ cat hostfile1 
conda1:2
conda2:2

hadoop@conda1:~$ cat test.py 
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
from mpi4py import MPI
import tensorflow as tf
import os
import socket
print (socket.gethostname())
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"
#创建一个常量op
m1 = tf.constant([[3,3]]) #两行一列

#创建一个常量op
m2 = tf.constant([[2],[3]]) #一行两列

#创建一个矩阵乘法op,把m1和m2传入
product = tf.matmul(m1,m2)

#输出
print(product)

#定义一个会话,启动默认图
sess = tf.Session()
#调用sess的run方法来执行矩阵乘法op
#run(product)出发了图中3个op
result = sess.run(product)
print(result)
#关闭会话
sess.close()
hadoop@conda1:~$ mpiexec -f hostfile1 -np 4 python test.py
conda2
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda1
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda2
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
conda1
Tensor("MatMul:0", shape=(1, 1), dtype=int32)
[[15]]
hadoop@conda1:~$ 
7.1消除h5py问题/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. 
 pip install --upgrade h5py
分布式测试
数据集 MNIST-data-0
代码
hadoop@conda1:~$ cat tensorflow_mnist.py
70   mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

hadoop@conda1:~$ cat hostfile1 
conda1:1
conda2:1

hadoop@conda1:~$ mpiexec -f hostfile1 -np 2 python tensorflow_mnist.py

hadoop@conda1:~$  mpiexec -hosts conda1,conda2 -np 2 python tensorflow_mnist.py
Logo

更多推荐