1、下载数据包

wget http://archive.apache.org/dist/hbase/hbase-0.90.4/hbase-0.90.4.tar.gz

wget http://archive.apache.org/dist/hadoop/common/hadoop-0.20.2/hadoop-0.20.2.tar.gz

wget http://mirrors.ustc.edu.cn/apache/nutch/2.2/apache-nutch-2.2-src.tar.gz

wget http://mirror.bit.edu.cn/apache/zookeeper/zookeeper-3.4.5/zookeeper-3.4.5.tar.gz

wget http://mirror.bit.edu.cn/apache/gora/0.3/apache-gora-0.3-src.tar.gz

2、解压:

tar –zxvf hbase-0.90.4.tar.gz

tar –zxvf hadoop-0.20.2.tar.gz

tar –zxvf apache-nutch-2.2-src.tar.gz

tar –zxvf zookeeper-3.4.5.tar.gz

tar –zxvf apache-gora-0.3-src.tar.gz

3、安装所需插件

sudo apt-getinstall maven2

4、安装hadoop

1)、cd $HADOOP_HOME

2)、mkdir data //用于制定hadoop的hadoop.tmp.dir目录

3)、cd $HADOOP_HOME/conf

4)、vim hadoop-env.sh

//将JAVA_HOME修改为自己的JAVA_HOME路径

exportJAVA_HOME=/usr/lib/jvm/java-7-oracle

5)、vim core-site.xml

在configuration标签中添加如下信息

<configuration>
    <property>
       <name>fs.default.name</name>
       <value>hdfs://nutch1:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
       <value>/data/projects/hadoop-0.20.2/data</value>
    </property>
</configuration>

6)、vim hdfs-site.xml

在configuration标签中添加如下信息

<configuration>
    <property>
       <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
       <name>dfs.permissions</name>
        <value>false</value>
    </property>
    <property>
       <name>dfs.datanode.max.xcievers</name>
        <value>4096</value>
    </property>
</configuration>

7)、vim mapred-site.xml

在configuration标签中添加如下信息

<configuration>
    <property>
       <name>mapred.job.tracker</name>
        <value>nutch1:9001</value>
    </property>
</configuration>

8)、vim masters

//设置master

nutch1

9)、vim slaves

//设置slaves

nutch1
nutch2

10)、将hadoop项目通过scp拷贝到其他服务器上

scp –r$HADOOP_HOME hadoop@nutch2:/data/projects

11)、格式化hadoop文件系统

cd$HADOOP_HOME/bin

./hadoopnamenode –format

12)、启动hadoop

./start-all.sh

13)、检查hadoop运行情况

使用JAVA中的jps命令查询

Master

hadoop@nutch1:/$jps

21832SecondaryNameNode

22031TaskTracker

25971 Jps

21695 DataNode

21914 JobTracker

21575 NameNode

Slave

hadoop@nutch2:/$ jps

29939 DataNode

30044TaskTracker

704 Jps

5、配置Zookeeper

1)、创建zoo.cfg配置文件

cd$ZOOKEEPER_HOME/conf

cpzoo_sample.cfg zoo.cfg

2)、配置zoo.cfg配置文件

//修改dataDir
dataDir=$ZOOKEEPER_HOME/data
//添加dataLogDir
dataLogDir=$ZOOKEEPER_HOME/data
//添加服务器信息
server.1=10.68.237.26:2888:3888
server.2=10.68.237.27:2888:3888

3)、创建id文件

cd$ZOOKEEPER_HOME

mkdir data

mkdir log

cd data

vim myid

//在myid文件中添加zoo.cfg中对应的服务器id

//如10.68.237.26服务器中的myid文件。添加信息1

1

4)、通过scp拷贝项目到其他的服务器

scp –r $ZOOKEEPER_HOMEhadoop@nutch2:/data/projects/

5)、修改其他服务器上的myid文件

ssh nutch2

cd$ZOOKEEPER_HOME/data

vim myid

//修改myid内容为2

2

6)、分别启动两台服务器的zookeeper服务

nutch1

$ZOOKEEPER_HOME/bin/zkServer.sh start

nutch2

$ZOOKEEPER_HOME/bin/zkServer.shstart

7)、检查zookeeper服务状态

nutch1

hadoop@nutch1:/data/projects/zookeeper-3.4.5/bin$./zkServer.sh status

JMX enabled by default

Using config:/data/projects/zookeeper-3.4.5/bin/../conf/zoo.cfg

Mode: follower

nutch2

hadoop@nutch2:/data/projects/zookeeper-3.4.5/bin$./zkServer.sh status

JMX enabled by default

Using config:/data/projects/zookeeper-3.4.5/bin/../conf/zoo.cfg

Mode: leader

6、安装hbase

1)、cd $HBASE_HOME/conf

2)、配置hbase-env.sh

vim hbase-env.sh

//将JAVA_HOME修改为自己的JAVA_HOME路径
exportJAVA_HOME=/usr/lib/jvm/java-7-oracle
//修改HBASE_MANAGES_ZK为false,这边我们用上面搭建好的zookeeper集群
exportHBASE_MANAGES_ZK=false

3)、修改hbase-site.xml,在configuration中添加如下信息

<configuration>
   <property>
        <name>hbase.rootdir</name>
       <value>hdfs://nutch1:9000/hbase</value>
    </property>
    <property>
       <name>hbase.cluster.distributed</name>
        <value>true</value>
    </property>
    <property>
       <name>hbase.zookeeper.quorum</name>
        <value>nutch2</value>
    </property>
    <property>
        <name>hbase.zookeeper.session.timeout</name>
        <value>60000</value>
    </property>
    <property>
       <name>hbase.zookeeper.property.clientPort</name>
        <value>2181</value>
    </property>
    <property>
        <name>hbase.master</name>
        <value>nutch1</value>
    </property>
    <property>
       <name>hbase.regionserver.lease.period</name>
        <value>60000</value>
    </property>
    <property>
       <name>hbase.rpc.timeout</name>
        <value>60000</value>
    </property>
    <property>
       <name>hbase.master.maxclockskew</name>
        <value>180000</value>
   </property>
</configuration>

4)、修改regionservers

nutch2

5)、将hbase中的hadoop jar包版本与hadoop集群版本一致

rm hadoop-*

cp$HADOOP_HOME/hadoop-0.20.2-core.jar $HBASE_HOME/lib

6)、通过scp将hbase工程拷贝到其他服务器上

scp –r $HBASE_HOMEhadoop@nutch2:/data/projects

7)、启动hbase

$HBASE_HOME/bin/start-hbase.sh

8)、通过jps命令检查服务是否正常运行

Masters

hadoop@nutch1:/data/projects/hbase-0.90.4/lib$jps

26394 Jps

21832 SecondaryNameNode

22031TaskTracker

21695 DataNode

24953 HMaster

21914 JobTracker

24791QuorumPeerMain

21575 NameNode

Slaves

hadoop@nutch2:~$jps

29939 DataNode

30044TaskTracker

32270QuorumPeerMain

1126 Jps

32493 HRegionServer

7、安装nutch

1)、修改$NUTCH_HOME/ivy/ivy.xml

将被注释掉的<dependency org="org.apache.gora"name="gora-hbase" rev="0.3" conf="*->default"/>去除注释

2)、修改gora.properties文件

vim$NTUCH_HOME/runtime/local/conf/gora.properties

修改以下内容

#gora.datastore.default=org.apache.gora.mock.store.MockDataStore

修改为

gora.datastore.default=org.apache.gora.hbase.store.HBaseStore

3)、修改nutch-site.xml配置

//在configuration标签中添加如下内容

<configuration>
    <property> 
       <name>http.agent.name</name> 
       <value>test-nutch</value> 
    </property>
<property> 
<property> 
       <name>http.robots.agents</name> 
       <value>test-nutch,*</value> 
</property>
<property> 
        <name>generate.batch.id</name> 
        <value>1</value> 
    </property>
    <property> 
       <name>http.agent.name.check</name> 
       <value>true</value> 
    </property>
    <property> 
       <name>distributed.search.test.port</name> 
        <value>60000</value> 
        <description>TCP port used duringjunit testing.</description> 
    </property>
    <property> 
       <name>http.accept.language</name> 
        <value>ja-jp,en-us,en-gb,en;q=0.7,*;q=0.3</value> 
        <description>Value of the“Accept-Language” request header field. 
            This 
            allows selecting non-Englishlanguage as default one to retrieve. 
            It 
            is a useful setting for searchengines build for certain national 
            group. 
        </description> 
    </property>
    <property> 
       <name>parser.character.encoding.default</name> 
        <value>utf-8</value> 
        <description>The characterencoding to fall back to when no other 
            information 
            is available 
        </description> 
    </property>
    <property> 
        <name>storage.data.store.class</name> 
       <value>org.apache.gora.hbase.store.HBaseStore</value> 
        <description>The Gora DataStoreclass for storing and retrieving data. 
            Currently the following stores areavailable: …. 
        </description> 
    </property>
    <property> 
       <name>hadoop.tmp.dir</name> 
       <value>/data/zqhadoop/data</value> 
        <description>此处设置hadoop根目录</description> 
    </property>
    <property> 
      <name>plugin.folders</name> 
     <value>/home/zqgame/apache-nutch/runtime/local/plugins</value> 
     <description>Directorieswhere nutch plugins are located. Each 
      element may be a relative or absolutepath.  If absolute, it is used 
     asis.  If relative, it is searched for onthe classpath.</description>  
    </property>
</configuration>

4)、将$HBASE_HOME/conf中的hbase-site.xml拷贝到$NUTCH_HOME/runtime/local/conf目录下覆盖

5)、export JAVA_HOME

exportJAVA_HOME=/usr/lib/jvm/java-7-oracle

6)、编译nutch

cd $NUTCH_HOME

ant

7)、替换nutch中对应的hadoop、hbase 的jar包版本

将hadoop-core-0.20.2.jar,hbase-0.90.4.jar  拷贝到$NUTCH_HOME/runtime/local/lib目录下

8)、创建拦截的url文件

//在$NUTCH_HOME/runtime/local/目录中创建一个目录urls,且创建一个存放url路径//的文件

mkdir urls

cd urls

vim seed.txt

//输入想要的url

http://nutch.apache.org/

9)、设置正则拦截规则

vim$NUTCH_HOME/runtime/local/conf/regex-url

//修改

# acceptanything else
+.

//为以下内容

# acceptanything else
+^http://([a-z0-9]*\.)*nutch.apache.org/

10)、inject任务url到nutch中

bin/nutch injecturls/seed.txt

11)、crawl你的任务

bin/nutch crawlurls -depth 3 -topN 5

12)、执行成功之后可以在hbase看到一个webpage的表,且日志不会报错


 

Logo

权威|前沿|技术|干货|国内首个API全生命周期开发者社区

更多推荐