Hadoop之日志分析

环境说明

名称版本备注
宿主系统Win7【64位】 
VMware12 
虚拟机镜像CentOS-6.5-x86_64-minimal.iso下载地址(不同版本):http://vault.centos.org/
jdkjdk-8u65-linux-x64.tar.gzlinux版
hadoophadoop-2.6.0-cdh5.7.0.tar.gzlinux版

软件安装路径

软件名称路径
jdk/software/jdk/jdk8
hadoop/software/hadoop/hadoop

主机名称

  • centos01

主机网卡连接方式

  • NAT 静态固定IP
  • IP:192.168.66.66

免密登录

  • 已经免密
  • 免密步骤
    1. ssh-keygen
    2. cat ~/.ssh/ad_rsa.pub >> authorized_keys

环境关键配置文件

  • /etc/profile

     

    #jdk
    JAVA_HOME=/software/jdk/jdk8
    PATH=$JAVA_HOME/bin:$PATH
    CLASSPATH=.:$JAVA_HOME/lb/dt.jar:$JAVA_HOME/lib/tools.jar
    export JAVA_HOME PATH CLASSPATH
    
    #hadoop
    export HADOOP_HOME=/software/hadoop/hadoop
    export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
    
  • /software/hadoop/hadoop/etc/hadoop/hadoop-env.sh【将export JAVA_HOME=${JAVA_HOME在的地方进行修改}】

     

    。。。
    export JAVA_HOME=/software/jdk/jdk8
    。。。
    
  • /software/hadoop/hadoop/etc/hadoop/core-site.xml

     

    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    
    <configuration>
    
    <!--代码操作hdfs的文件地址-->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://centos01:8020</value>
    </property>
    
    <!--防止重启之后数据丢失-->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/software/hadoop/tmp</value>
    </property>
    
    </configuration>
    
  • /software/hadoop/hadoop/etc/hadoop/hdfs-site.xml

     

    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    
    <configuration>
    
    <!--副本-->
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    
    <!--NameNode的文件目录-->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/software/hadoop/tmp/dfs/name</value>
    </property>
    
    <!--DataNode的文件目录-->        
    <property>
        <name>dfs.namenode.data.dir</name>
        <value>file:/software/hadoop/tmp/dfs/data</value>
    </property>
    
    </configuration>
    
  • /software/hadoop/hadoop/etc/hadoop/slaves

     

    centos01
    
  • /software/hadoop/hadoop/etc/hadoop/mapred-site.xml

     

    <?xml version="1.0"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    
    <configuration>
    
    <!--mapreduce的框架名称-->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    
    <!--日志历史记录地址-->
    <property>
      <name>mapreduce.jobhistory.address</name>
      <value>centos01:10020</value>
      <description>MapReduce JobHistory Server IPC host:port</description>
    </property>
    
    <!--日志历史网页地址-->
    <property>
      <name>mapreduce.jobhistory.webapp.address</name>
      <value>centos01:19888</value>
      <description>MapReduce JobHistory Server Web UI host:port</description>
    </property>
    
    <!--日志存放地址(hdfs服务器上的位置)-->
    <property>
      <name>mapreduce.jobhistory.done-dir</name>
      <value>/history/done</value>
    </property>
    
    <!--日志存放地址(hdfs服务器上的位置)-->
    <property>
      <name>mapreduce.jobhistory.intermediate-done-dir</name>
      <value>/history/done_intermediate</value>
    </property>
    
    </configuration>
    
  • /software/hadoop/hadoop/etc/hadoop/yarn-site.xml

     

    <?xml version="1.0"?>
    <configuration>
    
    <!--nodemanager版本-->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    
    <!--日志启用-->
    <property>
      <name>yarn.log-aggregation-enable</name>
      <value>true</value>
    </property>
    
    </configuration>
    

日志信息

  • 模拟数据8_access.log

     

    182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/userdynamic HTTP/1.1" 200 19501 "www.imooc.com" "-" cid=0&timestamp=1478707261847&uid=2871142&touid=2871142&page=1&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=3837a5bf27ea718fe18bda6c53fbbc14 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.136.65:80 200 0.195 0.195
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    114.248.161.26 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getcourseintro HTTP/1.1" 200 2510 "www.imooc.com" "-" cid=283&secrect=86b720f312c2b25da3b20e59e7c89780&timestamp=1478707261951&token=4c144b3f4314178b9527d1e91ecc0fac&uid=3372975 "mukewang/5.0.2 (iPhone; iOS 8.4.1; Scale/2.00)" "-" 10.100.136.65:80 200 0.007 0.008
    120.52.94.105 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getmediainfo_ver2 HTTP/1.1" 200 633 "www.imooc.com" "-" cid=608&secrect=e25994750eb2bbc7ade1a36708b999a5&timestamp=1478707261945&token=9bbdba949aec02735e59e0868b538e19&uid=4203162 "mukewang/5.0.2 (iPhone; iOS 10.0.1; Scale/3.00)" "-" 10.100.136.65:80 200 0.049 0.049
    10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000
    112.10.136.45 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.006 0.006
    

idea项目

  • maven配置文件

     

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.peng</groupId>
        <artifactId>hdfstest</artifactId>
        <version>1.0-SNAPSHOT</version>
        <build>
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <configuration>
                        <source>1.8</source>
                        <target>1.8</target>
                    </configuration>
                </plugin>
            </plugins>
        </build>
    
    
        <repositories>
            <repository>
                <id>repo</id>
                <url>http://repo1.maven.org/maven2/</url>
            </repository>
            <repository>
                <id>cloudera</id>
                <url>https://repository.cloudera.com/content/repositories/releases/</url>
            </repository>
        </repositories>
    
    
        <dependencies>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>2.6.0-cdh5.7.0</version>
            </dependency>
    
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.10</version>
                <scope>test</scope>
            </dependency>
    
            <dependency>
                <groupId>org.mortbay.jetty</groupId>
                <artifactId>jetty</artifactId>
                <version>6.1.26</version>
            </dependency>
    
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
                <version>1.7.25</version>
                <scope>test</scope>
            </dependency>
    
        </dependencies>
    
    </project>
    
  • 主程序

    • useragent解析https://github.com/LeeKemp/UserAgentParser
    • LogTest.java

       

      package com.peng.logtest;
      
      import com.peng.utils.position.PositionUtils;
      import com.peng.utils.useragent.UserAgent;
      import com.peng.utils.useragent.UserAgentParser;
      import org.apache.hadoop.conf.Configuration;
      import org.apache.hadoop.fs.FileSystem;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.io.LongWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.Mapper;
      import org.apache.hadoop.mapreduce.Reducer;
      import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
      
      import java.io.IOException;
      
      public class LogTest {
          //main
          public static void main(String[] args) throws Exception {
      
              if (args.length < 2) {
                  System.exit(1);
              }
      
              //创建配置文件
              Configuration configuration = new Configuration();
      
              //设置mapper阶段的堆内存大小
              configuration.set("mapreduce.admin.map.child.java.opts", "-Xmx1024m");
              configuration.set("mapred.map.child.java.opts", "-Xmx1024m");
      
              //设置reducer阶段的堆内存大小
              configuration.set("mapreduce.admin.reduce.child.java.opts", "-Xmx1024m");
              configuration.set("mapred.reduce.child.java.opts", "-Xmx1024m");
      
              //判断是否存在输出文件--有的话进行删除
              FileSystem fileSystem = FileSystem.get(configuration);
      
              Path outFilePath = new Path(args[1]);
      
              boolean is_exists = fileSystem.exists(outFilePath);
      
              //判断是否存在此文件--存在的话进行删除
              if (is_exists) {
                  fileSystem.delete(outFilePath, true);
              }
      
              //创建job对象
              Job job = Job.getInstance(configuration, "logtest");
              //设置job的处理类
              job.setJarByClass(LogTest.class);
              //设置作业处理的输入路径
              FileInputFormat.setInputPaths(job, new Path(args[0]));
      
              //设置map相关参数
              job.setMapperClass(LogTest.MyMapper.class);
              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(LongWritable.class);
      
              //设置reduce相关参数
              job.setReducerClass(LogTest.MyReduce.class);
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(LongWritable.class);
      
              //设置作业处理的输出路径
              FileOutputFormat.setOutputPath(job, new Path(args[1]));
      
              System.exit(job.waitForCompletion(true) ? 0 : 1);
      
          }
      
          /**
           * 读取输入文件
           */
          public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
              UserAgentParser userAgentParser = null;
              LongWritable one = null;
      
              //初始化对象
              @Override
              protected void setup(Context context) throws IOException, InterruptedException {
                  if (userAgentParser == null) {
                      userAgentParser = new UserAgentParser();
                  }
      
                  if (one == null) {
                      one = new LongWritable(1);
                  }
              }
      
              @Override
              protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                  //接收到的每一行数据
                  String line = value.toString();
                  //---------------取出浏览器版本-------------start---------------------
                  int start_index = PositionUtils.getCharacterPosition(line, "\"", 7);
                  int end_index = PositionUtils.getCharacterPosition(line, "\"", 8);
                  if (end_index != 0) {
                      String userAgentData = line.substring(start_index + 1, end_index);
                      if (userAgentData != null && userAgentData.length() > 0) {
                          UserAgent userAgent = userAgentParser.parse(userAgentData);
                          if (userAgent != null) {
                              if (userAgent.getBrowser() != null) {
                                  context.write(new Text(userAgent.getBrowser()), one);
                              }
                          }
                      }
                  }
                  //---------------取出浏览器版本--------------end----------------------
              }
          }
      
          /**
           * 归并操作
           */
          public static class MyReduce extends Reducer<Text, LongWritable, Text, LongWritable> {
              @Override
              protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
                  //浏览器出现的总数
                  long sum = 0;
                  for (LongWritable value : values) {
                      //求key出现的次数
                      sum += value.get();
                  }
                  //将统计的结果进行输出
                  context.write(key, new LongWritable(sum));
              }
          }
      
      }
      

测试过程

  1. 初始化hdfs【一次就好】
    • 在目录/software/hadoop/hadoop/bin下
    • 执行hadoop namenode -format
  2. 开启hadoop服务
    • 在目录/software/hadoop/hadoop/sbin下
    • 执行sh start-all.sh
  3. 打包主程序为jar包,并拷入虚拟机/hadoop_lib下
    • 打jar包
    • 拷贝
  4. 拷贝数据8_access.log到虚拟机/hadoop-data下,并上传到dhfs服务的/data下
    • 拷贝
  5. 执行日志分析主程序
    • hadoop jar hdfstest-1.0-SNAPSHOT.jar com.peng.logtest.LogTest hdfs://centos01:8020/data/8_access.log hdfs://centos01:8020/result/logtest

执行结果

补充:执行10000_access.log的结果

Logo

更多推荐