<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    paulwong

    WordCount的一個變種版本…Hadoop

    統計域名(實際是host)的計數器。

    輸入:一個文件夾中有一堆的文本文件,內容是一行一個的url,可以想像為數據庫中的一條記錄
    流程:提取url的domain,對domain計數+1
    輸出:域名,域名計數

    代碼如下:
    Mapper
    package com.keseek.hadoop;

    import java.io.IOException;
    import java.net.URI;

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.OutputCollector;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapred.Mapper;

    public class DomainCountMapper implements
            Mapper
    <LongWritable, Text, Text, LongWritable> {

        @Override
       
    public void configure(JobConf arg0) {
           
    // Init Text and LongWritable
            domain = new Text();
            one
    = new LongWritable(1);
        }


        @Override
       
    public void close() throws IOException {
           
    // TODO Auto-generated method stub
        }


        @Override
       
    public void map(LongWritable key, Text value,
                OutputCollector
    <Text, LongWritable> output, Reporter reporter)
               
    throws IOException {
           
    // Get URL
            String url = value.toString().trim();

           
    // URL->Domain && Collect
            domain.set(ParseDomain(url));
           
    if (domain.getLength() != 0) {
                output.collect(domain, one);
            }


        }


       
    public String ParseDomain(String url) {
           
    try {
                URI uri
    = URI.create(url);
               
    return uri.getHost();
            }
    catch (Exception e) {
               
    return "";
            }

        }


       
    // Shared used Text domain
        private Text domain;

       
    // One static
        private LongWritable one;

    }

    Reducer

    package com.keseek.hadoop;

    import java.io.IOException;
    import java.util.Iterator;

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.OutputCollector;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapred.Reducer;

    public class DomainCountReducer implements
            Reducer
    <Text, LongWritable, Text, LongWritable> {

        @Override
       
    public void configure(JobConf arg0) {
           
    // TODO Auto-generated method stub

        }


        @Override
       
    public void close() throws IOException {
           
    // TODO Auto-generated method stub

        }


        @Override
       
    public void reduce(Text key, Iterator<LongWritable> values,
                OutputCollector
    <Text, LongWritable> output, Reporter reporter)
               
    throws IOException {
           
    // Count the domain
            long cnt = 0;
           
    while (values.hasNext()) {
                cnt
    += values.next().get();
            }

           
    // Output
            output.collect(key, new LongWritable(cnt));
        }


    }

    Main

    package com.keseek.hadoop;

    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.FileInputFormat;
    import org.apache.hadoop.mapred.FileOutputFormat;
    import org.apache.hadoop.mapred.JobClient;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.RunningJob;
    import org.apache.hadoop.mapred.TextInputFormat;
    import org.apache.hadoop.mapred.TextOutputFormat;

    public class DomainCountMain {
       
    public static void main(String[] args) throws Exception {
           
    // Param for path
            if (args.length != 2) {
                System.out.println(
    "Usage:");
                System.out
                        .println(
    "DomainCountMain.jar  <Input_Path>  <Outpu_Path>");
                System.exit(
    -1);
            }


           
    // Configure JobConf
            JobConf jobconf = new JobConf(DomainCountMain.class);

            jobconf.setJobName(
    "Domain Counter by Coder4");

            FileInputFormat.setInputPaths(jobconf,
    new Path(args[0]));
           FileOutputFormat.setOutputPath(jobconf,
    new Path(args[1]));

            jobconf.setInputFormat(TextInputFormat.
    class);
           jobconf.setOutputFormat(TextOutputFormat.
    class);

            jobconf.setMapperClass(DomainCountMapper.
    class);
            jobconf.setReducerClass(DomainCountReducer.
    class);
           jobconf.setCombinerClass(DomainCountReducer.
    class);

            jobconf.setMapOutputKeyClass(Text.
    class);
            jobconf.setMapOutputValueClass(LongWritable.
    class);
            jobconf.setOutputKeyClass(Text.
    class);
            jobconf.setOutputValueClass(LongWritable.
    class);

           
    // Run job
            RunningJob run = JobClient.runJob(jobconf);
            run.waitForCompletion();
           
    if (run.isSuccessful()) {
                System.out.println(
    "<<<DomainCount Main>>> success.");
            }
    else {
                System.out.println(
    "<<<DomainCount Main>>> error.");
            }

        }

    }

    posted on 2012-09-08 15:30 paulwong 閱讀(266) 評論(0)  編輯  收藏 所屬分類: HADOOP云計算

    主站蜘蛛池模板: 免费看国产一级片| 亚洲国产最大av| 亚洲国产精品成人一区| 蜜桃AV无码免费看永久| 一级毛片免费一级直接观看| 亚洲一区二区三区91| 国产亚洲人成无码网在线观看| 国产在线观看免费视频播放器 | 在线观看亚洲免费| 免费在线视频你懂的| 日本不卡免费新一区二区三区| 曰韩无码AV片免费播放不卡| 亚洲精品无码久久久久A片苍井空| 亚洲美免无码中文字幕在线| 亚洲av综合av一区| 亚洲香蕉成人AV网站在线观看| 五月婷婷亚洲综合| 免费日韩在线视频| 国产免费看插插插视频| 成人免费看片又大又黄| a毛片基地免费全部视频| 四虎最新永久免费视频| 蜜桃成人无码区免费视频网站 | 亚洲第一区精品观看| 亚洲免费网站观看视频| 国产免费拔擦拔擦8X高清在线人| 亚洲av无码天堂一区二区三区| 精品在线视频免费| 国产亚洲av片在线观看16女人 | 精品日韩99亚洲的在线发布| 亚洲精品尤物yw在线影院| 午夜影院免费观看| 在线A亚洲老鸭窝天堂| 免费一级做a爰片久久毛片潮喷| 一区二区免费电影| 一级做a爰片久久毛片免费陪| 黄色网页免费观看| 日日狠狠久久偷偷色综合免费| 特级毛片全部免费播放a一级| 黄色免费在线网址| xxxxx做受大片视频免费|