<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    paulwong

    WordCount的一個變種版本…Hadoop

    統計域名(實際是host)的計數器。

    輸入:一個文件夾中有一堆的文本文件,內容是一行一個的url,可以想像為數據庫中的一條記錄
    流程:提取url的domain,對domain計數+1
    輸出:域名,域名計數

    代碼如下:
    Mapper
    package com.keseek.hadoop;

    import java.io.IOException;
    import java.net.URI;

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.OutputCollector;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapred.Mapper;

    public class DomainCountMapper implements
            Mapper
    <LongWritable, Text, Text, LongWritable> {

        @Override
       
    public void configure(JobConf arg0) {
           
    // Init Text and LongWritable
            domain = new Text();
            one
    = new LongWritable(1);
        }


        @Override
       
    public void close() throws IOException {
           
    // TODO Auto-generated method stub
        }


        @Override
       
    public void map(LongWritable key, Text value,
                OutputCollector
    <Text, LongWritable> output, Reporter reporter)
               
    throws IOException {
           
    // Get URL
            String url = value.toString().trim();

           
    // URL->Domain && Collect
            domain.set(ParseDomain(url));
           
    if (domain.getLength() != 0) {
                output.collect(domain, one);
            }


        }


       
    public String ParseDomain(String url) {
           
    try {
                URI uri
    = URI.create(url);
               
    return uri.getHost();
            }
    catch (Exception e) {
               
    return "";
            }

        }


       
    // Shared used Text domain
        private Text domain;

       
    // One static
        private LongWritable one;

    }

    Reducer

    package com.keseek.hadoop;

    import java.io.IOException;
    import java.util.Iterator;

    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.OutputCollector;
    import org.apache.hadoop.mapred.Reporter;
    import org.apache.hadoop.mapred.Reducer;

    public class DomainCountReducer implements
            Reducer
    <Text, LongWritable, Text, LongWritable> {

        @Override
       
    public void configure(JobConf arg0) {
           
    // TODO Auto-generated method stub

        }


        @Override
       
    public void close() throws IOException {
           
    // TODO Auto-generated method stub

        }


        @Override
       
    public void reduce(Text key, Iterator<LongWritable> values,
                OutputCollector
    <Text, LongWritable> output, Reporter reporter)
               
    throws IOException {
           
    // Count the domain
            long cnt = 0;
           
    while (values.hasNext()) {
                cnt
    += values.next().get();
            }

           
    // Output
            output.collect(key, new LongWritable(cnt));
        }


    }

    Main

    package com.keseek.hadoop;

    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapred.FileInputFormat;
    import org.apache.hadoop.mapred.FileOutputFormat;
    import org.apache.hadoop.mapred.JobClient;
    import org.apache.hadoop.mapred.JobConf;
    import org.apache.hadoop.mapred.RunningJob;
    import org.apache.hadoop.mapred.TextInputFormat;
    import org.apache.hadoop.mapred.TextOutputFormat;

    public class DomainCountMain {
       
    public static void main(String[] args) throws Exception {
           
    // Param for path
            if (args.length != 2) {
                System.out.println(
    "Usage:");
                System.out
                        .println(
    "DomainCountMain.jar  <Input_Path>  <Outpu_Path>");
                System.exit(
    -1);
            }


           
    // Configure JobConf
            JobConf jobconf = new JobConf(DomainCountMain.class);

            jobconf.setJobName(
    "Domain Counter by Coder4");

            FileInputFormat.setInputPaths(jobconf,
    new Path(args[0]));
           FileOutputFormat.setOutputPath(jobconf,
    new Path(args[1]));

            jobconf.setInputFormat(TextInputFormat.
    class);
           jobconf.setOutputFormat(TextOutputFormat.
    class);

            jobconf.setMapperClass(DomainCountMapper.
    class);
            jobconf.setReducerClass(DomainCountReducer.
    class);
           jobconf.setCombinerClass(DomainCountReducer.
    class);

            jobconf.setMapOutputKeyClass(Text.
    class);
            jobconf.setMapOutputValueClass(LongWritable.
    class);
            jobconf.setOutputKeyClass(Text.
    class);
            jobconf.setOutputValueClass(LongWritable.
    class);

           
    // Run job
            RunningJob run = JobClient.runJob(jobconf);
            run.waitForCompletion();
           
    if (run.isSuccessful()) {
                System.out.println(
    "<<<DomainCount Main>>> success.");
            }
    else {
                System.out.println(
    "<<<DomainCount Main>>> error.");
            }

        }

    }

    posted on 2012-09-08 15:30 paulwong 閱讀(270) 評論(0)  編輯  收藏 所屬分類: HADOOP 、云計算

    主站蜘蛛池模板: 免费无码又爽又高潮视频| 一个人看的免费观看日本视频www 一个人看的免费视频www在线高清动漫 | 50岁老女人的毛片免费观看| 丝袜熟女国偷自产中文字幕亚洲| 亚洲成AV人影片在线观看| 成年女人看片免费视频播放器| 亚洲中文字幕乱码AV波多JI| 亚洲免费观看在线视频| 亚洲视频免费在线播放| 亚洲精品免费在线观看| 亚洲精品高清国产一久久| 99热在线观看免费| 亚洲av丰满熟妇在线播放| 无码人妻一区二区三区免费看| 久久精品国产亚洲香蕉| 久久国产免费观看精品| 91天堂素人精品系列全集亚洲| 亚洲美女免费视频| 亚洲色偷偷色噜噜狠狠99网| 国产精品免费一级在线观看| 特级aa**毛片免费观看| 亚洲中文字幕无码久久2017| 最近中文字幕免费大全| 亚洲天堂中文字幕在线观看| 天天拍拍天天爽免费视频| 黄色毛片免费网站| 亚洲国产精品VA在线观看麻豆| 一级毛片在线观看免费| 亚洲成人福利在线| 国产免费av片在线无码免费看| 中文字幕无码毛片免费看| 亚洲最大黄色网站| 亚洲av中文无码| 久久国产高潮流白浆免费观看| 亚洲日韩中文字幕无码一区| 亚洲爽爽一区二区三区| 日本不卡免费新一区二区三区| 亚洲网址在线观看| 女人毛片a级大学毛片免费| 亚洲高清国产拍精品熟女| 亚洲热线99精品视频|