<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Terry.Li-彬

    虛其心,可解天下之問(wèn);專(zhuān)其心,可治天下之學(xué);靜其心,可悟天下之理;恒其心,可成天下之業(yè)。

      BlogJava :: 首頁(yè) :: 新隨筆 :: 聯(lián)系 :: 聚合  :: 管理 ::
      143 隨筆 :: 344 文章 :: 130 評(píng)論 :: 0 Trackbacks
    基于詞典的正向最大匹配中文分詞算法,能實(shí)現(xiàn)中英文數(shù)字混合分詞。比如能分出這樣的詞:bb霜、3室、樂(lè)phone、touch4、mp3、T恤

    第一次寫(xiě)中文分詞程序,歡迎拍磚。

    public?class?MM2?
    {
    ????
    private?static?final?Log?log?=?LogFactory.getLog(MM2.class);
    ????
    ????
    private?static?HashMap<String,?Integer>?dictionary?=?null;?
    ????
    private?static?final?int?WORD_MAX_LENGTH?=?9;
    ????
    private?Reader?reader;
    ????
    ????
    static
    ????
    {
    ????????loadDictionary();
    ????}

    ????
    ????
    public?MM2(Reader?reader)?
    ????
    {?
    ????????
    this.reader?=?reader;?
    ????}
    ?
    ????
    ????
    //切分出由中文、字母、數(shù)字組成的句子
    ????public?ArrayList<Sentence>?getSentence()?throws?IOException
    ????
    {???
    ????????ArrayList
    <Sentence>?list=new?ArrayList<Sentence>();
    ????????StringBuffer?cb
    =new?StringBuffer();
    ????????
    int?d=reader.read();
    ????????
    int?offset=0;
    ????????
    boolean?b=false;
    ????????
    while(d>-1)
    ????????
    {
    ????????????
    int?type=Character.getType(d);
    ????????????
    if(type==2?||?type==9?||?type==5)
    ????????????
    {
    ????????????????d
    =toAscii(d);
    ????????????????cb.append((
    char)d);
    ????????????}

    ????????????
    else
    ????????????
    {
    ????????????????b
    =true;
    ????????????}

    ????????????d
    =reader.read();
    ????????????
    if(d==-1?||?b)
    ????????????
    {
    ????????????????
    if(d==-1)?offset++;
    ????????????????b
    =false;
    ????????????????
    char[]?ioBuffer?=?new?char[cb.length()];
    ????????????????cb.getChars(
    0,?cb.length(),?ioBuffer,?0);
    ????????????????Sentence?sen
    =new?Sentence(ioBuffer,offset-cb.length());
    ????????????????list.add(sen);
    ????????????????cb.setLength(
    0);
    ????????????}

    ????????????offset
    ++;
    ????????}

    ????????
    return?list;
    ????}

    ????
    ????
    //將句子切分出詞
    ????public?ArrayList<Token>?getToken(ArrayList<Sentence>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????
    for(Sentence?sen:list)
    ????????
    {
    ????????????StringBuffer?word?
    =?new?StringBuffer();
    ????????????
    int?offset=sen.getStartOffset();
    ????????????
    int?bufferIndex?=?0;
    ????????????
    char?c;
    ????????????
    boolean?b=false;
    ????????????
    while(bufferIndex<sen.getText().length)
    ????????????
    {
    ????????????????offset
    ++;
    ????????????????c
    =sen.getText()[bufferIndex++];
    ????????????????
    if(word.length()==0)
    ????????????????????word.append(c);
    ????????????????
    else
    ????????????????
    {
    ????????????????????String?temp?
    =?(word.toString()?+?c).intern();
    ????????????????????
    if(dictionary.containsKey(temp)?&&?dictionary.get(temp)==1)
    ????????????????????????word.append(c);
    ????????????????????
    else?if(dictionary.containsKey(temp)?&&?bufferIndex<sen.getText().length)
    ????????????????????????word.append(c);
    ????????????????????
    else
    ????????????????????
    {
    ????????????????????????bufferIndex
    --;
    ????????????????????????offset
    --;
    ????????????????????????
    while(word.length()>1?&&?dictionary.get(word.toString())!=null?&&?dictionary.get(word.toString())==2)
    ????????????????????????
    {
    ????????????????????????????word.deleteCharAt(word.length()
    -1);
    ????????????????????????????bufferIndex
    --;
    ????????????????????????????offset
    --;
    ????????????????????????}

    ????????????????????????b
    =true;
    ????????????????????}

    ????????????????}

    ????????????????
    if(b?||?bufferIndex==sen.getText().length)
    ????????????????
    {
    ????????????????????Token?token?
    =?new?Token(word.toString(),offset-word.length(),offset,"word");
    ????????????????????word.setLength(
    0);
    ????????????????????tokenlist.add(token);
    ????????????????????b
    =false;
    ????????????????}

    ????????????}

    ????????}

    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //將相連的單個(gè)英文或數(shù)字組合成詞
    ????public?ArrayList<Token>?getNewToken(ArrayList<Token>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????Token?word
    =null;
    ????????
    for(int?i=0;i<list.size();i++)
    ????????
    {
    ????????????Token?t
    =list.get(i);
    ????????????
    if(t.getWord().length()==1?&&?Character.getType((int)t.getWord().charAt(0))!=5)
    ????????????
    {
    ????????????????
    if(word==null)
    ????????????????????word
    =t;
    ????????????????
    else?if(word.getEnd()==t.getStart())
    ????????????????
    {
    ????????????????????word.setEnd(t.getEnd());
    ????????????????????word.setWord(word.getWord()
    +t.getWord());
    ????????????????}

    ????????????????
    else
    ????????????????
    {
    ????????????????????tokenlist.add(word);
    ????????????????????word
    =t;
    ????????????????}

    ????????????}

    ????????????
    else?if(word!=null)
    ????????????
    {
    ????????????????tokenlist.add(word);
    ????????????????word
    =null;
    ????????????????tokenlist.add(t);
    ????????????}

    ????????????
    else
    ????????????????tokenlist.add(t);
    ????????}

    ????????
    if(word!=null)
    ????????????tokenlist.add(word);
    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //雙角轉(zhuǎn)單角
    ????public?static?int?toAscii(int?codePoint)?
    ????
    {
    ????????
    if((codePoint>=65296?&&?codePoint<=65305)????//0-9
    ????????????????||?(codePoint>=65313?&&?codePoint<=65338)????//A-Z
    ????????????????||?(codePoint>=65345?&&?codePoint<=65370)????//a-z
    ????????????????)
    ????????
    {????
    ????????????codePoint?
    -=?65248;
    ????????}

    ????????
    return?codePoint;
    ????}

    ????
    ????
    //加載詞典
    ????public?static?void?loadDictionary()?
    ????
    {??
    ????????
    if?(dictionary?==?null)?
    ????????
    {????
    ????????????dictionary?
    =?new?HashMap<String,?Integer>();????
    ????????????InputStream?is?
    =?null;????
    ????????????BufferedReader?br?
    =?null;????????????
    ????????????
    try
    ????????????
    {
    ????????????????is?
    =?new?FileInputStream(new?File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
    ????????????????br?
    =?new?BufferedReader(new?InputStreamReader(is,?"UTF-8"));
    ????????????????String?word?
    =?null;
    ????????????????
    while?((word?=?br.readLine())?!=?null)?
    ????????????????
    {
    ????????????????????word
    =word.toLowerCase();
    ????????????????????
    if?((word.indexOf("#")?==?-1)?&&?(word.length()?<=?WORD_MAX_LENGTH))
    ????????????????????
    {
    ????????????????????????dictionary.put(word.intern(),?
    1);????
    ????????????????????????
    int?i?=?word.length()-1;?
    ????????????????????????
    while(i?>=?2)
    ????????????????????????
    {
    ????????????????????????????String?temp?
    =?word.substring(0,?i).intern();?
    ????????????????????????????
    if?(!dictionary.containsKey(temp))
    ????????????????????????????????dictionary.put(temp,
    2);?
    ????????????????????????????i
    --;
    ????????????????????????}

    ????????????????????}

    ????????????????}

    ????????????}

    ????????????
    catch?(Exception?e)?
    ????????????
    {??????
    ????????????????log.info(e);
    ????????????}

    ????????????
    finally
    ????????????
    {
    ????????????????
    try?
    ????????????????
    {??????
    ????????????????????
    if(br!=null)
    ????????????????????????br.close();???
    ????????????????????
    if(is!=null)
    ????????????????????????is.close();??
    ????????????????}

    ????????????????
    catch?(IOException?e)
    ????????????????
    {?????
    ????????????????????log.info(e);
    ????????????????}
    ????????????
    ????????????}
    ?
    ????????}
    ?
    ????}

    ????
    ????
    public?static?String[]?segWords(Reader?input)
    ????
    {
    ????????ArrayList
    <String>?list=new?ArrayList<String>();
    ????????
    try
    ????????
    {
    ????????????MM2?f
    =new?MM2(input);
    ????????????ArrayList
    <Token>?tlist=?f.getNewToken(f.getToken(f.getSentence()));
    ????????????
    for(Token?t:tlist)
    ????????????
    {
    ????????????????list.add(t.getWord());
    ????????????}

    ????????}

    ????????
    catch(IOException?e)
    ????????
    {
    ????????????log.info(e);
    ????????}

    ????????
    return?(String[])list.toArray(new?String[0]);
    ????}

    ????
    ????
    public?static?void?main(String[]?args)?
    ????
    {
    ????????String[]?cc
    =MM2.segWords(new?StringReader("ibm商務(wù)機(jī)t60p".toLowerCase()));
    ????????
    for(String?c:cc)
    ????????
    {
    ????????????System.out.println(c);
    ????????}

    ????}

    }
    posted on 2011-08-05 08:34 禮物 閱讀(2118) 評(píng)論(2)  編輯  收藏

    評(píng)論

    # re: 基于詞典的正向最大匹配中文分詞算法,能實(shí)現(xiàn)中英文數(shù)字混合分詞 2013-07-25 22:09 yi
    這是全的么,樓主?我導(dǎo)入到MyEclipse里好多錯(cuò)誤呀,除了import包之外還有好多錯(cuò),看不懂。。。  回復(fù)  更多評(píng)論
      

    # re: 基于詞典的正向最大匹配中文分詞算法,能實(shí)現(xiàn)中英文數(shù)字混合分詞 2013-08-22 20:01 love code
    麻煩 博主把dictionary.txt發(fā)給我吧,讓我學(xué)習(xí)學(xué)習(xí)
    1182787467@qq.com
    謝謝  回復(fù)  更多評(píng)論
      


    只有注冊(cè)用戶(hù)登錄后才能發(fā)表評(píng)論。

    網(wǎng)站導(dǎo)航:
     
    主站蜘蛛池模板: 亚洲GV天堂GV无码男同| 亚洲色欲色欲www| 日韩激情淫片免费看| 亚洲国产综合精品一区在线播放| 亚洲日本va中文字幕久久| 免费看成人AA片无码视频羞羞网| 精品国产一区二区三区免费看| 免费人成年激情视频在线观看| 最新免费jlzzjlzz在线播放| 成人免费的性色视频| 亚洲无人区一区二区三区| 羞羞的视频在线免费观看| 91精品成人免费国产片| 国产午夜精品久久久久免费视 | 一级毛片免费播放试看60分钟 | 亚洲成a人无码av波多野按摩| 亚洲三级电影网站| 久久亚洲sm情趣捆绑调教| 美女内射无套日韩免费播放 | 免费A级毛片无码免费视| 亚洲成色999久久网站| 亚洲香蕉久久一区二区三区四区| 日韩精品无码免费专区午夜 | 亚洲色偷偷综合亚洲AV伊人| 亚洲男人的天堂在线播放| 少妇性饥渴无码A区免费 | 一级毛片免费观看| 一二三四免费观看在线视频中文版 | 国产大片免费网站不卡美女| 亚洲AV无码乱码在线观看裸奔 | 亚洲人成影院在线| 亚洲男同帅GAY片在线观看| 91九色老熟女免费资源站| 亚洲人成综合在线播放| 国产偷国产偷亚洲高清在线 | 七次郎成人免费线路视频| 亚洲国产精品无码久久久秋霞2 | 亚洲欧美精品午睡沙发| 国产精品免费一区二区三区四区| 999国内精品永久免费观看| 黄色毛片视频免费|