<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Terry.Li-彬

    虛其心,可解天下之問;專其心,可治天下之學;靜其心,可悟天下之理;恒其心,可成天下之業。

      BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理 ::
      143 隨筆 :: 344 文章 :: 130 評論 :: 0 Trackbacks
    基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

    第一次寫中文分詞程序,歡迎拍磚。

    public?class?MM2?
    {
    ????
    private?static?final?Log?log?=?LogFactory.getLog(MM2.class);
    ????
    ????
    private?static?HashMap<String,?Integer>?dictionary?=?null;?
    ????
    private?static?final?int?WORD_MAX_LENGTH?=?9;
    ????
    private?Reader?reader;
    ????
    ????
    static
    ????
    {
    ????????loadDictionary();
    ????}

    ????
    ????
    public?MM2(Reader?reader)?
    ????
    {?
    ????????
    this.reader?=?reader;?
    ????}
    ?
    ????
    ????
    //切分出由中文、字母、數字組成的句子
    ????public?ArrayList<Sentence>?getSentence()?throws?IOException
    ????
    {???
    ????????ArrayList
    <Sentence>?list=new?ArrayList<Sentence>();
    ????????StringBuffer?cb
    =new?StringBuffer();
    ????????
    int?d=reader.read();
    ????????
    int?offset=0;
    ????????
    boolean?b=false;
    ????????
    while(d>-1)
    ????????
    {
    ????????????
    int?type=Character.getType(d);
    ????????????
    if(type==2?||?type==9?||?type==5)
    ????????????
    {
    ????????????????d
    =toAscii(d);
    ????????????????cb.append((
    char)d);
    ????????????}

    ????????????
    else
    ????????????
    {
    ????????????????b
    =true;
    ????????????}

    ????????????d
    =reader.read();
    ????????????
    if(d==-1?||?b)
    ????????????
    {
    ????????????????
    if(d==-1)?offset++;
    ????????????????b
    =false;
    ????????????????
    char[]?ioBuffer?=?new?char[cb.length()];
    ????????????????cb.getChars(
    0,?cb.length(),?ioBuffer,?0);
    ????????????????Sentence?sen
    =new?Sentence(ioBuffer,offset-cb.length());
    ????????????????list.add(sen);
    ????????????????cb.setLength(
    0);
    ????????????}

    ????????????offset
    ++;
    ????????}

    ????????
    return?list;
    ????}

    ????
    ????
    //將句子切分出詞
    ????public?ArrayList<Token>?getToken(ArrayList<Sentence>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????
    for(Sentence?sen:list)
    ????????
    {
    ????????????StringBuffer?word?
    =?new?StringBuffer();
    ????????????
    int?offset=sen.getStartOffset();
    ????????????
    int?bufferIndex?=?0;
    ????????????
    char?c;
    ????????????
    boolean?b=false;
    ????????????
    while(bufferIndex<sen.getText().length)
    ????????????
    {
    ????????????????offset
    ++;
    ????????????????c
    =sen.getText()[bufferIndex++];
    ????????????????
    if(word.length()==0)
    ????????????????????word.append(c);
    ????????????????
    else
    ????????????????
    {
    ????????????????????String?temp?
    =?(word.toString()?+?c).intern();
    ????????????????????
    if(dictionary.containsKey(temp)?&&?dictionary.get(temp)==1)
    ????????????????????????word.append(c);
    ????????????????????
    else?if(dictionary.containsKey(temp)?&&?bufferIndex<sen.getText().length)
    ????????????????????????word.append(c);
    ????????????????????
    else
    ????????????????????
    {
    ????????????????????????bufferIndex
    --;
    ????????????????????????offset
    --;
    ????????????????????????
    while(word.length()>1?&&?dictionary.get(word.toString())!=null?&&?dictionary.get(word.toString())==2)
    ????????????????????????
    {
    ????????????????????????????word.deleteCharAt(word.length()
    -1);
    ????????????????????????????bufferIndex
    --;
    ????????????????????????????offset
    --;
    ????????????????????????}

    ????????????????????????b
    =true;
    ????????????????????}

    ????????????????}

    ????????????????
    if(b?||?bufferIndex==sen.getText().length)
    ????????????????
    {
    ????????????????????Token?token?
    =?new?Token(word.toString(),offset-word.length(),offset,"word");
    ????????????????????word.setLength(
    0);
    ????????????????????tokenlist.add(token);
    ????????????????????b
    =false;
    ????????????????}

    ????????????}

    ????????}

    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //將相連的單個英文或數字組合成詞
    ????public?ArrayList<Token>?getNewToken(ArrayList<Token>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????Token?word
    =null;
    ????????
    for(int?i=0;i<list.size();i++)
    ????????
    {
    ????????????Token?t
    =list.get(i);
    ????????????
    if(t.getWord().length()==1?&&?Character.getType((int)t.getWord().charAt(0))!=5)
    ????????????
    {
    ????????????????
    if(word==null)
    ????????????????????word
    =t;
    ????????????????
    else?if(word.getEnd()==t.getStart())
    ????????????????
    {
    ????????????????????word.setEnd(t.getEnd());
    ????????????????????word.setWord(word.getWord()
    +t.getWord());
    ????????????????}

    ????????????????
    else
    ????????????????
    {
    ????????????????????tokenlist.add(word);
    ????????????????????word
    =t;
    ????????????????}

    ????????????}

    ????????????
    else?if(word!=null)
    ????????????
    {
    ????????????????tokenlist.add(word);
    ????????????????word
    =null;
    ????????????????tokenlist.add(t);
    ????????????}

    ????????????
    else
    ????????????????tokenlist.add(t);
    ????????}

    ????????
    if(word!=null)
    ????????????tokenlist.add(word);
    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //雙角轉單角
    ????public?static?int?toAscii(int?codePoint)?
    ????
    {
    ????????
    if((codePoint>=65296?&&?codePoint<=65305)????//0-9
    ????????????????||?(codePoint>=65313?&&?codePoint<=65338)????//A-Z
    ????????????????||?(codePoint>=65345?&&?codePoint<=65370)????//a-z
    ????????????????)
    ????????
    {????
    ????????????codePoint?
    -=?65248;
    ????????}

    ????????
    return?codePoint;
    ????}

    ????
    ????
    //加載詞典
    ????public?static?void?loadDictionary()?
    ????
    {??
    ????????
    if?(dictionary?==?null)?
    ????????
    {????
    ????????????dictionary?
    =?new?HashMap<String,?Integer>();????
    ????????????InputStream?is?
    =?null;????
    ????????????BufferedReader?br?
    =?null;????????????
    ????????????
    try
    ????????????
    {
    ????????????????is?
    =?new?FileInputStream(new?File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
    ????????????????br?
    =?new?BufferedReader(new?InputStreamReader(is,?"UTF-8"));
    ????????????????String?word?
    =?null;
    ????????????????
    while?((word?=?br.readLine())?!=?null)?
    ????????????????
    {
    ????????????????????word
    =word.toLowerCase();
    ????????????????????
    if?((word.indexOf("#")?==?-1)?&&?(word.length()?<=?WORD_MAX_LENGTH))
    ????????????????????
    {
    ????????????????????????dictionary.put(word.intern(),?
    1);????
    ????????????????????????
    int?i?=?word.length()-1;?
    ????????????????????????
    while(i?>=?2)
    ????????????????????????
    {
    ????????????????????????????String?temp?
    =?word.substring(0,?i).intern();?
    ????????????????????????????
    if?(!dictionary.containsKey(temp))
    ????????????????????????????????dictionary.put(temp,
    2);?
    ????????????????????????????i
    --;
    ????????????????????????}

    ????????????????????}

    ????????????????}

    ????????????}

    ????????????
    catch?(Exception?e)?
    ????????????
    {??????
    ????????????????log.info(e);
    ????????????}

    ????????????
    finally
    ????????????
    {
    ????????????????
    try?
    ????????????????
    {??????
    ????????????????????
    if(br!=null)
    ????????????????????????br.close();???
    ????????????????????
    if(is!=null)
    ????????????????????????is.close();??
    ????????????????}

    ????????????????
    catch?(IOException?e)
    ????????????????
    {?????
    ????????????????????log.info(e);
    ????????????????}
    ????????????
    ????????????}
    ?
    ????????}
    ?
    ????}

    ????
    ????
    public?static?String[]?segWords(Reader?input)
    ????
    {
    ????????ArrayList
    <String>?list=new?ArrayList<String>();
    ????????
    try
    ????????
    {
    ????????????MM2?f
    =new?MM2(input);
    ????????????ArrayList
    <Token>?tlist=?f.getNewToken(f.getToken(f.getSentence()));
    ????????????
    for(Token?t:tlist)
    ????????????
    {
    ????????????????list.add(t.getWord());
    ????????????}

    ????????}

    ????????
    catch(IOException?e)
    ????????
    {
    ????????????log.info(e);
    ????????}

    ????????
    return?(String[])list.toArray(new?String[0]);
    ????}

    ????
    ????
    public?static?void?main(String[]?args)?
    ????
    {
    ????????String[]?cc
    =MM2.segWords(new?StringReader("ibm商務機t60p".toLowerCase()));
    ????????
    for(String?c:cc)
    ????????
    {
    ????????????System.out.println(c);
    ????????}

    ????}

    }
    posted on 2011-08-05 08:34 禮物 閱讀(2118) 評論(2)  編輯  收藏

    評論

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-07-25 22:09 yi
    這是全的么,樓主?我導入到MyEclipse里好多錯誤呀,除了import包之外還有好多錯,看不懂。。。  回復  更多評論
      

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-08-22 20:01 love code
    麻煩 博主把dictionary.txt發給我吧,讓我學習學習
    1182787467@qq.com
    謝謝  回復  更多評論
      


    只有注冊用戶登錄后才能發表評論。

    網站導航:
     
    主站蜘蛛池模板: 亚洲精品中文字幕无码AV| 国产亚洲av片在线观看播放| 久爱免费观看在线网站| 精品久久久久久亚洲综合网| 亚洲午夜国产精品| 思思99re66在线精品免费观看| 亚洲最大中文字幕无码网站| 亚洲国产综合专区电影在线| 免费va在线观看| 午夜免费福利片观看| 成人免费乱码大片A毛片| 91久久亚洲国产成人精品性色| 性xxxx视频播放免费| 免费三级毛片电影片| 国产精品怡红院永久免费| 老妇激情毛片免费| 日本亚洲色大成网站www久久| 免费人成在线观看网站视频 | 阿v免费在线观看| 亚洲中文字幕无码av在线| 久久亚洲熟女cc98cm| 国产偷v国产偷v亚洲高清| 亚洲精品无码久久千人斩| 亚洲黄黄黄网站在线观看| 国产在线98福利播放视频免费| 中文字幕免费播放| 久久久久久久久久久免费精品| 亚洲av片不卡无码久久| 亚洲理论片在线中文字幕| 亚洲无成人网77777| 亚洲午夜电影在线观看高清| 亚洲国产亚洲片在线观看播放| 亚洲国产精品第一区二区三区| 最近2019中文免费字幕在线观看| 亚洲天堂2017无码中文| 亚洲视频在线观看2018| 久久青青成人亚洲精品| 亚洲成a人一区二区三区| 在线观看亚洲精品福利片| 亚洲AV永久无码区成人网站| 99久久亚洲精品无码毛片|