<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Terry.Li-彬

    虛其心,可解天下之問;專其心,可治天下之學;靜其心,可悟天下之理;恒其心,可成天下之業。

      BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理 ::
      143 隨筆 :: 344 文章 :: 130 評論 :: 0 Trackbacks
    基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

    第一次寫中文分詞程序,歡迎拍磚。

    public?class?MM2?
    {
    ????
    private?static?final?Log?log?=?LogFactory.getLog(MM2.class);
    ????
    ????
    private?static?HashMap<String,?Integer>?dictionary?=?null;?
    ????
    private?static?final?int?WORD_MAX_LENGTH?=?9;
    ????
    private?Reader?reader;
    ????
    ????
    static
    ????
    {
    ????????loadDictionary();
    ????}

    ????
    ????
    public?MM2(Reader?reader)?
    ????
    {?
    ????????
    this.reader?=?reader;?
    ????}
    ?
    ????
    ????
    //切分出由中文、字母、數字組成的句子
    ????public?ArrayList<Sentence>?getSentence()?throws?IOException
    ????
    {???
    ????????ArrayList
    <Sentence>?list=new?ArrayList<Sentence>();
    ????????StringBuffer?cb
    =new?StringBuffer();
    ????????
    int?d=reader.read();
    ????????
    int?offset=0;
    ????????
    boolean?b=false;
    ????????
    while(d>-1)
    ????????
    {
    ????????????
    int?type=Character.getType(d);
    ????????????
    if(type==2?||?type==9?||?type==5)
    ????????????
    {
    ????????????????d
    =toAscii(d);
    ????????????????cb.append((
    char)d);
    ????????????}

    ????????????
    else
    ????????????
    {
    ????????????????b
    =true;
    ????????????}

    ????????????d
    =reader.read();
    ????????????
    if(d==-1?||?b)
    ????????????
    {
    ????????????????
    if(d==-1)?offset++;
    ????????????????b
    =false;
    ????????????????
    char[]?ioBuffer?=?new?char[cb.length()];
    ????????????????cb.getChars(
    0,?cb.length(),?ioBuffer,?0);
    ????????????????Sentence?sen
    =new?Sentence(ioBuffer,offset-cb.length());
    ????????????????list.add(sen);
    ????????????????cb.setLength(
    0);
    ????????????}

    ????????????offset
    ++;
    ????????}

    ????????
    return?list;
    ????}

    ????
    ????
    //將句子切分出詞
    ????public?ArrayList<Token>?getToken(ArrayList<Sentence>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????
    for(Sentence?sen:list)
    ????????
    {
    ????????????StringBuffer?word?
    =?new?StringBuffer();
    ????????????
    int?offset=sen.getStartOffset();
    ????????????
    int?bufferIndex?=?0;
    ????????????
    char?c;
    ????????????
    boolean?b=false;
    ????????????
    while(bufferIndex<sen.getText().length)
    ????????????
    {
    ????????????????offset
    ++;
    ????????????????c
    =sen.getText()[bufferIndex++];
    ????????????????
    if(word.length()==0)
    ????????????????????word.append(c);
    ????????????????
    else
    ????????????????
    {
    ????????????????????String?temp?
    =?(word.toString()?+?c).intern();
    ????????????????????
    if(dictionary.containsKey(temp)?&&?dictionary.get(temp)==1)
    ????????????????????????word.append(c);
    ????????????????????
    else?if(dictionary.containsKey(temp)?&&?bufferIndex<sen.getText().length)
    ????????????????????????word.append(c);
    ????????????????????
    else
    ????????????????????
    {
    ????????????????????????bufferIndex
    --;
    ????????????????????????offset
    --;
    ????????????????????????
    while(word.length()>1?&&?dictionary.get(word.toString())!=null?&&?dictionary.get(word.toString())==2)
    ????????????????????????
    {
    ????????????????????????????word.deleteCharAt(word.length()
    -1);
    ????????????????????????????bufferIndex
    --;
    ????????????????????????????offset
    --;
    ????????????????????????}

    ????????????????????????b
    =true;
    ????????????????????}

    ????????????????}

    ????????????????
    if(b?||?bufferIndex==sen.getText().length)
    ????????????????
    {
    ????????????????????Token?token?
    =?new?Token(word.toString(),offset-word.length(),offset,"word");
    ????????????????????word.setLength(
    0);
    ????????????????????tokenlist.add(token);
    ????????????????????b
    =false;
    ????????????????}

    ????????????}

    ????????}

    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //將相連的單個英文或數字組合成詞
    ????public?ArrayList<Token>?getNewToken(ArrayList<Token>?list)?throws?IOException
    ????
    {
    ????????ArrayList
    <Token>?tokenlist=new?ArrayList<Token>();
    ????????Token?word
    =null;
    ????????
    for(int?i=0;i<list.size();i++)
    ????????
    {
    ????????????Token?t
    =list.get(i);
    ????????????
    if(t.getWord().length()==1?&&?Character.getType((int)t.getWord().charAt(0))!=5)
    ????????????
    {
    ????????????????
    if(word==null)
    ????????????????????word
    =t;
    ????????????????
    else?if(word.getEnd()==t.getStart())
    ????????????????
    {
    ????????????????????word.setEnd(t.getEnd());
    ????????????????????word.setWord(word.getWord()
    +t.getWord());
    ????????????????}

    ????????????????
    else
    ????????????????
    {
    ????????????????????tokenlist.add(word);
    ????????????????????word
    =t;
    ????????????????}

    ????????????}

    ????????????
    else?if(word!=null)
    ????????????
    {
    ????????????????tokenlist.add(word);
    ????????????????word
    =null;
    ????????????????tokenlist.add(t);
    ????????????}

    ????????????
    else
    ????????????????tokenlist.add(t);
    ????????}

    ????????
    if(word!=null)
    ????????????tokenlist.add(word);
    ????????
    return?tokenlist;
    ????}

    ????
    ????
    //雙角轉單角
    ????public?static?int?toAscii(int?codePoint)?
    ????
    {
    ????????
    if((codePoint>=65296?&&?codePoint<=65305)????//0-9
    ????????????????||?(codePoint>=65313?&&?codePoint<=65338)????//A-Z
    ????????????????||?(codePoint>=65345?&&?codePoint<=65370)????//a-z
    ????????????????)
    ????????
    {????
    ????????????codePoint?
    -=?65248;
    ????????}

    ????????
    return?codePoint;
    ????}

    ????
    ????
    //加載詞典
    ????public?static?void?loadDictionary()?
    ????
    {??
    ????????
    if?(dictionary?==?null)?
    ????????
    {????
    ????????????dictionary?
    =?new?HashMap<String,?Integer>();????
    ????????????InputStream?is?
    =?null;????
    ????????????BufferedReader?br?
    =?null;????????????
    ????????????
    try
    ????????????
    {
    ????????????????is?
    =?new?FileInputStream(new?File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
    ????????????????br?
    =?new?BufferedReader(new?InputStreamReader(is,?"UTF-8"));
    ????????????????String?word?
    =?null;
    ????????????????
    while?((word?=?br.readLine())?!=?null)?
    ????????????????
    {
    ????????????????????word
    =word.toLowerCase();
    ????????????????????
    if?((word.indexOf("#")?==?-1)?&&?(word.length()?<=?WORD_MAX_LENGTH))
    ????????????????????
    {
    ????????????????????????dictionary.put(word.intern(),?
    1);????
    ????????????????????????
    int?i?=?word.length()-1;?
    ????????????????????????
    while(i?>=?2)
    ????????????????????????
    {
    ????????????????????????????String?temp?
    =?word.substring(0,?i).intern();?
    ????????????????????????????
    if?(!dictionary.containsKey(temp))
    ????????????????????????????????dictionary.put(temp,
    2);?
    ????????????????????????????i
    --;
    ????????????????????????}

    ????????????????????}

    ????????????????}

    ????????????}

    ????????????
    catch?(Exception?e)?
    ????????????
    {??????
    ????????????????log.info(e);
    ????????????}

    ????????????
    finally
    ????????????
    {
    ????????????????
    try?
    ????????????????
    {??????
    ????????????????????
    if(br!=null)
    ????????????????????????br.close();???
    ????????????????????
    if(is!=null)
    ????????????????????????is.close();??
    ????????????????}

    ????????????????
    catch?(IOException?e)
    ????????????????
    {?????
    ????????????????????log.info(e);
    ????????????????}
    ????????????
    ????????????}
    ?
    ????????}
    ?
    ????}

    ????
    ????
    public?static?String[]?segWords(Reader?input)
    ????
    {
    ????????ArrayList
    <String>?list=new?ArrayList<String>();
    ????????
    try
    ????????
    {
    ????????????MM2?f
    =new?MM2(input);
    ????????????ArrayList
    <Token>?tlist=?f.getNewToken(f.getToken(f.getSentence()));
    ????????????
    for(Token?t:tlist)
    ????????????
    {
    ????????????????list.add(t.getWord());
    ????????????}

    ????????}

    ????????
    catch(IOException?e)
    ????????
    {
    ????????????log.info(e);
    ????????}

    ????????
    return?(String[])list.toArray(new?String[0]);
    ????}

    ????
    ????
    public?static?void?main(String[]?args)?
    ????
    {
    ????????String[]?cc
    =MM2.segWords(new?StringReader("ibm商務機t60p".toLowerCase()));
    ????????
    for(String?c:cc)
    ????????
    {
    ????????????System.out.println(c);
    ????????}

    ????}

    }
    posted on 2011-08-05 08:34 禮物 閱讀(2118) 評論(2)  編輯  收藏

    評論

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-07-25 22:09 yi
    這是全的么,樓主?我導入到MyEclipse里好多錯誤呀,除了import包之外還有好多錯,看不懂。。。  回復  更多評論
      

    # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-08-22 20:01 love code
    麻煩 博主把dictionary.txt發給我吧,讓我學習學習
    1182787467@qq.com
    謝謝  回復  更多評論
      


    只有注冊用戶登錄后才能發表評論。

    網站導航:
     
    主站蜘蛛池模板: 97免费人妻无码视频| 一级A毛片免费观看久久精品| 亚洲精品不卡视频| 亚洲黄色片免费看| 亚洲高清无在码在线电影不卡| 亚洲黄色在线网站| 久久亚洲AV成人无码| 亚洲AV无码乱码在线观看裸奔| 亚洲AV无码不卡在线播放| 亚洲va国产va天堂va久久| 亚洲av日韩av天堂影片精品| 亚洲AV无码成人精品区在线观看 | 无码国模国产在线观看免费| 日韩在线看片免费人成视频播放| 日本无吗免费一二区| 免费A级毛片无码久久版| 亚洲男人的天堂在线va拉文 | 久久午夜伦鲁片免费无码| 久久久久久一品道精品免费看 | 噜噜噜亚洲色成人网站| 四虎一区二区成人免费影院网址| 亚洲一级片免费看| 久久免费区一区二区三波多野| ww在线观视频免费观看| 在线观看无码的免费网站| yy6080亚洲一级理论| 亚洲精品tv久久久久久久久| 亚洲国产精品婷婷久久| 亚洲精品一二三区| 美女视频黄频a免费大全视频| aa毛片免费全部播放完整| 99久久综合精品免费| 免费观看理论片毛片| 亚洲视频在线免费| 亚洲精品私拍国产福利在线| 亚洲免费视频播放| 一边摸一边爽一边叫床免费视频| 嫩草成人永久免费观看| 成年女人毛片免费视频| 国产亚洲老熟女视频| 亚洲毛片免费观看|