<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆-23  評論-58  文章-0  trackbacks-0
    基于詞典的逆向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤。實際分詞效果比正向分詞效果好

    查看第2版:逆向最大匹配分詞程序,能實現中英文數字混合分詞 (第二版)

    public class RMM
    {
        
    private static final Log log = LogFactory.getLog(RMM.class);
        
        
    private static HashMap<String, Integer> dictionary = null
        
    private static final int WORD_MAX_LENGTH = 9;
        
        
    static
        
    {
            loadDictionary();
        }

        
        
    //將句子切分出詞,逆向最大匹配
        public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
        
    {
            Collections.reverse(list);
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            
    for(Sentence sen:list)
            
    {
                StringBuffer word 
    = new StringBuffer();
                
    int offset=sen.getStartOffset()+sen.getText().length;
                
    int bufferIndex = sen.getText().length-1;
                
    char c;
                
    boolean b=false;
                
    while(bufferIndex>-1)
                
    {
                    offset
    --;
                    c
    =sen.getText()[bufferIndex--];
                    
    if(word.length()==0)
                        word.append(c);
                    
    else
                    
    {
                        String temp 
    = (c+word.toString()).intern();
                        
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                            word.insert(
    0, c);
                        
    else if(dictionary.containsKey(temp) && bufferIndex>-1)
                            word.insert(
    0, c);
                        
    else
                        
    {
                            bufferIndex
    ++;
                            offset
    ++;
                            
    while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                            
    {
                                word.deleteCharAt(
    0);
                                bufferIndex
    ++;
                                offset
    ++;
                            }

                            b
    =true;
                        }

                    }

                    
    if(b || bufferIndex==-1)
                    
    {
                        Token token 
    = new Token(word.toString(),offset,offset+word.length(),"word");
                        word.setLength(
    0);
                        tokenlist.add(token);
                        b
    =false;
                    }

                }

            }

            Collections.reverse(tokenlist);
            
    return tokenlist;
        }

        
        
    //加載詞典
        public static void loadDictionary() 
        
    {  
            
    if (dictionary == null
            
    {    
                dictionary 
    = new HashMap<String, Integer>();    
                InputStream is 
    = null;    
                BufferedReader br 
    = null;            
                
    try
                
    {
                    is 
    = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
                    br 
    = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                    String word 
    = null;
                    
    while ((word = br.readLine()) != null
                    
    {
                        word
    =word.toLowerCase();
                        
    if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                        
    {
                            dictionary.put(word.intern(), 
    1);    
                            
    int i = 1
                            
    while(i < word.length()-1)
                            
    {
                                String temp 
    = word.substring(i,word.length()).intern(); 
                                
    if (!dictionary.containsKey(temp))
                                    dictionary.put(temp,
    2); 
                                i
    ++;
                            }

                        }

                    }

                }

                
    catch (Exception e) 
                
    {      
                    log.info(e);
                }

                
    finally
                
    {
                    
    try 
                    
    {      
                        
    if(br!=null)
                            br.close();   
                        
    if(is!=null)
                            is.close();  
                    }

                    
    catch (IOException e)
                    
    {     
                        log.info(e);
                    }
                
                }
     
            }
     
        }

        
        
    public static String[] segWords(Reader reader)
        
    {
            ArrayList
    <String> list=new ArrayList<String>();
            
    try
            
    {
                ArrayList
    <Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
                
    for(Token t:tlist)
                
    {
                    list.add(t.getWord());
                }

            }

            
    catch(IOException e)
            
    {
                log.info(e);
            }

            
    return (String[])list.toArray(new String[0]);
        }

        
        
    public static void main(String[] args) 
         
    {
            String[] cc
    =RMM.segWords(new StringReader("急、急、急、花里林居,二房二廳,業主誠心,出租".toLowerCase()));
            
    for(String c:cc)
            
    {
                System.out.println(c);
            }

        }

    }


    public class Util
    {
     //切分出由中文、字母、數字組成的句子
     public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
     {  
      ArrayList<Sentence> list=new ArrayList<Sentence>();
      StringBuffer cb=new StringBuffer();
      int d=reader.read();
      int offset=0;
      boolean b=false;
      while(d>-1)
      {
       int type=Character.getType(d);
       if(type==2 || type==9 || type==5)
       {
        d=toAscii(d);
        cb.append((char)d);
       }
       else
       {
        b=true;
       }
       d=reader.read();
       if(d==-1 || b)
       {
        if(d==-1) offset++;
        b=false;
        char[] ioBuffer = new char[cb.length()];
        cb.getChars(0, cb.length(), ioBuffer, 0);
        Sentence sen=new Sentence(ioBuffer,offset-cb.length());
        list.add(sen);
        cb.setLength(0);
       }
       offset++;
      }
      return list;
     }
     
     //將相連的單個英文或數字組合成詞
     public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
     {
      ArrayList<Token> tokenlist=new ArrayList<Token>();
      Token word=null;
      for(int i=0;i<list.size();i++)
      {
       Token t=list.get(i);
       if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
       {
        if(word==null)
         word=t;
        else if(word.getEnd()==t.getStart())
        {
         word.setEnd(t.getEnd());
         word.setWord(word.getWord()+t.getWord());
        }
        else
        {
         tokenlist.add(word);
         word=t;
        }
       }
       else if(word!=null)
       {
        tokenlist.add(word);
        word=null;
        tokenlist.add(t);
       }
       else
        tokenlist.add(t);
      }
      if(word!=null)
       tokenlist.add(word);
      return tokenlist;
     }
     
     //雙角轉單角
     public static int toAscii(int codePoint)
     {
      if((codePoint>=65296 && codePoint<=65305) //0-9
        || (codePoint>=65313 && codePoint<=65338) //A-Z
        || (codePoint>=65345 && codePoint<=65370) //a-z
        )
      { 
       codePoint -= 65248;
      }
      return codePoint;
     }
    }








    posted on 2011-08-19 13:22 nianzai 閱讀(4487) 評論(2)  編輯  收藏 所屬分類: 中文分詞

    評論:
    # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 [未登錄] 2011-10-21 16:38 | zxj
    樓主,代碼中的Sentence 類呢?  回復  更多評論
      
    # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 2011-11-08 15:55 | nianzai
    參考正向最大匹配中文分詞算法  回復  更多評論
      
    主站蜘蛛池模板: 中文字幕的电影免费网站| A在线观看免费网站大全| 亚洲自偷自偷在线成人网站传媒 | 嘿嘿嘿视频免费网站在线观看| 免费精品视频在线| 亚洲乱码一区二区三区国产精品| 午夜影视日本亚洲欧洲精品一区| 免费观看亚洲人成网站| 毛片免费在线观看网站| 67194成手机免费观看| 中文字幕乱码系列免费| 特级做a爰片毛片免费看| 亚洲小视频在线观看| 亚洲日韩精品无码专区网站| 妞干网手机免费视频| 一本岛高清v不卡免费一三区| 免费精品99久久国产综合精品| 国产精品免费αv视频| 青娱乐在线视频免费观看| 亚洲爆乳大丰满无码专区| 国产jizzjizz免费视频| 岛国av无码免费无禁网站| 免费A级毛片无码无遮挡内射| 亚洲成人免费网站| 57pao一国产成视频永久免费| 午夜无码A级毛片免费视频| 99视频在线观看免费| 日韩久久无码免费毛片软件 | 国产禁女女网站免费看| 免费毛片在线视频| 免费高清小黄站在线观看| 毛片网站免费在线观看| 成人黄动漫画免费网站视频| 欧美在线看片A免费观看| 久久精品无码一区二区三区免费| 妻子5免费完整高清电视| 日本精品人妻无码免费大全| 在线jlzzjlzz免费播放| 免费观看的av毛片的网站| 又黄又爽的视频免费看| 亚洲片国产一区一级在线观看|