<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆-23  評論-58  文章-0  trackbacks-0
    基于詞典的逆向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤。實際分詞效果比正向分詞效果好

    查看第2版:逆向最大匹配分詞程序,能實現中英文數字混合分詞 (第二版)

    public class RMM
    {
        
    private static final Log log = LogFactory.getLog(RMM.class);
        
        
    private static HashMap<String, Integer> dictionary = null
        
    private static final int WORD_MAX_LENGTH = 9;
        
        
    static
        
    {
            loadDictionary();
        }

        
        
    //將句子切分出詞,逆向最大匹配
        public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
        
    {
            Collections.reverse(list);
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            
    for(Sentence sen:list)
            
    {
                StringBuffer word 
    = new StringBuffer();
                
    int offset=sen.getStartOffset()+sen.getText().length;
                
    int bufferIndex = sen.getText().length-1;
                
    char c;
                
    boolean b=false;
                
    while(bufferIndex>-1)
                
    {
                    offset
    --;
                    c
    =sen.getText()[bufferIndex--];
                    
    if(word.length()==0)
                        word.append(c);
                    
    else
                    
    {
                        String temp 
    = (c+word.toString()).intern();
                        
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                            word.insert(
    0, c);
                        
    else if(dictionary.containsKey(temp) && bufferIndex>-1)
                            word.insert(
    0, c);
                        
    else
                        
    {
                            bufferIndex
    ++;
                            offset
    ++;
                            
    while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                            
    {
                                word.deleteCharAt(
    0);
                                bufferIndex
    ++;
                                offset
    ++;
                            }

                            b
    =true;
                        }

                    }

                    
    if(b || bufferIndex==-1)
                    
    {
                        Token token 
    = new Token(word.toString(),offset,offset+word.length(),"word");
                        word.setLength(
    0);
                        tokenlist.add(token);
                        b
    =false;
                    }

                }

            }

            Collections.reverse(tokenlist);
            
    return tokenlist;
        }

        
        
    //加載詞典
        public static void loadDictionary() 
        
    {  
            
    if (dictionary == null
            
    {    
                dictionary 
    = new HashMap<String, Integer>();    
                InputStream is 
    = null;    
                BufferedReader br 
    = null;            
                
    try
                
    {
                    is 
    = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
                    br 
    = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                    String word 
    = null;
                    
    while ((word = br.readLine()) != null
                    
    {
                        word
    =word.toLowerCase();
                        
    if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                        
    {
                            dictionary.put(word.intern(), 
    1);    
                            
    int i = 1
                            
    while(i < word.length()-1)
                            
    {
                                String temp 
    = word.substring(i,word.length()).intern(); 
                                
    if (!dictionary.containsKey(temp))
                                    dictionary.put(temp,
    2); 
                                i
    ++;
                            }

                        }

                    }

                }

                
    catch (Exception e) 
                
    {      
                    log.info(e);
                }

                
    finally
                
    {
                    
    try 
                    
    {      
                        
    if(br!=null)
                            br.close();   
                        
    if(is!=null)
                            is.close();  
                    }

                    
    catch (IOException e)
                    
    {     
                        log.info(e);
                    }
                
                }
     
            }
     
        }

        
        
    public static String[] segWords(Reader reader)
        
    {
            ArrayList
    <String> list=new ArrayList<String>();
            
    try
            
    {
                ArrayList
    <Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
                
    for(Token t:tlist)
                
    {
                    list.add(t.getWord());
                }

            }

            
    catch(IOException e)
            
    {
                log.info(e);
            }

            
    return (String[])list.toArray(new String[0]);
        }

        
        
    public static void main(String[] args) 
         
    {
            String[] cc
    =RMM.segWords(new StringReader("急、急、急、花里林居,二房二廳,業主誠心,出租".toLowerCase()));
            
    for(String c:cc)
            
    {
                System.out.println(c);
            }

        }

    }


    public class Util
    {
     //切分出由中文、字母、數字組成的句子
     public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
     {  
      ArrayList<Sentence> list=new ArrayList<Sentence>();
      StringBuffer cb=new StringBuffer();
      int d=reader.read();
      int offset=0;
      boolean b=false;
      while(d>-1)
      {
       int type=Character.getType(d);
       if(type==2 || type==9 || type==5)
       {
        d=toAscii(d);
        cb.append((char)d);
       }
       else
       {
        b=true;
       }
       d=reader.read();
       if(d==-1 || b)
       {
        if(d==-1) offset++;
        b=false;
        char[] ioBuffer = new char[cb.length()];
        cb.getChars(0, cb.length(), ioBuffer, 0);
        Sentence sen=new Sentence(ioBuffer,offset-cb.length());
        list.add(sen);
        cb.setLength(0);
       }
       offset++;
      }
      return list;
     }
     
     //將相連的單個英文或數字組合成詞
     public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
     {
      ArrayList<Token> tokenlist=new ArrayList<Token>();
      Token word=null;
      for(int i=0;i<list.size();i++)
      {
       Token t=list.get(i);
       if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
       {
        if(word==null)
         word=t;
        else if(word.getEnd()==t.getStart())
        {
         word.setEnd(t.getEnd());
         word.setWord(word.getWord()+t.getWord());
        }
        else
        {
         tokenlist.add(word);
         word=t;
        }
       }
       else if(word!=null)
       {
        tokenlist.add(word);
        word=null;
        tokenlist.add(t);
       }
       else
        tokenlist.add(t);
      }
      if(word!=null)
       tokenlist.add(word);
      return tokenlist;
     }
     
     //雙角轉單角
     public static int toAscii(int codePoint)
     {
      if((codePoint>=65296 && codePoint<=65305) //0-9
        || (codePoint>=65313 && codePoint<=65338) //A-Z
        || (codePoint>=65345 && codePoint<=65370) //a-z
        )
      { 
       codePoint -= 65248;
      }
      return codePoint;
     }
    }








    posted on 2011-08-19 13:22 nianzai 閱讀(4487) 評論(2)  編輯  收藏 所屬分類: 中文分詞

    評論:
    # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 [未登錄] 2011-10-21 16:38 | zxj
    樓主,代碼中的Sentence 類呢?  回復  更多評論
      
    # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 2011-11-08 15:55 | nianzai
    參考正向最大匹配中文分詞算法  回復  更多評論
      
    主站蜘蛛池模板: 国产色婷婷精品免费视频| 一个人免费日韩不卡视频| 麻豆国产精品入口免费观看| 亚洲午夜久久久久久尤物| 91成人在线免费视频| 亚洲成a人片在线观看中文!!!| 69视频在线观看免费| 亚洲第一网站免费视频| 黄网站色在线视频免费观看| 亚洲中文字幕乱码熟女在线| 四虎在线免费播放| 日本特黄特色AAA大片免费| 亚洲综合久久夜AV | 在线观看免费无码视频| 亚洲va在线va天堂va不卡下载| 色猫咪免费人成网站在线观看| 亚洲黄色免费在线观看| 国产在线观看免费观看不卡| 亚洲日本人成中文字幕| 免费A级毛片无码久久版| 东北美女野外bbwbbw免费| 亚洲国产成人精品不卡青青草原| 18以下岁毛片在免费播放| 亚洲乱人伦中文字幕无码| 久久丫精品国产亚洲av不卡| 中国人xxxxx69免费视频| 亚洲永久网址在线观看| 亚洲男人天堂2020| 99久久免费精品视频| 久久亚洲精品专区蓝色区| 亚洲国产av无码精品| 久久免费视频精品| 亚洲国产一区二区三区在线观看| 亚洲精品WWW久久久久久| 外国成人网在线观看免费视频| 亚洲一本一道一区二区三区| 中文字幕在线亚洲精品| 亚洲香蕉免费有线视频| 免费国产a理论片| 在线观看亚洲人成网站| 免费乱理伦在线播放|