<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆-23  評論-58  文章-0  trackbacks-0
    基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

    第一次寫中文分詞程序,歡迎拍磚。

    查看第2版:正向最大匹配分詞程序,能實現中英文數字混合分詞 (第二版)

    public class MM2 
    {
        
    private static final Log log = LogFactory.getLog(MM2.class);
        
        
    private static HashMap<String, Integer> dictionary = null
        
    private static final int WORD_MAX_LENGTH = 9;
        
    private Reader reader;
        
        
    static
        
    {
            loadDictionary();
        }

        
        
    public MM2(Reader reader) 
        

            
    this.reader = reader; 
        }
     
        
        
    //切分出由中文、字母、數字組成的句子
        public ArrayList<Sentence> getSentence() throws IOException
        
    {   
            ArrayList
    <Sentence> list=new ArrayList<Sentence>();
            StringBuffer cb
    =new StringBuffer();
            
    int d=reader.read();
            
    int offset=0;
            
    boolean b=false;
            
    while(d>-1)
            
    {
                
    int type=Character.getType(d);
                
    if(type==2 || type==9 || type==5)
                
    {
                    d
    =toAscii(d);
                    cb.append((
    char)d);
                }

                
    else
                
    {
                    b
    =true;
                }

                d
    =reader.read();
                
    if(d==-1 || b)
                
    {
                    
    if(d==-1) offset++;
                    b
    =false;
                    
    char[] ioBuffer = new char[cb.length()];
                    cb.getChars(
    0, cb.length(), ioBuffer, 0);
                    Sentence sen
    =new Sentence(ioBuffer,offset-cb.length());
                    list.add(sen);
                    cb.setLength(
    0);
                }

                offset
    ++;
            }

            
    return list;
        }

        
        
    //將句子切分出詞
        public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            
    for(Sentence sen:list)
            
    {
                StringBuffer word 
    = new StringBuffer();
                
    int offset=sen.getStartOffset();
                
    int bufferIndex = 0;
                
    char c;
                
    boolean b=false;
                
    while(bufferIndex<sen.getText().length)
                
    {
                    offset
    ++;
                    c
    =sen.getText()[bufferIndex++];
                    
    if(word.length()==0)
                        word.append(c);
                    
    else
                    
    {
                        String temp 
    = (word.toString() + c).intern();
                        
    if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                            word.append(c);
                        
    else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                            word.append(c);
                        
    else
                        
    {
                            bufferIndex
    --;
                            offset
    --;
                            
    while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                            
    {
                                word.deleteCharAt(word.length()
    -1);
                                bufferIndex
    --;
                                offset
    --;
                            }

                            b
    =true;
                        }

                    }

                    
    if(b || bufferIndex==sen.getText().length)
                    
    {
                        Token token 
    = new Token(word.toString(),offset-word.length(),offset,"word");
                        word.setLength(
    0);
                        tokenlist.add(token);
                        b
    =false;
                    }

                }

            }

            
    return tokenlist;
        }

        
        
    //將相連的單個英文或數字組合成詞
        public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
        
    {
            ArrayList
    <Token> tokenlist=new ArrayList<Token>();
            Token word
    =null;
            
    for(int i=0;i<list.size();i++)
            
    {
                Token t
    =list.get(i);
                
    if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
                
    {
                    
    if(word==null)
                        word
    =t;
                    
    else if(word.getEnd()==t.getStart())
                    
    {
                        word.setEnd(t.getEnd());
                        word.setWord(word.getWord()
    +t.getWord());
                    }

                    
    else
                    
    {
                        tokenlist.add(word);
                        word
    =t;
                    }

                }

                
    else if(word!=null)
                
    {
                    tokenlist.add(word);
                    word
    =null;
                    tokenlist.add(t);
                }

                
    else
                    tokenlist.add(t);
            }

            
    if(word!=null)
                tokenlist.add(word);
            
    return tokenlist;
        }

        
        
    //雙角轉單角
        public static int toAscii(int codePoint) 
        
    {
            
    if((codePoint>=65296 && codePoint<=65305)    //0-9
                    || (codePoint>=65313 && codePoint<=65338)    //A-Z
                    || (codePoint>=65345 && codePoint<=65370)    //a-z
                    )
            
    {    
                codePoint 
    -= 65248;
            }

            
    return codePoint;
        }

        
        
    //加載詞典
        public static void loadDictionary() 
        
    {  
            
    if (dictionary == null
            
    {    
                dictionary 
    = new HashMap<String, Integer>();    
                InputStream is 
    = null;    
                BufferedReader br 
    = null;            
                
    try
                
    {
                    is 
    = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                    br 
    = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                    String word 
    = null;
                    
    while ((word = br.readLine()) != null
                    
    {
                        word
    =word.toLowerCase();
                        
    if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                        
    {
                            dictionary.put(word.intern(), 
    1);    
                            
    int i = word.length()-1
                            
    while(i >= 2)
                            
    {
                                String temp 
    = word.substring(0, i).intern(); 
                                
    if (!dictionary.containsKey(temp))
                                    dictionary.put(temp,
    2); 
                                i
    --;
                            }

                        }

                    }

                }

                
    catch (Exception e) 
                
    {      
                    log.info(e);
                }

                
    finally
                
    {
                    
    try 
                    
    {      
                        
    if(br!=null)
                            br.close();   
                        
    if(is!=null)
                            is.close();  
                    }

                    
    catch (IOException e)
                    
    {     
                        log.info(e);
                    }
                
                }
     
            }
     
        }

        
        
    public static String[] segWords(Reader input)
        
    {
            ArrayList
    <String> list=new ArrayList<String>();
            
    try
            
    {
                MM2 f
    =new MM2(input);
                ArrayList
    <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
                
    for(Token t:tlist)
                
    {
                    list.add(t.getWord());
                }

            }

            
    catch(IOException e)
            
    {
                log.info(e);
            }

            
    return (String[])list.toArray(new String[0]);
        }

        
        
    public static void main(String[] args) 
        
    {
            String[] cc
    =MM2.segWords(new StringReader("ibm商務機t60p".toLowerCase()));
            
    for(String c:cc)
            
    {
                System.out.println(c);
            }

        }

    }
    posted on 2011-08-04 15:31 nianzai 閱讀(3461) 評論(1)  編輯  收藏 所屬分類: 中文分詞

    評論:
    # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2014-09-13 18:30 | 余道
    您好,您沒有給出Sentence和Token的定義,我猜不出啊

    hdwgz@qq.com  回復  更多評論
      
    主站蜘蛛池模板: 伊人久久亚洲综合影院首页| 国产成人综合亚洲绿色| 国产成人午夜精品免费视频| 亚洲av成人无码网站…| 亚洲视频在线精品| 久久不见久久见免费视频7| 亚洲欧美日韩久久精品| 亚洲综合另类小说色区| 国产精品成人免费福利| 男女猛烈激情xx00免费视频| 亚洲色图.com| 免费吃奶摸下激烈视频| 亚洲一级免费毛片| fc2成年免费共享视频18| 4480yy私人影院亚洲| 四虎永久免费影院在线| 四虎国产精品永久免费网址| 色偷偷亚洲男人天堂| 亚洲电影免费在线观看| 四虎免费久久影院| 91黑丝国产线观看免费| 一级毛片在线免费播放| 亚洲第一男人天堂| 亚洲AV永久青草无码精品| 日韩在线看片免费人成视频播放| 久久青草精品38国产免费| 日韩在线视频播放免费视频完整版 | 日本红怡院亚洲红怡院最新| 永久免费视频v片www| 久久久久免费精品国产小说| 日本特黄特色AAA大片免费| 中文字幕 亚洲 有码 在线| 久久精品九九亚洲精品天堂| 免费国产成人高清在线观看麻豆 | 日本免费xxxx色视频| 国产成人精品免费视频大全| 亚洲国产午夜精品理论片在线播放 | 亚洲精品成人av在线| 国产亚洲精aa成人网站| 四虎影视永久免费视频观看| 欧美三级在线电影免费|