亚洲爆乳精品无码一区二区,亚洲a在线视频视频,亚洲一区电影在线观看

<rt id="bn8ez"></rt>

<label id="bn8ez"></label>

<span id="bn8ez"></span>

<label id="bn8ez"><meter id="bn8ez"></meter></label>

<button id="msamy"></button>

<dl id="msamy"><tr id="msamy"></tr></dl>

隨筆-23 評論-58 文章-0 trackbacks-0

基于詞典的逆向最大匹配中文分詞算法，逆向分詞比正向分詞效果好

基于詞典的逆向最大匹配中文分詞算法，能實現中英文數字混合分詞。比如能分出這樣的詞：bb霜、3室、樂phone、touch4、mp3、T恤。實際分詞效果比正向分詞效果好

查看第2版：逆向最大匹配分詞程序，能實現中英文數字混合分詞 (第二版)

public class RMM

{

private static final Log log = LogFactory.getLog(RMM.class);

private static HashMap<String, Integer> dictionary = null;

private static final int WORD_MAX_LENGTH = 9;

static

{

loadDictionary();

}

//將句子切分出詞,逆向最大匹配

public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException

{

Collections.reverse(list);

ArrayList<Token> tokenlist=new ArrayList<Token>();

for(Sentence sen:list)

{

StringBuffer word = new StringBuffer();

int offset=sen.getStartOffset()+sen.getText().length;

int bufferIndex = sen.getText().length-1;

char c;

boolean b=false;

while(bufferIndex>-1)

{

offset--;

c=sen.getText()[bufferIndex--];

if(word.length()==0)

word.append(c);

else

{

String temp = (c+word.toString()).intern();

if(dictionary.containsKey(temp) && dictionary.get(temp)==1)

word.insert(0, c);

else if(dictionary.containsKey(temp) && bufferIndex>-1)

word.insert(0, c);

else

{

bufferIndex++;

offset++;

while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)

{

word.deleteCharAt(0);

bufferIndex++;

offset++;

}

b=true;

}

}

if(b || bufferIndex==-1)

{

Token token = new Token(word.toString(),offset,offset+word.length(),"word");

word.setLength(0);

tokenlist.add(token);

b=false;

}

}

}

Collections.reverse(tokenlist);

return tokenlist;

}

//加載詞典

public static void loadDictionary()

{

if (dictionary == null)

{

dictionary = new HashMap<String, Integer>();

InputStream is = null;

BufferedReader br = null;

try

{

is = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));

br = new BufferedReader(new InputStreamReader(is, "UTF-8"));

String word = null;

while ((word = br.readLine()) != null)

{

word=word.toLowerCase();

if ((word.indexOf("#") == -1) && (word.length() <= WORD_MAX_LENGTH))

{

dictionary.put(word.intern(), 1);

int i = 1;

while(i < word.length()-1)

{

String temp = word.substring(i,word.length()).intern();

if (!dictionary.containsKey(temp))

dictionary.put(temp,2);

i++;

}

}

}

}

catch (Exception e)

{

log.info(e);

}

finally

{

try

{

if(br!=null)

br.close();

if(is!=null)

is.close();

}

catch (IOException e)

{

log.info(e);

}

}

}

}

public static String[] segWords(Reader reader)

{

ArrayList<String> list=new ArrayList<String>();

try

{

ArrayList<Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));

for(Token t:tlist)

{

list.add(t.getWord());

}

}

catch(IOException e)

{

log.info(e);

}

return (String[])list.toArray(new String[0]);

}

public static void main(String[] args)

{

String[] cc=RMM.segWords(new StringReader("急、急、急、花里林居,二房二廳,業主誠心,出租".toLowerCase()));

for(String c:cc)

{

System.out.println(c);

}

}

}

public class Util
{
//切分出由中文、字母、數字組成的句子
public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
{
  ArrayList<Sentence> list=new ArrayList<Sentence>();
  StringBuffer cb=new StringBuffer();
  int d=reader.read();
  int offset=0;
  boolean b=false;
  while(d>-1)
  {
   int type=Character.getType(d);
   if(type==2 || type==9 || type==5)
   {
    d=toAscii(d);
    cb.append((char)d);
   }
   else
   {
    b=true;
   }
   d=reader.read();
   if(d==-1 || b)
   {
    if(d==-1) offset++;
    b=false;
    char[] ioBuffer = new char[cb.length()];
    cb.getChars(0, cb.length(), ioBuffer, 0);
    Sentence sen=new Sentence(ioBuffer,offset-cb.length());
    list.add(sen);
    cb.setLength(0);
   }
   offset++;
  }
  return list;
}

//將相連的單個英文或數字組合成詞
public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
{
  ArrayList<Token> tokenlist=new ArrayList<Token>();
  Token word=null;
  for(int i=0;i<list.size();i++)
  {
   Token t=list.get(i);
   if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
   {
    if(word==null)
     word=t;
    else if(word.getEnd()==t.getStart())
    {
     word.setEnd(t.getEnd());
     word.setWord(word.getWord()+t.getWord());
    }
    else
    {
     tokenlist.add(word);
     word=t;
    }
   }
   else if(word!=null)
   {
    tokenlist.add(word);
    word=null;
    tokenlist.add(t);
   }
   else
    tokenlist.add(t);
  }
  if(word!=null)
   tokenlist.add(word);
  return tokenlist;
}

//雙角轉單角
public static int toAscii(int codePoint)
{
  if((codePoint>=65296 && codePoint<=65305) //０-９
    || (codePoint>=65313 && codePoint<=65338) //Ａ-Ｚ
    || (codePoint>=65345 && codePoint<=65370) //ａ-ｚ
    )
  {
   codePoint -= 65248;
  }
  return codePoint;
}
}

posted on 2011-08-19 13:22 nianzai 閱讀(4487) 評論(2) 編輯收藏所屬分類: 中文分詞

評論:

# re: 基于詞典的逆向最大匹配中文分詞算法，逆向分詞比正向分詞效果好 [未登錄] 2011-10-21 16:38 | zxj

樓主,代碼中的Sentence 類呢? 回復更多評論

# re: 基于詞典的逆向最大匹配中文分詞算法，逆向分詞比正向分詞效果好 2011-11-08 15:55 | nianzai

參考正向最大匹配中文分詞算法回復更多評論

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發表評論。




網站導航: 博客園 IT新聞 Chat2DB C++博客博問管理
相關文章: 隱馬可夫(HMM)中文分詞詞性標注程序最大概率分詞程序最短路徑分詞程序全切分分詞程序,能實現中英文數字混合分詞逆向最大匹配分詞程序，能實現中英文數字混合分詞 (第二版) 正向最大匹配分詞程序，能實現中英文數字混合分詞 (第二版) 基于詞典的逆向最大匹配中文分詞算法，逆向分詞比正向分詞效果好基于詞典的正向最大匹配中文分詞算法，能實現中英文數字混合分詞

主站蜘蛛池模板：中文字幕的电影免费网站| A在线观看免费网站大全| 亚洲自偷自偷在线成人网站传媒 | 嘿嘿嘿视频免费网站在线观看| 免费精品视频在线| 亚洲乱码一区二区三区国产精品| 午夜影视日本亚洲欧洲精品一区| 免费观看亚洲人成网站| 毛片免费在线观看网站| 67194成手机免费观看| 中文字幕乱码系列免费| 特级做a爰片毛片免费看| 亚洲小视频在线观看| 亚洲日韩精品无码专区网站| 妞干网手机免费视频| 一本岛高清v不卡免费一三区| 免费精品99久久国产综合精品| 国产精品免费αv视频| 青娱乐在线视频免费观看| 亚洲爆乳大丰满无码专区| 国产jizzjizz免费视频| 岛国av无码免费无禁网站| 免费A级毛片无码无遮挡内射| 亚洲成人免费网站| 57pao一国产成视频永久免费| 午夜无码A级毛片免费视频| 99视频在线观看免费| 日韩久久无码免费毛片软件 | 国产禁女女网站免费看| 免费毛片在线视频| 免费高清小黄站在线观看| 毛片网站免费在线观看| 成人黄动漫画免费网站视频| 欧美在线看片A免费观看| 久久精品无码一区二区三区免费| 妻子5免费完整高清电视| 日本精品人妻无码免费大全| 在线jlzzjlzz免费播放| 免费观看的av毛片的网站| 又黄又爽的视频免费看| 亚洲片国产一区一级在线观看|

<code id="syssi"><delect id="syssi"></delect></code>

<li id="syssi"></li>

<tfoot id="syssi"></tfoot>

<code id="syssi"><acronym id="syssi"></acronym></code>

<dl id="syssi"><tr id="syssi"></tr></dl>

<center id="syssi"><tr id="syssi"></tr></center>

<code id="syssi"><xmp id="syssi"></xmp></code>

<nav id="syssi"></nav>