<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Rising Sun

      BlogJava :: 首頁 :: 新隨筆 :: 聯(lián)系 :: 聚合  :: 管理 ::
      148 隨筆 :: 0 文章 :: 22 評論 :: 0 Trackbacks

    看了網(wǎng)上的許多對于lucene 分詞解析的文章一知半解且代碼比較老舊,為透徹、系統(tǒng)、全面、深刻的了解分詞是怎么一個(gè)過程,通過自定義一個(gè)分詞器來分析理解。 其中分詞部分利用ICTCLAS4j接口實(shí)現(xiàn)。結(jié)構(gòu)如下所示:


     

             要實(shí)現(xiàn)自定義的ICTCLAS4jAnalyzer必須繼承Analyzer類,并重寫createComponents方法。直接上代碼,看到了吧是從StandardAnalyzer 類中直接復(fù)制過來的。把實(shí)現(xiàn)ICTCLAS4jICTCLAS4jTokenizer替換就搞定了。

     @Override

        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

          final ICTCLAS4jTokenizer src = new ICTCLAS4jTokenizer(reader);

          //src.setMaxTokenLength(maxTokenLength);

          TokenStream tok = new ICTCLAS4jFilter(matchVersion, src);

          tok = new LowerCaseFilter(matchVersion, tok);

          tok = new StopFilter(matchVersion, tok, STOP_WORDS_SET);

          return new TokenStreamComponents(src, tok) {

            @Override

            protected void setReader(final Reader reader) throws IOException {

              //src.setMaxTokenLength(ICTCLAS4jAnalyzer.this.maxTokenLength);

              super.setReader(reader);

            }

          };

    }

     

             ICTCLAS4jTokenizer需重新incrementToken方法,并設(shè)定CharTermAttribute(存放詞條),OffsetAttribute(存放詞條的偏移地址),構(gòu)造函數(shù)中寫入需分詞的字符串,通過ICTCLAS4j返回分詞列表在通過incrementToken實(shí)現(xiàn)分詞。代碼如下:

     

    package com.zhy.analysis.ictclas4j;

     

    import java.io.FileInputStream;

    import java.io.IOException;

    import java.io.InputStream;

    import java.io.Reader;

    import java.util.ArrayList;

     

    import org.apache.lucene.analysis.Tokenizer;

    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

    import org.ictclas4j.bean.SegResult;

    import org.ictclas4j.segment.SegTag;

     

    /**

     * @author brockhong

     *

     */

     

    public class ICTCLAS4jTokenizer extends Tokenizer {

     

         private static SegTag segment;

         private StringBuilder sb = new StringBuilder();

         private ArrayList<String> words = new ArrayList<String>();

         private int startOffest = 0;

         private int length = 0;

         private int wordIdx = 0;

     

         public ICTCLAS4jTokenizer(Reader input) {

             super(input);

             char[] buf = new char[8192];

             int d = -1;

             try {

                  while ((d = input.read(buf)) != -1) {

                       sb.append(buf, 0, d);

                  }

             } catch (IOException e) {

                  e.printStackTrace();

             }

             SegResult sr = seg().split(sb.toString());

             words = sr.getWords();

         }

             private static SegTag seg() {

             try {

                  if (segment == null) {

                       final InputStream coreDictIn = new FileInputStream(

                                "data/coreDict.dct");

                       final InputStream bigramDictIn = new FileInputStream(

                                "data/BigramDict.dct");

                       final InputStream personTaggerDctIn = new FileInputStream(

                                "data/nr.dct");

                       final InputStream personTaggerCtxIn = new FileInputStream(

                                "data/nr.ctx");

                       final InputStream transPersonTaggerDctIn = new FileInputStream(

                                "data/tr.dct");

                       final InputStream transPersonTaggerCtxIn = new FileInputStream(

                                "data/tr.ctx");

                       final InputStream placeTaggerDctIn = new FileInputStream(

                                "data/ns.dct");

                       final InputStream placeTaggerCtxIn = new FileInputStream(

                                "data/ns.ctx");

                      final InputStream lexTaggerCtxIn = new FileInputStream(

                                "data/lexical.ctx");

                       segment = new SegTag(1, coreDictIn, bigramDictIn,

                                personTaggerDctIn, personTaggerCtxIn,

                                transPersonTaggerDctIn, transPersonTaggerCtxIn,

                                placeTaggerDctIn, placeTaggerCtxIn, lexTaggerCtxIn);

                  }

             } catch (Exception e) {

                  e.printStackTrace();

             }

             return segment;

         }

         private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

         private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

     

         @Override

         public boolean incrementToken() throws IOException {

             while (true) {

                  length = 0;

                  if (wordIdx < words.size()) {

                       String word = words.get(wordIdx);

     

                       termAtt.copyBuffer(word.toCharArray(), 0, word.length());

                       offsetAtt.setOffset(correctOffset(startOffest),

                                correctOffset(startOffest + length));

                       wordIdx++;

                       startOffest += length;

                       return true;

                  } else {

                       return false;

                  }

     

             }

         }

    }   

     

             ICTCLAS4jFilter 分詞過濾器直接使用StandardAnalyzer的過濾器,作為自定義過濾器。

     

    ICTCLAS4j改造過程來自網(wǎng)上,修改SegTagoutputResult讓其輸出的分詞輸入到列表中。并修復(fù)了ICTCLAS4j 在分詞中沒有時(shí)報(bào)錯(cuò)代碼。

    附上analyzer 測試類如下:

     

    import java.io.Reader;

    import org.apache.lucene.analysis.Analyzer;

    import org.apache.lucene.analysis.TokenStream;

    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

    import org.apache.lucene.util.Version;

    import java.io.StringReader;

    import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

    /**

     *  @author brockhong

     */

    public class Ictclas4janalyzer {

          public static void main(String[] args) throws Exception {

               Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

              

               Reader r = new StringReader("張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟(jì)的航空領(lǐng)域中的航空母艦地點(diǎn)在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0");     

               TokenStream ts=analyzer.tokenStream("fff", r);     

                CharTermAttribute term=ts.addAttribute(CharTermAttribute.class); 

            ts.reset(); 

            while(ts.incrementToken()){ 

                System.out.println(term.toString()); 

            } 

            ts.end(); 

            ts.close(); 

          }

    }

    Lucene寫入測試類:

    import java.io.File;

    import java.io.IOException;

    import org.apache.lucene.analysis.Analyzer;

    import org.apache.lucene.analysis.standard.StandardAnalyzer;

    import org.apache.lucene.document.Document;

    import org.apache.lucene.document.StringField;

    import org.apache.lucene.document.TextField;

    import org.apache.lucene.document.Field.Store;

    import org.apache.lucene.index.IndexWriter;

    import org.apache.lucene.index.IndexWriterConfig;

    import org.apache.lucene.store.Directory;

    import org.apache.lucene.store.FSDirectory;

    import org.apache.lucene.util.Version;

    import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

    /** @author brockhong */

    public class Testictclas4j {

                  public static void main(String[] args) throws Exception {

                                // 設(shè)置寫入目錄(好幾種呵呵)

                                Directory d = FSDirectory.open(new File("D:/luceneTest2"));

                                // 設(shè)置分詞 StandardAnalyzer(會把句子中的字單個(gè)分詞)

                                Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

                                // 設(shè)置索引寫入配置

                                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45,          analyzer);

                                IndexWriter indexwriter = new IndexWriter(d, config);

                                Document doc = new Document();

                                doc.add(new StringField("id", "1", Store.YES));

                                doc.add(new StringField("name", "brockhong", Store.YES));

                                doc.add(new TextField("content",

                                                            "張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟(jì)的航空領(lǐng)域中的航空母艦地點(diǎn)在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0",Store.YES));

                                // 寫入數(shù)據(jù)

                                indexwriter.addDocument(doc);

                                // 提交

                                indexwriter.commit();             }}



    下載jar/Files/brock/ictclas4j.7z

    posted on 2015-01-07 10:11 brock 閱讀(1098) 評論(0)  編輯  收藏 所屬分類: Lucene

    只有注冊用戶登錄后才能發(fā)表評論。


    網(wǎng)站導(dǎo)航:
     
    主站蜘蛛池模板: 亚洲啪啪免费视频| 亚洲视频精品在线| 亚洲国产亚洲综合在线尤物| 免费看男女下面日出水视频| 久久永久免费人妻精品| 免费无遮挡无码视频在线观看| 亚洲av无码片在线观看| 婷婷亚洲久悠悠色悠在线播放| 亚洲第一页综合图片自拍| 成年女人男人免费视频播放| 一级毛片免费观看不卡视频| 男人天堂免费视频| 成人a毛片视频免费看| 亚洲av色香蕉一区二区三区蜜桃| 亚洲性色成人av天堂| 亚洲国产高清在线| 亚洲熟妇无码另类久久久| 亚洲精品无码久久久| 四虎永久精品免费观看| 韩国日本好看电影免费看| 波多野结衣在线免费观看| 99久久免费精品视频| 亚洲国产精品网站在线播放| 亚洲成aⅴ人片在线影院八| 亚洲精品自产拍在线观看动漫| 在线亚洲人成电影网站色www | 久久精品国产亚洲AV麻豆不卡| 亚洲日韩中文字幕日韩在线 | 18禁亚洲深夜福利人口| 男人天堂2018亚洲男人天堂| 亚洲国产成人91精品| 亚洲精品在线电影| 亚洲网址在线观看| 亚洲欧洲国产综合| 亚洲春色另类小说| 亚洲一区二区三区亚瑟| 亚洲av成人综合网| 亚洲欧洲免费无码| 日韩国产欧美亚洲v片| 黄色免费网址在线观看| 久久亚洲精品无码aⅴ大香|