<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    Rising Sun

      BlogJava :: 首頁 :: 新隨筆 :: 聯(lián)系 :: 聚合  :: 管理 ::
      148 隨筆 :: 0 文章 :: 22 評論 :: 0 Trackbacks

    看了網(wǎng)上的許多對于lucene 分詞解析的文章一知半解且代碼比較老舊,為透徹、系統(tǒng)、全面、深刻的了解分詞是怎么一個過程,通過自定義一個分詞器來分析理解。 其中分詞部分利用ICTCLAS4j接口實現(xiàn)。結構如下所示:


     

             要實現(xiàn)自定義的ICTCLAS4jAnalyzer必須繼承Analyzer類,并重寫createComponents方法。直接上代碼,看到了吧是從StandardAnalyzer 類中直接復制過來的。把實現(xiàn)ICTCLAS4jICTCLAS4jTokenizer替換就搞定了。

     @Override

        protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

          final ICTCLAS4jTokenizer src = new ICTCLAS4jTokenizer(reader);

          //src.setMaxTokenLength(maxTokenLength);

          TokenStream tok = new ICTCLAS4jFilter(matchVersion, src);

          tok = new LowerCaseFilter(matchVersion, tok);

          tok = new StopFilter(matchVersion, tok, STOP_WORDS_SET);

          return new TokenStreamComponents(src, tok) {

            @Override

            protected void setReader(final Reader reader) throws IOException {

              //src.setMaxTokenLength(ICTCLAS4jAnalyzer.this.maxTokenLength);

              super.setReader(reader);

            }

          };

    }

     

             ICTCLAS4jTokenizer需重新incrementToken方法,并設定CharTermAttribute(存放詞條),OffsetAttribute(存放詞條的偏移地址),構造函數(shù)中寫入需分詞的字符串,通過ICTCLAS4j返回分詞列表在通過incrementToken實現(xiàn)分詞。代碼如下:

     

    package com.zhy.analysis.ictclas4j;

     

    import java.io.FileInputStream;

    import java.io.IOException;

    import java.io.InputStream;

    import java.io.Reader;

    import java.util.ArrayList;

     

    import org.apache.lucene.analysis.Tokenizer;

    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

    import org.ictclas4j.bean.SegResult;

    import org.ictclas4j.segment.SegTag;

     

    /**

     * @author brockhong

     *

     */

     

    public class ICTCLAS4jTokenizer extends Tokenizer {

     

         private static SegTag segment;

         private StringBuilder sb = new StringBuilder();

         private ArrayList<String> words = new ArrayList<String>();

         private int startOffest = 0;

         private int length = 0;

         private int wordIdx = 0;

     

         public ICTCLAS4jTokenizer(Reader input) {

             super(input);

             char[] buf = new char[8192];

             int d = -1;

             try {

                  while ((d = input.read(buf)) != -1) {

                       sb.append(buf, 0, d);

                  }

             } catch (IOException e) {

                  e.printStackTrace();

             }

             SegResult sr = seg().split(sb.toString());

             words = sr.getWords();

         }

             private static SegTag seg() {

             try {

                  if (segment == null) {

                       final InputStream coreDictIn = new FileInputStream(

                                "data/coreDict.dct");

                       final InputStream bigramDictIn = new FileInputStream(

                                "data/BigramDict.dct");

                       final InputStream personTaggerDctIn = new FileInputStream(

                                "data/nr.dct");

                       final InputStream personTaggerCtxIn = new FileInputStream(

                                "data/nr.ctx");

                       final InputStream transPersonTaggerDctIn = new FileInputStream(

                                "data/tr.dct");

                       final InputStream transPersonTaggerCtxIn = new FileInputStream(

                                "data/tr.ctx");

                       final InputStream placeTaggerDctIn = new FileInputStream(

                                "data/ns.dct");

                       final InputStream placeTaggerCtxIn = new FileInputStream(

                                "data/ns.ctx");

                      final InputStream lexTaggerCtxIn = new FileInputStream(

                                "data/lexical.ctx");

                       segment = new SegTag(1, coreDictIn, bigramDictIn,

                                personTaggerDctIn, personTaggerCtxIn,

                                transPersonTaggerDctIn, transPersonTaggerCtxIn,

                                placeTaggerDctIn, placeTaggerCtxIn, lexTaggerCtxIn);

                  }

             } catch (Exception e) {

                  e.printStackTrace();

             }

             return segment;

         }

         private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

         private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

     

         @Override

         public boolean incrementToken() throws IOException {

             while (true) {

                  length = 0;

                  if (wordIdx < words.size()) {

                       String word = words.get(wordIdx);

     

                       termAtt.copyBuffer(word.toCharArray(), 0, word.length());

                       offsetAtt.setOffset(correctOffset(startOffest),

                                correctOffset(startOffest + length));

                       wordIdx++;

                       startOffest += length;

                       return true;

                  } else {

                       return false;

                  }

     

             }

         }

    }   

     

             ICTCLAS4jFilter 分詞過濾器直接使用StandardAnalyzer的過濾器,作為自定義過濾器。

     

    ICTCLAS4j改造過程來自網(wǎng)上,修改SegTagoutputResult讓其輸出的分詞輸入到列表中。并修復了ICTCLAS4j 在分詞中沒有時報錯代碼。

    附上analyzer 測試類如下:

     

    import java.io.Reader;

    import org.apache.lucene.analysis.Analyzer;

    import org.apache.lucene.analysis.TokenStream;

    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

    import org.apache.lucene.util.Version;

    import java.io.StringReader;

    import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

    /**

     *  @author brockhong

     */

    public class Ictclas4janalyzer {

          public static void main(String[] args) throws Exception {

               Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

              

               Reader r = new StringReader("張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0");     

               TokenStream ts=analyzer.tokenStream("fff", r);     

                CharTermAttribute term=ts.addAttribute(CharTermAttribute.class); 

            ts.reset(); 

            while(ts.incrementToken()){ 

                System.out.println(term.toString()); 

            } 

            ts.end(); 

            ts.close(); 

          }

    }

    Lucene寫入測試類:

    import java.io.File;

    import java.io.IOException;

    import org.apache.lucene.analysis.Analyzer;

    import org.apache.lucene.analysis.standard.StandardAnalyzer;

    import org.apache.lucene.document.Document;

    import org.apache.lucene.document.StringField;

    import org.apache.lucene.document.TextField;

    import org.apache.lucene.document.Field.Store;

    import org.apache.lucene.index.IndexWriter;

    import org.apache.lucene.index.IndexWriterConfig;

    import org.apache.lucene.store.Directory;

    import org.apache.lucene.store.FSDirectory;

    import org.apache.lucene.util.Version;

    import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

    /** @author brockhong */

    public class Testictclas4j {

                  public static void main(String[] args) throws Exception {

                                // 設置寫入目錄(好幾種呵呵)

                                Directory d = FSDirectory.open(new File("D:/luceneTest2"));

                                // 設置分詞 StandardAnalyzer(會把句子中的字單個分詞)

                                Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

                                // 設置索引寫入配置

                                IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45,          analyzer);

                                IndexWriter indexwriter = new IndexWriter(d, config);

                                Document doc = new Document();

                                doc.add(new StringField("id", "1", Store.YES));

                                doc.add(new StringField("name", "brockhong", Store.YES));

                                doc.add(new TextField("content",

                                                            "張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經(jīng)濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0",Store.YES));

                                // 寫入數(shù)據(jù)

                                indexwriter.addDocument(doc);

                                // 提交

                                indexwriter.commit();             }}



    下載jar/Files/brock/ictclas4j.7z

    posted on 2015-01-07 10:11 brock 閱讀(1098) 評論(0)  編輯  收藏 所屬分類: Lucene

    只有注冊用戶登錄后才能發(fā)表評論。


    網(wǎng)站導航:
     
    主站蜘蛛池模板: 亚洲精品视频在线观看视频| 亚洲中文字幕无码mv| 久久99青青精品免费观看| 亚洲精品第一国产综合精品| 日韩激情淫片免费看| a级精品九九九大片免费看| 一本色道久久综合亚洲精品蜜桃冫| 免费jjzz在线播放国产| 无码国产精品一区二区免费16| 亚洲综合一区国产精品| 亚洲精品你懂的在线观看| 免费看韩国黄a片在线观看| 国产精品hd免费观看| 亚洲av无码不卡久久| 亚洲日韩av无码| 卡一卡二卡三在线入口免费| 久久精品免费电影| 美女免费视频一区二区| 亚洲免费电影网站| 亚洲欧洲日产国码无码网站 | 免费看一级高潮毛片| 亚洲成a人片在线观看播放| 国产成人综合亚洲AV第一页| 亚色九九九全国免费视频| 中文在线日本免费永久18近| 亚洲妇女无套内射精| 亚洲欧洲国产视频| 日本亚洲成高清一区二区三区| 国产性生交xxxxx免费| 又黄又爽又成人免费视频| 精品国产污污免费网站| 欧洲精品码一区二区三区免费看| 亚洲免费视频观看| 久久亚洲一区二区| 国内精品99亚洲免费高清| 国产一区视频在线免费观看| 成全高清视频免费观看| 久久99国产综合精品免费| 久久久久国产精品免费看| 中文字幕在线免费视频| 日亚毛片免费乱码不卡一区|