search鐨凟xtractor浠g爜濡備笅錛岋紙鍒拰涔︿笂瀹炰緥鐩稿悓錛変緵澶у鍙傝冿細闄勪歡閲屾湁瀹屾暣浠g爜
package com.luceneheritrixbook.extractor.younet;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;
/**
* <p></p>
* @author cnyqiao@hotmail.com
* @date Feb 6, 2009
*/
public class ExtractYounetMoblie extends Extractor {
@Override
public void extract() {
BufferedWriter bw = null;
NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit"));
NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"))));
NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
//鎻愬彇鏍囬淇℃伅
try {
//Parser鏍規嵁榪囨護鍣ㄨ繑鍥炴墍鏈夋弧瓚寵繃婊ゆ潯浠剁殑鑺傜偣
// 榪唬閫愭笎鏌ユ壘
NodeList nodeList=this.getParser().parse(title_filter);
NodeIterator it = nodeList.elements();
StringBuffer title = new StringBuffer();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
String[] names = node.toPlainTextString().split(" ");
for(int i = 0; i < names.length; i++)
title.append(names[i]).append("-");
title.append(new Date().getTime());
//鍒涘緩瑕佺敓鎴愮殑鏂囦歡
bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
//鑾峰彇褰撳墠鎻愬彇欏電殑瀹屾暣URL鍦板潃
int startPos = this.getInuputFilePath().indexOf("mirror") + 6;
String url_seg = this.getInuputFilePath().substring(startPos);
url_seg = url_seg.replaceAll("\\\\", "/");
String url = "http:/" + url_seg;
//鍐欏叆褰撳墠鎻愬彇欏電殑瀹屾暣URL鍦板潃
bw.write(url + NEWLINE);
bw.write(names[0] + NEWLINE);
bw.write(names[1] + NEWLINE);
}
// 閲嶇疆Parser
this.getParser().reset();
Parser attNameParser = null;
Parser attValueParser = null;
//Parser parser=new Parser("http://www.sina.com.cn");
NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"));
NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2"));
String attName = "";
String attValue = "";
// 榪唬閫愭笎鏌ユ壘
nodeList=this.getParser().parse(attribute_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
attNameParser = new Parser();
attNameParser.setEncoding("GB2312");
attNameParser.setInputHTML(node.toHtml());
NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
attName = attNameNodeList.elements().nextNode().toPlainTextString();
attValueParser = new Parser();
attValueParser.setEncoding("GB2312");
attValueParser.setInputHTML(node.toHtml());
NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
attValue = attValueNodeList.elements().nextNode().toPlainTextString();
bw.write(attName.trim() + attValue.trim());
bw.newLine();
}
// 閲嶇疆Parser
this.getParser().reset();
String imgUrl = "";
String fileType ="";
// 榪唬閫愭笎鏌ユ壘
nodeList=this.getParser().parse(img_filter);
it = nodeList.elements();
while (it.hasMoreNodes()) {
Node node = (Node) it.nextNode();
ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
imgUrl = imgNode.getAttribute("src");
fileType = imgUrl.trim().substring(imgUrl
.lastIndexOf(".") + 1);
//鐢熸垚鏂扮殑鍥劇墖鐨勬枃浠跺悕
String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
//鍒╃敤miorr鐩綍涓嬬殑鍥劇墖鐢熸垚鐨勬柊鐨勫浘鐗?/span>
this.copyImage(imgUrl, new_iamge_file);
bw.write(SEPARATOR + NEWLINE);
bw.write(new_iamge_file + NEWLINE);
}
} catch(Exception e) {
e.printStackTrace();
} finally {
try{
if (bw != null)
bw.close();
}catch(IOException e){
e.printStackTrace();
}
}
}
}
榪愯涔︿笂鐨刪eritrix瀹炰緥錛屽茍鎸変功涓婄殑榛樿璁劇疆榪涜鎶撳彇濡備笅錛碉疾錛╋細錛堣鑷繁鍒嗘瀽鏁寸悊錛?br />
http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html


]]>