<rt id="bn8ez"></rt>
<label id="bn8ez"></label>

  • <span id="bn8ez"></span>

    <label id="bn8ez"><meter id="bn8ez"></meter></label>

    隨筆 - 24, 文章 - 6, 評論 - 70, 引用 - 0
    數據加載中……

    Search Crawler 源碼

    /*
    RComponent 網絡組件 .java版
    提供對FTP, NTP, POP3, SMTP編程組件
    下載
    http://www.rcomponet.com

    */

    import java.awt.BorderLayout;
    import java.awt.Cursor;
    import java.awt.Font;
    import java.awt.GridBagConstraints;
    import java.awt.GridBagLayout;
    import java.awt.Insets;
    import java.awt.event.ActionEvent;
    import java.awt.event.ActionListener;
    import java.awt.event.KeyEvent;
    import java.awt.event.WindowAdapter;
    import java.awt.event.WindowEvent;
    import java.io.BufferedReader;
    import java.io.FileWriter;
    import java.io.InputStreamReader;
    import java.io.PrintWriter;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.LinkedHashSet;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;

    import javax.swing.BorderFactory;
    import javax.swing.JButton;
    import javax.swing.JCheckBox;
    import javax.swing.JComboBox;
    import javax.swing.JFrame;
    import javax.swing.JLabel;
    import javax.swing.JMenu;
    import javax.swing.JMenuBar;
    import javax.swing.JMenuItem;
    import javax.swing.JOptionPane;
    import javax.swing.JPanel;
    import javax.swing.JProgressBar;
    import javax.swing.JScrollPane;
    import javax.swing.JSeparator;
    import javax.swing.JTable;
    import javax.swing.JTextField;
    import javax.swing.table.DefaultTableModel;

    // The Search Web Crawler
    public class SearchCrawler extends JFrame {
    ? // Max URLs drop-down values.
    ? private static final String[] MAX_URLS = { "50", "100", "500", "1000" };

    ? // Cache of robot disallow lists.
    ? private HashMap disallowListCache = new HashMap();

    ? // Search GUI controls.
    ? private JTextField startTextField;

    ? private JComboBox maxComboBox;

    ? private JCheckBox limitCheckBox;

    ? private JTextField logTextField;

    ? private JTextField searchTextField;

    ? private JCheckBox caseCheckBox;

    ? private JButton searchButton;

    ? // Search stats GUI controls.
    ? private JLabel crawlingLabel2;

    ? private JLabel crawledLabel2;

    ? private JLabel toCrawlLabel2;

    ? private JProgressBar progressBar;

    ? private JLabel matchesLabel2;

    ? // Table listing search matches.
    ? private JTable table;

    ? // Flag for whether or not crawling is underway.
    ? private boolean crawling;

    ? // Matches log file print writer.
    ? private PrintWriter logFileWriter;

    ? // Constructor for Search Web Crawler.
    ? public SearchCrawler() {
    ??? // Set application title.
    ??? setTitle("Search Crawler");

    ??? // Set window size.
    ??? setSize(600, 600);

    ??? // Handle window closing events.
    ??? addWindowListener(new WindowAdapter() {
    ????? public void windowClosing(WindowEvent e) {
    ??????? actionExit();
    ????? }
    ??? });

    ??? // Set up File menu.
    ??? JMenuBar menuBar = new JMenuBar();
    ??? JMenu fileMenu = new JMenu("File");
    ??? fileMenu.setMnemonic(KeyEvent.VK_F);
    ??? JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
    ??? fileExitMenuItem.addActionListener(new ActionListener() {
    ????? public void actionPerformed(ActionEvent e) {
    ??????? actionExit();
    ????? }
    ??? });
    ??? fileMenu.add(fileExitMenuItem);
    ??? menuBar.add(fileMenu);
    ??? setJMenuBar(menuBar);

    ??? // Set up search panel.
    ??? JPanel searchPanel = new JPanel();
    ??? GridBagConstraints constraints;
    ??? GridBagLayout layout = new GridBagLayout();
    ??? searchPanel.setLayout(layout);

    ??? JLabel startLabel = new JLabel("Start URL:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(startLabel, constraints);
    ??? searchPanel.add(startLabel);

    ??? startTextField = new JTextField();
    ??? startTextField.Text="??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(startTextField, constraints);
    ??? searchPanel.add(startTextField);

    ??? JLabel maxLabel = new JLabel("Max URLs to Crawl:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(maxLabel, constraints);
    ??? searchPanel.add(maxLabel);

    ??? maxComboBox = new JComboBox(MAX_URLS);
    ??? maxComboBox.setEditable(true);
    ??? constraints = new GridBagConstraints();
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(maxComboBox, constraints);
    ??? searchPanel.add(maxComboBox);

    ??? limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.WEST;
    ??? constraints.insets = new Insets(0, 10, 0, 0);
    ??? layout.setConstraints(limitCheckBox, constraints);
    ??? searchPanel.add(limitCheckBox);

    ??? JLabel blankLabel = new JLabel();
    ??? constraints = new GridBagConstraints();
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? layout.setConstraints(blankLabel, constraints);
    ??? searchPanel.add(blankLabel);

    ??? JLabel logLabel = new JLabel("Matches Log File:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(logLabel, constraints);
    ??? searchPanel.add(logLabel);

    ??? String file = System.getProperty("user.dir")
    ??????? + System.getProperty("file.separator") + "crawler.log";
    ??? logTextField = new JTextField(file);
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(logTextField, constraints);
    ??? searchPanel.add(logTextField);

    ??? JLabel searchLabel = new JLabel("Search String:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(searchLabel, constraints);
    ??? searchPanel.add(searchLabel);

    ??? searchTextField = new JTextField();
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? constraints.gridwidth = 2;
    ??? constraints.weightx = 1.0d;
    ??? layout.setConstraints(searchTextField, constraints);
    ??? searchPanel.add(searchTextField);

    ??? caseCheckBox = new JCheckBox("Case Sensitive");
    ??? constraints = new GridBagConstraints();
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? layout.setConstraints(caseCheckBox, constraints);
    ??? searchPanel.add(caseCheckBox);

    ??? searchButton = new JButton("Search");
    ??? searchButton.addActionListener(new ActionListener() {
    ????? public void actionPerformed(ActionEvent e) {
    ??????? actionSearch();
    ????? }
    ??? });
    ??? constraints = new GridBagConstraints();
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 5, 5);
    ??? layout.setConstraints(searchButton, constraints);
    ??? searchPanel.add(searchButton);

    ??? JSeparator separator = new JSeparator();
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 5, 5);
    ??? layout.setConstraints(separator, constraints);
    ??? searchPanel.add(separator);

    ??? JLabel crawlingLabel1 = new JLabel("Crawling:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(crawlingLabel1, constraints);
    ??? searchPanel.add(crawlingLabel1);

    ??? crawlingLabel2 = new JLabel();
    ??? crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(crawlingLabel2, constraints);
    ??? searchPanel.add(crawlingLabel2);

    ??? JLabel crawledLabel1 = new JLabel("Crawled URLs:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(crawledLabel1, constraints);
    ??? searchPanel.add(crawledLabel1);

    ??? crawledLabel2 = new JLabel();
    ??? crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(crawledLabel2, constraints);
    ??? searchPanel.add(crawledLabel2);

    ??? JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(toCrawlLabel1, constraints);
    ??? searchPanel.add(toCrawlLabel1);

    ??? toCrawlLabel2 = new JLabel();
    ??? toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(toCrawlLabel2, constraints);
    ??? searchPanel.add(toCrawlLabel2);

    ??? JLabel progressLabel = new JLabel("Crawling Progress:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 0, 0);
    ??? layout.setConstraints(progressLabel, constraints);
    ??? searchPanel.add(progressLabel);

    ??? progressBar = new JProgressBar();
    ??? progressBar.setMinimum(0);
    ??? progressBar.setStringPainted(true);
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 0, 5);
    ??? layout.setConstraints(progressBar, constraints);
    ??? searchPanel.add(progressBar);

    ??? JLabel matchesLabel1 = new JLabel("Search Matches:");
    ??? constraints = new GridBagConstraints();
    ??? constraints.anchor = GridBagConstraints.EAST;
    ??? constraints.insets = new Insets(5, 5, 10, 0);
    ??? layout.setConstraints(matchesLabel1, constraints);
    ??? searchPanel.add(matchesLabel1);
    ??? matchesLabel2 = new JLabel();
    ??? matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
    ??? constraints = new GridBagConstraints();
    ??? constraints.fill = GridBagConstraints.HORIZONTAL;
    ??? constraints.gridwidth = GridBagConstraints.REMAINDER;
    ??? constraints.insets = new Insets(5, 5, 10, 5);
    ??? layout.setConstraints(matchesLabel2, constraints);
    ??? searchPanel.add(matchesLabel2);

    ??? // Set up matches table.
    ??? table = new JTable(new DefaultTableModel(new Object[][] {},
    ??????? new String[] { "URL" }) {
    ????? public boolean isCellEditable(int row, int column) {
    ??????? return false;
    ????? }
    ??? });

    ??? // Set up Matches panel.
    ??? JPanel matchesPanel = new JPanel();
    ??? matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
    ??? matchesPanel.setLayout(new BorderLayout());
    ??? matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);

    ??? // Add panels to display.
    ??? getContentPane().setLayout(new BorderLayout());
    ??? getContentPane().add(searchPanel, BorderLayout.NORTH);
    ??? getContentPane().add(matchesPanel, BorderLayout.CENTER);
    ? }

    ? // Exit this program.
    ? private void actionExit() {
    ??? System.exit(0);
    ? }

    ? // Handle Search/Stop button being clicked.
    ? private void actionSearch() {
    ??? // If stop button clicked, turn crawling flag off.
    ??? if (crawling) {
    ????? crawling = false;
    ????? return;
    ??? }

    ??? ArrayList errorList = new ArrayList();

    ??? // Validate that start URL has been entered.
    ??? String startUrl = startTextField.getText().trim();
    ??? if (startUrl.length() < 1) {
    ????? errorList.add("Missing Start URL.");
    ??? }
    ??? // Verify start URL.
    ??? else if (verifyUrl(startUrl) == null) {
    ????? errorList.add("Invalid Start URL.");
    ??? }

    ??? // Validate that Max URLs is either empty or is a number.
    ??? int maxUrls = 0;
    ??? String max = ((String) maxComboBox.getSelectedItem()).trim();
    ??? if (max.length() > 0) {
    ????? try {
    ??????? maxUrls = Integer.parseInt(max);
    ????? } catch (NumberFormatException e) {
    ????? }
    ????? if (maxUrls < 1) {
    ??????? errorList.add("Invalid Max URLs value.");
    ????? }
    ??? }

    ??? // Validate that matches log file has been entered.
    ??? String logFile = logTextField.getText().trim();
    ??? if (logFile.length() < 1) {
    ????? errorList.add("Missing Matches Log File.");
    ??? }

    ??? // Validate that search string has been entered.
    ??? String searchString = searchTextField.getText().trim();
    ??? if (searchString.length() < 1) {
    ????? errorList.add("Missing Search String.");
    ??? }

    ??? // Show errors, if any, and return.
    ??? if (errorList.size() > 0) {
    ????? StringBuffer message = new StringBuffer();

    ????? // Concatenate errors into single message.
    ????? for (int i = 0; i < errorList.size(); i++) {
    ??????? message.append(errorList.get(i));
    ??????? if (i + 1 < errorList.size()) {
    ????????? message.append("\n");
    ??????? }
    ????? }

    ????? showError(message.toString());
    ????? return;
    ??? }

    ??? // Remove "www" from start URL if present.
    ??? startUrl = removeWwwFromUrl(startUrl);

    ??? // Start the Search Crawler.
    ??? search(logFile, startUrl, maxUrls, searchString);
    ? }

    ? private void search(final String logFile, final String startUrl,
    ????? final int maxUrls, final String searchString) {
    ??? // Start the search in a new thread.
    ??? Thread thread = new Thread(new Runnable() {
    ????? public void run() {
    ??????? // Show hour glass cursor while crawling is under way.
    ??????? setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

    ??????? // Disable search controls.
    ??????? startTextField.setEnabled(false);
    ??????? maxComboBox.setEnabled(false);
    ??????? limitCheckBox.setEnabled(false);
    ??????? logTextField.setEnabled(false);
    ??????? searchTextField.setEnabled(false);
    ??????? caseCheckBox.setEnabled(false);

    ??????? // Switch Search button to "Stop."
    ??????? searchButton.setText("Stop");

    ??????? // Reset stats.
    ??????? table.setModel(new DefaultTableModel(new Object[][] {},
    ??????????? new String[] { "URL" }) {
    ????????? public boolean isCellEditable(int row, int column) {
    ??????????? return false;
    ????????? }
    ??????? });
    ??????? updateStats(startUrl, 0, 0, maxUrls);

    ??????? // Open matches log file.
    ??????? try {
    ????????? logFileWriter = new PrintWriter(new FileWriter(logFile));
    ??????? } catch (Exception e) {
    ????????? showError("Unable to open matches log file.");
    ????????? return;
    ??????? }

    ??????? // Turn crawling flag on.
    ??????? crawling = true;

    ??????? // Perform the actual crawling.
    ??????? crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
    ??????????? searchString, caseCheckBox.isSelected());

    ??????? // Turn crawling flag off.
    ??????? crawling = false;

    ??????? // Close matches log file.
    ??????? try {
    ????????? logFileWriter.close();
    ??????? } catch (Exception e) {
    ????????? showError("Unable to close matches log file.");
    ??????? }

    ??????? // Mark search as done.
    ??????? crawlingLabel2.setText("Done");

    ??????? // Enable search controls.
    ??????? startTextField.setEnabled(true);
    ??????? maxComboBox.setEnabled(true);
    ??????? limitCheckBox.setEnabled(true);
    ??????? logTextField.setEnabled(true);
    ??????? searchTextField.setEnabled(true);
    ??????? caseCheckBox.setEnabled(true);

    ??????? // Switch search button back to "Search."
    ??????? searchButton.setText("Search");

    ??????? // Return to default cursor.
    ??????? setCursor(Cursor.getDefaultCursor());

    ??????? // Show message if search string not found.
    ??????? if (table.getRowCount() == 0) {
    ????????? JOptionPane
    ????????????? .showMessageDialog(
    ????????????????? SearchCrawler.this,
    ????????????????? "Your Search String was not found. Please try another.",
    ????????????????? "Search String Not Found",
    ????????????????? JOptionPane.WARNING_MESSAGE);
    ??????? }
    ????? }
    ??? });
    ??? thread.start();
    ? }

    ? // Show dialog box with error message.
    ? private void showError(String message) {
    ??? JOptionPane.showMessageDialog(this, message, "Error",
    ??????? JOptionPane.ERROR_MESSAGE);
    ? }

    ? // Update crawling stats.
    ? private void updateStats(String crawling, int crawled, int toCrawl,
    ????? int maxUrls) {
    ??? crawlingLabel2.setText(crawling);
    ??? crawledLabel2.setText("" + crawled);
    ??? toCrawlLabel2.setText("" + toCrawl);

    ??? // Update progress bar.
    ??? if (maxUrls == -1) {
    ????? progressBar.setMaximum(crawled + toCrawl);
    ??? } else {
    ????? progressBar.setMaximum(maxUrls);
    ??? }
    ??? progressBar.setValue(crawled);

    ??? matchesLabel2.setText("" + table.getRowCount());
    ? }

    ? // Add match to matches table and log file.
    ? private void addMatch(String url) {
    ??? // Add URL to matches table.
    ??? DefaultTableModel model = (DefaultTableModel) table.getModel();
    ??? model.addRow(new Object[] { url });

    ??? // Add URL to matches log file.
    ??? try {
    ????? logFileWriter.println(url);
    ??? } catch (Exception e) {
    ????? showError("Unable to log match.");
    ??? }
    ? }

    ? // Verify URL format.
    ? private URL verifyUrl(String url) {
    ??? // Only allow HTTP URLs.
    ??? if (!url.toLowerCase().startsWith("http://"))
    ????? return null;

    ??? // Verify format of URL.
    ??? URL verifiedUrl = null;
    ??? try {
    ????? verifiedUrl = new URL(url);
    ??? } catch (Exception e) {
    ????? return null;
    ??? }

    ??? return verifiedUrl;
    ? }

    ? // Check if robot is allowed to access the given URL.
    ? private boolean isRobotAllowed(URL urlToCheck) {
    ??? String host = urlToCheck.getHost().toLowerCase();

    ??? // Retrieve host's disallow list from cache.
    ??? ArrayList disallowList = (ArrayList) disallowListCache.get(host);

    ??? // If list is not in the cache, download and cache it.
    ??? if (disallowList == null) {
    ????? disallowList = new ArrayList();

    ????? try {
    ??????? URL robotsFileUrl = new URL("http://" + host + "/robots.txt");

    ??????? // Open connection to robot file URL for reading.
    ??????? BufferedReader reader = new BufferedReader(
    ??????????? new InputStreamReader(robotsFileUrl.openStream()));

    ??????? // Read robot file, creating list of disallowed paths.
    ??????? String line;
    ??????? while ((line = reader.readLine()) != null) {
    ????????? if (line.indexOf("Disallow:") == 0) {
    ??????????? String disallowPath = line.substring("Disallow:"
    ??????????????? .length());

    ??????????? // Check disallow path for comments and remove if
    ??????????? // present.
    ??????????? int commentIndex = disallowPath.indexOf("#");
    ??????????? if (commentIndex != -1) {
    ????????????? disallowPath = disallowPath.substring(0,
    ????????????????? commentIndex);
    ??????????? }

    ??????????? // Remove leading or trailing spaces from disallow path.
    ??????????? disallowPath = disallowPath.trim();

    ??????????? // Add disallow path to list.
    ??????????? disallowList.add(disallowPath);
    ????????? }
    ??????? }

    ??????? // Add new disallow list to cache.
    ??????? disallowListCache.put(host, disallowList);
    ????? } catch (Exception e) {
    ??????? /*
    ???????? * Assume robot is allowed since an exception is thrown if the
    ???????? * robot file doesn't exist.
    ???????? */
    ??????? return true;
    ????? }
    ??? }

    ??? /*
    ???? * Loop through disallow list to see if crawling is allowed for the
    ???? * given URL.
    ???? */
    ??? String file = urlToCheck.getFile();
    ??? for (int i = 0; i < disallowList.size(); i++) {
    ????? String disallow = (String) disallowList.get(i);
    ????? if (file.startsWith(disallow)) {
    ??????? return false;
    ????? }
    ??? }

    ??? return true;
    ? }

    ? // Download page at given URL.
    ? private String downloadPage(URL pageUrl) {
    ??? try {
    ????? // Open connection to URL for reading.
    ????? BufferedReader reader = new BufferedReader(new InputStreamReader(
    ????????? pageUrl.openStream()));

    ????? // Read page into buffer.
    ????? String line;
    ????? StringBuffer pageBuffer = new StringBuffer();
    ????? while ((line = reader.readLine()) != null) {
    ??????? pageBuffer.append(line);
    ????? }

    ????? return pageBuffer.toString();
    ??? } catch (Exception e) {
    ??? }

    ??? return null;
    ? }

    ? // Remove leading "www" from a URL's host if present.
    ? private String removeWwwFromUrl(String url) {
    ??? int index = url.indexOf("://www.");
    ??? if (index != -1) {
    ????? return url.substring(0, index + 3) + url.substring(index + 7);
    ??? }

    ??? return (url);
    ? }

    ? // Parse through page contents and retrieve links.
    ? private ArrayList retrieveLinks(URL pageUrl, String pageContents,
    ????? HashSet crawledList, boolean limitHost) {
    ??? // Compile link matching pattern.
    ??? Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
    ??????? Pattern.CASE_INSENSITIVE);
    ??? Matcher m = p.matcher(pageContents);

    ??? // Create list of link matches.
    ??? ArrayList linkList = new ArrayList();
    ??? while (m.find()) {
    ????? String link = m.group(1).trim();

    ????? // Skip empty links.
    ????? if (link.length() < 1) {
    ??????? continue;
    ????? }

    ????? // Skip links that are just page anchors.
    ????? if (link.charAt(0) == '#') {
    ??????? continue;
    ????? }

    ????? // Skip mailto links.
    ????? if (link.indexOf("mailto:") != -1) {
    ??????? continue;
    ????? }

    ????? // Skip JavaScript links.
    ????? if (link.toLowerCase().indexOf("javascript") != -1) {
    ??????? continue;
    ????? }

    ????? // Prefix absolute and relative URLs if necessary.
    ????? if (link.indexOf("://") == -1) {
    ??????? // Handle absolute URLs.
    ??????? if (link.charAt(0) == '/') {
    ????????? link = "http://" + pageUrl.getHost() + link;
    ????????? // Handle relative URLs.
    ??????? } else {
    ????????? String file = pageUrl.getFile();
    ????????? if (file.indexOf('/') == -1) {
    ??????????? link = "http://" + pageUrl.getHost() + "/" + link;
    ????????? } else {
    ??????????? String path = file.substring(0,
    ??????????????? file.lastIndexOf('/') + 1);
    ??????????? link = "http://" + pageUrl.getHost() + path + link;
    ????????? }
    ??????? }
    ????? }

    ????? // Remove anchors from link.
    ????? int index = link.indexOf('#');
    ????? if (index != -1) {
    ??????? link = link.substring(0, index);
    ????? }

    ????? // Remove leading "www" from URL's host if present.
    ????? link = removeWwwFromUrl(link);

    ????? // Verify link and skip if invalid.
    ????? URL verifiedLink = verifyUrl(link);
    ????? if (verifiedLink == null) {
    ??????? continue;
    ????? }

    ????? /*
    ?????? * If specified, limit links to those having the same host as the
    ?????? * start URL.
    ?????? */
    ????? if (limitHost
    ????????? && !pageUrl.getHost().toLowerCase().equals(
    ????????????? verifiedLink.getHost().toLowerCase())) {
    ??????? continue;
    ????? }

    ????? // Skip link if it has already been crawled.
    ????? if (crawledList.contains(link)) {
    ??????? continue;
    ????? }

    ????? // Add link to list.
    ????? linkList.add(link);
    ??? }

    ??? return (linkList);
    ? }

    ? /*
    ?? * Determine whether or not search string is matched in the given page
    ?? * contents.
    ?? */
    ? private boolean searchStringMatches(String pageContents,
    ????? String searchString, boolean caseSensitive) {
    ??? String searchContents = pageContents;

    ??? /*
    ???? * If case-sensitive search, lowercase page contents for comparison.
    ???? */
    ??? if (!caseSensitive) {
    ????? searchContents = pageContents.toLowerCase();
    ??? }
    ??? // Split search string into individual terms.
    ??? Pattern p = Pattern.compile("[\\s]+");
    ??? String[] terms = p.split(searchString);

    ??? // Check to see if each term matches.
    ??? for (int i = 0; i < terms.length; i++) {
    ????? if (caseSensitive) {
    ??????? if (searchContents.indexOf(terms[i]) == -1) {
    ????????? return false;
    ??????? }
    ????? } else {
    ??????? if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
    ????????? return false;
    ??????? }
    ????? }
    ??? }

    ??? return true;
    ? }

    ? // Perform the actual crawling, searching for the search string.
    ? public void crawl(String startUrl, int maxUrls, boolean limitHost,
    ????? String searchString, boolean caseSensitive) {
    ??? // Set up crawl lists.
    ??? HashSet crawledList = new HashSet();
    ??? LinkedHashSet toCrawlList = new LinkedHashSet();

    ??? // Add start URL to the to crawl list.
    ??? toCrawlList.add(startUrl);

    ??? /*
    ???? * Perform actual crawling by looping through the To Crawl list.
    ???? */
    ??? while (crawling && toCrawlList.size() > 0) {
    ????? /*
    ?????? * Check to see if the max URL count has been reached, if it was
    ?????? * specified.
    ?????? */
    ????? if (maxUrls != -1) {
    ??????? if (crawledList.size() == maxUrls) {
    ????????? break;
    ??????? }
    ????? }

    ????? // Get URL at bottom of the list.
    ????? String url = (String) toCrawlList.iterator().next();

    ????? // Remove URL from the To Crawl list.
    ????? toCrawlList.remove(url);

    ????? // Convert string url to URL object.
    ????? URL verifiedUrl = verifyUrl(url);

    ????? // Skip URL if robots are not allowed to access it.
    ????? if (!isRobotAllowed(verifiedUrl)) {
    ??????? continue;
    ????? }

    ????? // Update crawling stats.
    ????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);

    ????? // Add page to the crawled list.
    ????? crawledList.add(url);????? // Download the page at the given URL.
    ????? String pageContents = downloadPage(verifiedUrl);

    ????? /*
    ?????? * If the page was downloaded successfully, retrieve all its links
    ?????? * and then see if it contains the search string.
    ?????? */
    ????? if (pageContents != null && pageContents.length() > 0) {
    ??????? // Retrieve list of valid links from page.
    ??????? ArrayList links = retrieveLinks(verifiedUrl, pageContents,
    ??????????? crawledList, limitHost);

    ??????? // Add links to the To Crawl list.
    ??????? toCrawlList.addAll(links);

    ??????? /*
    ???????? * Check if search string is present in page, and if so, record
    ???????? * a match.
    ???????? */
    ??????? if (searchStringMatches(pageContents, searchString,
    ??????????? caseSensitive)) {
    ????????? addMatch(url);
    ??????? }
    ????? }

    ????? // Update crawling stats.
    ????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
    ??? }
    ? }

    ? // Run the Search Crawler.
    ? public static void main(String[] args) {
    ??? SearchCrawler crawler = new SearchCrawler();
    ??? crawler.show();
    ? }
    }
    /**
    A quantifier determines how many times an expression is matched. The quantifiers are shown here:
    +?? Match one or more.
    *?? Match zero or more.
    ??? Match zero or one.

    */

    /*
    Character Sequence Explanation

    <a Look for the characters "<a".

    \\s+ Look for one or more space characters.

    href Look for the characters "href".

    \\s* Look for zero or more space characters.

    =??? Look for the character "--".

    \\s* Look for zero or more space characters.

    \"?? Look for zero or one quote character.

    (.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group.

    [\">]Look for quote character or greater than (">") character.

    */

    posted on 2006-04-12 13:14 大雁北飛 閱讀(858) 評論(2)  編輯  收藏

    評論

    # re: Search Crawler 源碼  回復  更多評論   

    是java的嗎? 

    2006-05-12 11:15 | liufei

    # re: Search Crawler 源碼  回復  更多評論   

    no comment for the code ?!
    2009-10-20 11:20 | yakasima

    只有注冊用戶登錄后才能發表評論。


    網站導航:
     
    主站蜘蛛池模板: 亚洲精品成人无码中文毛片不卡| 妞干网免费视频观看| 亚洲国产精品毛片av不卡在线| 亚洲嫩草影院在线观看| 最近免费中文字幕高清大全| 亚洲va在线va天堂va888www| 亚洲免费精彩视频在线观看| 亚洲国产综合专区电影在线| 1000部无遮挡拍拍拍免费视频观看 | 华人在线精品免费观看| 亚洲一区二区三区影院| 色www永久免费网站| 国产精品亚洲一区二区三区在线| a级成人免费毛片完整版| 亚洲成a人片在线观看中文动漫 | 日韩在线观看视频免费| 国产精品亚洲产品一区二区三区| 国产成人无码精品久久久免费| 亚洲精品tv久久久久| 日韩精品无码免费专区网站| 亚洲一区二区三区电影| 最新欧洲大片免费在线| 日日摸日日碰夜夜爽亚洲| 激情97综合亚洲色婷婷五 | 免费看一级毛片在线观看精品视频| 亚洲午夜爱爱香蕉片| 国产午夜无码精品免费看动漫 | 亚洲乱人伦精品图片| 卡1卡2卡3卡4卡5免费视频 | 国产精品手机在线亚洲| 中文字幕久久亚洲一区| 最近中文字幕大全中文字幕免费| 国产精品亚洲午夜一区二区三区| 国产aa免费视频| 日本免费久久久久久久网站| 久久精品亚洲AV久久久无码| 亚洲成AⅤ人影院在线观看| 日韩视频在线观看免费| 亚洲AV无码一区二区乱子仑| 亚洲乱码一区二区三区在线观看 | 免费大片在线观看网站|