/*
RComponent 網絡組件 .java版
提供對FTP, NTP, POP3, SMTP編程組件
下載
http://www.rcomponet.com
*/
import java.awt.BorderLayout;
import java.awt.Cursor;
import java.awt.Font;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.BorderFactory;
import javax.swing.JButton;
import javax.swing.JCheckBox;
import javax.swing.JComboBox;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JScrollPane;
import javax.swing.JSeparator;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.table.DefaultTableModel;
// The Search Web Crawler
public class SearchCrawler extends JFrame {
? // Max URLs drop-down values.
? private static final String[] MAX_URLS = { "50", "100", "500", "1000" };
? // Cache of robot disallow lists.
? private HashMap disallowListCache = new HashMap();
? // Search GUI controls.
? private JTextField startTextField;
? private JComboBox maxComboBox;
? private JCheckBox limitCheckBox;
? private JTextField logTextField;
? private JTextField searchTextField;
? private JCheckBox caseCheckBox;
? private JButton searchButton;
? // Search stats GUI controls.
? private JLabel crawlingLabel2;
? private JLabel crawledLabel2;
? private JLabel toCrawlLabel2;
? private JProgressBar progressBar;
? private JLabel matchesLabel2;
? // Table listing search matches.
? private JTable table;
? // Flag for whether or not crawling is underway.
? private boolean crawling;
? // Matches log file print writer.
? private PrintWriter logFileWriter;
? // Constructor for Search Web Crawler.
? public SearchCrawler() {
??? // Set application title.
??? setTitle("Search Crawler");
??? // Set window size.
??? setSize(600, 600);
??? // Handle window closing events.
??? addWindowListener(new WindowAdapter() {
????? public void windowClosing(WindowEvent e) {
??????? actionExit();
????? }
??? });
??? // Set up File menu.
??? JMenuBar menuBar = new JMenuBar();
??? JMenu fileMenu = new JMenu("File");
??? fileMenu.setMnemonic(KeyEvent.VK_F);
??? JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
??? fileExitMenuItem.addActionListener(new ActionListener() {
????? public void actionPerformed(ActionEvent e) {
??????? actionExit();
????? }
??? });
??? fileMenu.add(fileExitMenuItem);
??? menuBar.add(fileMenu);
??? setJMenuBar(menuBar);
??? // Set up search panel.
??? JPanel searchPanel = new JPanel();
??? GridBagConstraints constraints;
??? GridBagLayout layout = new GridBagLayout();
??? searchPanel.setLayout(layout);
??? JLabel startLabel = new JLabel("Start URL:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(startLabel, constraints);
??? searchPanel.add(startLabel);
??? startTextField = new JTextField();
??? startTextField.Text="??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(startTextField, constraints);
??? searchPanel.add(startTextField);
??? JLabel maxLabel = new JLabel("Max URLs to Crawl:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(maxLabel, constraints);
??? searchPanel.add(maxLabel);
??? maxComboBox = new JComboBox(MAX_URLS);
??? maxComboBox.setEditable(true);
??? constraints = new GridBagConstraints();
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(maxComboBox, constraints);
??? searchPanel.add(maxComboBox);
??? limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.WEST;
??? constraints.insets = new Insets(0, 10, 0, 0);
??? layout.setConstraints(limitCheckBox, constraints);
??? searchPanel.add(limitCheckBox);
??? JLabel blankLabel = new JLabel();
??? constraints = new GridBagConstraints();
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? layout.setConstraints(blankLabel, constraints);
??? searchPanel.add(blankLabel);
??? JLabel logLabel = new JLabel("Matches Log File:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(logLabel, constraints);
??? searchPanel.add(logLabel);
??? String file = System.getProperty("user.dir")
??????? + System.getProperty("file.separator") + "crawler.log";
??? logTextField = new JTextField(file);
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(logTextField, constraints);
??? searchPanel.add(logTextField);
??? JLabel searchLabel = new JLabel("Search String:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(searchLabel, constraints);
??? searchPanel.add(searchLabel);
??? searchTextField = new JTextField();
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? constraints.gridwidth = 2;
??? constraints.weightx = 1.0d;
??? layout.setConstraints(searchTextField, constraints);
??? searchPanel.add(searchTextField);
??? caseCheckBox = new JCheckBox("Case Sensitive");
??? constraints = new GridBagConstraints();
??? constraints.insets = new Insets(5, 5, 0, 5);
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? layout.setConstraints(caseCheckBox, constraints);
??? searchPanel.add(caseCheckBox);
??? searchButton = new JButton("Search");
??? searchButton.addActionListener(new ActionListener() {
????? public void actionPerformed(ActionEvent e) {
??????? actionSearch();
????? }
??? });
??? constraints = new GridBagConstraints();
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 5, 5);
??? layout.setConstraints(searchButton, constraints);
??? searchPanel.add(searchButton);
??? JSeparator separator = new JSeparator();
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 5, 5);
??? layout.setConstraints(separator, constraints);
??? searchPanel.add(separator);
??? JLabel crawlingLabel1 = new JLabel("Crawling:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(crawlingLabel1, constraints);
??? searchPanel.add(crawlingLabel1);
??? crawlingLabel2 = new JLabel();
??? crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(crawlingLabel2, constraints);
??? searchPanel.add(crawlingLabel2);
??? JLabel crawledLabel1 = new JLabel("Crawled URLs:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(crawledLabel1, constraints);
??? searchPanel.add(crawledLabel1);
??? crawledLabel2 = new JLabel();
??? crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(crawledLabel2, constraints);
??? searchPanel.add(crawledLabel2);
??? JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(toCrawlLabel1, constraints);
??? searchPanel.add(toCrawlLabel1);
??? toCrawlLabel2 = new JLabel();
??? toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(toCrawlLabel2, constraints);
??? searchPanel.add(toCrawlLabel2);
??? JLabel progressLabel = new JLabel("Crawling Progress:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 0, 0);
??? layout.setConstraints(progressLabel, constraints);
??? searchPanel.add(progressLabel);
??? progressBar = new JProgressBar();
??? progressBar.setMinimum(0);
??? progressBar.setStringPainted(true);
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 0, 5);
??? layout.setConstraints(progressBar, constraints);
??? searchPanel.add(progressBar);
??? JLabel matchesLabel1 = new JLabel("Search Matches:");
??? constraints = new GridBagConstraints();
??? constraints.anchor = GridBagConstraints.EAST;
??? constraints.insets = new Insets(5, 5, 10, 0);
??? layout.setConstraints(matchesLabel1, constraints);
??? searchPanel.add(matchesLabel1);
??? matchesLabel2 = new JLabel();
??? matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
??? constraints = new GridBagConstraints();
??? constraints.fill = GridBagConstraints.HORIZONTAL;
??? constraints.gridwidth = GridBagConstraints.REMAINDER;
??? constraints.insets = new Insets(5, 5, 10, 5);
??? layout.setConstraints(matchesLabel2, constraints);
??? searchPanel.add(matchesLabel2);
??? // Set up matches table.
??? table = new JTable(new DefaultTableModel(new Object[][] {},
??????? new String[] { "URL" }) {
????? public boolean isCellEditable(int row, int column) {
??????? return false;
????? }
??? });
??? // Set up Matches panel.
??? JPanel matchesPanel = new JPanel();
??? matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
??? matchesPanel.setLayout(new BorderLayout());
??? matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);
??? // Add panels to display.
??? getContentPane().setLayout(new BorderLayout());
??? getContentPane().add(searchPanel, BorderLayout.NORTH);
??? getContentPane().add(matchesPanel, BorderLayout.CENTER);
? }
? // Exit this program.
? private void actionExit() {
??? System.exit(0);
? }
? // Handle Search/Stop button being clicked.
? private void actionSearch() {
??? // If stop button clicked, turn crawling flag off.
??? if (crawling) {
????? crawling = false;
????? return;
??? }
??? ArrayList errorList = new ArrayList();
??? // Validate that start URL has been entered.
??? String startUrl = startTextField.getText().trim();
??? if (startUrl.length() < 1) {
????? errorList.add("Missing Start URL.");
??? }
??? // Verify start URL.
??? else if (verifyUrl(startUrl) == null) {
????? errorList.add("Invalid Start URL.");
??? }
??? // Validate that Max URLs is either empty or is a number.
??? int maxUrls = 0;
??? String max = ((String) maxComboBox.getSelectedItem()).trim();
??? if (max.length() > 0) {
????? try {
??????? maxUrls = Integer.parseInt(max);
????? } catch (NumberFormatException e) {
????? }
????? if (maxUrls < 1) {
??????? errorList.add("Invalid Max URLs value.");
????? }
??? }
??? // Validate that matches log file has been entered.
??? String logFile = logTextField.getText().trim();
??? if (logFile.length() < 1) {
????? errorList.add("Missing Matches Log File.");
??? }
??? // Validate that search string has been entered.
??? String searchString = searchTextField.getText().trim();
??? if (searchString.length() < 1) {
????? errorList.add("Missing Search String.");
??? }
??? // Show errors, if any, and return.
??? if (errorList.size() > 0) {
????? StringBuffer message = new StringBuffer();
????? // Concatenate errors into single message.
????? for (int i = 0; i < errorList.size(); i++) {
??????? message.append(errorList.get(i));
??????? if (i + 1 < errorList.size()) {
????????? message.append("\n");
??????? }
????? }
????? showError(message.toString());
????? return;
??? }
??? // Remove "www" from start URL if present.
??? startUrl = removeWwwFromUrl(startUrl);
??? // Start the Search Crawler.
??? search(logFile, startUrl, maxUrls, searchString);
? }
? private void search(final String logFile, final String startUrl,
????? final int maxUrls, final String searchString) {
??? // Start the search in a new thread.
??? Thread thread = new Thread(new Runnable() {
????? public void run() {
??????? // Show hour glass cursor while crawling is under way.
??????? setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
??????? // Disable search controls.
??????? startTextField.setEnabled(false);
??????? maxComboBox.setEnabled(false);
??????? limitCheckBox.setEnabled(false);
??????? logTextField.setEnabled(false);
??????? searchTextField.setEnabled(false);
??????? caseCheckBox.setEnabled(false);
??????? // Switch Search button to "Stop."
??????? searchButton.setText("Stop");
??????? // Reset stats.
??????? table.setModel(new DefaultTableModel(new Object[][] {},
??????????? new String[] { "URL" }) {
????????? public boolean isCellEditable(int row, int column) {
??????????? return false;
????????? }
??????? });
??????? updateStats(startUrl, 0, 0, maxUrls);
??????? // Open matches log file.
??????? try {
????????? logFileWriter = new PrintWriter(new FileWriter(logFile));
??????? } catch (Exception e) {
????????? showError("Unable to open matches log file.");
????????? return;
??????? }
??????? // Turn crawling flag on.
??????? crawling = true;
??????? // Perform the actual crawling.
??????? crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
??????????? searchString, caseCheckBox.isSelected());
??????? // Turn crawling flag off.
??????? crawling = false;
??????? // Close matches log file.
??????? try {
????????? logFileWriter.close();
??????? } catch (Exception e) {
????????? showError("Unable to close matches log file.");
??????? }
??????? // Mark search as done.
??????? crawlingLabel2.setText("Done");
??????? // Enable search controls.
??????? startTextField.setEnabled(true);
??????? maxComboBox.setEnabled(true);
??????? limitCheckBox.setEnabled(true);
??????? logTextField.setEnabled(true);
??????? searchTextField.setEnabled(true);
??????? caseCheckBox.setEnabled(true);
??????? // Switch search button back to "Search."
??????? searchButton.setText("Search");
??????? // Return to default cursor.
??????? setCursor(Cursor.getDefaultCursor());
??????? // Show message if search string not found.
??????? if (table.getRowCount() == 0) {
????????? JOptionPane
????????????? .showMessageDialog(
????????????????? SearchCrawler.this,
????????????????? "Your Search String was not found. Please try another.",
????????????????? "Search String Not Found",
????????????????? JOptionPane.WARNING_MESSAGE);
??????? }
????? }
??? });
??? thread.start();
? }
? // Show dialog box with error message.
? private void showError(String message) {
??? JOptionPane.showMessageDialog(this, message, "Error",
??????? JOptionPane.ERROR_MESSAGE);
? }
? // Update crawling stats.
? private void updateStats(String crawling, int crawled, int toCrawl,
????? int maxUrls) {
??? crawlingLabel2.setText(crawling);
??? crawledLabel2.setText("" + crawled);
??? toCrawlLabel2.setText("" + toCrawl);
??? // Update progress bar.
??? if (maxUrls == -1) {
????? progressBar.setMaximum(crawled + toCrawl);
??? } else {
????? progressBar.setMaximum(maxUrls);
??? }
??? progressBar.setValue(crawled);
??? matchesLabel2.setText("" + table.getRowCount());
? }
? // Add match to matches table and log file.
? private void addMatch(String url) {
??? // Add URL to matches table.
??? DefaultTableModel model = (DefaultTableModel) table.getModel();
??? model.addRow(new Object[] { url });
??? // Add URL to matches log file.
??? try {
????? logFileWriter.println(url);
??? } catch (Exception e) {
????? showError("Unable to log match.");
??? }
? }
? // Verify URL format.
? private URL verifyUrl(String url) {
??? // Only allow HTTP URLs.
??? if (!url.toLowerCase().startsWith("http://"))
????? return null;
??? // Verify format of URL.
??? URL verifiedUrl = null;
??? try {
????? verifiedUrl = new URL(url);
??? } catch (Exception e) {
????? return null;
??? }
??? return verifiedUrl;
? }
? // Check if robot is allowed to access the given URL.
? private boolean isRobotAllowed(URL urlToCheck) {
??? String host = urlToCheck.getHost().toLowerCase();
??? // Retrieve host's disallow list from cache.
??? ArrayList disallowList = (ArrayList) disallowListCache.get(host);
??? // If list is not in the cache, download and cache it.
??? if (disallowList == null) {
????? disallowList = new ArrayList();
????? try {
??????? URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
??????? // Open connection to robot file URL for reading.
??????? BufferedReader reader = new BufferedReader(
??????????? new InputStreamReader(robotsFileUrl.openStream()));
??????? // Read robot file, creating list of disallowed paths.
??????? String line;
??????? while ((line = reader.readLine()) != null) {
????????? if (line.indexOf("Disallow:") == 0) {
??????????? String disallowPath = line.substring("Disallow:"
??????????????? .length());
??????????? // Check disallow path for comments and remove if
??????????? // present.
??????????? int commentIndex = disallowPath.indexOf("#");
??????????? if (commentIndex != -1) {
????????????? disallowPath = disallowPath.substring(0,
????????????????? commentIndex);
??????????? }
??????????? // Remove leading or trailing spaces from disallow path.
??????????? disallowPath = disallowPath.trim();
??????????? // Add disallow path to list.
??????????? disallowList.add(disallowPath);
????????? }
??????? }
??????? // Add new disallow list to cache.
??????? disallowListCache.put(host, disallowList);
????? } catch (Exception e) {
??????? /*
???????? * Assume robot is allowed since an exception is thrown if the
???????? * robot file doesn't exist.
???????? */
??????? return true;
????? }
??? }
??? /*
???? * Loop through disallow list to see if crawling is allowed for the
???? * given URL.
???? */
??? String file = urlToCheck.getFile();
??? for (int i = 0; i < disallowList.size(); i++) {
????? String disallow = (String) disallowList.get(i);
????? if (file.startsWith(disallow)) {
??????? return false;
????? }
??? }
??? return true;
? }
? // Download page at given URL.
? private String downloadPage(URL pageUrl) {
??? try {
????? // Open connection to URL for reading.
????? BufferedReader reader = new BufferedReader(new InputStreamReader(
????????? pageUrl.openStream()));
????? // Read page into buffer.
????? String line;
????? StringBuffer pageBuffer = new StringBuffer();
????? while ((line = reader.readLine()) != null) {
??????? pageBuffer.append(line);
????? }
????? return pageBuffer.toString();
??? } catch (Exception e) {
??? }
??? return null;
? }
? // Remove leading "www" from a URL's host if present.
? private String removeWwwFromUrl(String url) {
??? int index = url.indexOf("://www.");
??? if (index != -1) {
????? return url.substring(0, index + 3) + url.substring(index + 7);
??? }
??? return (url);
? }
? // Parse through page contents and retrieve links.
? private ArrayList retrieveLinks(URL pageUrl, String pageContents,
????? HashSet crawledList, boolean limitHost) {
??? // Compile link matching pattern.
??? Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
??????? Pattern.CASE_INSENSITIVE);
??? Matcher m = p.matcher(pageContents);
??? // Create list of link matches.
??? ArrayList linkList = new ArrayList();
??? while (m.find()) {
????? String link = m.group(1).trim();
????? // Skip empty links.
????? if (link.length() < 1) {
??????? continue;
????? }
????? // Skip links that are just page anchors.
????? if (link.charAt(0) == '#') {
??????? continue;
????? }
????? // Skip mailto links.
????? if (link.indexOf("mailto:") != -1) {
??????? continue;
????? }
????? // Skip JavaScript links.
????? if (link.toLowerCase().indexOf("javascript") != -1) {
??????? continue;
????? }
????? // Prefix absolute and relative URLs if necessary.
????? if (link.indexOf("://") == -1) {
??????? // Handle absolute URLs.
??????? if (link.charAt(0) == '/') {
????????? link = "http://" + pageUrl.getHost() + link;
????????? // Handle relative URLs.
??????? } else {
????????? String file = pageUrl.getFile();
????????? if (file.indexOf('/') == -1) {
??????????? link = "http://" + pageUrl.getHost() + "/" + link;
????????? } else {
??????????? String path = file.substring(0,
??????????????? file.lastIndexOf('/') + 1);
??????????? link = "http://" + pageUrl.getHost() + path + link;
????????? }
??????? }
????? }
????? // Remove anchors from link.
????? int index = link.indexOf('#');
????? if (index != -1) {
??????? link = link.substring(0, index);
????? }
????? // Remove leading "www" from URL's host if present.
????? link = removeWwwFromUrl(link);
????? // Verify link and skip if invalid.
????? URL verifiedLink = verifyUrl(link);
????? if (verifiedLink == null) {
??????? continue;
????? }
????? /*
?????? * If specified, limit links to those having the same host as the
?????? * start URL.
?????? */
????? if (limitHost
????????? && !pageUrl.getHost().toLowerCase().equals(
????????????? verifiedLink.getHost().toLowerCase())) {
??????? continue;
????? }
????? // Skip link if it has already been crawled.
????? if (crawledList.contains(link)) {
??????? continue;
????? }
????? // Add link to list.
????? linkList.add(link);
??? }
??? return (linkList);
? }
? /*
?? * Determine whether or not search string is matched in the given page
?? * contents.
?? */
? private boolean searchStringMatches(String pageContents,
????? String searchString, boolean caseSensitive) {
??? String searchContents = pageContents;
??? /*
???? * If case-sensitive search, lowercase page contents for comparison.
???? */
??? if (!caseSensitive) {
????? searchContents = pageContents.toLowerCase();
??? }
??? // Split search string into individual terms.
??? Pattern p = Pattern.compile("[\\s]+");
??? String[] terms = p.split(searchString);
??? // Check to see if each term matches.
??? for (int i = 0; i < terms.length; i++) {
????? if (caseSensitive) {
??????? if (searchContents.indexOf(terms[i]) == -1) {
????????? return false;
??????? }
????? } else {
??????? if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
????????? return false;
??????? }
????? }
??? }
??? return true;
? }
? // Perform the actual crawling, searching for the search string.
? public void crawl(String startUrl, int maxUrls, boolean limitHost,
????? String searchString, boolean caseSensitive) {
??? // Set up crawl lists.
??? HashSet crawledList = new HashSet();
??? LinkedHashSet toCrawlList = new LinkedHashSet();
??? // Add start URL to the to crawl list.
??? toCrawlList.add(startUrl);
??? /*
???? * Perform actual crawling by looping through the To Crawl list.
???? */
??? while (crawling && toCrawlList.size() > 0) {
????? /*
?????? * Check to see if the max URL count has been reached, if it was
?????? * specified.
?????? */
????? if (maxUrls != -1) {
??????? if (crawledList.size() == maxUrls) {
????????? break;
??????? }
????? }
????? // Get URL at bottom of the list.
????? String url = (String) toCrawlList.iterator().next();
????? // Remove URL from the To Crawl list.
????? toCrawlList.remove(url);
????? // Convert string url to URL object.
????? URL verifiedUrl = verifyUrl(url);
????? // Skip URL if robots are not allowed to access it.
????? if (!isRobotAllowed(verifiedUrl)) {
??????? continue;
????? }
????? // Update crawling stats.
????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
????? // Add page to the crawled list.
????? crawledList.add(url);????? // Download the page at the given URL.
????? String pageContents = downloadPage(verifiedUrl);
????? /*
?????? * If the page was downloaded successfully, retrieve all its links
?????? * and then see if it contains the search string.
?????? */
????? if (pageContents != null && pageContents.length() > 0) {
??????? // Retrieve list of valid links from page.
??????? ArrayList links = retrieveLinks(verifiedUrl, pageContents,
??????????? crawledList, limitHost);
??????? // Add links to the To Crawl list.
??????? toCrawlList.addAll(links);
??????? /*
???????? * Check if search string is present in page, and if so, record
???????? * a match.
???????? */
??????? if (searchStringMatches(pageContents, searchString,
??????????? caseSensitive)) {
????????? addMatch(url);
??????? }
????? }
????? // Update crawling stats.
????? updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
??? }
? }
? // Run the Search Crawler.
? public static void main(String[] args) {
??? SearchCrawler crawler = new SearchCrawler();
??? crawler.show();
? }
}
/**
A quantifier determines how many times an expression is matched. The quantifiers are shown here:
+?? Match one or more.
*?? Match zero or more.
??? Match zero or one.
*/
/*
Character Sequence Explanation
<a Look for the characters "<a".
\\s+ Look for one or more space characters.
href Look for the characters "href".
\\s* Look for zero or more space characters.
=??? Look for the character "--".
\\s* Look for zero or more space characters.
\"?? Look for zero or one quote character.
(.*?)Look for zero or more of any character until the next part of the pattern is matched, and place the results in a group.
[\">]Look for quote character or greater than (">") character.
*/