001// Exercise 4.2.39 (Solution published at http://algs4.cs.princeton.edu/) 002package algs42; 003import stdlib.*; 004import java.util.regex.Matcher; 005import java.util.regex.Pattern; 006import algs13.Queue; 007import algs35.SET; 008/* *********************************************************************** 009 * Compilation: javac WebCrawler.java In.java 010 * Execution: java WebCrawler url 011 * 012 * Downloads the web page and prints out all urls on the web page. 013 * Gives an idea of how Google's spider crawls the web. Instead of 014 * looking for hyperlinks, we just look for patterns of the form: 015 * http:// followed by an alternating sequence of alphanumeric 016 * characters and dots, ending with a sequence of alphanumeric 017 * characters. 018 * 019 * % java WebCrawler http://www.slashdot.org 020 * http://www.slashdot.org 021 * http://www.osdn.com 022 * http://sf.net 023 * http://thinkgeek.com 024 * http://freshmeat.net 025 * http://newsletters.osdn.com 026 * http://slashdot.org 027 * http://osdn.com 028 * http://ads.osdn.com 029 * http://sourceforge.net 030 * http://www.msnbc.msn.com 031 * http://www.rhythmbox.org 032 * http://www.apple.com 033 * ... 034 * 035 * % java WebCrawler http://www.cs.princeton.edu 036 * http://www.cs.princeton.edu 037 * http://www.w3.org 038 * http://maps.yahoo.com 039 * http://www.princeton.edu 040 * http://www.Princeton.EDU 041 * http://ncstrl.cs.Princeton.EDU 042 * http://www.genomics.princeton.edu 043 * http://www.math.princeton.edu 044 * http://libweb.Princeton.EDU 045 * http://libweb2.princeton.edu 046 * http://www.acm.org 047 * ... 048 * 049 * 050 * Instead of setting the system property in the code, you could do it 051 * from the commandline 052 * % java -Dsun.net.client.defaultConnectTimeout=250 YYWebCrawler http://www.cs.princeton.edu 053 054 * 055 *************************************************************************/ 056 057public class XWebCrawler { 058 059 public static void main(String[] args) { 060 //args = new String[] { "http://www.slashdot.org" }; 061 //args = new String[] { "http://www.cdm.depaul.edu" }; 062 args = new String[] { "http://fpl.cs.depaul.edu/jriely" }; 063 064 // timeout connection after 500 miliseconds 065 System.setProperty("sun.net.client.defaultConnectTimeout", "500"); 066 System.setProperty("sun.net.client.defaultReadTimeout", "1000"); 067 068 // initial web page 069 String s = args[0]; 070 071 // list of web pages to be examined 072 Queue<String> queue = new Queue<>(); 073 queue.enqueue(s); 074 075 // set of examined web pages 076 SET<String> marked = new SET<>(); 077 marked.add(s); 078 079 // breadth first search crawl of web 080 while (!queue.isEmpty()) { 081 String v = queue.dequeue(); 082 StdOut.println(v); 083 084 In in = new In(v); 085 086 // only needed in case website does not respond 087 if (!in.exists()) continue; 088 089 String input = in.readAll(); 090 if (input == null) continue; 091 092 093 /* *********************************************************** 094 * Find links of the form: http://xxx.yyy.zzz 095 * \\w+ for one or more alpha-numeric characters 096 * \\. for dot 097 * could take first two statements out of loop 098 *************************************************************/ 099 String regexp = "http://(\\w+\\.)*(\\w+)"; 100 Pattern pattern = Pattern.compile(regexp); 101 Matcher matcher = pattern.matcher(input); 102 103 // find and print all matches 104 while (matcher.find()) { 105 String w = matcher.group(); 106 if (!marked.contains(w)) { 107 queue.enqueue(w); 108 marked.add(w); 109 } 110 } 111 112 } 113 } 114}