001// Exercise 4.2.39 (Solution published at http://algs4.cs.princeton.edu/)
002package algs42;
003import stdlib.*;
004import java.util.regex.Matcher;
005import java.util.regex.Pattern;
006import algs13.Stack;
007import algs35.SET;
008/* ***********************************************************************
009 *  Compilation:  javac WebCrawler.java In.java
010 *  Execution:    java WebCrawler url
011 *
012 *  Downloads the web page and prints out all urls on the web page.
013 *  Gives an idea of how Google's spider crawls the web. Instead of
014 *  looking for hyperlinks, we just look for patterns of the form:
015 *  http:// followed by an alternating sequence of alphanumeric
016 *  characters and dots, ending with a sequence of alphanumeric
017 *  characters.
018 *
019 *  % java WebCrawler http://www.slashdot.org
020 *  http://www.slashdot.org
021 *  http://www.osdn.com
022 *  http://sf.net
023 *  http://thinkgeek.com
024 *  http://freshmeat.net
025 *  http://newsletters.osdn.com
026 *  http://slashdot.org
027 *  http://osdn.com
028 *  http://ads.osdn.com
029 *  http://sourceforge.net
030 *  http://www.msnbc.msn.com
031 *  http://www.rhythmbox.org
032 *  http://www.apple.com
033 *  ...
034 *
035 *  % java WebCrawler http://www.cs.princeton.edu
036 *  http://www.cs.princeton.edu
037 *  http://www.w3.org
038 *  http://maps.yahoo.com
039 *  http://www.princeton.edu
040 *  http://www.Princeton.EDU
041 *  http://ncstrl.cs.Princeton.EDU
042 *  http://www.genomics.princeton.edu
043 *  http://www.math.princeton.edu
044 *  http://libweb.Princeton.EDU
045 *  http://libweb2.princeton.edu
046 *  http://www.acm.org
047 *  ...
048 *
049 *
050 *  Instead of setting the system property in the code, you could do it
051 *  from the commandline
052 *  % java -Dsun.net.client.defaultConnectTimeout=250 YYWebCrawler http://www.cs.princeton.edu
053
054 *
055 *************************************************************************/
056
057public class XWebCrawlerDFS {
058
059        public static void main(String[] args) {
060                //args = new String[] { "http://www.slashdot.org" };
061                //args = new String[] { "http://www.cdm.depaul.edu" };
062                args = new String[] { "http://fpl.cs.depaul.edu/jriely" };
063
064                // timeout connection after 500 miliseconds
065                System.setProperty("sun.net.client.defaultConnectTimeout", "500");
066                System.setProperty("sun.net.client.defaultReadTimeout",    "1000");
067
068                // initial web page
069                String s = args[0];
070
071                // list of web pages to be examined
072                Stack<String> queue = new Stack<>();
073                queue.push(s);
074
075                // set of examined web pages
076                SET<String> marked = new SET<>();
077                marked.add(s);
078
079                // breadth first search crawl of web
080                while (!queue.isEmpty()) {
081                        String v = queue.pop();
082                        StdOut.println(v);
083
084                        In in = new In(v);
085
086                        // only needed in case website does not respond
087                        if (!in.exists()) continue;
088
089                        String input = in.readAll();
090                        if (input == null) continue;
091
092
093                        /* ***********************************************************
094                         *  Find links of the form: http://xxx.yyy.zzz
095                         *  \\w+ for one or more alpha-numeric characters
096                         *  \\. for dot
097                         *  could take first two statements out of loop
098                         *************************************************************/
099                        String regexp = "http://(\\w+\\.)+(\\w+)";
100                        Pattern pattern = Pattern.compile(regexp);
101                        Matcher matcher = pattern.matcher(input);
102
103                        // find and print all matches
104                        while (matcher.find()) {
105                                String w = matcher.group();
106                                if (!marked.contains(w)) {
107                                        queue.push(w);
108                                        marked.add(w);
109                                }
110                        }
111
112                }
113        }
114}