1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler;
28
29 import java.util.Collection;
30 import java.util.Date;
31 import org.apache.log4j.Logger;
32 import org.smartcrawler.common.Context;
33 import org.smartcrawler.extractor.LinksExtractor;
34 import org.smartcrawler.extractor.RegExpLinksExtractor;
35 import org.smartcrawler.filter.FilterManager;
36 import org.smartcrawler.persistence.Persister;
37 import org.smartcrawler.common.Link;
38 import org.smartcrawler.common.Provider;
39 import org.smartcrawler.common.ProviderFactory;
40 import org.smartcrawler.common.SCLogger;
41 import org.smartcrawler.retriever.Content;
42 import org.smartcrawler.retriever.HttpCall;
43 import org.smartcrawler.retriever.Response;
44 import org.smartcrawler.retriever.Retriever;
45
46 /***
47 * The engine thread which is started by the {@link org.smartcrawler.Crawler}
48 *
49 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
50 * @version <tt>$Revision: 1.8 $</tt>
51 */
52 public class DownloadEngine extends Thread {
53
54 private Provider linksProv;
55 private Context conf;
56 private static Logger log = SCLogger.getLogger(DownloadEngine.class);
57 private static Logger logCons = SCLogger.getConsoleLogger();
58 private Retriever retriever;
59 private FilterManager fMan;
60 private Persister persister;
61 /***
62 * Creates a new instance of Engine
63 *
64 * @param conf The the {@link org.smartcrawler.common.SiteConfiguration}
65 * @param retriever The supplied {@link org.smartcrawler.retriever.Retriever}
66 */
67 public DownloadEngine(Context conf) {
68 this.conf = conf;
69 this.retriever = this.conf.getRetriever();
70 this.persister = this.conf.getPersister();
71 this.linksProv = ProviderFactory.instance().create();
72 Collection cPrec = conf.getPrecFiltersList();
73 Collection cPost = conf.getPostFiltersList();
74 log.debug("DownloadEngine() cPrec.size()="+cPrec.size()+
75 " cPost.size()="+cPost.size());
76 this.fMan = new FilterManager();
77 this.fMan.addPrecFilters(cPrec);
78 this.fMan.addPostFilters(cPost);
79 }
80
81 /***
82 *
83 * The main method of the thread. While the {@link org.smartcrawler.common.LinksProvider}
84 * contains a link to process, the engine retrieves and fetches it.
85 */
86 public void run() {
87 log.debug("run(): BEGIN");
88 long startTime = (new Date()).getTime();
89
90 log.info("Started.");
91 logCons.info("Started.");
92 while (!linksProv.isEmpty()) {
93 if (linksProv.size() > 0) {
94
95 Link link = linksProv.next();
96 if (link == null){
97 continue;
98 }
99
100 log.debug(" Processing " + link);
101 HttpCall call = new HttpCall(link);
102 Response result = retriever.execute(call);
103 if (result.isRedirected()) {
104 log.info("The link " +
105 link + " redirects to " + result.getRedirection());
106 logCons.info("The link " +
107 link + " redirects to " + result.getRedirection());
108 if (this.fMan.isPermitted(this.conf,
109 result.getRedirection())) {
110 linksProv.store(result.getRedirection());
111 }
112 }
113
114 if (result.isFound()) {
115 Content content = result.getContent();
116 if (this.fMan.isPermitted(this.conf, content)) {
117
118 persister.persist(content);
119 }
120
121
122 LinksExtractor extractor = new RegExpLinksExtractor(link);
123 Link[] links = extractor.extract(content);
124
125
126 for (Link newLink : links) {
127
128
129 if (this.fMan.isPermitted(this.conf, newLink)) {
130 linksProv.store(newLink);
131 }
132 }
133 }
134 linksProv.confirm(link);
135 }
136
137 }
138 long endTime = (new Date()).getTime();
139 long totTimeMins = (endTime - startTime)/1000;
140
141 log.debug("run(): END");
142 logCons.info("Shutted down [elapsed time: "
143 + totTimeMins + "sec.].");
144 }
145 }