1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.persistence;
28
29 import java.io.File;
30 import java.io.FileOutputStream;
31 import java.io.IOException;
32 import java.net.URL;
33 import java.nio.ByteBuffer;
34 import java.nio.channels.WritableByteChannel;
35 import java.util.StringTokenizer;
36 import org.apache.log4j.Logger;
37 import org.smartcrawler.common.AbstractParametrizableComponent;
38 import org.smartcrawler.retriever.Content;
39 import org.smartcrawler.common.Link;
40 import org.smartcrawler.common.SCLogger;
41 import org.smartcrawler.extractor.MimeTypeTranslator;
42
43 /***
44 *
45 *
46 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
47 * @version <tt>$Revision: 1.15 $</tt>
48 */
49 public class FileSystemPersister extends AbstractParametrizableComponent implements Persister {
50
51 private static Logger log = SCLogger.getLogger(FileSystemPersister.class);
52 private static Logger logPers = SCLogger.getPersisterLogger();
53
54 public static final String FILE_SEPARATOR =
55 System.getProperty("file.separator");
56
57 /***
58 * Creates a new instance of FileSystemPersister
59 * @param conf
60 */
61 public FileSystemPersister() {
62 log.info("Created persister");
63 }
64
65 /***
66 *
67 * @param content
68 */
69 public void persist(Content content) {
70 Link link = content.getLink();
71 byte[] buffer = content.getBuffer();
72 if (buffer!= null){
73
74 File rootDir;
75 if (getParameter("rootDir") != null) {
76 rootDir = new File(getParameter("rootDir"));
77 } else {
78 rootDir = new File(".");
79 }
80 if (!rootDir.exists()) {
81 rootDir.mkdirs();
82 }
83 File file = linkToFilePath(
84 link, rootDir,content.getContentType());
85 if (file == null){
86 log.error("persist(): Unable to convert url "
87 + link + " to a correct file name");
88 logPers.error("Unable to convert url "
89 + link + " to a correct file name");
90 return;
91 }
92 log.debug("persist(): File is " + file.getAbsolutePath());
93 if (file.exists() && file.length() == buffer.length){
94 log.info("persist(): File " +
95 file.getAbsolutePath() + " exists");
96 } else {
97 try {
98 log.debug("persist(): Allocating buffer of size "
99 + buffer.length);
100 ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
101 bbuf.put(buffer);
102 bbuf.flip();
103
104 WritableByteChannel wChannel =
105 new FileOutputStream(file).getChannel();
106 log.debug("persist(): Got channel for file " +
107 file.getAbsolutePath());
108
109
110
111 int numWritten = wChannel.write(bbuf);
112 log.debug("persist(): Wrote " + numWritten
113 + " bytes on channel for file "
114 + file.getAbsolutePath());
115
116
117 wChannel.close();
118 log.debug("persist(): Closed channel for file " +
119 file.getAbsolutePath());
120 bbuf.flip();
121
122
123
124 bbuf.clear();
125 logPers.info("The buffer for url " + link
126 + " was successfully "
127 +"saved on file " + file.getAbsolutePath());
128
129 } catch (IOException e) {
130 log.error("persist(): Problem saving buffer for file " +
131 file.getAbsolutePath(), e);
132 logPers.error("Error saving buffer for file " +
133 file.getAbsolutePath()+ ": " + e.getMessage());
134
135 }
136 }
137 } else {
138 log.warn("persist(): the buffer for url " + link + " is NULL");
139 logPers.warn("The buffer for url " + link + " is NULL");
140 }
141
142 }
143
144 /***
145 *
146 * @param link
147 * @param rootDir
148 * @param cType
149 * @return
150 */
151 protected File linkToFilePath(Link link, File rootDir, String cType) {
152 log.debug("linkToFilePath(): BEGIN");
153 String urlStr = link.toString();
154 String fileName = null;
155 try {
156 log.debug("linkToFilePath(): url string is "
157 + urlStr +" cType=" + cType);
158
159
160 if (urlStr.toLowerCase().startsWith("http://")) {
161 urlStr = urlStr.substring(7);
162 }
163
164 if (urlStr.toLowerCase().endsWith("/")) {
165 urlStr = urlStr.substring(0, urlStr.length() - 1);
166 }
167
168 URL url = link.getURL();
169 String qs = url.getQuery();
170
171 String ulrPath = url.getHost() + "/" + url.getPath();
172 if (qs != null) {
173 qs = qs.replaceAll("////|/", "_");
174 urlStr = ulrPath + "_" + qs;
175 }
176
177 urlStr = urlStr.replaceAll(":|<|>|//||//*", "_");
178
179 StringTokenizer st = new StringTokenizer(urlStr, "/");
180 int tokensNum = st.countTokens();
181 int counter = 0;
182 String path = rootDir.getAbsolutePath() + FILE_SEPARATOR;
183 File file = null;
184
185 while(st.hasMoreElements()) {
186 counter++;
187 String elem = (String)st.nextElement();
188 if (elem.length() == 0)
189 continue;
190 log.debug("linkToFilePath(): str=" + urlStr
191 + "|token=" + elem
192 + "|tokensNum=" + tokensNum
193 + "|counter=" + counter);
194
195 if (counter == tokensNum &&
196 elem.indexOf(".") >= 0 && tokensNum > 1) {
197
198
199 fileName = elem;
200 } else if (getParameter("preservePath") != null &&
201 getParameter("preservePath").equals("true")) {
202
203 String dirName = path + elem;
204 File dir = new File(dirName);
205 if (!dir.exists()) {
206 dir.mkdir();
207 log.debug("linkToFilePath(): Created dir "
208 + dir.getAbsolutePath());
209 }
210 path += elem + FILE_SEPARATOR;
211 }
212 }
213
214
215
216
217
218
219
220
221 String ext = "unknown";
222 try {
223 ext = MimeTypeTranslator.getFileExtension(cType);
224 } catch (Exception e) {
225 log.warn(e.getMessage());
226 }
227 if (fileName == null) {
228 fileName = FILE_SEPARATOR + "index." + ext;
229 } else if (!fileName.toLowerCase().endsWith(ext)) {
230 fileName += "." + ext;
231 }
232
233 fileName = path + fileName;
234
235 file = new File(fileName);
236
237 if (!file.exists()) {
238 file.createNewFile();
239 log.debug("linkToFilePath(): created file " +
240 file.getAbsolutePath() + " for url: " + urlStr);
241 } else {
242 log.warn("linkToFilePath(): The file " + fileName
243 + " already exists!");
244 }
245 return file;
246
247 } catch(Exception e) {
248 log.error("linkToFilePath(): Unable to create file: " + fileName +
249 " Error: " + e.getMessage());
250 return null;
251 } finally{
252 log.debug("linkToFilePath(): END");
253 }
254 }
255
256 }