View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: FileSystemPersister.java,v 1.15 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.persistence;
28  
29  import java.io.File;
30  import java.io.FileOutputStream;
31  import java.io.IOException;
32  import java.net.URL;
33  import java.nio.ByteBuffer;
34  import java.nio.channels.WritableByteChannel;
35  import java.util.StringTokenizer;
36  import org.apache.log4j.Logger;
37  import org.smartcrawler.common.AbstractParametrizableComponent;
38  import org.smartcrawler.retriever.Content;
39  import org.smartcrawler.common.Link;
40  import org.smartcrawler.common.SCLogger;
41  import org.smartcrawler.extractor.MimeTypeTranslator;
42  
43  /***
44   *
45   *
46   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
47   * @version <tt>$Revision: 1.15 $</tt>
48   */
49  public class FileSystemPersister extends AbstractParametrizableComponent implements Persister {
50  
51      private static Logger log = SCLogger.getLogger(FileSystemPersister.class);
52      private static Logger logPers = SCLogger.getPersisterLogger();
53  
54      public static final String FILE_SEPARATOR =
55              System.getProperty("file.separator");
56  
57      /***
58       * Creates a new instance of FileSystemPersister
59       * @param conf
60       */
61      public FileSystemPersister() {
62          log.info("Created persister");
63      }
64  
65      /***
66       *
67       * @param content
68       */
69      public void persist(Content content) {
70          Link link = content.getLink();
71          byte[] buffer = content.getBuffer();
72          if (buffer!= null){
73              //String cType = content.getContentType();
74              File rootDir;
75              if (getParameter("rootDir") != null) {
76                  rootDir = new File(getParameter("rootDir"));
77              } else {
78                  rootDir = new File(".");
79              }
80              if (!rootDir.exists()) {
81                  rootDir.mkdirs();
82              }
83              File file = linkToFilePath(
84                      link, rootDir,content.getContentType());
85              if (file == null){
86                  log.error("persist(): Unable to convert url "
87                          + link + " to a correct file name");
88                  logPers.error("Unable to convert url "
89                          + link + " to a correct file name");
90                  return;
91              }
92              log.debug("persist(): File is " + file.getAbsolutePath());
93              if (file.exists() && file.length() == buffer.length){
94                  log.info("persist(): File " +
95                          file.getAbsolutePath() + " exists");
96              } else {
97                  try {
98                      log.debug("persist(): Allocating buffer of size "
99                              + buffer.length);
100                     ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
101                     bbuf.put(buffer);
102                     bbuf.flip();
103                     // Create a writable file channel
104                     WritableByteChannel wChannel =
105                             new FileOutputStream(file).getChannel();
106                     log.debug("persist(): Got channel for file " +
107                             file.getAbsolutePath());
108 
109                     // Write the ByteBuffer contents; the bytes between the
110                     //ByteBuffer's position and the limit is written to the file
111                     int numWritten = wChannel.write(bbuf);
112                     log.debug("persist(): Wrote " + numWritten
113                             + " bytes on channel for file "
114                             + file.getAbsolutePath());
115 
116                     // Close the file
117                     wChannel.close();
118                     log.debug("persist(): Closed channel for file " +
119                             file.getAbsolutePath());
120                     bbuf.flip();
121                     /*
122                     res.setLinks(extractLinks(bbuf));
123                      */
124                     bbuf.clear();
125                     logPers.info("The buffer for url " + link
126                             + " was successfully "
127                             +"saved on file " + file.getAbsolutePath());
128 
129                 } catch (IOException e) {
130                     log.error("persist(): Problem saving buffer for file " +
131                             file.getAbsolutePath(), e);
132                     logPers.error("Error saving buffer for file " +
133                             file.getAbsolutePath()+ ": " + e.getMessage());
134 
135                 }
136             }
137         } else {
138             log.warn("persist(): the buffer for url " + link + " is NULL");
139             logPers.warn("The buffer for url " + link + " is NULL");
140         }
141 
142     }
143 
144     /***
145      *
146      * @param link
147      * @param rootDir
148      * @param cType
149      * @return
150      */
151     protected File linkToFilePath(Link link, File rootDir, String cType) {
152         log.debug("linkToFilePath(): BEGIN");
153         String  urlStr = link.toString();
154         String fileName = null;
155         try {
156             log.debug("linkToFilePath(): url string is "
157                     + urlStr +" cType=" + cType);
158 
159             //remove the http:// prefix
160             if (urlStr.toLowerCase().startsWith("http://")) {
161                 urlStr = urlStr.substring(7);
162             }
163             //remove the trailing /
164             if (urlStr.toLowerCase().endsWith("/")) {
165                 urlStr = urlStr.substring(0, urlStr.length() - 1);
166             }
167 
168             URL url = link.getURL();
169             String qs = url.getQuery();
170 
171             String ulrPath = url.getHost() + "/" + url.getPath();
172             if (qs != null) {
173                 qs = qs.replaceAll("////|/", "_");
174                 urlStr = ulrPath + "_" + qs;
175             }
176 
177             urlStr = urlStr.replaceAll(":|<|>|//||//*", "_");
178 
179             StringTokenizer st = new StringTokenizer(urlStr, "/");
180             int tokensNum = st.countTokens();
181             int counter = 0;
182             String path = rootDir.getAbsolutePath() + FILE_SEPARATOR;
183             File file = null;
184 
185             while(st.hasMoreElements()) {
186                 counter++;
187                 String elem = (String)st.nextElement();
188                 if (elem.length() == 0)
189                     continue;
190                 log.debug("linkToFilePath(): str=" + urlStr
191                         + "|token=" + elem
192                         + "|tokensNum=" + tokensNum
193                         + "|counter=" + counter);
194 
195                 if (counter == tokensNum &&
196                         elem.indexOf(".") >= 0 && tokensNum > 1) {
197                     //in order to be identified the filename
198                     //has to be the last token and has to contain a dot
199                     fileName = elem;
200                 } else if (getParameter("preservePath") != null &&
201                         getParameter("preservePath").equals("true")) {
202                 //} else {
203                     String dirName = path + elem;
204                     File dir = new File(dirName);
205                     if (!dir.exists()) {
206                         dir.mkdir();
207                         log.debug("linkToFilePath(): Created dir "
208                                 + dir.getAbsolutePath());
209                     }
210                     path += elem + FILE_SEPARATOR;
211                 }
212             }//while(st.hasMoreElements()) {
213 
214             //file creation
215             /*
216             fileName = fileName == null ?
217                 FILE_SEPARATOR + "index."
218                     + MimeTypeTranslator.getFileExtension(cType) :
219                 fileName;
220             */
221             String ext = "unknown";
222             try {
223                 ext = MimeTypeTranslator.getFileExtension(cType);
224             } catch (Exception e) {
225                 log.warn(e.getMessage());
226             }
227             if (fileName == null) {
228                 fileName = FILE_SEPARATOR + "index." + ext;
229             } else if (!fileName.toLowerCase().endsWith(ext)) {
230                 fileName += "." + ext;
231             }
232 
233             fileName = path + fileName;
234 
235             file = new File(fileName);
236 
237             if (!file.exists()) {
238                 file.createNewFile();
239                 log.debug("linkToFilePath(): created file " +
240                         file.getAbsolutePath() + " for url: " + urlStr);
241             } else {
242                 log.warn("linkToFilePath(): The file " + fileName
243                         + " already exists!");
244             }
245             return file;
246 
247         } catch(Exception e) {
248             log.error("linkToFilePath(): Unable to create file: " + fileName +
249                     " Error: " + e.getMessage());
250             return null;
251         } finally{
252             log.debug("linkToFilePath(): END");
253         }
254     }
255 
256 }