View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: MimeTypeTranslator.java,v 1.6 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.extractor;
28  
29  /***
30   *
31   *
32   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
33   * @version <tt>$Revision: 1.6 $</tt>
34   */
35  public class MimeTypeTranslator {
36  
37      private static final String[][] translationMatrix = {
38          {"323","text/h323"},
39          {"acx","application/internet-property-stream"},
40          {"ai","application/postscript"},
41          {"aif","audio/x-aiff"},
42          {"aifc","audio/x-aiff"},
43          {"aiff","audio/x-aiff"},
44          {"asf","video/x-ms-asf"},
45          {"asr","video/x-ms-asf"},
46          {"asx","video/x-ms-asf"},
47          {"au","audio/basic"},
48          {"avi","video/x-msvideo"},
49          {"axs","application/olescript"},
50          {"bas","text/plain"},
51          {"bcpio","application/x-bcpio"},
52          {"bin","application/octet-stream"},
53          {"bmp","image/bmp"},
54          {"c","text/plain"},
55          {"cat","application/vnd.ms-pkiseccat"},
56          {"cdf","application/x-cdf"},
57          {"cer","application/x-x509-ca-cert"},
58          {"class","application/octet-stream"},
59          {"clp","application/x-msclip"},
60          {"cmx","image/x-cmx"},
61          {"cod","image/cis-cod"},
62          {"cpio","application/x-cpio"},
63          {"crd","application/x-mscardfile"},
64          {"crl","application/pkix-crl"},
65          {"crt","application/x-x509-ca-cert"},
66          {"csh","application/x-csh"},
67          {"css","text/css"},
68          {"dcr","application/x-director"},
69          {"der","application/x-x509-ca-cert"},
70          {"dir","application/x-director"},
71          {"dll","application/x-msdownload"},
72          {"dms","application/octet-stream"},
73          {"doc","application/msword"},
74          {"dot","application/msword"},
75          {"dvi","application/x-dvi"},
76          {"dxr","application/x-director"},
77          {"eps","application/postscript"},
78          {"etx","text/x-setext"},
79          {"evy","application/envoy"},
80          {"exe","application/octet-stream"},
81          {"fif","application/fractals"},
82          {"flr","x-world/x-vrml"},
83          {"gif","image/gif"},
84          {"gtar","application/x-gtar"},
85          {"gz","application/x-gzip"},
86          {"h","text/plain"},
87          {"hdf","application/x-hdf"},
88          {"hlp","application/winhlp"},
89          {"hqx","application/mac-binhex40"},
90          {"hta","application/hta"},
91          {"htc","text/x-component"},
92          {"html","text/html"},
93          {"htm","text/html"},
94          {"htt","text/webviewhtml"},
95          {"ico","image/x-icon"},
96          {"ief","image/ief"},
97          {"iii","application/x-iphone"},
98          {"ins","application/x-internet-signup"},
99          {"isp","application/x-internet-signup"},
100         {"jfif","image/pipeg"},
101         {"jpe","image/jpeg"},
102         {"jpeg","image/jpeg"},
103         {"jpg","image/jpeg"},
104         {"js","application/x-javascript"},
105         {"latex","application/x-latex"},
106         {"lha","application/octet-stream"},
107         {"lsf","video/x-la-asf"},
108         {"lsx","video/x-la-asf"},
109         {"lzh","application/octet-stream"},
110         {"m13","application/x-msmediaview"},
111         {"m14","application/x-msmediaview"},
112         {"m3u","audio/x-mpegurl"},
113         {"man","application/x-troff-man"},
114         {"mdb","application/x-msaccess"},
115         {"me","application/x-troff-me"},
116         {"mht","message/rfc822"},
117         {"mhtml","message/rfc822"},
118         {"mid","audio/mid"},
119         {"mny","application/x-msmoney"},
120         {"mov","video/quicktime"},
121         {"movie","video/x-sgi-movie"},
122         {"mp2","video/mpeg"},
123         {"mp3","audio/mpeg"},
124         {"mpa","video/mpeg"},
125         {"mpe","video/mpeg"},
126         {"mpeg","video/mpeg"},
127         {"mpg","video/mpeg"},
128         {"mpp","application/vnd.ms-project"},
129         {"mpv2","video/mpeg"},
130         {"ms","application/x-troff-ms"},
131         {"mvb","application/x-msmediaview"},
132         {"nws","message/rfc822"},
133         {"oda","application/oda"},
134         {"p10","application/pkcs10"},
135         {"p12","application/x-pkcs12"},
136         {"p7b","application/x-pkcs7-certificates"},
137         {"p7c","application/x-pkcs7-mime"},
138         {"p7m","application/x-pkcs7-mime"},
139         {"p7r","application/x-pkcs7-certreqresp"},
140         {"p7s","application/x-pkcs7-signature"},
141         {"pbm","image/x-portable-bitmap"},
142         {"pdf","application/pdf"},
143         {"pfx","application/x-pkcs12"},
144         {"pgm","image/x-portable-graymap"},
145         {"pko","application/ynd.ms-pkipko"},
146         {"pma","application/x-perfmon"},
147         {"pmc","application/x-perfmon"},
148         {"pml","application/x-perfmon"},
149         {"pmr","application/x-perfmon"},
150         {"pmw","application/x-perfmon"},
151         {"pnm","image/x-portable-anymap"},
152         {"pot,","application/vnd.ms-powerpoint"},
153         {"ppm","image/x-portable-pixmap"},
154         {"pps","application/vnd.ms-powerpoint"},
155         {"ppt","application/vnd.ms-powerpoint"},
156         {"prf","application/pics-rules"},
157         {"ps","application/postscript"},
158         {"pub","application/x-mspublisher"},
159         {"qt","video/quicktime"},
160         {"ra","audio/x-pn-realaudio"},
161         {"ram","audio/x-pn-realaudio"},
162         {"ras","image/x-cmu-raster"},
163         {"rgb","image/x-rgb"},
164         {"rmi","audio/mid"},
165         {"roff","application/x-troff"},
166         {"rtf","application/rtf"},
167         {"rtx","text/richtext"},
168         {"scd","application/x-msschedule"},
169         {"sct","text/scriptlet"},
170         {"setpay","application/set-payment-initiation"},
171         {"setreg","application/set-registration-initiation"},
172         {"sh","application/x-sh"},
173         {"shar","application/x-shar"},
174         {"sit","application/x-stuffit"},
175         {"snd","audio/basic"},
176         {"spc","application/x-pkcs7-certificates"},
177         {"spl","application/futuresplash"},
178         {"src","application/x-wais-source"},
179         {"sst","application/vnd.ms-pkicertstore"},
180         {"stl","application/vnd.ms-pkistl"},
181         {"stm","text/html"},
182         {"sv4cpio","application/x-sv4cpio"},
183         {"sv4crc","application/x-sv4crc"},
184         {"t","application/x-troff"},
185         {"tar","application/x-tar"},
186         {"tcl","application/x-tcl"},
187         {"tex","application/x-tex"},
188         {"texi","application/x-texinfo"},
189         {"texinfo","application/x-texinfo"},
190         {"tgz","application/x-compressed"},
191         {"tif","image/tiff"},
192         {"tiff","image/tiff"},
193         {"tr","application/x-troff"},
194         {"trm","application/x-msterminal"},
195         {"tsv","text/tab-separated-values"},
196         {"txt","text/plain"},
197         {"uls","text/iuls"},
198         {"ustar","application/x-ustar"},
199         {"vcf","text/x-vcard"},
200         {"vrml","x-world/x-vrml"},
201         {"wav","audio/x-wav"},
202         {"wcm","application/vnd.ms-works"},
203         {"wdb","application/vnd.ms-works"},
204         {"wks","application/vnd.ms-works"},
205         {"wmf","application/x-msmetafile"},
206         {"wps","application/vnd.ms-works"},
207         {"wri","application/x-mswrite"},
208         {"wrl","x-world/x-vrml"},
209         {"wrz","x-world/x-vrml"},
210         {"xaf","x-world/x-vrml"},
211         {"xbm","image/x-xbitmap"},
212         {"xla","application/vnd.ms-excel"},
213         {"xlc","application/vnd.ms-excel"},
214         {"xlm","application/vnd.ms-excel"},
215         {"xls","application/vnd.ms-excel"},
216         {"xlt","application/vnd.ms-excel"},
217         {"xlw","application/vnd.ms-excel"},
218         {"xof","x-world/x-vrml"},
219         {"xpm","image/x-xpixmap"},
220         {"xwd","image/x-xwindowdump"},
221         {"z","application/x-compress"},
222         {"zip","application/zip"}
223     };
224 
225     /*** Creates a new instance of MimeTypeTranslator */
226     private MimeTypeTranslator() {
227     }
228 
229     /***
230      *
231      * @param extension
232      * @throws org.smartcrawler.extractor.UnhandledMimeTypeException
233      * @return
234      */
235     public static String getMimeType(String extension)
236     throws UnhandledMimeTypeException {
237 
238         if (extension == null) {
239             throw new NullPointerException("Unable to parse null file extension");
240         }
241         if (extension.startsWith(".")) {
242             extension = extension.substring(1);
243         }
244         for (int i = 0; i < translationMatrix.length; i++) {
245             if (translationMatrix[i][0].equalsIgnoreCase(extension)) {
246                 return translationMatrix[i][1];
247             }
248         }
249         throw new UnhandledMimeTypeException("Unhadled file extension: "
250                 + extension);
251     }
252 
253     /***
254      *
255      * @param contentType
256      * @throws org.smartcrawler.extractor.UnhandledMimeTypeException
257      * @return
258      */
259     public static String getFileExtension(String contentType)
260     throws UnhandledMimeTypeException {
261 
262         if (contentType == null) {
263             throw new NullPointerException("Unable to parse null Content Type");
264         }
265         String mime = null;
266         int endIndex = contentType.indexOf(";");
267         if (endIndex > 0)
268             mime = contentType.substring(0, endIndex);
269         else
270             mime = contentType;
271         for (int i = 0; i < translationMatrix.length; i++) {
272             if (translationMatrix[i][1].equalsIgnoreCase(mime)) {
273                 return translationMatrix[i][0];
274             }
275         }
276         throw new UnhandledMimeTypeException("Unhadled Content Type: "
277                 + mime);
278     }
279 }
280