1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler.extractor;
28
29 /***
30 *
31 *
32 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
33 * @version <tt>$Revision: 1.6 $</tt>
34 */
35 public class MimeTypeTranslator {
36
37 private static final String[][] translationMatrix = {
38 {"323","text/h323"},
39 {"acx","application/internet-property-stream"},
40 {"ai","application/postscript"},
41 {"aif","audio/x-aiff"},
42 {"aifc","audio/x-aiff"},
43 {"aiff","audio/x-aiff"},
44 {"asf","video/x-ms-asf"},
45 {"asr","video/x-ms-asf"},
46 {"asx","video/x-ms-asf"},
47 {"au","audio/basic"},
48 {"avi","video/x-msvideo"},
49 {"axs","application/olescript"},
50 {"bas","text/plain"},
51 {"bcpio","application/x-bcpio"},
52 {"bin","application/octet-stream"},
53 {"bmp","image/bmp"},
54 {"c","text/plain"},
55 {"cat","application/vnd.ms-pkiseccat"},
56 {"cdf","application/x-cdf"},
57 {"cer","application/x-x509-ca-cert"},
58 {"class","application/octet-stream"},
59 {"clp","application/x-msclip"},
60 {"cmx","image/x-cmx"},
61 {"cod","image/cis-cod"},
62 {"cpio","application/x-cpio"},
63 {"crd","application/x-mscardfile"},
64 {"crl","application/pkix-crl"},
65 {"crt","application/x-x509-ca-cert"},
66 {"csh","application/x-csh"},
67 {"css","text/css"},
68 {"dcr","application/x-director"},
69 {"der","application/x-x509-ca-cert"},
70 {"dir","application/x-director"},
71 {"dll","application/x-msdownload"},
72 {"dms","application/octet-stream"},
73 {"doc","application/msword"},
74 {"dot","application/msword"},
75 {"dvi","application/x-dvi"},
76 {"dxr","application/x-director"},
77 {"eps","application/postscript"},
78 {"etx","text/x-setext"},
79 {"evy","application/envoy"},
80 {"exe","application/octet-stream"},
81 {"fif","application/fractals"},
82 {"flr","x-world/x-vrml"},
83 {"gif","image/gif"},
84 {"gtar","application/x-gtar"},
85 {"gz","application/x-gzip"},
86 {"h","text/plain"},
87 {"hdf","application/x-hdf"},
88 {"hlp","application/winhlp"},
89 {"hqx","application/mac-binhex40"},
90 {"hta","application/hta"},
91 {"htc","text/x-component"},
92 {"html","text/html"},
93 {"htm","text/html"},
94 {"htt","text/webviewhtml"},
95 {"ico","image/x-icon"},
96 {"ief","image/ief"},
97 {"iii","application/x-iphone"},
98 {"ins","application/x-internet-signup"},
99 {"isp","application/x-internet-signup"},
100 {"jfif","image/pipeg"},
101 {"jpe","image/jpeg"},
102 {"jpeg","image/jpeg"},
103 {"jpg","image/jpeg"},
104 {"js","application/x-javascript"},
105 {"latex","application/x-latex"},
106 {"lha","application/octet-stream"},
107 {"lsf","video/x-la-asf"},
108 {"lsx","video/x-la-asf"},
109 {"lzh","application/octet-stream"},
110 {"m13","application/x-msmediaview"},
111 {"m14","application/x-msmediaview"},
112 {"m3u","audio/x-mpegurl"},
113 {"man","application/x-troff-man"},
114 {"mdb","application/x-msaccess"},
115 {"me","application/x-troff-me"},
116 {"mht","message/rfc822"},
117 {"mhtml","message/rfc822"},
118 {"mid","audio/mid"},
119 {"mny","application/x-msmoney"},
120 {"mov","video/quicktime"},
121 {"movie","video/x-sgi-movie"},
122 {"mp2","video/mpeg"},
123 {"mp3","audio/mpeg"},
124 {"mpa","video/mpeg"},
125 {"mpe","video/mpeg"},
126 {"mpeg","video/mpeg"},
127 {"mpg","video/mpeg"},
128 {"mpp","application/vnd.ms-project"},
129 {"mpv2","video/mpeg"},
130 {"ms","application/x-troff-ms"},
131 {"mvb","application/x-msmediaview"},
132 {"nws","message/rfc822"},
133 {"oda","application/oda"},
134 {"p10","application/pkcs10"},
135 {"p12","application/x-pkcs12"},
136 {"p7b","application/x-pkcs7-certificates"},
137 {"p7c","application/x-pkcs7-mime"},
138 {"p7m","application/x-pkcs7-mime"},
139 {"p7r","application/x-pkcs7-certreqresp"},
140 {"p7s","application/x-pkcs7-signature"},
141 {"pbm","image/x-portable-bitmap"},
142 {"pdf","application/pdf"},
143 {"pfx","application/x-pkcs12"},
144 {"pgm","image/x-portable-graymap"},
145 {"pko","application/ynd.ms-pkipko"},
146 {"pma","application/x-perfmon"},
147 {"pmc","application/x-perfmon"},
148 {"pml","application/x-perfmon"},
149 {"pmr","application/x-perfmon"},
150 {"pmw","application/x-perfmon"},
151 {"pnm","image/x-portable-anymap"},
152 {"pot,","application/vnd.ms-powerpoint"},
153 {"ppm","image/x-portable-pixmap"},
154 {"pps","application/vnd.ms-powerpoint"},
155 {"ppt","application/vnd.ms-powerpoint"},
156 {"prf","application/pics-rules"},
157 {"ps","application/postscript"},
158 {"pub","application/x-mspublisher"},
159 {"qt","video/quicktime"},
160 {"ra","audio/x-pn-realaudio"},
161 {"ram","audio/x-pn-realaudio"},
162 {"ras","image/x-cmu-raster"},
163 {"rgb","image/x-rgb"},
164 {"rmi","audio/mid"},
165 {"roff","application/x-troff"},
166 {"rtf","application/rtf"},
167 {"rtx","text/richtext"},
168 {"scd","application/x-msschedule"},
169 {"sct","text/scriptlet"},
170 {"setpay","application/set-payment-initiation"},
171 {"setreg","application/set-registration-initiation"},
172 {"sh","application/x-sh"},
173 {"shar","application/x-shar"},
174 {"sit","application/x-stuffit"},
175 {"snd","audio/basic"},
176 {"spc","application/x-pkcs7-certificates"},
177 {"spl","application/futuresplash"},
178 {"src","application/x-wais-source"},
179 {"sst","application/vnd.ms-pkicertstore"},
180 {"stl","application/vnd.ms-pkistl"},
181 {"stm","text/html"},
182 {"sv4cpio","application/x-sv4cpio"},
183 {"sv4crc","application/x-sv4crc"},
184 {"t","application/x-troff"},
185 {"tar","application/x-tar"},
186 {"tcl","application/x-tcl"},
187 {"tex","application/x-tex"},
188 {"texi","application/x-texinfo"},
189 {"texinfo","application/x-texinfo"},
190 {"tgz","application/x-compressed"},
191 {"tif","image/tiff"},
192 {"tiff","image/tiff"},
193 {"tr","application/x-troff"},
194 {"trm","application/x-msterminal"},
195 {"tsv","text/tab-separated-values"},
196 {"txt","text/plain"},
197 {"uls","text/iuls"},
198 {"ustar","application/x-ustar"},
199 {"vcf","text/x-vcard"},
200 {"vrml","x-world/x-vrml"},
201 {"wav","audio/x-wav"},
202 {"wcm","application/vnd.ms-works"},
203 {"wdb","application/vnd.ms-works"},
204 {"wks","application/vnd.ms-works"},
205 {"wmf","application/x-msmetafile"},
206 {"wps","application/vnd.ms-works"},
207 {"wri","application/x-mswrite"},
208 {"wrl","x-world/x-vrml"},
209 {"wrz","x-world/x-vrml"},
210 {"xaf","x-world/x-vrml"},
211 {"xbm","image/x-xbitmap"},
212 {"xla","application/vnd.ms-excel"},
213 {"xlc","application/vnd.ms-excel"},
214 {"xlm","application/vnd.ms-excel"},
215 {"xls","application/vnd.ms-excel"},
216 {"xlt","application/vnd.ms-excel"},
217 {"xlw","application/vnd.ms-excel"},
218 {"xof","x-world/x-vrml"},
219 {"xpm","image/x-xpixmap"},
220 {"xwd","image/x-xwindowdump"},
221 {"z","application/x-compress"},
222 {"zip","application/zip"}
223 };
224
225 /*** Creates a new instance of MimeTypeTranslator */
226 private MimeTypeTranslator() {
227 }
228
229 /***
230 *
231 * @param extension
232 * @throws org.smartcrawler.extractor.UnhandledMimeTypeException
233 * @return
234 */
235 public static String getMimeType(String extension)
236 throws UnhandledMimeTypeException {
237
238 if (extension == null) {
239 throw new NullPointerException("Unable to parse null file extension");
240 }
241 if (extension.startsWith(".")) {
242 extension = extension.substring(1);
243 }
244 for (int i = 0; i < translationMatrix.length; i++) {
245 if (translationMatrix[i][0].equalsIgnoreCase(extension)) {
246 return translationMatrix[i][1];
247 }
248 }
249 throw new UnhandledMimeTypeException("Unhadled file extension: "
250 + extension);
251 }
252
253 /***
254 *
255 * @param contentType
256 * @throws org.smartcrawler.extractor.UnhandledMimeTypeException
257 * @return
258 */
259 public static String getFileExtension(String contentType)
260 throws UnhandledMimeTypeException {
261
262 if (contentType == null) {
263 throw new NullPointerException("Unable to parse null Content Type");
264 }
265 String mime = null;
266 int endIndex = contentType.indexOf(";");
267 if (endIndex > 0)
268 mime = contentType.substring(0, endIndex);
269 else
270 mime = contentType;
271 for (int i = 0; i < translationMatrix.length; i++) {
272 if (translationMatrix[i][1].equalsIgnoreCase(mime)) {
273 return translationMatrix[i][0];
274 }
275 }
276 throw new UnhandledMimeTypeException("Unhadled Content Type: "
277 + mime);
278 }
279 }
280