1
2 /*
3 * SmartCrawler
4 *
5 * $Id: LinkBuilderImpl.java,v 1.7 2005/07/08 12:09:08 vincool Exp $
6 * Copyright 2005 Davide Pozza
7 *
8 * This program is free software; you can redistribute it
9 * and/or modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation;
11 * either version 2 of the License, or (at your option) any
12 * later version.
13 *
14 * This program is distributed in the hope that it will be
15 * useful, but WITHOUT ANY WARRANTY; without even the implied
16 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17 * PURPOSE. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the Free
22 * Software Foundation, Inc., 59 Temple Place, Suite 330,
23 * Boston, MA 02111-1307 USA
24 *
25 */
26
27 package org.smartcrawler.extractor;
28
29 import org.apache.log4j.Logger;
30 import org.smartcrawler.common.Link;
31 import org.smartcrawler.common.MalformedLinkException;
32 import org.smartcrawler.common.SCLogger;
33
34 /***
35 *
36 *
37 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
38 * @version <tt>$Revision: 1.7 $</tt>
39 */
40 public class LinkBuilderImpl implements LinkBuilder {
41
42
43 private static Logger log = SCLogger.getLogger(LinkBuilderImpl.class);
44 private static Logger logLink = SCLogger.getLinkLogger();
45
46 private Link parsedPageLink;
47 private String parsedPagePath;
48 private String hostName;
49
50 /***
51 * Creates a new instance of LinkBuilder
52 * @param parsedPageLink
53 */
54 public LinkBuilderImpl(Link parsedPageLink) {
55 this.parsedPageLink = parsedPageLink;
56 try {
57 hostName = parsedPageLink.getHost();
58 log.debug("LinkBuilderImpl(): hostName="+hostName);
59 parsedPagePath = parsedPageLink.getPath(false);
60
61 } catch (Exception e){
62 hostName = null;
63 log.debug("LinkBuilderImpl(): Invalid link " + parsedPageLink);
64 }
65 }
66
67 /***
68 *
69 * @param htmlURL
70 * @return
71 */
72 public Link buildLink(HtmlURL htmlURL) throws MalformedLinkException {
73
74 log.debug("buildLink(): BEGIN");
75
76 String extractedURL = htmlURL.getCleanedLinkAsString();
77
78 Link res = null;
79 //String strCurrItemLink = currItemLink.toString();
80 log.debug("buildLink(): normalizing: " + extractedURL
81 + " of type " + HtmlURL.LINK_ABSOLUTE_URI);
82 //String tmpExtractedLink = this.cleanedURL;
83 String tmpLinkStr = "";
84
85 //validity check
86 if(!htmlURL.isValid()) {
87 log.debug("buildLink(): Invalid link " + extractedURL);
88 return null;
89
90 }else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URI) {
91 //
92 // ex. "/path1/file.htm"
93 //
94 tmpLinkStr = HtmlURL.PROTOCOL_PREF +
95 hostName + extractedURL;
96
97 } else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URL) {
98 //
99 //ex. "http://www.satollo.com/path1/file.htm"
100 //
101 tmpLinkStr = extractedURL;
102
103 } else if (htmlURL.getType() == HtmlURL.LINK_RELATIVE) {
104 //
105 // ex. "../path1/file.htm" or "path1/file.htm" and not "/path1/file.htm"
106 //
107
108 tmpLinkStr = HtmlURL.PROTOCOL_PREF + hostName;//Ex. http://www.brucalipto.org
109 String tmpExtractedURL = extractedURL;//ex. images/95.png
110 String newLinkPath = parsedPagePath;//ex. /
111
112
113 //log.debug("buildLink(): tmpLinkStr=" + tmpLinkStr);
114 //log.debug("buildLink(): tmpExtractedURL=" + tmpExtractedURL);
115 //log.debug("buildLink(): newLinkPath=" + newLinkPath);
116 //log.debug("buildLink(): parsedPagePath=" + parsedPagePath);
117
118 if (!tmpExtractedURL.startsWith("../")) {
119 tmpLinkStr += newLinkPath + HtmlURL.PATH_SEP + tmpExtractedURL;
120
121 } else {
122 //int dummy = 0;
123 while(tmpExtractedURL.startsWith("../")) {
124 //remove the last level on newLinkPath and
125 //the first ../ on the link
126
127 /*
128 logLink.info(
129 "WHILE BEFORE: " + (++dummy) +
130 " parsedPageLink=" + parsedPageLink +
131 " parsedPagePath=" + parsedPagePath +
132 " tmpExtractedURL="+tmpExtractedURL +
133 " newLinkPath=" + newLinkPath +
134 " tmpLinkStr=" + tmpLinkStr);
135 */
136 if (newLinkPath.length() > 0) {
137 int idx = newLinkPath.lastIndexOf(HtmlURL.PATH_SEP);
138 if (idx >= 0) newLinkPath = newLinkPath.substring(0, idx);
139 }
140 tmpExtractedURL = tmpExtractedURL.substring(3);
141 tmpLinkStr += HtmlURL.PATH_SEP + newLinkPath;
142 /*
143 logLink.info(
144 "WHILE AFTER: " + (dummy) +
145 " parsedPageLink=" + parsedPageLink +
146 " parsedPagePath=" + parsedPagePath +
147 " tmpExtractedURL="+tmpExtractedURL +
148 " newLinkPath=" + newLinkPath +
149 " tmpLinkStr=" + tmpLinkStr);
150 */
151
152 }// while(tmpExtractedURL.startsWith("../")) {
153
154 tmpLinkStr += tmpExtractedURL;
155 }
156
157 } else {
158 log.warn("buildLink(): url " + extractedURL + " UNHANDLED!! ");
159 }
160 res = new Link(tmpLinkStr);
161
162 logLink.info(parsedPageLink + " " + parsedPagePath
163 + " " + extractedURL + " " + res);
164
165 log.debug("buildLink(): curr. level: " + parsedPageLink +
166 " orig. link: " + extractedURL +
167 "; normalized: " + res);
168
169 log.debug("buildLink(): END");
170 return res;
171 }
172
173 }