-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawler.java
351 lines (303 loc) · 8.77 KB
/
Crawler.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/** Description of My Web Crawler
*
* @version 1.0 Aug 2012
*
* @author Xiaomou Wang
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Crawler implements Runnable {
private HashMap<String, ArrayList<String>> disallowLinks = new HashMap<String, ArrayList<String>>();
private String startUrl = null;
private String searchString = null;
private int maxUrlNum = 0;
ArrayList<String> result = new ArrayList<String>();
boolean caseSenstive = false;
/**
* @param startUrl the first URL to crawl
* @param searchString the key words
* @param maxUrlNum the number of URLs we want to get
*/
public Crawler(String startUrl, String searchString, int maxUrlNum) {
this.startUrl = startUrl;
this.searchString = searchString;
this.maxUrlNum = maxUrlNum;
}
/**
*
* @return the result
*/
public ArrayList<String> getResult() {
return result;
}
/**
*
* @param urlToVerify url to be check
* @return if the URL is allowed to crawl
*/
private boolean ifURLAllowedToCrawl(URL urlToVerify) {
String host = urlToVerify.getHost().toLowerCase();
ArrayList<String> disallowList = disallowLinks.get(host);
// if not visit the robots.txt file for this URL visit.
if (disallowList == null) {
disallowList = new ArrayList<String>();
try {
URL robotUrl = new URL("http://" + host + "/robots.txt");
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotUrl.openStream()));
// read the file.
String line;
while ((line = reader.readLine()) != null) {
// if there is any path that is forbidden to visit.
int index = line.indexOf("Disallow:");
if (index == 0) {
// cut of the comment
String file = line.substring("Dissallow".length());
index = line.indexOf("#");
if (index != -1) {
file = file.substring(0, index);
}
file = file.trim();
// add the path to the disallowed list on this page
disallowList.add(file);
}
}
} catch (IOException e) {
System.out.println("NO Robot found " + e);
return true;
}
disallowLinks.put(host, disallowList);
//System.out.println(disallowList);
} else {
// Check if this url allow to visit
String path = urlToVerify.getQuery();
if (disallowList.contains(path))
return false;
}
return true;
}
/**
*
* @param url the string to be verify
* @return if the string format is legal URL change it to a URL
*/
private URL getUrl(String url) {
// check if the url contain protocol area
if (!url.toLowerCase().startsWith("http://"))
return null;
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (MalformedURLException e) {
//System.out.println("Error in getUrl " + e);
//System.out.println("URL is " + url);
return null;
}
return verifiedUrl;
}
/**
*
* @param pageUrl the URL of the web page to be download
* @return the web page content
* @throws IOException
*/
private String downLoad(URL pageUrl) throws IOException {
try {
// down load the web page of pageUrl
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));
StringBuffer page = new StringBuffer();
String s = null;
while ((s = reader.readLine()) != null) {
page.append(s);
}
return page.toString();
} catch (IOException e) {
System.out.println("Eorror in downloading" + pageUrl + ":" + e);
return null;
}
}
/**
*
* @param url the url of page which will be retrieve
* @param pageContent page content of the certain page
* @param crawledUrl the set of urls which has been crawled
* @return all links on this page
*/
private ArrayList<String> retrieveLink(URL url, String pageContent,
HashSet<String> crawledUrl) {
// regular express to find all links
Pattern p = Pattern.compile("<a\\s+href=\"(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContent);
// to maintain the links on this page
ArrayList<String> listOfLink = new ArrayList<String>();
while (m.find()) {
String link = m.group(1).trim();
// if not a proper link
if (link.length() < 1) {
continue;
}
// ignore the link direct to a loaction in the same page
if (link.charAt(0) == '#') {
continue;
}
// ignore the link for email and javascript
if ((link.indexOf("mailto") != -1)
|| (link.toLowerCase().indexOf("javascript") != -1)) {
continue;
}
URL verifiedLink = getUrl(link);
// if not a correct url
if (verifiedLink == null) {
continue;
}
// if the link has been visited
if (crawledUrl.contains(link)) {
continue;
}
listOfLink.add(link);
}
return listOfLink;
}
/**
*
* @param searchString the key words
* @param pageContent the page content
* @param isCaseSensitive whether is a case sensitive search
* @return if the page contain that key words
*/
private boolean searchStringMatchs(String searchString, String pageContent,
boolean isCaseSensitive) {
if (searchString.length() < 0 || pageContent.length() < 0l) {
return false;
}
String pageToSearch = pageContent;
if (!isCaseSensitive) {
pageContent.toLowerCase();
}
// split the key words by space
Pattern p = Pattern.compile("[\\s]+");
String[] terms = p.split(searchString);
// To find if the page has at least one key word
for (int i = 0; i < terms.length; ++i) {
if (isCaseSensitive) {
if (pageToSearch.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (pageToSearch.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}
return true;
}
/**
*
* @param verifirdUrl the correct url to search
* @param crawledList urls that have been checked
* @param toCrawlList urls that not been checked
* @param tmpUrl the string in to crawl list
*/
private void checkIfUrlContainKey(URL verifirdUrl, HashSet<String> crawledList, ArrayList<String> toCrawlList, String tmpUrl) {
String pageContent;
try {
// download the page
pageContent = this.downLoad(verifirdUrl);
if (pageContent == null) {
System.out.println("Download failed");
return;
}
// add Url to crawled list
crawledList.add(tmpUrl);
if (pageContent != null && pageContent.length() > 0) {
// extract all links in this page
ArrayList<String> links = retrieveLink(verifirdUrl,
pageContent, crawledList);
// add them to the to crawl list
toCrawlList.addAll(links);
// check if the page contain key words
if (searchStringMatchs(pageContent, searchString,caseSenstive)) {
result.add(tmpUrl);
System.out.println(tmpUrl);
}
System.out.println(tmpUrl);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
*
* @param startUrl the start url
* @param maxUrlNum the max number of urls we want
* @param searchString the key words
* @param isCaseSensitive if the search is a case sensative search
* @return result set
*/
public ArrayList<String> crawl(String startUrl, int maxUrlNum,
String searchString, boolean isCaseSensitive) {
System.out.println("seach String " + searchString);
// the links visited
HashSet<String> crawledList = new HashSet<String>();
// the links to be visit
ArrayList<String> toCrawlList = new ArrayList<String>();
if (startUrl.length() < 1) {
System.out.println("wrong strat URL");
return null;
}
if (maxUrlNum < 1) {
System.out.println("max is less than one");
return null;
}
toCrawlList.add(this.startUrl);
while (toCrawlList.size() > 0) {
if (crawledList.size() == maxUrlNum)
break;
// get a url
String tmpUrl = toCrawlList.iterator().next();
toCrawlList.remove(tmpUrl);
// check if it a correct url
URL verifirdUrl = getUrl(tmpUrl);
if (verifirdUrl == null)
continue;
// if url is allow to visit
if (!ifURLAllowedToCrawl(verifirdUrl))
continue;
// down load and check the page
checkIfUrlContainKey(verifirdUrl, crawledList, toCrawlList,tmpUrl);
}
return this.result;
}
/**
*
*/
public void run() {
System.out.println("thread created");
crawl(startUrl, maxUrlNum, searchString, caseSenstive);
System.out.println("thread exit");
}
/**
*
* @param args intput argument should be start Url max number of urls and key words
*/
public static void main(String[] args) {
if (args.length != 3) {
System.out.println("Input should be start Url, max number of urls and key words");
return;
}
int max = Integer.parseInt(args[1]);
Crawler crawler = new Crawler(args[0], args[2], max);
Thread worker = new Thread(crawler);
System.out.println("Start searching...");
System.out.println("result:");
worker.start();
}
}