-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.js
93 lines (70 loc) · 2.44 KB
/
crawler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
var casper = require('casper').create();
var fs = require('fs');
// the default file to be read is "links.txt", if you use another file, change it here.
var file = "links.txt";
var output = "emails.txt";
var links = [];
var openStream = fs.open(file, 'r');
console.log("The Friendly Ghost is a PR-bot that scrapes and saves email addresses into an emails.txt file. Sites to search:");
// read, log, and save links in an array
while(!openStream.atEnd()) {
var line = openStream.readLine();
links.push(line);
console.log(line);
}
openStream.flush();
openStream.close();
casper.start().each(links, function (self, link) {
self.thenOpen(link, function() {
// GRAB EMAILS FROM LINKS AND HTML
// grab the "href" HTML attribute and text content and put them into arrays
try {
var linkArray = this.getElementsAttribute('a', 'href');
var text = this.getElementsInfo('p');
this.echo('Searching ' + this.getCurrentUrl());
} catch (e) {
var text =[];
var linkArray =[];
console.log("Problem accessing page " + link + ' ' + e);
return;
}
// regular expression for matching emails
var emailRegex = new RegExp("[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?");
// Array containing the emails found
var emailArray = [];
// loop through every href, searching for an email address
for(var i=0; i <= linkArray.length; i++) {
var attribute = linkArray[i];
// if it finds an email, save it
if (emailRegex.test(attribute)) {
var email = attribute;
// first stripping any 'mailto's
if (attribute.substring(0,7) === 'mailto:'); {
email = attribute.substring(7);
}
emailArray.push(email);
}
}
// check text snippets for emails, then save
for(var k=0; k < text.length; k++) {
var elementText = text[k].text
if (emailRegex.test(elementText)) {
var emailMatches = elementText.match(emailRegex);
emailArray.push(emailMatches[0]);
}
}
// print out and write to file all the emails discovered for that url
if (emailArray.length == 0) {
console.log("No emails found.");
} else {
var downloadStream = fs.open('emails.txt', 'w');
for (email in emailArray) {
downloadStream.writeLine(emailArray[email]);
console.log(emailArray[email]);
}
downloadStream.flush();
downloadStream.close();
}
});
});
casper.run();