-
Notifications
You must be signed in to change notification settings - Fork 1
/
feeds.expat.js
145 lines (122 loc) · 4.07 KB
/
feeds.expat.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/*******************************************************************************
*
* An expat based feed converter.
* Takes a url, retrieves the content (assumes it is XML).
* Parses it with Expat and then converts it to a JSON object.
* Sends it back.
*
* I used for inspiration / heavily borrowed from in part:
*
* https://github.com/ibrow/node-rss
* https://github.com/maqr/node-xml2js
*
******************************************************************************/
var util = require('util'),
events = require('events'),
expat = require('node-expat');
var Parser = function() {
// Store our variables
var that = this;
var stack = [];
this.resultObject = null;
this.EXPLICIT_CHARKEY = false; // always use the '#' key, even if there are no subkeys
this.CHARKEY = 'text';
this.ATTRKEY = '@';
this.cleaner = /[^\x20-\x7E]/
// Create an expat parser
this.parser = new expat.Parser();
this.parser.addListener('startElement', function(name, attrs) {
var obj = {};
obj[that.CHARKEY] = "";
for(var key in attrs) {
if(typeof obj[that.ATTRKEY] === 'undefined') {
obj[that.ATTRKEY] = {};
}
obj[that.ATTRKEY][key] = attrs[key];
}
obj['#name'] = name; // store the node name
stack.push(obj);
});
this.parser.addListener('endElement', function(name) {
var obj = stack.pop();
var nodeName = name;
var s = stack[stack.length-1];
// remove the '#' key altogether if it's blank
if(obj[that.CHARKEY].match(/^\s*$/)) {
delete obj[that.CHARKEY];
}
else {
// turn 2 or more spaces into one space
obj[that.CHARKEY] = obj[that.CHARKEY].replace(/\s{2,}/g, " ").trim();
// also do away with '#' key altogether, if there's no subkeys
// unless EXPLICIT_CHARKEY is set
if( Object.keys(obj).length == 1 && that.CHARKEY in obj && !(that.EXPLICIT_CHARKEY) ) {
obj = obj[that.CHARKEY];
}
}
// set up the parent element relationship
if (stack.length > 0) {
if (typeof s[nodeName] === 'undefined')
s[nodeName] = obj;
else if (s[nodeName] instanceof Array)
s[nodeName].push(obj);
else {
var old = s[nodeName];
s[nodeName] = [old];
s[nodeName].push(obj);
}
} else {
that.resultObject = obj;
that.emit("end", that.resultObject);
}
});
this.parser.addListener('text', function(t) {
var s = stack[stack.length-1];
if(s) {
// Clean the text of any invalid characters
t = t.replace(that.cleaner,"");
s[that.CHARKEY] += t;
}
});
}
/**
* parseURL() Parses an RSS feed from a URL.
*
* @param url -
* URL of the RSS feed file
* @param cb -
* callback function to be triggered at end of parsing
*
* @TODO - decent error checking
*/
exports.parseURL = function(url, cb) {
var u = require('url');
var parts = u.parse(url);
if(parts.protocol === 'https:') {
client = require('https');
} else {
client = require('http');
if(!parts.port) {
parts.port = 80;
}
}
client.get({ host: parts.hostname, port: parts.port, path: parts.pathname }, function(res) {
var data = '';
res.setEncoding('utf8');
res.on('data', function(d) {
data += d;
});
res.on('end', function() {
var parser = new Parser();
parser.addListener('end', function(data) {
cb(null,data);
});
parser.parse(data);
});
}).on('error', function(err) {
cb(err,null);
});
}
sys.inherits(Parser, events.EventEmitter);
Parser.prototype.parse = function(str) { this.parser.parse(str); };
exports.Parser = Parser;