-
Notifications
You must be signed in to change notification settings - Fork 10
/
Myspider.py
407 lines (381 loc) · 12 KB
/
Myspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
import re
from bs4 import BeautifulSoup
from urllib import request
import chardet
import os
import gzip
import threading
import _thread
from time import sleep
import random
import socket
import urllib
"""
这是一个基于urllib实现全网爬取的多线程爬虫
Create by Jack
"""
#想要抓取的关键字列表
keywords = ['人工智能','AI','ai']
#入口url列表
urls = [
'http://www.ailab.cn/news/ainew/',
'http://news.chinaso.com/search?wd='+request.quote('人工智能+行业'),
'https://www.baidu.com/s?ie=utf-8&cus_sid=18364444163281065044&tn=SE_pscse_053x7tyx&wd=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%20%E8%A1%8C%E4%B8%9A'
'http://search.ifeng.com/sofeng/search.action?q=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD+%E8%A1%8C%E4%B8%9A&c=1&chel='
'http://search.southcn.com/web/search?channelid=216505&&searchword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%20%E8%A1%8C%E4%B8%9A',
'https://www.baidu.com/s?ie=utf-8&cus_f=0&wd=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%01%E8%A1%8C%E4%B8%9A&tn=71032170_cpr&rsv_lu=3_sb&si=china.com&ct=2097152&fenlei=mv6qUAdxTZPxTvb0IZRqIHDLnjmdnWc0T1YYPARYrHN-nAR4uhD1uHTL0AGo5HmLPH7hP1u9PHD4Pj9bnhm0IAYqnWm3PjfkPHRz0APh5Hn0Thcqn0K_IyVG5HD0mv4YUWYLnH01nWDLn7qWTZc0TLPs5HD0TLPsnWYk0ZwYTjYk0A4YIZ0qnfKLpHYY0Au1mv9VujYz0Zwb5HDLnjmdnWc0IgF_5y9YIZ0-nYD-nbm-nbuYuyPCFHF7mv9GUhD-nbNWUvY-nbm0',
'http://search.sina.com.cn/?q=%C8%CB%B9%A4%D6%C7%C4%DC+%D0%D0%D2%B5&range=all&c=news&sort=rel',
'http://sou.chinanews.com.cn/search.do?q=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD+%E8%A1%8C%E4%B8%9A&ps=10&time_scope=0&channel=all&sort=_score',
'http://news.sogou.com/news?query=%C8%CB%B9%A4%D6%C7%C4%DC+%D0%D0%D2%B5',
'https://www.sogou.com/sogou?site=news.qq.com&query=%C8%CB%B9%A4%D6%C7%C4%DC+%D0%D0%D2%B5&pid=sogou-wsse-b58ac8403eb9cf17-0004&sourceid=&idx=f&idx=f'
]
#用一个递增的数字作为文件名标识
counter = 0
#互斥锁,用于线程对互斥资源的访问
mutex = threading.Lock()
"""
@param: values 正文段列表
k 每次选取的段数
@return:left 正文起始段
right 正文结束段
"""
def findEnd(values,k=5):
left,right = 0,0
# print(values)
for i in range(len(values)):
strCnt = sum(values[i:i+k])
# print(strCnt)
if strCnt > 180 and values[i] > 15:
if left == 0:
left = i
right = i
cnt = 0
for i in range(right,len(values)):
if cnt >= k:
break
if values[i] > 15:
right += 1
cnt += 1
return left,right
"""
找出正文的起始位置和结束位置
@param: content 过滤标签后的网页
k 每段选取的行数
@return:left 正文起始段
right 正文结束段
"""
def filt(content,k=1):
if not content:
return None,None
lines = content.split('\n')
group_value = []
for i in range(0,len(lines),k):
group = '\n'.join(lines[i:i+k]).strip()
group_value.append(len(group))
left,right = findEnd (group_value)
return left,right
"""
基于正文提取算法(行快分布)实现的提取函数
@param: content 过滤标签后的网页
@return:res 正文
"""
def extract (content):
if content.count('\n') < 10: #如果换行符少于10,则认为html是经过压缩的,进行换行处理
content = content.replace('>\n','\n')
content = tool.replace(content)
left,right = filt(content)
return '\n'.join(content.split('\n')[left:right])
"""
为特殊网页结构而特别设计的提取函数
@param: content 过滤标签后的网页
@return:res 正文
"""
def extractForAilab(content):
try:
# tool.save(''.join(content.split('\n'))+'\n\n\n','a','haha')
soup = BeautifulSoup(content,'html.parser')
main = soup.find(attrs={'id':'mainDiv'})
[s.extract() for s in main(['div'])]
s = main.get_text()
r = re.compile(r'<[^>]+>',re.M|re.S)
s = r.sub('',s).strip() #删除HTML标签
r = re.compile(r'^\s+$', re.M|re.S) #删除空白行
s = r.sub ('', s)
r = re.compile(r'\n+',re.M|re.S) #合并空行为一行
s = r.sub('\n',s)
return s
except Exception as e:
print(e)
return ''
"""
子线程抓取页面,并以文本文件的方式将其存到本地磁盘中
@param: url 当前要抓取的页面
"""
def loop(url):
try:
global counter,mutex
print('fetching %s...' %url)
page = tool.fetchPage(url)
if page =='':
print('drop:%s'%url)
tool.save(url+'\n','a','dropedUrls')
return
r = re.compile(r'<title>(.*?)</title>',re.M|re.S)
title = ''.join(re.findall(r,page))
if spider.getHost(url) == 'http://www.ailab.cn':
body = extractForAilab(page)
else:
body = extract(page)
if body == '':
print('drop:%s'%url)
tool.save(url+'\n','a','dropedUrls')
else:
if mutex.acquire(): #互斥锁
counter+=1
tool.save('标题:'+title+'\n\n'+body,name=str(counter))
mutex.release()
except:
print('loop error')
return ''
class MySpider:
def __init__(self,urls):
self.cur_depth = 0
self.queue = MyQueue()
tool.remove('urls')
tool.remove('important')
tool.remove('dropedUrls')
self.url_dict = dict()
if isinstance(urls,str):
self.queue.addUnvisitedUrl(urls,1)
self.url_dict[url] = 1
elif isinstance(urls,list):
for url in urls:
self.queue.addUnvisitedUrl(url,1)
self.url_dict[url] = 1
"""
通过bfs进行搜索爬取感兴趣的页面
@param: depth 爬取的深度
"""
def crawl(self,depth):
queue = self.queue
for i in range(depth):
unvisted_links = []
while not queue.isUnvisitedEmpty():
try:
cur_url = queue.unVisitedDequeue()
print('pop link:%s' % cur_url)
links = []
if cur_url is not None and cur_url != '':
links = self.fetchLinks(cur_url)
unvisted_links.extend(links)
print('get %d new links'%len(links))
queue.addVisitedUrl(cur_url)
print('visited count:%d' % queue.getVisitedNums())
print('current depth:%d' % (i+1))
except:
print('crawl error')
continue
if len(unvisted_links) > 0:
for link in unvisted_links:
level = self.url_dict[link]
queue.addUnvisitedUrl(link,level)
else:
break
print('unVisited count:%d' % queue.getunVisitedNums())
"""
解析当前页面中的全部url连接,抓取感兴趣的页面,并将过滤后的url列表加入待爬取队列中
这里用了一个简单的思想:1. 如果当前页面中的子页面是我们感兴趣的,则认为这个页面是重要的
2. 如果当前页面中我们感兴趣的子页面超过10个,则认为这个页面也是重要的
@param: depth 爬取的深度
"""
def fetchLinks(self,url):
links = []
page = tool.fetchPage(url)
page_num = 0
url_dict = self.url_dict
if page != '':
try:
soup = BeautifulSoup(page,'html.parser')
items = soup.findAll('a')
for item in items:
link = item.get('href')
title = item.get_text()
newlink = ''
pageType = 0
if link != None and link!='' and len(link) > 3 and 'javascript' not in link:
for k in keywords:
if k in title:
pageType = 1
break
if link[:4] != 'http':
host = self.getHost(url)
if link[0] in ['/','?']:
newlink = host + link
else:
newlink = host + '/' + link
elif link[:4] == 'http':
newlink = link
if link!='':
if newlink not in url_dict:
links.append(newlink)
tool.save(link+'\n',option='a',name='urls')
if pageType == 1:
_thread.start_new_thread(loop,(newlink,))
page_num += 1
except:
print('extract error')
return []
if page_num >= 10: #当当前页面发现十个以上重要的子页面时,认为该页面也是重要的,并将所有子页面标记为重要
tool.save(url+'\n','a',name="important")
url_dict[url] = 1
else:
url_dict[url] = 0
level = url_dict[url]
for link in links:
url_dict[link] = level
return links
"""
获取url中的域名
@param: url
"""
def getHost(self,url):
r = re.compile(r'(https?://)?[^/\s]*',re.S)
g = r.match(url)
host = ''.join(g.group())
return host
"""
判断url是否为html类型的
@param: url
"""
def judgePage(self,url):
r = re.compile('[xs]?htm[l]?$')
s = re.findall(r,url)
return len(s) == 1
"""
一个模拟队列的数据结构,这里做了个优化:
通过使用两个unVisited列表(first,second)实现多级队列,从而使感兴趣的内容能够尽快被访问到,而不是被其他无用的内容抢先
"""
class MyQueue:
def __init__(self):
self.visited = []
self.unVisitedFirst = [] #采用多级队列实现优先级分级
self.unVisitedSecond = []
def getVisitedUrls(self):
return self.visited
def getUnvisitedUrls(self):
return self.unVisitedFirst.extend(self.unVisitedSecond)
def addVisitedUrl(self,url):
self.visited.append(url)
def removeVisitedUrl(self,url):
self.visited.remove(url)
#将最后一个元素(即队列头)移出列表
def unVisitedDequeue(self):
try:
if len(self.unVisitedFirst) > 0: #只有当一级队列为空时,才返回二级队列的内容
return self.unVisitedFirst.pop()
else:
return self.unVisitedSecond.pop()
except:
return None
#将新加入的url插入到列表头(即对列尾),先进先出
def addUnvisitedUrl(self,url,level):
if url!='' :
if level == 1:
self.unVisitedFirst.insert(0,url)
else:
self.unVisitedSecond.insert(0,url)
def getVisitedNums(self):
return len(self.visited)
def getunVisitedNums(self):
return len(self.unVisitedFirst) + len(self.unVisitedSecond)
def isUnvisitedEmpty(self):
return len(self.unVisitedFirst)==0 and len(self.unVisitedSecond) == 0
"""
类如其名,一个封装了各种操作的工具类
包括: 1.html页面标签过滤
2.解压gzip压缩的页面
3.保存文件到磁盘中
4.删除特定目录下的文件
5.通过url抓取页面,经过各种处理后,返回字符串
"""
class Tool:
path = os.getcwd()+os.sep +'news'+os.sep
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
'Referer':'http://www.google.com',
}
def replace(self,content):
r = re.compile(r'<script.*?</script>',re.I|re.M|re.S) #删除JavaScript
s = r.sub ('',content)
r = re.compile(r'<style.*?</style>',re.I|re.M|re.S) #删除css
s = r.sub ('', s)
r = re.compile(r'<!--.*?-->', re.I|re.M|re.S) #删除注释
s = r.sub('',s)
r = re.compile(r'<meta.*?>', re.I|re.M|re.S) #删除meta
s = r.sub('',s)
r = re.compile(r'<ins.*?</ins>', re.I|re.M|re.S) #删除ins
s = r.sub('',s)
r = re.compile(r'<[^>]+>',re.M|re.S)
s = r.sub('',s).strip() #删除HTML标签
r = re.compile(r'^\s+$', re.M|re.S) #删除空白行
s = r.sub ('', s)
r = re.compile(r'\n+',re.M|re.S) #合并空行为一行
s = r.sub('\n',s)
r = re.compile('\s+',re.I|re.M|re.S)
s = r.sub('\n',s)
return s
#解压gzip压缩的网页
def gzdecode(self,data):
charset = chardet.detect(data)['encoding']
if charset == None:
charset = 'utf-8'
if charset.lower() == 'gb2312':
charset = 'gb18030'
try:
html = gzip.decompress(data).decode(charset)
except OSError:
html = data.decode(charset)
return html
def save(self,data,option='w',name='out'):
path = self.path
if os.path.exists(path) == False:
os.makedirs(path)
f = open(path+name+'.txt',option,encoding='utf-8')
f.write(data)
f.close()
def remove(self,name):
path = self.path+name+'.txt'
if os.path.exists(path):
print('remove...')
os.remove(path)
else:
return
def fetchPage(self,url):
page = ''
for i in range(3):
try:
socket.setdefaulttimeout(5)
req = request.Request(url,headers=self.headers)
page = request.urlopen(req).read()
page = self.gzdecode(page)
# print('success')
return page
except request.HTTPError as e:
print(e.reason)
return ''
except request.URLError as e:
print(e.reason)
return ''
except socket.timeout as e:
if i < 2:
continue
else:
print('timeout')
return ''
except:
print('other error')
return ''
if __name__ == '__main__':
tool = Tool()
spider = MySpider(urls)
spider.crawl(10)