-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
101 lines (82 loc) · 2.62 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/python
from csv import reader
import time
import random
from bs4 import BeautifulSoup as bs
import pandas as pd
from login import driver
from login import Login
name = input('\nInsert name of the database... ')
db = open(name+'.csv')
db = list(reader(db))
Login()
def scroll( sleep ):
start = time.time()
initialScroll = 0
finalScroll = 1000
while True:
driver.execute_script(f"window.scrollTo({initialScroll}, {finalScroll})")
initialScroll = finalScroll
finalScroll += 1000
time.sleep(3)
end = time.time()
if round(end - start) > sleep:
break
def GetData( url ):
driver.get(url)
print(url+'...')
ranint = random.randint(10,25)
print('Sleeping '+str(ranint)+' seconds...')
scroll( ranint )
html = driver.page_source
soup = bs(html)
name = ''
try:
name = soup.find('h1').get_text()
#print(name)
except: pass
title = ''
try:
title = soup.find('div', class_='text-body-medium break-words').get_text()
title = title.replace('\n ', '')
title = title.replace('\n ', '')
#print(title)
except: pass
company = ''
try:
company = soup.find('h2', class_='pv-text-details__right-panel-item-text hoverable-link-text break-words text-body-small inline').get_text()
company = company.replace('\n\n\n ', '')
company = company.replace('\n \n\n', '')
#print(company)
except: pass
location = ''
try:
location = soup.find('span', class_='text-body-small inline t-black--light break-words').get_text()
location = location.replace('\n ', '')
location = location.replace('\n ', '')
#print(location)
except: pass
about = ''
try:
about = soup.find('div', class_='pv-shared-text-with-see-more t-14 t-normal t-black display-flex align-items-center').find('span').get_text()
#print(about)
except: pass
followers = 0
try:
followers = soup.find('p', class_='pvs-header__subtitle text-body-small').find('span').get_text()
followers = int(followers.replace(' followers', ''))
#print(followers)
except: pass
df = { 'name': name,
'title': title,
'comapny': company,
'location': location,
'about': about,
'followers': followers,
'url': url
}
return df
df = pd.DataFrame(columns=['name','title','company','location','about','followers','url'])
for profile in db:
df = df.append(GetData(profile[0]), ignore_index=True)
print(df)