-
Notifications
You must be signed in to change notification settings - Fork 12
/
sectorCodeCrawler.py
117 lines (98 loc) · 3.47 KB
/
sectorCodeCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
import shutil
import random
def getSectorCode(str_sectorName):
if str_sectorName == 'ETF':
return 'ETF'
if str_sectorName == 'Finance':
return 'Financials'
if str_sectorName == 'Basic Industries':
return 'Materials'
if str_sectorName == 'Capital Goods':
return 'Industrials'
if str_sectorName == 'Consumer Durables':
return 'ConsumerDiscretionary'
if str_sectorName == 'Consumer Non-Durables':
return 'consumerStaples'
if str_sectorName == 'Consumer Services':
return 'consumerDiscretionary'
if str_sectorName == 'Energy':
return 'Energy'
if str_sectorName == 'Health Care':
return 'HealthCare'
if str_sectorName == 'Miscellaneous':
return 'Unknown'
if str_sectorName == 'Public Utilities':
return 'Utilities'
if str_sectorName == 'Technology':
return 'InformationTechnology'
if str_sectorName == 'Transportation':
return 'Industrials'
def crawl_sector_code(ticker, list_dict_sectorCode):
time.sleep(random.random()*3)
url = "http://www.nasdaq.com/symbol/%s" %ticker
res = None
try:
res = requests.get(url, timeout = 10)
if res.status_code != requests.codes.ok:
print "ERROR: Cannot download: " + ticker + " successfully. status_code: ", res.status_code
except:
print ticker + " exception"
list_dict_sectorCode.append({'AbbrevSymbol':ticker, 'SectorCodeName':'Unknown', 'SectorCode': 'Unknown'})
return None
soup = BeautifulSoup(res.text, "html.parser")
if soup.find(id = 'qbar_sectorLabel') != None:
entries = soup.find(id = 'qbar_sectorLabel').find_all('a')
if entries == []:
list_dict_sectorCode.append({'AbbrevSymbol':ticker, 'SectorCodeName':'ETF', 'SectorCode': 'ETF'})
if entries != []:
for entry in entries:
str_sectorName = entry.get_text()
str_sectorCode = getSectorCode(str_sectorName)
list_dict_sectorCode.append({'AbbrevSymbol': ticker, 'SectorCodeName': str_sectorName, 'SectorCode': str_sectorCode})
print "sector code for %s has been processed." %ticker
def crawl_all_stocks():
print "sector code crawler starts processing..."
if not os.path.exists('sector_code'):
os.mkdir('sector_code')
int_numOfProcess = 20
list_int_children = []
list_dict_sectorCode = []
#use a local directory for debugging purpose only
df_metadata = pd.read_csv("/etc/rq/hd/Instruments/latest/us/US_Instruments.csv")
#df_metadata = pd.read_csv("US_Instruments.csv")
int_len_of_meta = len(df_metadata.AbbrevSymbol)
print "length of metadata is %d." %int_len_of_meta
#multi-processing starts here
for i in range(int_numOfProcess):
pid = os.fork()
if pid:
print "child pid is %d" %pid
#time.sleep(1)
list_int_children.append(pid)
else:
list_dict_sectorCode = []
print "sub_process %d starts." %i
for j in range(int_len_of_meta):
if j % int_numOfProcess == i:
crawl_sector_code(df_metadata['AbbrevSymbol'].iloc[j], list_dict_sectorCode)
print "%d stocks has been processed." %j
df = pd.DataFrame(list_dict_sectorCode)
df.to_csv('sector_code/%d.csv' %i, index = False)
os._exit(0)
for i,child in enumerate(list_int_children):
os.waitpid(child, 0)
list_df_sectorCode = []
for file_ in os.listdir('sector_code'):
df_temp = pd.read_csv('sector_code/' + file_)
list_df_sectorCode.append(df_temp)
df_out = pd.concat(list_df_sectorCode)
df_out.to_csv("sectorCode.csv", index = False)
shutil.rmtree('sector_code')
print "all sector codes are crawled."
if __name__ == '__main__':
main()