-
Notifications
You must be signed in to change notification settings - Fork 0
/
dailyScrape.py
116 lines (76 loc) · 3.76 KB
/
dailyScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 12 17:52:59 2018
@author: Kelis
"""
from bs4 import BeautifulSoup
import pandas as pd
import requests
import numpy as np
from datetime import datetime
def DailyScrape(day, month, year, box_file, first_run = False):
overres = np.empty((0,49))
#For each day pull out the url of the page with all box scores
page = requests.get('http://www.sports-reference.com/cbb/boxscores/index.cgi?month=' + str(month) + '&day=' + str(day) + ' &year='+ str(year))
soup = BeautifulSoup(page.content, 'html.parser')
#Pull out just the ending of each the url of each game on the given day
final_links = soup.find_all('td', {'class': 'right gamelink'})
final_links = list(final_links)
links = list()
for l in final_links:
links.append(l.a["href"])
#Loop through each individual game and pull out box scores
for m in links:
#Naviagate to a given link
page = requests.get('http://www.sports-reference.com/'+str(m))
soup = BeautifulSoup(page.content, 'html.parser')
#Pull out the teams playing on the day
teams = soup.find_all('a', {'itemprop': 'name'})
results = np.empty((0,0))
for team in teams: results = np.append(results, team.string)
#Extract one table to obtain headers
basictab = soup.find('table', {"class": "sortable"} )
columnName = [item['data-stat'] for item in basictab.find_all(attrs={'data-stat' : True})]
columnName = pd.unique(columnName)
#Use minutes as an indication of when one team's stats end. Minutes equaling 200 or more indicates the final position
minutes = soup.find_all('td',attrs={'data-stat' : str(columnName[2])})
rawminutes = list()
for n in range(0,len(minutes)-1):
rawminutes.append(int(minutes[n].text.strip()))
rawminutes = np.asarray(rawminutes)
#Find the location of the end of each minutes count to parse minutes by team
endteam1 = np.where(rawminutes >= 200)
if not endteam1[0]:
continue
endteam1 = endteam1[0][0]
#Begin loop which pulls out stats for a given game
for cName in columnName[2:] :
stat = soup.find_all('td',attrs={'data-stat' : str(cName)})
results = np.append(results,stat[endteam1].text.strip())
results = np.append(results,stat[len(stat)-1].text.strip())
#Add day, month and year to the results matrix
results = np.append(results,day)
results = np.append(results,month)
results = np.append(results,year)
results = np.matrix(results)
overres = np.vstack([overres, results])
#Add column names if first time ran
if first_run:
home_cols = [ i+"_H" for i in columnName[2:]]
away_cols = [ i+"_A" for i in columnName[2:]]
#Combine home and away into alternating columns to match the data
res_cols = [x for xs in zip(away_cols, home_cols) for x in xs]
#Combining columns from site and manually create names
colnames = ["Away", "Home"] + res_cols + ["Day", "Month", "Year"]
box_score = pd.DataFrame(overres, columns = colnames)
box_score.to_csv(box_file, mode = 'a', header = True, index=False)
else:
box_score = pd.DataFrame(overres)
box_score.to_csv(box_file, mode = 'a', header = False, index=False)
#day = 12
#month = 1
#year = 2017
#box_file = "box_scores.csv"
#first_run = True
#
#DailyScrape(day, month, year, box_file, first_run = False)