-
Notifications
You must be signed in to change notification settings - Fork 0
/
event_crawling.py
150 lines (120 loc) ยท 4.11 KB
/
event_crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import requests
import os
import re
from bs4 import BeautifulSoup
def get_html(url):
"""
url์ ํด๋น๋๋ html์ str์ผ๋ก ๊ฐ์ ธ์ด
param url -> ๊ฐ์ ธ ์ฌ url / str
return str
"""
response = requests.get(url)
return response.text
def split_event_html(html):
"""
html ์ค ์ด๋ฒคํธ ๋ถ๋ถ์ ์ฐพ์์ ๋ฐํ
๋๋ฌ์น๋ก ๊ณ ์
param html -> ํ์ด์ง์ html / str
param range -> event์ ์์น / int
return soup Object List
"""
split_HTML = list(html.split('<h2')[6:])
soup = BeautifulSoup(split_HTML[0] + split_HTML[1] + split_HTML[2], 'html.parser')
return soup.findAll("li")
def find_day_by_body(body):
"""
์ด๋ฒคํธ์์ ์ผ์๋ฅผ ์ฐพ์์ค
param body -> ์ด๋ฒคํธ ๋ ์ง / soup Object
return -> arr
arr[0] = due_day
arr[1] = start_day
"""
str_body = str(body)
dot_split_str = str_body.split('.')
"""
expected
'<li>์ ์ฒญ: 02', ' 04(๋ชฉ) 14:00 / 02', ' 05(๊ธ) 14:00</li>'
"""
date_len = len(dot_split_str)
res = ["0", "0"]
for i in range(date_len - 1):
res.insert(0, find_day_by_stub(dot_split_str[i], dot_split_str[i+1]))
return res
def find_day_by_stub(mon_stub, day_stub):
"""
stub์์ ์ ๊ทํํ์์ ํตํด์ ์ ์๋ฅผ ์ฐพ์์ฃผ๋ ๋ก์ง
param -> str, str
return -> str
"""
month = get_number_by_string(mon_stub[-3:])
days = get_number_by_string(day_stub[:3])
MnD = month + days
return MnD
def get_number_by_string(str: str):
return re.findall("\d+", str)[0]
def get_event_script(event):
"""
event์์
param event_body -> event / soup Object
return arr(str)
arr[0] = title
arr[1] = link
arr[2] = date
arr[3] = host
arr[4] = due
arr[5] = start
"""
event_body = event.findAll("li")
event_title = event.find("strong")
# link ์ถ์ถ
link = event_title.select("a")[0].attrs['href']
date = event_body[2].text
host = event_body[1].text
date_info = find_day_by_body(event_body[2])
due = date_info[0]
start = date_info[1]
return [event_title.text, link, date, host, due, start]
def is_activate_event(first_date, second_date, today) -> bool:
"""
์ด๋ฒคํธ๊ฐ ํ์ฌ ๋ ์ง์ ์งํ๋๊ณ ์๋์ง๋ฅผ ํ์
ํจ
param first_date -> ์ด๋ฒคํธ์ ์ข
๋ฃ ์ผ์. / string
param second_date -> ์ด๋ฒคํธ์ ์์ ์ผ์. ์๋ค๋ฉด '0'์ด ๋ค์ด๊ฐ๊ฒ ๋จ. / string
param today -> ์ค๋ ๋ ์ง. ๋ง๊ฐ๋ ๋ ์ง๋ฅผ ๊ฐ๋ฆฌ๋ ๊ธฐ์ค์ด ๋จ. / int
return bool
"""
event_start_date = 0
event_end_date = int(first_date)
if second_date == '0':
event_start_date = int(first_date)
else:
event_start_date = int(second_date)
event_end_date = int(first_date)
if event_start_date > event_end_date:
event_end_date += 1200
today += 1200
date_range = today + 100
return (today <= event_end_date) and (date_range >= event_start_date)
def content_list(script_title, events, today):
"""
event ๋ฐ์ดํฐ๋ฅผ ์ถ์ถ, issue์ Body๋ก ์ ๋ฆฌํจ.
param events -> ์ด๋ฒคํธ์ ๋ฆฌ์คํธ, ์ฐ๋ ๊ธฐ ๋ฐ์ดํฐ๊ฐ ์กด์ฌํจ. / soup Object List
param day -> ์ค๋ ๋ ์ง. ๋ง๊ฐ๋ ๋ ์ง๋ฅผ ์ ๊ฑฐํ๋ ๊ธฐ์ค์ด ๋จ. / int
return str
"""
current_content = f"{script_title} \n \n"
for event in events:
if len(event.findAll("li")) > 0: # ๋ด์ฉ์ด ์กด์ฌํ๋ Object๋ง ์ฐ์ฐ
event_arr = get_event_script(event)
if is_activate_event(event_arr[4], event_arr[5], today):
content = f"[{event_arr[0]}]({event_arr[1]})" + "\n -" + event_arr[2] + "\n -"+ event_arr[3] + " <br/>\n "
current_content += content
return current_content
def __main__():
url = 'https://github.com/brave-people/Dev-Event'
date_now = 206 # ์ง๊ธ ๋ ์ง intํ์ผ๋ก
html = get_html(url)
event = split_event_html(html)
print(content_list("ํ
์คํธ ๊ณต์ง", event, date_now))
# print(event)
if __name__ == '__main__':
__main__()