-
Notifications
You must be signed in to change notification settings - Fork 1
/
reuters.go
100 lines (90 loc) · 2.94 KB
/
reuters.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package crawler
import (
"fmt"
"regexp"
"strings"
"github.com/go-some/txtanalyzer"
"github.com/gocolly/colly"
)
type Reuters struct {
}
func (rc *Reuters) Run(wtr DocsWriter) {
// Instantiate default NewCollector
c := colly.NewCollector(
colly.MaxDepth(2),
// Visit only finance and businessnews section
colly.URLFilters(
regexp.MustCompile("https://www\\.reuters\\.com/finance"),
regexp.MustCompile("https://www\\.reuters\\.com/news/archive/businessnews.+"),
),
colly.DisallowedURLFilters(
regexp.MustCompile("https://www\\.reuters\\.com/finance/.+"),
),
)
c.AllowURLRevisit = false
// Create another collector to scrape each news article
articleCollector := colly.NewCollector()
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
/* crawl all href links recursively */
link := e.Request.AbsoluteURL(e.Attr("href"))
//if the link is article page, crawl using articleCollector
//else, visit the link until MaxDepth
if strings.Index(link, "reuters.com/article") != -1 {
articleCollector.Visit(link)
} else {
e.Request.Visit(link) //e.Request.Visit을 이용해야 MaxDepth 처리가 된다.
}
})
// 뉴스 기사 url 별 대표 image source 를 저장하기 위한 변수 선언
url := ""
imgSrc := ""
articleCollector.OnHTML("body", func(e *colly.HTMLElement) {
// cnbc의 경우 head meta 태그에 대표 이미지 정보가 저장되어 있음
url = e.Request.URL.String()
imgSrc = e.ChildAttr("meta[property=\"og:image\"]", "content")
})
articleCollector.OnHTML("div.StandardArticle_inner-container", func(e *colly.HTMLElement) {
/* Read article page and save to mongoDB
- 최종적으로 우리가 크롤하고자 하는 기사 페이지 (leaf node)
- 크롤과 동시에 바로 저장하도록 함
- mongoDB에서의 중복체크는 WriteDocs 함수에서 진행
*/
date := DateParser(e.ChildText(".ArticleHeader_date"))
// 해당 기사의 head로부터 대표 이미지를 찾고 그래프 이미지인지 check
var hasGraphImg bool
if url != e.Request.URL.String() {
imgSrc = ""
hasGraphImg = false
} else {
hasGraphImg = CheckGraphImage(imgSrc)
if !hasGraphImg {
imgSrc = ""
}
}
title := e.ChildText(".ArticleHeader_headline")
body := e.ChildText("div.StandardArticleBody_body")
entitiesInTitle, personList, orgList, prodList := txtanalyzer.NEROnDoc(title, body)
bodySum := txtanalyzer.SumOnDoc(title, body)
doc := News{
Title: title,
Body: body,
Time: date,
Url: e.Request.URL.String(),
Origin: "reuters",
ImgUrl: imgSrc,
HasGraphImg: hasGraphImg,
EntitiesInTitle: entitiesInTitle,
PersonList: personList,
OrgList: orgList,
ProdList: prodList,
BodySum: bodySum,
}
cnt, err := wtr.WriteDocs([]News{doc})
if err != nil {
fmt.Println(err)
} else {
fmt.Println(cnt, "docs saved")
}
})
c.Visit("https://www.reuters.com/finance")
}