Skip to content

Commit

Permalink
Merge pull request #525 from songzy12/master
Browse files Browse the repository at this point in the history
Fix the tool to run correctly when there are 2 pinned weibo.
  • Loading branch information
dataabc committed May 19, 2023
2 parents ad66af1 + 1353dd6 commit 40c505a
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 14 deletions.
8 changes: 4 additions & 4 deletions tests/testdata/url_map.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"https://weibo.cn/1669879400": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html",
"https://weibo.cn/1669879400/profile": "tests/testdata/a4437630f3bdfa2757bae1595186ac063fe5ec25cf2f98116ece83cb.html",
"https://weibo.cn/1669879400/info": "tests/testdata/ca5f2a555e8d62f728c66fa90afb2d54d19f8c898e164204a61bdf03.html",
"https://weibo.cn/1669879400?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html",
"https://weibo.cn/1669879400/profile?page=1": "tests/testdata/4957814af5a123b82e974b5537dea736dfb34e48d8835203a45d2e67.html",
"https://weibo.cn/mblog/picAll/J6k49kbTc?rl=1": "tests/testdata/e97222acd5bc7d8d1bfbd3f352f8cad3e36fdd19e40b69e1c33fb3c3.html",
"https://weibo.cn/mblog/picAll/J5ZcSnCAg?rl=1": "tests/testdata/63a98849ec82b2c87ec55bca03cbf5988f7eac233a23d86b4fdf5ffd.html",
"https://weibo.cn/1669879400?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html",
"https://weibo.cn/1669879400?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html",
"https://weibo.cn/1669879400/profile?page=2": "tests/testdata/2f62165fa3ca1e85e0d398d385c377a068b76eb95765f7020ffffd3e.html",
"https://weibo.cn/1669879400/profile?page=3": "tests/testdata/d486235d4a17dd0accb0f2cc77b3648abfa03580b9e0cdb61f1e618f.html",
"https://weibo.cn/mblog/picAll/J3xfm61AZ?rl=1": "tests/testdata/76233b3f90394581aac6f19cfa5d674a610e8b442b1f83de7673ab49.html",
"https://weibo.cn/comment/J5cVGuUNq": "tests/testdata/4d5ed0a3ebd0303cb45edd544dbc0ab5e86d43e103405f0c60515884.html",
"https://weibo.cn/1980768563/photo?tf=6_008": "tests/testdata/e4d541ecb02253c14abc1d52605fc00d91279df9ac4c1465c85b91b3.html",
Expand Down
18 changes: 8 additions & 10 deletions weibo_spider/parser/page_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from .parser import Parser
from .util import handle_garbled, handle_html, to_video_download_url

MAX_PINNED_COUNT = 2

logger = logging.getLogger('spider.page_parser')


Expand Down Expand Up @@ -58,6 +60,7 @@ def __init__(self, cookie, user_config, page, filter):

def get_one_page(self, weibo_id_list):
"""获取第page页的全部微博"""
cur_pinned_count = 0
try:
info = self.selector.xpath("//div[@class='c']")
is_exist = info[0].xpath("div/span[@class='ctt']")
Expand All @@ -72,8 +75,11 @@ def get_one_page(self, weibo_id_list):
publish_time = datetime_util.str_to_time(
weibo.publish_time)

if publish_time < since_date:
if self.is_pinned_weibo(info[i]):
if publish_time < since_date:
# As of 2023.05, there can be at most 2 pinned weibo.
# We will continue for at most 2 times before return.
if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT:
cur_pinned_count += 1
continue
else:
return weibos, weibo_id_list, False
Expand Down Expand Up @@ -301,14 +307,6 @@ def get_video_url(self, info):

return video_url

def is_pinned_weibo(self, info):
"""判断微博是否为置顶微博"""
kt = info.xpath(".//span[@class='kt']/text()")
if kt and kt[0] == u'置顶':
return True
else:
return False

def get_one_weibo(self, info):
"""获取一条微博的全部信息"""
try:
Expand Down

0 comments on commit 40c505a

Please sign in to comment.