From 1523bfd8a389bf3226fb2bf1152e96eb62edd895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sun, 28 Apr 2024 11:12:12 +0800 Subject: [PATCH 1/2] =?UTF-8?q?issues=5Ffeature=5Fpost=5Fapi=5F576=20?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E9=80=9A=E8=BF=87POST=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E5=B0=86=E6=95=B0=E6=8D=AE=E6=8E=A8=E9=80=81=E5=88=B0=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/settings.md | 12 +++++++ weibo_spider/config_sample.json | 4 +++ weibo_spider/config_util.py | 4 +-- weibo_spider/spider.py | 6 ++++ weibo_spider/writer/__init__.py | 3 +- weibo_spider/writer/post_writer.py | 57 ++++++++++++++++++++++++++++++ 6 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 weibo_spider/writer/post_writer.py diff --git a/docs/settings.md b/docs/settings.md index 41fa065f..03733670 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -239,3 +239,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为 - **publish_tool**:存储微博的发布工具。 + +## 设置API接口POST联动(可选) + +本部分是可选部分,如果不需要将爬取信息通过POST请求发送到指定API接口,可跳过这一步 + +请求数据格式为 `content-type : application/json`,接口响应返回也需要是 `content-type : application/json`,HTTP状态码为 `200` + +数据主体与 `write_mode` 配置的 `json` 输出格式一致,是整页获取数据json,每页POST发送一次 + +`api_url` 为指定的API接口地址 + +`api_token` 为接口鉴权TOKEN,将在 Request Headers 中添加 `api-token` 字段,根据需要配置 \ No newline at end of file diff --git a/weibo_spider/config_sample.json b/weibo_spider/config_sample.json index 330e2a10..262398d9 100644 --- a/weibo_spider/config_sample.json +++ b/weibo_spider/config_sample.json @@ -29,5 +29,9 @@ "connection_string": "mongodb://admin:password@localhost:27017/weibo", "dba_name": "", "dba_password": "" + }, + "post_config": { + "api_url": "", + "api_token": "" } } diff --git a/weibo_spider/config_util.py b/weibo_spider/config_util.py index 55e4bdd8..ba4676b3 100644 --- a/weibo_spider/config_util.py +++ b/weibo_spider/config_util.py @@ -85,14 +85,14 @@ def validate_config(config): sys.exit() # 验证write_mode - write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka'] + write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka','post'] if not isinstance(config['write_mode'], list): logger.warning(u'write_mode值应为list类型') sys.exit() for mode in config['write_mode']: if mode not in write_mode: logger.warning( - u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode', + u'%s为无效模式,请从txt、csv、json、post、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode', mode) sys.exit() diff --git a/weibo_spider/spider.py b/weibo_spider/spider.py index dabd6f77..e0d2e41e 100644 --- a/weibo_spider/spider.py +++ b/weibo_spider/spider.py @@ -74,6 +74,7 @@ def __init__(self, config): self.sqlite_config = config.get('sqlite_config') self.kafka_config = config.get('kafka_config') self.mongo_config = config.get('mongo_config') + self.post_config = config.get('post_config') self.user_config_file_path = '' user_id_list = config['user_id_list'] if FLAGS.user_id_list: @@ -284,6 +285,11 @@ def initialize_info(self, user_config): self.writers.append(KafkaWriter(self.kafka_config)) + if 'post' in self.write_mode: + from .writer import PostWriter + + self.writers.append(PostWriter(self.post_config)) + self.downloaders = [] if self.pic_download == 1: from .downloader import (OriginPictureDownloader, diff --git a/weibo_spider/writer/__init__.py b/weibo_spider/writer/__init__.py index 5868f1ac..f6b24bd6 100644 --- a/weibo_spider/writer/__init__.py +++ b/weibo_spider/writer/__init__.py @@ -5,5 +5,6 @@ from .txt_writer import TxtWriter from .sqlite_writer import SqliteWriter from .kafka_writer import KafkaWriter +from .post_writer import PostWriter -__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter] +__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter] diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py new file mode 100644 index 00000000..7446fbea --- /dev/null +++ b/weibo_spider/writer/post_writer.py @@ -0,0 +1,57 @@ +import codecs +import json +import logging +import os +import requests + +from .writer import Writer + +logger = logging.getLogger('spider.post_writer') + +class PostWriter(Writer): + def __init__(self, post_config): + self.post_config = post_config + self.api_url = post_config['api_url'] + self.api_token = post_config.get('api_token', None) + self.dba_password = post_config.get('dba_password', None) + + def write_user(self, user): + self.user = user + + def _update_json_data(self, data, weibo_info): + """将获取到的微博数据转换为json输出模式一致""" + data['user'] = self.user.__dict__ + if data.get('weibo'): + data['weibo'] += weibo_info + else: + data['weibo'] = weibo_info + return data + + def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor): + headers = { + 'Content-Type': 'application/json', + 'api-token': f'{token}', + } + for attempt in range(max_retries + 1): + try: + response = requests.post(url, json=data, headers=headers) + if response.status_code == requests.codes.ok: + return response.json() + else: + raise RequestException(f"Unexpected response status: {response.status_code}") + except RequestException as e: + if attempt < max_retries: + sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试 + continue + else: + logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}") + + def write_weibo(self, weibos): + """将爬到的信息POST到API""" + data = {} + data = self._update_json_data(data, [w.__dict__ for w in weibos]) + if data: + self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2) + logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url) + else: + logger.info(u'没有获取到微博,略过API POST') From c0c75257a488f9448870d6b4821f217ca4560ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sun, 28 Apr 2024 17:04:19 +0800 Subject: [PATCH 2/2] =?UTF-8?q?issues=5Ffeature=5Fpost=5Fapi=5F576=20?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E9=80=9A=E8=BF=87POST=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E5=B0=86=E6=95=B0=E6=8D=AE=E6=8E=A8=E9=80=81=E5=88=B0=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 2 +- weibo_spider/writer/post_writer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 6e06c776..c0117d80 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -33,7 +33,7 @@ def get_long_weibo(self): # 3. 去掉所有 HTML 标签,但保留标签内的有效文本 new_content = fromstring(html_string).text_content() # 4. 替换多个连续的 \n 为一个 \n - new_content = re.sub(r'\n+', '\n', new_content) + new_content = re.sub(r'\n+\s*', '\n', new_content) weibo_content = handle_garbled(new_content) if weibo_content is not None: return weibo_content diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py index 7446fbea..af536623 100644 --- a/weibo_spider/writer/post_writer.py +++ b/weibo_spider/writer/post_writer.py @@ -5,6 +5,8 @@ import requests from .writer import Writer +from time import sleep +from requests.exceptions import RequestException logger = logging.getLogger('spider.post_writer')