From bc158523e3071991c9c8e0c05998bdc462e1ba81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com.com>
Date: Sat, 27 Apr 2024 18:03:18 +0800
Subject: [PATCH 1/2] =?UTF-8?q?issues=5Fbug=5F574=20=E6=97=A0=E6=B3=95?=
 =?UTF-8?q?=E5=8C=B9=E9=85=8D=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A=E9=95=BF?=
 =?UTF-8?q?=E6=96=87=EF=BC=8C=E5=B0=9D=E8=AF=95=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 weibo_spider/parser/comment_parser.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py
index 28e75e3..6e06c77 100644
--- a/weibo_spider/parser/comment_parser.py
+++ b/weibo_spider/parser/comment_parser.py
@@ -1,8 +1,11 @@
 import logging
 import random
 import requests
+import re
 from time import sleep
-
+from lxml.html import tostring
+from lxml.html import fromstring
+from lxml import etree
 from .parser import Parser
 from .util import handle_garbled, handle_html
 
@@ -21,11 +24,17 @@ def get_long_weibo(self):
             for i in range(5):
                 self.selector = handle_html(self.cookie, self.url)
                 if self.selector is not None:
-                    info = self.selector.xpath("//div[@class='c']")[1]
-                    wb_content = handle_garbled(info)
-                    wb_time = info.xpath("//span[@class='ct']/text()")[0]
-                    weibo_content = wb_content[wb_content.find(':') +
-                                               1:wb_content.rfind(wb_time)]
+                    info_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0]
+                    info_span = info_div.xpath("//span[@class='ctt']")[0]
+                    # 1. 获取 info_span 中的所有 HTML 代码作为字符串
+                    html_string = etree.tostring(info_span, encoding='unicode', method='html')
+                    # 2. 将 <br> 替换为 \n
+                    html_string = html_string.replace('<br>', '\n')
+                    # 3. 去掉所有 HTML 标签，但保留标签内的有效文本
+                    new_content = fromstring(html_string).text_content()
+                    # 4. 替换多个连续的 \n 为一个 \n
+                    new_content = re.sub(r'\n+', '\n', new_content)
+                    weibo_content = handle_garbled(new_content)
                     if weibo_content is not None:
                         return weibo_content
                 sleep(random.randint(6, 10))

From 241d1098e708b39806b365c5e7c54fca8e813a08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com>
Date: Sat, 27 Apr 2024 18:43:06 +0800
Subject: [PATCH 2/2] =?UTF-8?q?issues=5Fbug=5F574=20=E6=97=A0=E6=B3=95?=
 =?UTF-8?q?=E5=8C=B9=E9=85=8D=E8=8E=B7=E5=8F=96=E5=BE=AE=E5=8D=9A=E9=95=BF?=
 =?UTF-8?q?=E6=96=87=EF=BC=8C=E5=B0=9D=E8=AF=95=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 weibo_spider/parser/util.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py
index 3169f24..81aa429 100644
--- a/weibo_spider/parser/util.py
+++ b/weibo_spider/parser/util.py
@@ -48,8 +48,13 @@ def handle_html(cookie, url):
 def handle_garbled(info):
     """处理乱码"""
     try:
-        info = (info.xpath('string(.)').replace(u'\u200b', '').encode(
-            sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding))
+        if hasattr(info, 'xpath'): # 检查 info 是否具有 xpath 方法
+            info_str = info.xpath('string(.)')  # 提取字符串内容
+        else:
+            info_str = str(info) # 若不支持 xpath，将其转换为字符串
+
+        info = info_str.replace(u'\u200b', '').encode(
+            sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)
         return info
     except Exception as e:
         logger.exception(e)