Skip to content

Commit

Permalink
New version
Browse files Browse the repository at this point in the history
  • Loading branch information
NaiboWang committed Oct 19, 2022
1 parent 3646513 commit f125db1
Show file tree
Hide file tree
Showing 33 changed files with 2,398 additions and 18 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
712 changes: 712 additions & 0 deletions ExecuteStage/ServiceWrapper_ExecuteStage.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import atexit # 遇到错误退出时应执行的代码
import json
from lib2to3.pgen2 import driver
import re
import sys
from urllib import parse
Expand Down Expand Up @@ -64,7 +65,7 @@ def scrollDown(para, rt=""):
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element_by_css_selector("body")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
Expand All @@ -77,7 +78,7 @@ def scrollDown(para, rt=""):
for i in range(para["scrollCount"]):
time.sleep(1) # 下拉完等1秒
Log("下拉完等待1秒")
body = browser.find_element_by_css_selector("body")
body = browser.find_element(By.CSS_SELECTOR, "body")
if para["scrollType"] == 1:
body.send_keys(Keys.PGDN)
else:
Expand Down Expand Up @@ -283,6 +284,18 @@ def openPage(para, loopValue):
global links
global urlId
global history
# try:
# firstTime = True
# for handle in browser.window_handles:
# browser.switch_to.window(handle)
# if (not firstTime):
# browser.close()
# firstTime = False
# except:
# return
if len(browser.window_handles) > 1:
browser.switch_to.window(browser.window_handles[-1]) # 打开网页操作从第1个页面开始
browser.close()
browser.switch_to.window(browser.window_handles[0]) # 打开网页操作从第1个页面开始
history["handle"] = browser.current_window_handle
if para["useLoop"]:
Expand All @@ -306,15 +319,15 @@ def openPage(para, loopValue):
if containJudge:
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
try:
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text: ' + url)
recordLog('time out after 10 seconds when getting body text:: ' + url)
browser.execute_script('window.stop()')
time.sleep(1)
Log("获得bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
Expand All @@ -340,7 +353,7 @@ def inputInfo(para, loopValue):
else:
textbox.send_keys(para["value"])
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()


Expand Down Expand Up @@ -391,15 +404,15 @@ def clickElement(para, loopElement=None, clickPath="", index=0):
if containJudge: # 有判断语句才执行以下操作
global bodyText # 每次执行点击,输入元素和打开网页操作后,需要更新bodyText
try:
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
except TimeoutException:
Log('time out after 10 seconds when getting body text')
recordLog('time out after 10 seconds when getting body text')
browser.execute_script('window.stop()')
time.sleep(1)
Log("bodytext等待1秒")
# 再执行一遍
bodyText = browser.find_element_by_css_selector("body").text
bodyText = browser.find_element(By.CSS_SELECTOR, "body").text
rt.end()
except Exception as e:
Log(e)
Expand Down Expand Up @@ -594,18 +607,25 @@ def clean():
f_csv.writerow(line)
f.close()
browser.quit()
sys.exit(saveName + '.csv')


if __name__ == '__main__':
options = Options()
exe_path = "chromedriver.exe"
if os.path.exists(os.getcwd()+"/ServiceWrapper"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/ServiceWrapper")
options.binary_location = "ServiceWrapper/Chrome/chrome.exe" # 指定chrome位置
exe_path = "ServiceWrapper/Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd()+"/Debug"):
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "Debug/Chrome/chrome.exe" # 指定chrome位置
exe_path = "Debug/Chrome/chromedriver.exe"
elif os.getcwd().find("ExcuteStage") >= 0: # 如果直接执行
elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
print("Finding chromedriver in ServiceWrapper",
os.getcwd()+"/Debug")
options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
exe_path = "./Chrome/chromedriver.exe"
else:
Expand All @@ -620,17 +640,23 @@ def clean():
id = 7 # 设置默认值
print("id:", id)
if len(sys.argv) > 2:
backEndAddress = sys.argv[2]
else:
backEndAddress = "http://servicewrapper.naibo.wang"
if len(sys.argv) > 3:
saveName = "task_" + str(id) + "_" + sys.argv[3] # 保存文件的名字
saveName = "task_" + str(id) + "_" + sys.argv[2] # 保存文件的名字
else:
saveName = "task_" + str(id) + "_" + \
str(random.randint(0, 999999999)) # 保存文件的名字
content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
service = json.loads(content.text) # 加载服务信息
print("name:", service["name"])
print("saveName is:", saveName, sys.argv, len(sys.argv) > 2)
if len(sys.argv) > 3:
backEndAddress = sys.argv[3]
else:
backEndAddress = "http://servicewrapper.naibo.wang"

# content = requests.get(backEndAddress + "/backEnd/queryTask?id=" + str(id))
with open("tasks/" + str(id) + ".json", 'r') as f:
content = f.read()
service = json.loads(content)
# print(service)
# service = json.loads() # 加载服务信息
print("name:", service["name"])
procedure = service["graph"] # 程序执行流程
links = list(filter(isnull, service["links"].split("\n"))) # 要执行的link的列表
OUTPUT = [] # 采集的数据
Expand Down Expand Up @@ -663,4 +689,3 @@ def clean():
f.close()
SAVED = True
browser.quit()
sys.exit(0)
Binary file added ExecuteStage/all_data.xls
Binary file not shown.
151 changes: 151 additions & 0 deletions ExecuteStage/author_crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# _*_coding:utf-8_*_
from hashlib import new
import json
import os
import sys
import time
from multiprocessing import Process
import time
from datetime import datetime, timedelta
import os
import pickle
import calendar
import re
from copy import deepcopy
import requests
import csv
from commandline_config import Config
from service_invoke import invokeService


class TimeUtil(object):
@classmethod
def parse_timezone(cls, timezone):
"""
解析时区表示
:param timezone: str eg: +8
:return: dict{symbol, offset}
"""
result = re.match(r'(?P<symbol>[+-])(?P<offset>\d+)', timezone)
symbol = result.groupdict()['symbol']
offset = int(result.groupdict()['offset'])

return {
'symbol': symbol,
'offset': offset
}

@classmethod
def convert_timezone(cls, dt, timezone="+0"):
"""默认是utc时间,需要"""
result = cls.parse_timezone(timezone)
symbol = result['symbol']

offset = result['offset']

if symbol == '+':
return dt + timedelta(hours=offset)
elif symbol == '-':
return dt - timedelta(hours=offset)
else:
raise Exception('dont parse timezone format')


def generate_timestamp():
current_GMT = time.gmtime()
# ts stores timestamp
ts = calendar.timegm(current_GMT)

current_time = datetime.utcnow()
convert_now = TimeUtil.convert_timezone(current_time, '+8')
print("current_time: " + str(convert_now))
return str(convert_now)


def main():
# result = os.popen('python ServiceWrapper_ExecuteStage.py 38')
# res = result.read()
# for line in res.splitlines():
# print("\n\n\n\nfinename:\n\n\n\n\n", line)
config = {
"pages": 5,
"test": False,
"test_pages": 3,
}
c = Config(config)
print(c)
csv_reader = csv.reader(open("./关键词.csv", encoding='utf-8'))
keywords = []
i = 0
for line in csv_reader:
if i < c.test_pages:
print(line)
i += 1
keywords.append(line[0])
urlList = ""
i = 0
for keyword in keywords:
url = "https://so.toutiao.com/search?dvpf=pc&source=pagination&filter_vendor=site&keyword=%s&pd=synthesis&filter_vendor=site&action_type=pagination&page_num=0\r\n" % keyword
# print(url)
urlList += url
i += 1
if c.test and i > c.test_pages:
break
print(urlList)

# result = requests.post(
# "http://servicewrapper.naibo.wang/backEnd/invokeService",
# data={"id": 6, # serviceID
# "paras": json.dumps({"loopTimes_Loop_Click_1": c.pages,
# "urlList_0": urlList,
# }),
# })
# authorTaskID = int(result.text)
authorTaskID = invokeService(
0, {"loopTimes_Loop_Click_1": c.pages, "urlList_0": urlList})
print("authorTaskID: " + str(authorTaskID))
# exit(0)
filename = generate_timestamp().replace(" ", "").replace(":", "-")
print("filename:", filename)

command = 'python ServiceWrapper_ExecuteStage_local.py ' + \
str(authorTaskID) + ' ' + filename
result = os.system(command)

# authorTaskID = 53
file_name = "task_" + str(authorTaskID) + "_" + filename + ".csv"
# file_name = "task_53_2022-10-1723-35-40.881448.csv"
print("file_name:", file_name)
csv_reader = csv.reader(
open("./Data/"+file_name, encoding='utf-8')) # taskID
new_author_list = []
i = 0
for line in csv_reader:
# print(line)
if i > 0:
new_author_list.append(line[0])
i += 1
# print(new_author_list)
new_author_list = list(set(new_author_list)) # 去重

csv_reader = csv.reader(open("./author_list.csv", encoding='utf-8'))
author_list = []
for line in csv_reader:
author_list.append(line[0])
author_list = list(set(author_list)) # 去重

print("author_list:", author_list)
print("new_author_list:", new_author_list)

real_new_author_list = list(
set(new_author_list).difference(set(author_list)))
print("real_new_author_list:", real_new_author_list)
with open("author_list.csv", "a", encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
for row in real_new_author_list:
writer.writerow([row])



if __name__ == '__main__':
main()
Loading

0 comments on commit f125db1

Please sign in to comment.