Skip to content

Commit

Permalink
Sourcery refactored python3 branch (#179)
Browse files Browse the repository at this point in the history
Branch `python3` refactored by [Sourcery](https://sourcery.ai/github/).

If you're happy with these changes, merge this Pull Request using the
*Squash and merge* strategy.

See our documentation
[here](https://docs.sourcery.ai/GitHub/Using-Sourcery-for-GitHub/).

<details>
<summary>Run Sourcery locally</summary>
<p>
Reduce the feedback loop during development by using the Sourcery editor
plugin:
</p>
<ul>
<li><a href="https://sourcery.ai/download/?editor=vscode">VS
Code</a></li>
<li><a
href="https://sourcery.ai/download/?editor=pycharm">PyCharm</a></li>
</ul>
</details>

<details>
<summary>Review changes via command line</summary>
<p>To manually merge these changes, make sure you're on the
<code>python3</code> branch, then run:</p>
<pre>
git fetch origin sourcery/python3
git merge --ff-only FETCH_HEAD
git reset HEAD^
</pre>
</details>

Help us
[improve](https://research.typeform.com/to/j06Spdfr?type=branch_refactor&github_login=elsiehupp&base_repo=https%3A%2F%2Fgithub.com%2Fmediawiki-client-tools%2Fmediawiki-scraper.git&base_remote_ref=python3&base_ref=python3&base_sha=6d044c0c62c509751f57dfcb8edeca0906a974ab&head_repo=https%3A%2F%2Fgithub.com%2Fmediawiki-client-tools%2Fmediawiki-scraper.git&head_ref=sourcery%2Fpython3)
this pull request!

---------

Co-authored-by: Sourcery AI <>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
sourcery-ai[bot] and pre-commit-ci[bot] authored Aug 29, 2023
1 parent 6d044c0 commit 69cb2eb
Show file tree
Hide file tree
Showing 54 changed files with 552 additions and 970 deletions.
10 changes: 5 additions & 5 deletions resources/listsofwikis/mediawiki/checkalive.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,18 @@ def checkcore(api):
raw = urllib.request.urlopenurlopen(req, None, delay).read()
except URLError as reason: # https://docs.python.org/3/library/urllib.error.html
if reason.isinstance(HTTPError):
print(api + "is dead or has errors because:")
print(f"{api}is dead or has errors because:")
print(
"Error code "
+ HTTPError.code
+ ": "
+ BaseHTTPRequestHandler.responses[HTTPError.code].shortmessage
)
print(BaseHTTPRequestHandler.responses[HTTPError.code].longmessage)
print("Reason: " + HTTPError.reason)
print(f"Reason: {HTTPError.reason}")
print("HTTP Headers:\n" + HTTPError.headers)
else:
print(api + "is dead or has errors because:" + reason)
print(f"{api}is dead or has errors because:{reason}")
return
# RSD is available since 1.17, bug 25648
rsd = re.search(
Expand All @@ -69,7 +69,7 @@ def checkcore(api):
if "This is an auto-generated MediaWiki API documentation page" in raw:
printapi(api)
elif rsd and rsd.group(1):
api = "http:" + rsd.group(1)
api = f"http:{rsd.group(1)}"
printapi(api)
elif feed and feed.group(1) and domain and domain.group(1):
index = domain.group(1) + feed.group(1)
Expand All @@ -90,7 +90,7 @@ def check(apis):

apis = []
for api in open("wikistocheck.txt").read().strip().splitlines():
if not api in apis:
if api not in apis:
apis.append(api)
if len(apis) >= limit:
check(apis)
Expand Down
7 changes: 3 additions & 4 deletions resources/listsofwikis/mediawiki/fandom-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def main():
wikis = []
for lvl3 in tqdm(map_lvl3):
time.sleep(0.3)
req = requests.get("https://community.fandom.com%s" % lvl3)
req = requests.get(f"https://community.fandom.com{lvl3}")
if req.status_code != 200:
time.sleep(5)
req = requests.get("https://community.fandom.com%s" % lvl3)
req = requests.get(f"https://community.fandom.com{lvl3}")
wikis.extend(
[
wiki.replace("http://", "https://")
Expand All @@ -50,8 +50,7 @@ def main():
]
)

wikis = list(set(wikis))
wikis.sort()
wikis = sorted(set(wikis))
with open("fandom.com", "w") as f:
for wiki in wikis:
f.write(parse.urljoin(wiki, "api.php") + "\n")
Expand Down
3 changes: 1 addition & 2 deletions resources/listsofwikis/mediawiki/miraheze-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ def main():
)
)

wikis = list(set(wikis))
wikis.sort()
wikis = sorted(set(wikis))
with open("miraheze.org", "w") as f:
for wiki in wikis:
f.write(urljoin(wiki, "w/api.php") + "\n")
Expand Down
3 changes: 1 addition & 2 deletions resources/listsofwikis/mediawiki/neoseeker-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ def main():
raw = r.text
m = re.findall(r"<li><a href=\'([^>]+?)/wiki/\'>", raw)
m = [w.replace("http://", "https://") + "/w/api.php" for w in m]
m = list(set(m))
m.sort()
m = sorted(set(m))
with open("neoseeker.com", "w") as f:
f.write("\n".join(m))

Expand Down
2 changes: 1 addition & 1 deletion resources/listsofwikis/mediawiki/orain-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def main():
raw = r.text
m = re.findall(r'<tr><td><a href="//([^>]+?)/">[^<]+</a></td></tr>', raw)
for i in m:
print("http://" + i + "/w/api.php")
print(f"http://{i}/w/api.php")


if __name__ == "__main__":
Expand Down
11 changes: 3 additions & 8 deletions resources/listsofwikis/mediawiki/shoutwiki-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,7 @@ def main():
json = requests.get(url, params=params, headers=headers).json()
gcont = json["continue"]["gcmcontinue"] if "continue" in json else ""
query = json["query"]["pages"]
for wiki in query:
ids.append(wiki)

ids.extend(iter(query))
# grab wiki API
params = {
"action": "query",
Expand All @@ -64,15 +62,12 @@ def main():
for val in wiki["revisions"][0]["slots"]["main"]["content"].split("\n|"):
if "subdomain" in val:
wikis.append(
"http://%s.shoutwiki.com/w/api.php"
% val.split("subdomain =")[-1].strip()
f'http://{val.split("subdomain =")[-1].strip()}.shoutwiki.com/w/api.php'
)
break

time.sleep(0.3)
wikis = list(set(wikis))
wikis.sort()

wikis = sorted(set(wikis))
with open("shoutwiki.com", "w") as f:
f.write("\n".join(wikis))

Expand Down
3 changes: 1 addition & 2 deletions resources/listsofwikis/mediawiki/wiki-site-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def main():
req = requests.get(url, headers=headers)
wikis.extend(re.findall(r'<td><a href="([^>]+?)"', req.text))

wikis = list(set(wikis))
wikis.sort()
wikis = sorted(set(wikis))
with open("wiki-site.com", "w") as f:
for wiki in wikis:
f.write(parse.urljoin(wiki, "api.php") + "\n")
Expand Down
48 changes: 1 addition & 47 deletions resources/listsofwikis/mediawiki/wikia.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ def getall():
# This API module has no query continuation facility
print("Getting list of active domains...")
while True:
list = getlist(wikia, offset, offset + limit)
if list:
if list := getlist(wikia, offset, offset + limit):
print(offset)
domains = dict(domains.items() + list.items())
empty = 0
Expand All @@ -69,51 +68,6 @@ def main():
# assumed to be undumped.
return

undumped = []
# Or we could iterate over each sublist while we get it?
for i in domains:
dbname = re.sub("[-_.]", "", domains[i]["domain"].replace(".wikia.com", ""))
dbname = re.escape(dbname)
print(dbname)
first = dbname[0]
# There are one-letter dbnames; the second letter is replaced by an underscore
# http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
try:
second = dbname[1]
except:
second = "_"
base = (
"http://s3.amazonaws.com/wikia_xml_dumps/"
+ first
+ "/"
+ first
+ second
+ "/"
+ dbname
)
full = base + "_pages_full.xml.7z"
print(full)
current = base + "_pages_current.xml.7z"
images = base + "_images.tar"
try:
# subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
# Use this instead, and comment out the next try, to only list.
subprocess.call(["curl", "-I", "--fail", full])
except subprocess.CalledProcessError as e:
# We added --fail for this https://superuser.com/a/854102/283120
if e.returncode == 22:
print("Missing: " + domains[i]["domain"])
undumped.append(domains[i]["domain"])

# try:
# subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', current])
# subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
# except:
# pass

with open("wikia.com-unarchived", "w+") as out:
out.write("\n".join(str(domain) for domain in undumped))


if __name__ == "__main__":
main()
13 changes: 5 additions & 8 deletions resources/listsofwikis/wikidot/wikidot-duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ def main():
opener.addheaders = [("User-agent", "Mozilla/5.1")]
urllib.request.install_opener(opener)

for i in range(1, 100000):
url = "https://duckduckgo.com/html/?q={}%20{}%20site:wikidot.com".format(
random.randint(100, 5000),
random.randint(1000, 9999),
)
for _ in range(1, 100000):
url = f"https://duckduckgo.com/html/?q={random.randint(100, 5000)}%20{random.randint(1000, 9999)}%20site:wikidot.com"
print("URL search", url)
try:
html = urllib.request.urlopen(url).read().decode("utf-8")
Expand All @@ -42,16 +39,16 @@ def main():
m = re.findall(r"://([^/]+?\.wikidot\.com)", html)
wikis = []
for wiki in m:
wiki = "https://" + wiki
if not wiki in wikis:
wiki = f"https://{wiki}"
if wiki not in wikis:
wikis.append(wiki)
print(wiki)
wikis.sort()
with open("wikidot-duckduckgo.txt", "w") as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r"https?://www\.", "http://", wiki)
if not wiki in wikis2:
if wiki not in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
Expand Down
8 changes: 4 additions & 4 deletions resources/listsofwikis/wikidot/wikidot-spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main():
with open("wikidot-spider.txt") as f:
wikis = f.read().strip().splitlines()

for i in range(1, 1000000):
for _ in range(1, 1000000):
url = random.choice(wikis)
print("URL search", url)
try:
Expand All @@ -42,16 +42,16 @@ def main():
html = urllib.parse.unquote(html)
m = re.findall(r"://([^/]+?\.wikidot\.com)", html)
for wiki in m:
wiki = "http://" + wiki
if not wiki in wikis:
wiki = f"http://{wiki}"
if wiki not in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open("wikidot-spider.txt", "w") as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r"https?://www\.", "http://", wiki)
if not wiki in wikis2:
if wiki not in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
Expand Down
10 changes: 4 additions & 6 deletions resources/listsofwikis/wikidot/wikidot-spider2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@ def main():
with open("wikidot-spider2.txt") as f:
wikis = f.read().strip().splitlines()

for i in range(1, 1000000):
for _ in range(1, 1000000):
url = random.choice(wikis)
urlrandom = (
url.endswith("/")
and (url + "random-site.php")
or (url + "/" + "random-site.php")
url.endswith("/") and f"{url}random-site.php" or f"{url}/random-site.php"
)
print("URL exploring %s" % urlrandom)
print(f"URL exploring {urlrandom}")
try:
r = requests.get(urlrandom)
except:
Expand All @@ -51,7 +49,7 @@ def main():
wikis2 = []
for wiki in wikis:
wiki = re.sub(r"https?://www\.", "http://", wiki)
if not wiki in wikis2:
if wiki not in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
Expand Down
23 changes: 6 additions & 17 deletions resources/listsofwikis/wikispaces/wikispaces-duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def main():
wikis.sort()
print("Loaded %d wikis from file" % (len(wikis)))

for i in range(1, 100):
for _ in range(1, 100):
random.shuffle(words)
for word in words:
print("Word", word)
Expand All @@ -51,21 +51,10 @@ def main():
)
elif r == 1:
url = "https://duckduckgo.com/html/?q=%s%%20wikispaces.com" % (word_)
elif r == 2:
url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
word_,
random.randint(100, 3000),
)
elif r == 3:
url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
random.randint(100, 3000),
word_,
)
url = f"https://duckduckgo.com/html/?q={random.randint(100, 3000)}%20{word_}%20wikispaces.com"
else:
url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
word_,
random.randint(100, 3000),
)
url = f"https://duckduckgo.com/html/?q={word_}%20{random.randint(100, 3000)}%20wikispaces.com"
print("URL search", url)
try:
html = urllib.request.urlopen(url).read().decode("utf-8")
Expand All @@ -75,16 +64,16 @@ def main():
html = urllib.parse.unquote(html)
m = re.findall(r"://([^/]+?\.wikispaces\.com)", html)
for wiki in m:
wiki = "https://" + wiki
if not wiki in wikis:
wiki = f"https://{wiki}"
if wiki not in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open("wikispaces-duckduckgo.txt", "w") as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r"https://www\.", "https://", wiki)
if not wiki in wikis2:
if wiki not in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
Expand Down
Loading

0 comments on commit 69cb2eb

Please sign in to comment.