Sourcery refactored python3 branch (#179)

Branch `python3` refactored by [Sourcery](https://sourcery.ai/github/). If you're happy with these changes, merge this Pull Request using the *Squash and merge* strategy. See our documentation [here](https://docs.sourcery.ai/GitHub/Using-Sourcery-for-GitHub/). <details> <summary>Run Sourcery locally</summary> <p> Reduce the feedback loop during development by using the Sourcery editor plugin: </p> <ul> <li><a href="https://sourcery.ai/download/?editor=vscode">VS Code</a></li> <li><a href="https://sourcery.ai/download/?editor=pycharm">PyCharm</a></li> </ul> </details> <details> <summary>Review changes via command line</summary> <p>To manually merge these changes, make sure you're on the <code>python3</code> branch, then run:</p> <pre> git fetch origin sourcery/python3 git merge --ff-only FETCH_HEAD git reset HEAD^ </pre> </details> Help us [improve](https://research.typeform.com/to/j06Spdfr?type=branch_refactor&github_login=elsiehupp&base_repo=https%3A%2F%2Fgithub.com%2Fmediawiki-client-tools%2Fmediawiki-scraper.git&base_remote_ref=python3&base_ref=python3&base_sha=6d044c0c62c509751f57dfcb8edeca0906a974ab&head_repo=https%3A%2F%2Fgithub.com%2Fmediawiki-client-tools%2Fmediawiki-scraper.git&head_ref=sourcery%2Fpython3) this pull request! --------- Co-authored-by: Sourcery AI <> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
mediawiki-client-tools · Aug 29, 2023 · 69cb2eb · 69cb2eb
1 parent 6d044c0
commit 69cb2eb
Show file tree

Hide file tree

Showing 54 changed files with 552 additions and 970 deletions.
diff --git a/resources/listsofwikis/mediawiki/checkalive.py b/resources/listsofwikis/mediawiki/checkalive.py
@@ -39,18 +39,18 @@ def checkcore(api):
         raw = urllib.request.urlopenurlopen(req, None, delay).read()
     except URLError as reason:  # https://docs.python.org/3/library/urllib.error.html
         if reason.isinstance(HTTPError):
-            print(api + "is dead or has errors because:")
+            print(f"{api}is dead or has errors because:")
             print(
                 "Error code "
                 + HTTPError.code
                 + ": "
                 + BaseHTTPRequestHandler.responses[HTTPError.code].shortmessage
             )
             print(BaseHTTPRequestHandler.responses[HTTPError.code].longmessage)
-            print("Reason: " + HTTPError.reason)
+            print(f"Reason: {HTTPError.reason}")
             print("HTTP Headers:\n" + HTTPError.headers)
         else:
-            print(api + "is dead or has errors because:" + reason)
+            print(f"{api}is dead or has errors because:{reason}")
         return
     # RSD is available since 1.17, bug 25648
     rsd = re.search(
@@ -69,7 +69,7 @@ def checkcore(api):
     if "This is an auto-generated MediaWiki API documentation page" in raw:
         printapi(api)
     elif rsd and rsd.group(1):
-        api = "http:" + rsd.group(1)
+        api = f"http:{rsd.group(1)}"
         printapi(api)
     elif feed and feed.group(1) and domain and domain.group(1):
         index = domain.group(1) + feed.group(1)
@@ -90,7 +90,7 @@ def check(apis):
 
 apis = []
 for api in open("wikistocheck.txt").read().strip().splitlines():
-    if not api in apis:
+    if api not in apis:
         apis.append(api)
     if len(apis) >= limit:
         check(apis)

diff --git a/resources/listsofwikis/mediawiki/fandom-spider.py b/resources/listsofwikis/mediawiki/fandom-spider.py
@@ -37,10 +37,10 @@ def main():
     wikis = []
     for lvl3 in tqdm(map_lvl3):
         time.sleep(0.3)
-        req = requests.get("https://community.fandom.com%s" % lvl3)
+        req = requests.get(f"https://community.fandom.com{lvl3}")
         if req.status_code != 200:
             time.sleep(5)
-            req = requests.get("https://community.fandom.com%s" % lvl3)
+            req = requests.get(f"https://community.fandom.com{lvl3}")
         wikis.extend(
             [
                 wiki.replace("http://", "https://")
@@ -50,8 +50,7 @@ def main():
             ]
         )
 
-    wikis = list(set(wikis))
-    wikis.sort()
+    wikis = sorted(set(wikis))
     with open("fandom.com", "w") as f:
         for wiki in wikis:
             f.write(parse.urljoin(wiki, "api.php") + "\n")

diff --git a/resources/listsofwikis/mediawiki/miraheze-spider.py b/resources/listsofwikis/mediawiki/miraheze-spider.py
@@ -57,8 +57,7 @@ def main():
             )
         )
 
-    wikis = list(set(wikis))
-    wikis.sort()
+    wikis = sorted(set(wikis))
     with open("miraheze.org", "w") as f:
         for wiki in wikis:
             f.write(urljoin(wiki, "w/api.php") + "\n")

diff --git a/resources/listsofwikis/mediawiki/neoseeker-spider.py b/resources/listsofwikis/mediawiki/neoseeker-spider.py
@@ -29,8 +29,7 @@ def main():
     raw = r.text
     m = re.findall(r"<li><a href=\'([^>]+?)/wiki/\'>", raw)
     m = [w.replace("http://", "https://") + "/w/api.php" for w in m]
-    m = list(set(m))
-    m.sort()
+    m = sorted(set(m))
     with open("neoseeker.com", "w") as f:
         f.write("\n".join(m))
 

diff --git a/resources/listsofwikis/mediawiki/orain-spider.py b/resources/listsofwikis/mediawiki/orain-spider.py
@@ -29,7 +29,7 @@ def main():
     raw = r.text
     m = re.findall(r'<tr><td><a href="//([^>]+?)/">[^<]+</a></td></tr>', raw)
     for i in m:
-        print("http://" + i + "/w/api.php")
+        print(f"http://{i}/w/api.php")
 
 
 if __name__ == "__main__":

diff --git a/resources/listsofwikis/mediawiki/shoutwiki-spider.py b/resources/listsofwikis/mediawiki/shoutwiki-spider.py
@@ -44,9 +44,7 @@ def main():
         json = requests.get(url, params=params, headers=headers).json()
         gcont = json["continue"]["gcmcontinue"] if "continue" in json else ""
         query = json["query"]["pages"]
-        for wiki in query:
-            ids.append(wiki)
-
+        ids.extend(iter(query))
     # grab wiki API
     params = {
         "action": "query",
@@ -64,15 +62,12 @@ def main():
             for val in wiki["revisions"][0]["slots"]["main"]["content"].split("\n|"):
                 if "subdomain" in val:
                     wikis.append(
-                        "http://%s.shoutwiki.com/w/api.php"
-                        % val.split("subdomain =")[-1].strip()
+                        f'http://{val.split("subdomain =")[-1].strip()}.shoutwiki.com/w/api.php'
                     )
                     break
 
         time.sleep(0.3)
-    wikis = list(set(wikis))
-    wikis.sort()
-
+    wikis = sorted(set(wikis))
     with open("shoutwiki.com", "w") as f:
         f.write("\n".join(wikis))
 

diff --git a/resources/listsofwikis/mediawiki/wiki-site-spider.py b/resources/listsofwikis/mediawiki/wiki-site-spider.py
@@ -34,8 +34,7 @@ def main():
         req = requests.get(url, headers=headers)
         wikis.extend(re.findall(r'<td><a href="([^>]+?)"', req.text))
 
-    wikis = list(set(wikis))
-    wikis.sort()
+    wikis = sorted(set(wikis))
     with open("wiki-site.com", "w") as f:
         for wiki in wikis:
             f.write(parse.urljoin(wiki, "api.php") + "\n")

diff --git a/resources/listsofwikis/mediawiki/wikia.py b/resources/listsofwikis/mediawiki/wikia.py
@@ -45,8 +45,7 @@ def getall():
     # This API module has no query continuation facility
     print("Getting list of active domains...")
     while True:
-        list = getlist(wikia, offset, offset + limit)
-        if list:
+        if list := getlist(wikia, offset, offset + limit):
             print(offset)
             domains = dict(domains.items() + list.items())
             empty = 0
@@ -69,51 +68,6 @@ def main():
     # assumed to be undumped.
     return
 
-    undumped = []
-    # Or we could iterate over each sublist while we get it?
-    for i in domains:
-        dbname = re.sub("[-_.]", "", domains[i]["domain"].replace(".wikia.com", ""))
-        dbname = re.escape(dbname)
-        print(dbname)
-        first = dbname[0]
-        # There are one-letter dbnames; the second letter is replaced by an underscore
-        # http://s3.amazonaws.com/wikia_xml_dumps/n/n_/n_pages_full.xml.7z
-        try:
-            second = dbname[1]
-        except:
-            second = "_"
-        base = (
-            "http://s3.amazonaws.com/wikia_xml_dumps/"
-            + first
-            + "/"
-            + first
-            + second
-            + "/"
-            + dbname
-        )
-        full = base + "_pages_full.xml.7z"
-        print(full)
-        current = base + "_pages_current.xml.7z"
-        images = base + "_images.tar"
-        try:
-            # subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
-            # Use this instead, and comment out the next try, to only list.
-            subprocess.call(["curl", "-I", "--fail", full])
-        except subprocess.CalledProcessError as e:
-            # We added --fail for this https://superuser.com/a/854102/283120
-            if e.returncode == 22:
-                print("Missing: " + domains[i]["domain"])
-                undumped.append(domains[i]["domain"])
-
-        # try:
-        #    subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', current])
-        #    subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
-        # except:
-        #    pass
-
-    with open("wikia.com-unarchived", "w+") as out:
-        out.write("\n".join(str(domain) for domain in undumped))
-
 
 if __name__ == "__main__":
     main()
diff --git a/resources/listsofwikis/wikidot/wikidot-duckduckgo.py b/resources/listsofwikis/wikidot/wikidot-duckduckgo.py
@@ -26,11 +26,8 @@ def main():
     opener.addheaders = [("User-agent", "Mozilla/5.1")]
     urllib.request.install_opener(opener)
 
-    for i in range(1, 100000):
-        url = "https://duckduckgo.com/html/?q={}%20{}%20site:wikidot.com".format(
-            random.randint(100, 5000),
-            random.randint(1000, 9999),
-        )
+    for _ in range(1, 100000):
+        url = f"https://duckduckgo.com/html/?q={random.randint(100, 5000)}%20{random.randint(1000, 9999)}%20site:wikidot.com"
         print("URL search", url)
         try:
             html = urllib.request.urlopen(url).read().decode("utf-8")
@@ -42,16 +39,16 @@ def main():
         m = re.findall(r"://([^/]+?\.wikidot\.com)", html)
         wikis = []
         for wiki in m:
-            wiki = "https://" + wiki
-            if not wiki in wikis:
+            wiki = f"https://{wiki}"
+            if wiki not in wikis:
                 wikis.append(wiki)
                 print(wiki)
         wikis.sort()
         with open("wikidot-duckduckgo.txt", "w") as f:
             wikis2 = []
             for wiki in wikis:
                 wiki = re.sub(r"https?://www\.", "http://", wiki)
-                if not wiki in wikis2:
+                if wiki not in wikis2:
                     wikis2.append(wiki)
             wikis = wikis2
             wikis.sort()

diff --git a/resources/listsofwikis/wikidot/wikidot-spider.py b/resources/listsofwikis/wikidot/wikidot-spider.py
@@ -30,7 +30,7 @@ def main():
     with open("wikidot-spider.txt") as f:
         wikis = f.read().strip().splitlines()
 
-    for i in range(1, 1000000):
+    for _ in range(1, 1000000):
         url = random.choice(wikis)
         print("URL search", url)
         try:
@@ -42,16 +42,16 @@ def main():
         html = urllib.parse.unquote(html)
         m = re.findall(r"://([^/]+?\.wikidot\.com)", html)
         for wiki in m:
-            wiki = "http://" + wiki
-            if not wiki in wikis:
+            wiki = f"http://{wiki}"
+            if wiki not in wikis:
                 wikis.append(wiki)
                 wikis.sort()
                 print(wiki)
         with open("wikidot-spider.txt", "w") as f:
             wikis2 = []
             for wiki in wikis:
                 wiki = re.sub(r"https?://www\.", "http://", wiki)
-                if not wiki in wikis2:
+                if wiki not in wikis2:
                     wikis2.append(wiki)
             wikis = wikis2
             wikis.sort()

diff --git a/resources/listsofwikis/wikidot/wikidot-spider2.py b/resources/listsofwikis/wikidot/wikidot-spider2.py
@@ -27,14 +27,12 @@ def main():
     with open("wikidot-spider2.txt") as f:
         wikis = f.read().strip().splitlines()
 
-    for i in range(1, 1000000):
+    for _ in range(1, 1000000):
         url = random.choice(wikis)
         urlrandom = (
-            url.endswith("/")
-            and (url + "random-site.php")
-            or (url + "/" + "random-site.php")
+            url.endswith("/") and f"{url}random-site.php" or f"{url}/random-site.php"
         )
-        print("URL exploring %s" % urlrandom)
+        print(f"URL exploring {urlrandom}")
         try:
             r = requests.get(urlrandom)
         except:
@@ -51,7 +49,7 @@ def main():
             wikis2 = []
             for wiki in wikis:
                 wiki = re.sub(r"https?://www\.", "http://", wiki)
-                if not wiki in wikis2:
+                if wiki not in wikis2:
                     wikis2.append(wiki)
             wikis = wikis2
             wikis.sort()

diff --git a/resources/listsofwikis/wikispaces/wikispaces-duckduckgo.py b/resources/listsofwikis/wikispaces/wikispaces-duckduckgo.py
@@ -38,7 +38,7 @@ def main():
         wikis.sort()
     print("Loaded %d wikis from file" % (len(wikis)))
 
-    for i in range(1, 100):
+    for _ in range(1, 100):
         random.shuffle(words)
         for word in words:
             print("Word", word)
@@ -51,21 +51,10 @@ def main():
                 )
             elif r == 1:
                 url = "https://duckduckgo.com/html/?q=%s%%20wikispaces.com" % (word_)
-            elif r == 2:
-                url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
-                    word_,
-                    random.randint(100, 3000),
-                )
             elif r == 3:
-                url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
-                    random.randint(100, 3000),
-                    word_,
-                )
+                url = f"https://duckduckgo.com/html/?q={random.randint(100, 3000)}%20{word_}%20wikispaces.com"
             else:
-                url = "https://duckduckgo.com/html/?q={}%20{}%20wikispaces.com".format(
-                    word_,
-                    random.randint(100, 3000),
-                )
+                url = f"https://duckduckgo.com/html/?q={word_}%20{random.randint(100, 3000)}%20wikispaces.com"
             print("URL search", url)
             try:
                 html = urllib.request.urlopen(url).read().decode("utf-8")
@@ -75,16 +64,16 @@ def main():
             html = urllib.parse.unquote(html)
             m = re.findall(r"://([^/]+?\.wikispaces\.com)", html)
             for wiki in m:
-                wiki = "https://" + wiki
-                if not wiki in wikis:
+                wiki = f"https://{wiki}"
+                if wiki not in wikis:
                     wikis.append(wiki)
                     wikis.sort()
                     print(wiki)
             with open("wikispaces-duckduckgo.txt", "w") as f:
                 wikis2 = []
                 for wiki in wikis:
                     wiki = re.sub(r"https://www\.", "https://", wiki)
-                    if not wiki in wikis2:
+                    if wiki not in wikis2:
                         wikis2.append(wiki)
                 wikis = wikis2
                 wikis.sort()