Scrapper.py now creates one file per autor

Pubmed_citation.py fetches PubMed Record and updates scrapper_citations files
dmorais · Nov 10, 2018 · d0538f4 · d0538f4
1 parent 8a6d5ab
commit d0538f4
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 24 deletions.
diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,4 @@ erro.txt
 author.*
 .idea/*
 runs/*
+scrapper_citations/*
diff --git a/Alternative_citation.py b/Alternative_citation.py
@@ -1,9 +1,16 @@
 import sys
+import os
 
 
+def ensure_dir(dir_path):
+    if not os.path.exists(dir_path):
+        print("Creating", dir_path)
+        os.makedirs(dir_path)
 
-def create_list_of_citations(file_name):
+    return True
 
+
+def create_list_of_citations(file_name, dir_path, author):
     citations = dict()
     with open(file_name, 'r') as f:
         for line in f:
@@ -23,25 +30,33 @@ def create_list_of_citations(file_name):
                 else:
                     citations[last_names[0]] = {name}
 
+    with open(os.path.join(dir_path, author + "_Alternative_citation.txt"), 'a') as f:
+        for k, v in citations.items():
 
-
-    for k,v in citations.items():
-
-        # Print only if there is an alternative citation
-        if len(v) > 1:
-            print(k + ' : '+ "; ".join(v))
+            # Print only if there is an alternative citation
+            if len(v) > 1:
+                text = k + ' : ' + '; '.join(v)
+                f.write(text)
+                f.write('\n\n')
 
 
 def main():
-
     if (len(sys.argv) != 2) or sys.argv[1] == "-h":
-        print("Usage:\npython " + sys.argv[0] + " <File created by Scapper.py>\n" )
+        print("Usage:\npython " + sys.argv[0] + " <File created by Scapper.py>\n"
+                                                "The script now creates a dir from the CWD and writes to"
+                                                "files named after the author.")
         sys.exit()
 
+    dir_path = os.path.join(os.getcwd(), "alternative_citations")
+
+    ensure_dir(dir_path)
+
     file_name = sys.argv[1]
 
-    create_list_of_citations(file_name)
+    author = os.path.basename(file_name).split('.')
+
+    create_list_of_citations(file_name, dir_path, author[0])
 
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/Pubmed_citation.py b/Pubmed_citation.py
@@ -0,0 +1,34 @@
+from util.pubmedcrawler import get_pmid
+from util.util import get_titles, ensure_dir
+import sys
+import os
+from subprocess import call
+
+
+def main():
+    if (len(sys.argv) != 3) or sys.argv[1] == "-h":
+        print("\nThis script fetches citations from pubmed base on "
+              "its title and parses the output\nUsage\n\npython " + sys.argv[0] + " file_name author_name\n\n")
+        sys.exit()
+
+    dir_path = os.path.join(os.getcwd(), "pubmed_problems")
+    ensure_dir(dir_path)
+
+    file_name = sys.argv[1]
+    author = sys.argv[2]
+
+    titles = get_titles(file_name)
+    get_pmid(titles, author, dir_path)
+
+    # # Traverse the pubmed_results and call Fix_prombles to put the citation in the right format
+    # for _, _, file_list in os.walk(dir_path):
+    #     for fname in file_list:
+    #         if '_pubmed.txt' in fname:
+    #             print('Normalizing Pubmed Files')
+    #             call(['python', os.path.join(os.getcwd(), 'Fix_problems_with_citation.py'), os.path.join(dir_path, fname)])
+
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/README.md b/README.md
@@ -65,4 +65,17 @@ python Alternative_citation.py <File created by Scapper.py>
 
 
 
+```
+
+* Pubmed_citation.py - This script fetches PubMed records based on their title and parsed it.
+
+```
+python  Pubmed_citation.py file_name "author_name"
+
+# File_name contains a list of article titles retrieved from the citations_with_problems file
+# Autor_name is the full author_name as a string (surrounded by double quotes)
+
+# Notes: The script updated the record in scrapper_citation/autor_name.txt and creates a 
+pubmed_problems/autor_pubmed_error.txt
+
 ```
diff --git a/Scrapper.py b/Scrapper.py
@@ -1,7 +1,12 @@
 import argparse
-from util.util import get_list_autor_id
+from util.util import get_list_autor_id, ensure_dir
 from util.crawler import get_publication
 import sys
+import os
+
+
+
+
 
 
 def main():
@@ -16,10 +21,19 @@ def main():
                         help="Given a list of Lattes id extract the list of publications of an CV",
                         required=False)
 
+    parser.add_argument('-d', '--dir', action='store_true',
+                        default=os.getcwd(),
+                        help="A path to the output of scrapper. If none provide use current working dir ",
+                        required=False)
+
     args = parser.parse_args()
     if args.pub:
+
+        dir_path = os.path.join(args.dir,"scrapper_citations")
+        ensure_dir(dir_path)
+
         list_authors = get_list_autor_id(args.file)
-        get_publication(list_authors)
+        get_publication(list_authors,dir_path)
 
 
 if __name__ == '__main__':

diff --git a/list_author.tsv b/list_author.tsv
@@ -1,3 +1,3 @@
 # Each field is tab separated
 # Remove the commented lines First
-Author_name id
+Author_name id
diff --git a/util/crawler.py b/util/crawler.py
@@ -3,12 +3,14 @@
 import re
 import sys
 import time
+import os
+from util.util import ensure_dir
 
 
-def get_publication(list_authors):
+def get_publication(list_authors, dir_path):
     '''
     Fetch Publications from url
-
+    :param dir_path: A path to a directory where scrapper will output the citations
     :param list_authors: list of tuples (name, id)
     :return: True
     '''
@@ -27,7 +29,11 @@ def get_publication(list_authors):
                     unwanted_ano = div_layout5.find('span', {'data-tipo-ordenacao': 'ano'})
                     unwanted_ano.extract()
 
-                    _splitter(record[0], div_layout5.getText(separator=u' ').strip())
+
+                    pub = _splitter(record[0], div_layout5.getText(separator=u' ').strip())
+
+                    if isinstance(pub, str):
+                        normalize_names(pub,dir_path)
 
         time.sleep(30)
     return True
@@ -39,7 +45,7 @@ def _splitter(author, pub):
 
     :param author: full author's name string
     :param pub: publication record string
-    :return: True
+    :return: a string with author and publication records
     '''
     record = pub.strip().split(' . ')
 
@@ -48,17 +54,18 @@ def _splitter(author, pub):
         return True
 
     title = record[1].split('.')
-    normalize_names(author + "|" + record[0] + "|" + title[0] + "| " + "".join(title[1:]))
+    pub = author + "|" + record[0] + "|" + title[0] + "| " + "".join(title[1:])
+    return pub
 
-    return True
 
 
-def normalize_names(pub):
+def normalize_names(pub,dir_path):
     '''
     Remove unwanted char from names and standardize the citation
 
     :param pub: string in the format specified by the function splitter
-    :return:
+    :param dir_path: A path to a directory where scrapper will output the citation
+    :return: True
     '''
 
     # Some publication contain a new line within. It needs to be removed
@@ -76,9 +83,11 @@ def normalize_names(pub):
         r'\s+DI$'
         r'\s+DO$',
         r'\s+E\s+',
+        r'JR',
         r'\d+'
 
     )
+
     regex = re.compile("|".join(regexes), re.IGNORECASE)
     record = pub.strip().split("|")
 
@@ -112,7 +121,13 @@ def normalize_names(pub):
         authors[i] = " ".join(names)
 
     record[1] = "; ".join(authors)
-    print("|".join(record), end="\n\n")
+    citation = "|".join(record)
+
+
+    full_name = record[0].split(' ')
+    with open( os.path.join(dir_path,"_".join(full_name) + "_citations.txt"), 'a') as f:
+        f.write(citation)
+        f.write("\n\n")
 
     return True
 
@@ -174,8 +189,12 @@ def _create_initials(author_name):
 
 
 def _print_problems(author, pub):
+
+    dir_path = os.path.join(os.getcwd(),"citations_with_problems")
+    ensure_dir(dir_path)
+
     full_name = author.split(' ')
-    with open("_".join(full_name) + "_problems_with_citation.txt", 'a') as f:
+    with open( os.path.join(dir_path,"_".join(full_name) + "_citations.txt"), 'a') as f:
         line = author + '|' + pub
         f.write(line)
         f.write("\n\n")

diff --git a/util/pubmedcrawler.py b/util/pubmedcrawler.py
@@ -0,0 +1,82 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import sys
+import os
+from util.crawler import normalize_names
+
+
+def get_pmid(references, author, dir_path):
+    '''
+    This function fetches a full citation from NCBI pubmed.
+    :param references: (list)  article titles
+    :param author: (str) full author name (str)
+    :param dir_path: (str) path to output dir
+    :return: True
+    '''
+
+    regex = re.compile(" - PubMed - NCBI")
+    comma_regex = re.compile(',')
+
+    full_name = author.split(' ')
+    file_prob = open(os.path.join(dir_path, "_".join(full_name) + "_pubmed_error.txt"), 'a')
+
+    for record in references:
+
+        len_record = list()
+        data = requests.get('https://www.ncbi.nlm.nih.gov/pubmed/?term=' + record)
+        soup = BeautifulSoup(data.text, 'html.parser')
+
+        # Get and parse title
+
+        # Get an alternative title in the p tag which is not the same as the search
+        if soup.find('p', class_="title"):
+            title = soup.find('p', class_="title").text
+
+        else:
+            # Real full title
+            if soup.find("title"):
+                title = soup.find("title").text
+            else:
+                title = None
+
+        title = re.sub(regex, " ", title).strip()
+        len_record.append(len(title))
+
+        # Get and parse co-authors
+        coauthors = soup.find('div', class_="auths").text if soup.find('div', class_="auths") else ''
+        coauthors = re.sub(comma_regex, ";", coauthors).strip()
+        coauthors_list = coauthors.split(';')
+        fixed_list = list()
+
+        # Fix cases where initials are not separated be . or space
+        for person in coauthors_list:
+            names = person.strip().split(' ')
+            if len(names[-1]) == 2 and 'Jr' not in names[-1]:
+                names[-1] = ". ".join(list(names[-1])) + "."
+
+            person = " ".join(names)
+            fixed_list.append(person)
+
+        coauthors = ";".join(fixed_list)
+
+        len_record.append(len(coauthors))
+
+        # Get and parse journal
+        journal = soup.find('div', class_="cit").text if soup.find('div', class_="cit") else ''
+        len_record.append(len(journal))
+
+        # Check is title is actually right
+        if record.upper() not in title.upper() or any(x == 0 for x in len_record):
+            citaton = record + "|" + author + "|" + coauthors + "|" + title + "|" + journal + "\n\n"
+            file_prob.write(citaton)
+
+        else:
+            print("Normalizing citation")
+            citaton = author + "|" + coauthors + "|" + title + "|" + journal
+            normalize_names(citaton, os.path.join(os.getcwd(), "scrapper_citations"))
+
+
+    file_prob.close()
+
+    return True
diff --git a/util/util.py b/util/util.py
@@ -1,3 +1,4 @@
+import os
 def get_list_autor_id(file_name):
     '''
 
@@ -7,3 +8,18 @@ def get_list_autor_id(file_name):
     '''
     with open(file_name, 'r') as file_obj:
         return [(line[0], line[1]) for line in (line.strip().split(",") for line in file_obj if line.strip())]
+
+
+def get_titles(file_name):
+
+    with open(file_name, 'r') as file_obj:
+
+        return [line.strip() for line in file_obj if line.strip()]
+
+def ensure_dir(dir_path):
+
+    if not os.path.exists(dir_path):
+        print("Creating", dir_path)
+        os.makedirs(dir_path)
+
+    return True