Skip to content

Commit

Permalink
Scrapper.py now creates one file per autor
Browse files Browse the repository at this point in the history
Pubmed_citation.py fetches PubMed Record and updates scrapper_citations files
  • Loading branch information
DaveMorais committed Nov 10, 2018
1 parent 8a6d5ab commit d0538f4
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 24 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,4 @@ erro.txt
author.*
.idea/*
runs/*
scrapper_citations/*
37 changes: 26 additions & 11 deletions Alternative_citation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import sys
import os


def ensure_dir(dir_path):
if not os.path.exists(dir_path):
print("Creating", dir_path)
os.makedirs(dir_path)

def create_list_of_citations(file_name):
return True


def create_list_of_citations(file_name, dir_path, author):
citations = dict()
with open(file_name, 'r') as f:
for line in f:
Expand All @@ -23,25 +30,33 @@ def create_list_of_citations(file_name):
else:
citations[last_names[0]] = {name}

with open(os.path.join(dir_path, author + "_Alternative_citation.txt"), 'a') as f:
for k, v in citations.items():


for k,v in citations.items():

# Print only if there is an alternative citation
if len(v) > 1:
print(k + ' : '+ "; ".join(v))
# Print only if there is an alternative citation
if len(v) > 1:
text = k + ' : ' + '; '.join(v)
f.write(text)
f.write('\n\n')


def main():

if (len(sys.argv) != 2) or sys.argv[1] == "-h":
print("Usage:\npython " + sys.argv[0] + " <File created by Scapper.py>\n" )
print("Usage:\npython " + sys.argv[0] + " <File created by Scapper.py>\n"
"The script now creates a dir from the CWD and writes to"
"files named after the author.")
sys.exit()

dir_path = os.path.join(os.getcwd(), "alternative_citations")

ensure_dir(dir_path)

file_name = sys.argv[1]

create_list_of_citations(file_name)
author = os.path.basename(file_name).split('.')

create_list_of_citations(file_name, dir_path, author[0])


if __name__ == '__main__':
main()
main()
34 changes: 34 additions & 0 deletions Pubmed_citation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from util.pubmedcrawler import get_pmid
from util.util import get_titles, ensure_dir
import sys
import os
from subprocess import call


def main():
if (len(sys.argv) != 3) or sys.argv[1] == "-h":
print("\nThis script fetches citations from pubmed base on "
"its title and parses the output\nUsage\n\npython " + sys.argv[0] + " file_name author_name\n\n")
sys.exit()

dir_path = os.path.join(os.getcwd(), "pubmed_problems")
ensure_dir(dir_path)

file_name = sys.argv[1]
author = sys.argv[2]

titles = get_titles(file_name)
get_pmid(titles, author, dir_path)

# # Traverse the pubmed_results and call Fix_prombles to put the citation in the right format
# for _, _, file_list in os.walk(dir_path):
# for fname in file_list:
# if '_pubmed.txt' in fname:
# print('Normalizing Pubmed Files')
# call(['python', os.path.join(os.getcwd(), 'Fix_problems_with_citation.py'), os.path.join(dir_path, fname)])




if __name__ == '__main__':
main()
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,17 @@ python Alternative_citation.py <File created by Scapper.py>
```

* Pubmed_citation.py - This script fetches PubMed records based on their title and parsed it.

```
python Pubmed_citation.py file_name "author_name"
# File_name contains a list of article titles retrieved from the citations_with_problems file
# Autor_name is the full author_name as a string (surrounded by double quotes)
# Notes: The script updated the record in scrapper_citation/autor_name.txt and creates a
pubmed_problems/autor_pubmed_error.txt
```
18 changes: 16 additions & 2 deletions Scrapper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import argparse
from util.util import get_list_autor_id
from util.util import get_list_autor_id, ensure_dir
from util.crawler import get_publication
import sys
import os






def main():
Expand All @@ -16,10 +21,19 @@ def main():
help="Given a list of Lattes id extract the list of publications of an CV",
required=False)

parser.add_argument('-d', '--dir', action='store_true',
default=os.getcwd(),
help="A path to the output of scrapper. If none provide use current working dir ",
required=False)

args = parser.parse_args()
if args.pub:

dir_path = os.path.join(args.dir,"scrapper_citations")
ensure_dir(dir_path)

list_authors = get_list_autor_id(args.file)
get_publication(list_authors)
get_publication(list_authors,dir_path)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion list_author.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Each field is tab separated
# Remove the commented lines First
Author_name id
Author_name id
39 changes: 29 additions & 10 deletions util/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import re
import sys
import time
import os
from util.util import ensure_dir


def get_publication(list_authors):
def get_publication(list_authors, dir_path):
'''
Fetch Publications from url
:param dir_path: A path to a directory where scrapper will output the citations
:param list_authors: list of tuples (name, id)
:return: True
'''
Expand All @@ -27,7 +29,11 @@ def get_publication(list_authors):
unwanted_ano = div_layout5.find('span', {'data-tipo-ordenacao': 'ano'})
unwanted_ano.extract()

_splitter(record[0], div_layout5.getText(separator=u' ').strip())

pub = _splitter(record[0], div_layout5.getText(separator=u' ').strip())

if isinstance(pub, str):
normalize_names(pub,dir_path)

time.sleep(30)
return True
Expand All @@ -39,7 +45,7 @@ def _splitter(author, pub):
:param author: full author's name string
:param pub: publication record string
:return: True
:return: a string with author and publication records
'''
record = pub.strip().split(' . ')

Expand All @@ -48,17 +54,18 @@ def _splitter(author, pub):
return True

title = record[1].split('.')
normalize_names(author + "|" + record[0] + "|" + title[0] + "| " + "".join(title[1:]))
pub = author + "|" + record[0] + "|" + title[0] + "| " + "".join(title[1:])
return pub

return True


def normalize_names(pub):
def normalize_names(pub,dir_path):
'''
Remove unwanted char from names and standardize the citation
:param pub: string in the format specified by the function splitter
:return:
:param dir_path: A path to a directory where scrapper will output the citation
:return: True
'''

# Some publication contain a new line within. It needs to be removed
Expand All @@ -76,9 +83,11 @@ def normalize_names(pub):
r'\s+DI$'
r'\s+DO$',
r'\s+E\s+',
r'JR',
r'\d+'

)

regex = re.compile("|".join(regexes), re.IGNORECASE)
record = pub.strip().split("|")

Expand Down Expand Up @@ -112,7 +121,13 @@ def normalize_names(pub):
authors[i] = " ".join(names)

record[1] = "; ".join(authors)
print("|".join(record), end="\n\n")
citation = "|".join(record)


full_name = record[0].split(' ')
with open( os.path.join(dir_path,"_".join(full_name) + "_citations.txt"), 'a') as f:
f.write(citation)
f.write("\n\n")

return True

Expand Down Expand Up @@ -174,8 +189,12 @@ def _create_initials(author_name):


def _print_problems(author, pub):

dir_path = os.path.join(os.getcwd(),"citations_with_problems")
ensure_dir(dir_path)

full_name = author.split(' ')
with open("_".join(full_name) + "_problems_with_citation.txt", 'a') as f:
with open( os.path.join(dir_path,"_".join(full_name) + "_citations.txt"), 'a') as f:
line = author + '|' + pub
f.write(line)
f.write("\n\n")
Expand Down
82 changes: 82 additions & 0 deletions util/pubmedcrawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import requests
from bs4 import BeautifulSoup
import re
import sys
import os
from util.crawler import normalize_names


def get_pmid(references, author, dir_path):
'''
This function fetches a full citation from NCBI pubmed.
:param references: (list) article titles
:param author: (str) full author name (str)
:param dir_path: (str) path to output dir
:return: True
'''

regex = re.compile(" - PubMed - NCBI")
comma_regex = re.compile(',')

full_name = author.split(' ')
file_prob = open(os.path.join(dir_path, "_".join(full_name) + "_pubmed_error.txt"), 'a')

for record in references:

len_record = list()
data = requests.get('https://www.ncbi.nlm.nih.gov/pubmed/?term=' + record)
soup = BeautifulSoup(data.text, 'html.parser')

# Get and parse title

# Get an alternative title in the p tag which is not the same as the search
if soup.find('p', class_="title"):
title = soup.find('p', class_="title").text

else:
# Real full title
if soup.find("title"):
title = soup.find("title").text
else:
title = None

title = re.sub(regex, " ", title).strip()
len_record.append(len(title))

# Get and parse co-authors
coauthors = soup.find('div', class_="auths").text if soup.find('div', class_="auths") else ''
coauthors = re.sub(comma_regex, ";", coauthors).strip()
coauthors_list = coauthors.split(';')
fixed_list = list()

# Fix cases where initials are not separated be . or space
for person in coauthors_list:
names = person.strip().split(' ')
if len(names[-1]) == 2 and 'Jr' not in names[-1]:
names[-1] = ". ".join(list(names[-1])) + "."

person = " ".join(names)
fixed_list.append(person)

coauthors = ";".join(fixed_list)

len_record.append(len(coauthors))

# Get and parse journal
journal = soup.find('div', class_="cit").text if soup.find('div', class_="cit") else ''
len_record.append(len(journal))

# Check is title is actually right
if record.upper() not in title.upper() or any(x == 0 for x in len_record):
citaton = record + "|" + author + "|" + coauthors + "|" + title + "|" + journal + "\n\n"
file_prob.write(citaton)

else:
print("Normalizing citation")
citaton = author + "|" + coauthors + "|" + title + "|" + journal
normalize_names(citaton, os.path.join(os.getcwd(), "scrapper_citations"))


file_prob.close()

return True
16 changes: 16 additions & 0 deletions util/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
def get_list_autor_id(file_name):
'''
Expand All @@ -7,3 +8,18 @@ def get_list_autor_id(file_name):
'''
with open(file_name, 'r') as file_obj:
return [(line[0], line[1]) for line in (line.strip().split(",") for line in file_obj if line.strip())]


def get_titles(file_name):

with open(file_name, 'r') as file_obj:

return [line.strip() for line in file_obj if line.strip()]

def ensure_dir(dir_path):

if not os.path.exists(dir_path):
print("Creating", dir_path)
os.makedirs(dir_path)

return True

0 comments on commit d0538f4

Please sign in to comment.