-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Scrapper.py now creates one file per autor
Pubmed_citation.py fetches PubMed Record and updates scrapper_citations files
- Loading branch information
DaveMorais
committed
Nov 10, 2018
1 parent
8a6d5ab
commit d0538f4
Showing
9 changed files
with
218 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,3 +113,4 @@ erro.txt | |
author.* | ||
.idea/* | ||
runs/* | ||
scrapper_citations/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from util.pubmedcrawler import get_pmid | ||
from util.util import get_titles, ensure_dir | ||
import sys | ||
import os | ||
from subprocess import call | ||
|
||
|
||
def main(): | ||
if (len(sys.argv) != 3) or sys.argv[1] == "-h": | ||
print("\nThis script fetches citations from pubmed base on " | ||
"its title and parses the output\nUsage\n\npython " + sys.argv[0] + " file_name author_name\n\n") | ||
sys.exit() | ||
|
||
dir_path = os.path.join(os.getcwd(), "pubmed_problems") | ||
ensure_dir(dir_path) | ||
|
||
file_name = sys.argv[1] | ||
author = sys.argv[2] | ||
|
||
titles = get_titles(file_name) | ||
get_pmid(titles, author, dir_path) | ||
|
||
# # Traverse the pubmed_results and call Fix_prombles to put the citation in the right format | ||
# for _, _, file_list in os.walk(dir_path): | ||
# for fname in file_list: | ||
# if '_pubmed.txt' in fname: | ||
# print('Normalizing Pubmed Files') | ||
# call(['python', os.path.join(os.getcwd(), 'Fix_problems_with_citation.py'), os.path.join(dir_path, fname)]) | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# Each field is tab separated | ||
# Remove the commented lines First | ||
Author_name id | ||
Author_name id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import re | ||
import sys | ||
import os | ||
from util.crawler import normalize_names | ||
|
||
|
||
def get_pmid(references, author, dir_path): | ||
''' | ||
This function fetches a full citation from NCBI pubmed. | ||
:param references: (list) article titles | ||
:param author: (str) full author name (str) | ||
:param dir_path: (str) path to output dir | ||
:return: True | ||
''' | ||
|
||
regex = re.compile(" - PubMed - NCBI") | ||
comma_regex = re.compile(',') | ||
|
||
full_name = author.split(' ') | ||
file_prob = open(os.path.join(dir_path, "_".join(full_name) + "_pubmed_error.txt"), 'a') | ||
|
||
for record in references: | ||
|
||
len_record = list() | ||
data = requests.get('https://www.ncbi.nlm.nih.gov/pubmed/?term=' + record) | ||
soup = BeautifulSoup(data.text, 'html.parser') | ||
|
||
# Get and parse title | ||
|
||
# Get an alternative title in the p tag which is not the same as the search | ||
if soup.find('p', class_="title"): | ||
title = soup.find('p', class_="title").text | ||
|
||
else: | ||
# Real full title | ||
if soup.find("title"): | ||
title = soup.find("title").text | ||
else: | ||
title = None | ||
|
||
title = re.sub(regex, " ", title).strip() | ||
len_record.append(len(title)) | ||
|
||
# Get and parse co-authors | ||
coauthors = soup.find('div', class_="auths").text if soup.find('div', class_="auths") else '' | ||
coauthors = re.sub(comma_regex, ";", coauthors).strip() | ||
coauthors_list = coauthors.split(';') | ||
fixed_list = list() | ||
|
||
# Fix cases where initials are not separated be . or space | ||
for person in coauthors_list: | ||
names = person.strip().split(' ') | ||
if len(names[-1]) == 2 and 'Jr' not in names[-1]: | ||
names[-1] = ". ".join(list(names[-1])) + "." | ||
|
||
person = " ".join(names) | ||
fixed_list.append(person) | ||
|
||
coauthors = ";".join(fixed_list) | ||
|
||
len_record.append(len(coauthors)) | ||
|
||
# Get and parse journal | ||
journal = soup.find('div', class_="cit").text if soup.find('div', class_="cit") else '' | ||
len_record.append(len(journal)) | ||
|
||
# Check is title is actually right | ||
if record.upper() not in title.upper() or any(x == 0 for x in len_record): | ||
citaton = record + "|" + author + "|" + coauthors + "|" + title + "|" + journal + "\n\n" | ||
file_prob.write(citaton) | ||
|
||
else: | ||
print("Normalizing citation") | ||
citaton = author + "|" + coauthors + "|" + title + "|" + journal | ||
normalize_names(citaton, os.path.join(os.getcwd(), "scrapper_citations")) | ||
|
||
|
||
file_prob.close() | ||
|
||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters