-
Notifications
You must be signed in to change notification settings - Fork 0
/
parserACM.py
42 lines (34 loc) · 1.05 KB
/
parserACM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
"""
This is the specific Parser implementation for the ACM Digital Library.
URI spec:
File spec:
Known bugs:
This parser is incomplete.
"""
#HTML Parsing: see http://www.crummy.com/software/BeautifulSoup/
from bs4 import BeautifulSoup as BS
#simple HTTP library: see http://docs.python-requests.org/en/latest/
import requests
import re
from paper import Paper
from parser import Parser
class ACMParser(Parser):
def read(self):
if self.uri:
#use the flat layout to make sure the references are received
if "preflayout=flat" not in self.uri:
if "?" not in self.uri:
self.uri +="?"
elif not (self.uri.endswith("?") or self.uri.endswith("&")):
self.uri += "&"
self.uri +="preflayout=flat"
resp = requests.get(self.uri)
text=resp.text
elif self.myFile:
text=open(self.myFile)
else:
raise Exception("invalid state!")
def process(self):
if not self.text:
raise Exception("Invalid state!")
self.paper=ACMReferencesParser(self.text)