diff --git a/.travis.yml b/.travis.yml index b0ebc5ed..b542c481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ env: before_install: # work around https://github.com/travis-ci/travis-ci/issues/8363 - - pyenv global system 3.5 + - pyenv global system 3.6 install: - travis_retry pip install -U pip wheel tox diff --git a/Makefile b/Makefile index 0a28f375..3daf2d1d 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ # Makefile to help automate tasks WD := $(shell pwd) -PY := .env/bin/python -PIP := .env/bin/pip -PEP8 := .env/bin/pep8 -NOSE := .env/bin/nosetests - +PY := .venv/bin/python +PIP := .venv/bin/pip +PEP8 := .venv/bin/pep8 +NOSE := .venv/bin/nosetests # ########### # Tests rule! @@ -22,16 +21,17 @@ $(NOSE): .PHONY: all all: venv develop -venv: bin/python -bin/python: - virtualenv .env +venv: .venv/bin/python + +.venv/bin/python: + virtualenv .venv .PHONY: clean_venv clean_venv: - rm -rf .env + rm -rf .venv -develop: .env/lib/python*/site-packages/readability-lxml.egg-link -.env/lib/python*/site-packages/readability-lxml.egg-link: +develop: .venv/lib/python*/site-packages/readability-lxml.egg-link +.venv/lib/python*/site-packages/readability-lxml.egg-link: $(PY) setup.py develop diff --git a/README.rst b/README.rst index 51eac4af..518c7553 100644 --- a/README.rst +++ b/README.rst @@ -35,13 +35,15 @@ Usage Change Log ---------- -- 0.3 Added Document.encoding, positive\_keywords and - negative\_keywords -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and - 3.4 +- 0.7 Improved HTML5 tags handling. Heuristics were changed for a lot of sites: Fixed an important +bug with stripping unwanted HTML nodes (only first matching node was removed before). - 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and + 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and + negative\_keywords Licensing ========= diff --git a/readability/readability.py b/readability/readability.py index 90fbc138..aff79e2f 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -182,10 +182,10 @@ def summary(self, html_partial=False): if ruthless: self.remove_unlikely_candidates() self.transform_misused_divs_into_paragraphs() + candidates = self.score_paragraphs() best_candidate = self.select_best_candidate(candidates) - if best_candidate: article = self.get_article(candidates, best_candidate, html_partial=html_partial) @@ -381,13 +381,13 @@ def class_weight(self, e): def score_node(self, elem): content_score = self.class_weight(elem) name = elem.tag.lower() - if name == "div": + if name in ["div", "article"]: content_score += 5 elif name in ["pre", "td", "blockquote"]: content_score += 3 - elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: content_score -= 3 - elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: + elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th", "header", "footer", "nav"]: content_score -= 5 return { 'content_score': content_score, @@ -400,8 +400,10 @@ def remove_unlikely_candidates(self): if len(s) < 2: continue if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: + #print("Removing", describe(elem)) log.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() + #print("After removal: {}".format(tostring(self.html))) def transform_misused_divs_into_paragraphs(self): for elem in self.tags(self.html, 'div'): @@ -463,7 +465,7 @@ def sanitize(self, node, candidates): allowed = {} # Conditionally clean s,